diff options
Diffstat (limited to 'kernel/locking')
35 files changed, 5758 insertions, 3018 deletions
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 6d11cfb9b41f..0db4093d17b8 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -5,16 +5,16 @@ KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o -# Avoid recursion lockdep -> KCSAN -> ... -> lockdep. +# Avoid recursion lockdep -> sanitizer -> ... -> lockdep. KCSAN_SANITIZE_lockdep.o := n ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE) endif +obj-$(CONFIG_DEBUG_IRQFLAGS) += irqflag-debug.o obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o obj-$(CONFIG_LOCKDEP) += lockdep.o ifeq ($(CONFIG_PROC_FS),y) @@ -24,8 +24,8 @@ obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o -obj-$(CONFIG_RT_MUTEXES) += rtmutex.o -obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o +obj-$(CONFIG_RT_MUTEXES) += rtmutex_api.o +obj-$(CONFIG_PREEMPT_RT) += spinlock_rt.o ww_rt_mutex.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o diff --git a/kernel/locking/irqflag-debug.c b/kernel/locking/irqflag-debug.c new file mode 100644 index 000000000000..810b50344d35 --- /dev/null +++ b/kernel/locking/irqflag-debug.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/bug.h> +#include <linux/export.h> +#include <linux/irqflags.h> + +noinstr void warn_bogus_irq_restore(void) +{ + instrumentation_begin(); + WARN_ONCE(1, "raw_local_irq_restore() called with IRQs enabled\n"); + instrumentation_end(); +} +EXPORT_SYMBOL(warn_bogus_irq_restore); diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c index fa2c2f951c6b..e68d82099558 100644 --- a/kernel/locking/lock_events.c +++ b/kernel/locking/lock_events.c @@ -146,7 +146,7 @@ static int __init init_lockevent_counts(void) struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL); int i; - if (!d_counts) + if (IS_ERR(d_counts)) goto out; /* @@ -159,14 +159,14 @@ static int __init init_lockevent_counts(void) for (i = 0; i < lockevent_num; i++) { if (skip_lockevent(lockevent_names[i])) continue; - if (!debugfs_create_file(lockevent_names[i], 0400, d_counts, - (void *)(long)i, &fops_lockevent)) + if (IS_ERR(debugfs_create_file(lockevent_names[i], 0400, d_counts, + (void *)(long)i, &fops_lockevent))) goto fail_undo; } - if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200, + if (IS_ERR(debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200, d_counts, (void *)(long)LOCKEVENT_reset_cnts, - &fops_lockevent)) + &fops_lockevent))) goto fail_undo; return 0; diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h index 8c7e7d25f09c..a6016b91803d 100644 --- a/kernel/locking/lock_events.h +++ b/kernel/locking/lock_events.h @@ -57,4 +57,8 @@ static inline void __lockevent_add(enum lock_events event, int inc) #define lockevent_cond_inc(ev, c) #endif /* CONFIG_LOCK_EVENT_COUNTS */ + +ssize_t lockevent_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos); + #endif /* __LOCKING_LOCK_EVENTS_H */ diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index 239039d0ce21..97fb6f3f840a 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -56,13 +56,11 @@ LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */ LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */ LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */ LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */ -LOCK_EVENT(rwsem_opt_rlock) /* # of opt-acquired read locks */ -LOCK_EVENT(rwsem_opt_wlock) /* # of opt-acquired write locks */ +LOCK_EVENT(rwsem_opt_lock) /* # of opt-acquired write locks */ LOCK_EVENT(rwsem_opt_fail) /* # of failed optspins */ LOCK_EVENT(rwsem_opt_nospin) /* # of disabled optspins */ -LOCK_EVENT(rwsem_opt_norspin) /* # of disabled reader-only optspins */ -LOCK_EVENT(rwsem_opt_rlock2) /* # of opt-acquired 2ndary read locks */ LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ +LOCK_EVENT(rwsem_rlock_steal) /* # of read locks by lock stealing */ LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */ LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */ LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 29a8de4c50b9..151bd3de5936 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -54,28 +54,77 @@ #include <linux/nmi.h> #include <linux/rcupdate.h> #include <linux/kprobes.h> +#include <linux/lockdep.h> +#include <linux/context_tracking.h> #include <asm/sections.h> #include "lockdep_internals.h" -#define CREATE_TRACE_POINTS #include <trace/events/lock.h> #ifdef CONFIG_PROVE_LOCKING -int prove_locking = 1; +static int prove_locking = 1; module_param(prove_locking, int, 0644); #else #define prove_locking 0 #endif #ifdef CONFIG_LOCK_STAT -int lock_stat = 1; +static int lock_stat = 1; module_param(lock_stat, int, 0644); #else #define lock_stat 0 #endif +#ifdef CONFIG_SYSCTL +static struct ctl_table kern_lockdep_table[] = { +#ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", + .data = &prove_locking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_PROVE_LOCKING */ +#ifdef CONFIG_LOCK_STAT + { + .procname = "lock_stat", + .data = &lock_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_LOCK_STAT */ + { } +}; + +static __init int kernel_lockdep_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_lockdep_table); + return 0; +} +late_initcall(kernel_lockdep_sysctls_init); +#endif /* CONFIG_SYSCTL */ + +DEFINE_PER_CPU(unsigned int, lockdep_recursion); +EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion); + +static __always_inline bool lockdep_enabled(void) +{ + if (!debug_locks) + return false; + + if (this_cpu_read(lockdep_recursion)) + return false; + + if (current->lockdep_recursion) + return false; + + return true; +} + /* * lockdep_lock: protects the lockdep graph, the hashes and the * class/list/hash allocators. @@ -91,19 +140,21 @@ static inline void lockdep_lock(void) { DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + __this_cpu_inc(lockdep_recursion); arch_spin_lock(&__lock); __owner = current; - current->lockdep_recursion++; } static inline void lockdep_unlock(void) { + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + if (debug_locks && DEBUG_LOCKS_WARN_ON(__owner != current)) return; - current->lockdep_recursion--; __owner = NULL; arch_spin_unlock(&__lock); + __this_cpu_dec(lockdep_recursion); } static inline bool lockdep_assert_locked(void) @@ -163,11 +214,9 @@ static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES); static struct hlist_head lock_keys_hash[KEYHASH_SIZE]; unsigned long nr_lock_classes; unsigned long nr_zapped_classes; -#ifndef CONFIG_DEBUG_LOCKDEP -static -#endif +unsigned long max_lock_class_idx; struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; -static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS); +DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS); static inline struct lock_class *hlock_class(struct held_lock *hlock) { @@ -318,7 +367,7 @@ static inline void lock_release_holdtime(struct held_lock *hlock) * elements. These elements are linked together by the lock_entry member in * struct lock_class. */ -LIST_HEAD(all_lock_classes); +static LIST_HEAD(all_lock_classes); static LIST_HEAD(free_lock_classes); /** @@ -372,6 +421,21 @@ static struct hlist_head classhash_table[CLASSHASH_SIZE]; static struct hlist_head chainhash_table[CHAINHASH_SIZE]; /* + * the id of held_lock + */ +static inline u16 hlock_id(struct held_lock *hlock) +{ + BUILD_BUG_ON(MAX_LOCKDEP_KEYS_BITS + 2 > 16); + + return (hlock->class_idx | (hlock->read << MAX_LOCKDEP_KEYS_BITS)); +} + +static inline unsigned int chain_hlock_class_idx(u16 hlock_id) +{ + return hlock_id & (MAX_LOCKDEP_KEYS - 1); +} + +/* * The hash key of the lock dependency chains is a hash itself too: * it's a hash of all locks taken up to that lock, including that lock. * It's a 64-bit hash, because it's important for the keys to be @@ -393,10 +457,15 @@ void lockdep_init_task(struct task_struct *task) task->lockdep_recursion = 0; } +static __always_inline void lockdep_recursion_inc(void) +{ + __this_cpu_inc(lockdep_recursion); +} + static __always_inline void lockdep_recursion_finish(void) { - if (WARN_ON_ONCE(--current->lockdep_recursion)) - current->lockdep_recursion = 0; + if (WARN_ON_ONCE(__this_cpu_dec_return(lockdep_recursion))) + __this_cpu_write(lockdep_recursion, 0); } void lockdep_set_selftest_task(struct task_struct *task) @@ -585,6 +654,8 @@ static const char *usage_str[] = #include "lockdep_states.h" #undef LOCKDEP_STATE [LOCK_USED] = "INITIAL USE", + [LOCK_USED_READ] = "INITIAL READ USE", + /* abused as string storage for verify_lock_unused() */ [LOCK_USAGE_STATES] = "IN-NMI", }; #endif @@ -638,7 +709,7 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) usage[i] = '\0'; } -static void __print_lock_name(struct lock_class *class) +static void __print_lock_name(struct held_lock *hlock, struct lock_class *class) { char str[KSYM_NAME_LEN]; const char *name; @@ -653,18 +724,20 @@ static void __print_lock_name(struct lock_class *class) printk(KERN_CONT "#%d", class->name_version); if (class->subclass) printk(KERN_CONT "/%d", class->subclass); + if (hlock && class->print_fn) + class->print_fn(hlock->instance); } } -static void print_lock_name(struct lock_class *class) +static void print_lock_name(struct held_lock *hlock, struct lock_class *class) { char usage[LOCK_USAGE_CHARS]; get_usage_chars(class, usage); printk(KERN_CONT " ("); - __print_lock_name(class); - printk(KERN_CONT "){%s}-{%hd:%hd}", usage, + __print_lock_name(hlock, class); + printk(KERN_CONT "){%s}-{%d:%d}", usage, class->wait_type_outer ?: class->wait_type_inner, class->wait_type_inner); } @@ -701,7 +774,7 @@ static void print_lock(struct held_lock *hlock) } printk(KERN_CONT "%px", hlock->instance); - print_lock_name(lock); + print_lock_name(hlock, lock); printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); } @@ -718,7 +791,7 @@ static void lockdep_print_held_locks(struct task_struct *p) * It's not reliable to print a task's held locks if it's not sleeping * and it's not the current task. */ - if (p->state == TASK_RUNNING && p != current) + if (p != current && task_is_running(p)) return; for (i = 0; i < depth; i++) { printk(" #%d: ", i); @@ -748,20 +821,24 @@ static int very_verbose(struct lock_class *class) #ifdef __KERNEL__ static int static_obj(const void *obj) { - unsigned long start = (unsigned long) &_stext, - end = (unsigned long) &_end, - addr = (unsigned long) obj; + unsigned long addr = (unsigned long) obj; - if (arch_is_kernel_initmem_freed(addr)) - return 0; + if (is_kernel_core_data(addr)) + return 1; /* - * static variable? + * keys are allowed in the __ro_after_init section. */ - if ((addr >= start) && (addr < end)) + if (is_kernel_rodata(addr)) return 1; - if (arch_is_kernel_data(addr)) + /* + * in initdata section and used during bootup only? + * NOTE: On some platforms the initdata section is + * outside of the _stext ... _end range. + */ + if (system_state < SYSTEM_FREEING_INITMEM && + init_section_contains((void *)addr, 1)) return 1; /* @@ -801,7 +878,7 @@ static int count_matching_names(struct lock_class *new_class) } /* used from NMI context -- must be lockless */ -static __always_inline struct lock_class * +static noinstr struct lock_class * look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) { struct lockdep_subclass_key *key; @@ -809,12 +886,14 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) struct lock_class *class; if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { + instrumentation_begin(); debug_locks_off(); printk(KERN_ERR "BUG: looking up invalid subclass: %u\n", subclass); printk(KERN_ERR "turning off the locking correctness validator.\n"); dump_stack(); + instrumentation_end(); return NULL; } @@ -844,14 +923,16 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return NULL; - hlist_for_each_entry_rcu(class, hash_head, hash_entry) { + hlist_for_each_entry_rcu_notrace(class, hash_head, hash_entry) { if (class->key == key) { /* * Huh! same key, different name? Did someone trample * on some memory? We're most confused. */ - WARN_ON_ONCE(class->name != lock->name && - lock->key != &__lockdep_no_validate__); + WARN_ONCE(class->name != lock->name && + lock->key != &__lockdep_no_validate__, + "Looking for class \"%s\" with key %ps, but found a different class \"%s\" with the same key\n", + lock->name, lock->key, class->name); return class; } } @@ -889,7 +970,8 @@ static bool assign_lock_key(struct lockdep_map *lock) /* Debug-check: all keys must be persistent! */ debug_locks_off(); pr_err("INFO: trying to register non-static key.\n"); - pr_err("the code is fine but needs lockdep annotation.\n"); + pr_err("The code is fine but needs lockdep annotation, or maybe\n"); + pr_err("you didn't initialize this object before use?\n"); pr_err("turning off the locking correctness validator.\n"); dump_stack(); return false; @@ -1195,6 +1277,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) struct lockdep_subclass_key *key; struct hlist_head *hash_head; struct lock_class *class; + int idx; DEBUG_LOCKS_WARN_ON(!irqs_disabled()); @@ -1249,6 +1332,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) class->name_version = count_matching_names(class); class->wait_type_inner = lock->wait_type_inner; class->wait_type_outer = lock->wait_type_outer; + class->lock_type = lock->lock_type; /* * We use RCU's safe list-add method to make * parallel walking of the hash-list safe: @@ -1259,6 +1343,9 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * of classes. */ list_move_tail(&class->lock_entry, &all_lock_classes); + idx = class - lock_classes; + if (idx > max_lock_class_idx) + max_lock_class_idx = idx; if (verbose(class)) { graph_unlock(); @@ -1320,7 +1407,7 @@ static struct lock_list *alloc_list_entry(void) */ static int add_lock_to_list(struct lock_class *this, struct lock_class *links_to, struct list_head *head, - unsigned long ip, int distance, + u16 distance, u8 dep, const struct lock_trace *trace) { struct lock_list *entry; @@ -1334,6 +1421,7 @@ static int add_lock_to_list(struct lock_class *this, entry->class = this; entry->links_to = links_to; + entry->dep = dep; entry->distance = distance; entry->trace = trace; /* @@ -1349,7 +1437,7 @@ static int add_lock_to_list(struct lock_class *this, /* * For good efficiency of modular, we use power of 2 */ -#define MAX_CIRCULAR_QUEUE_SIZE 4096UL +#define MAX_CIRCULAR_QUEUE_SIZE (1UL << CONFIG_LOCKDEP_CIRCULAR_QUEUE_BITS) #define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1) /* @@ -1421,23 +1509,19 @@ static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) return (cq->rear - cq->front) & CQ_MASK; } -static inline void mark_lock_accessed(struct lock_list *lock, - struct lock_list *parent) +static inline void mark_lock_accessed(struct lock_list *lock) { - unsigned long nr; + lock->class->dep_gen_id = lockdep_dependency_gen_id; +} - nr = lock - list_entries; - WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */ +static inline void visit_lock_entry(struct lock_list *lock, + struct lock_list *parent) +{ lock->parent = parent; - lock->class->dep_gen_id = lockdep_dependency_gen_id; } static inline unsigned long lock_accessed(struct lock_list *lock) { - unsigned long nr; - - nr = lock - list_entries; - WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */ return lock->class->dep_gen_id == lockdep_dependency_gen_id; } @@ -1471,87 +1555,293 @@ static inline struct list_head *get_dep_list(struct lock_list *lock, int offset) return lock_class + offset; } +/* + * Return values of a bfs search: + * + * BFS_E* indicates an error + * BFS_R* indicates a result (match or not) + * + * BFS_EINVALIDNODE: Find a invalid node in the graph. + * + * BFS_EQUEUEFULL: The queue is full while doing the bfs. + * + * BFS_RMATCH: Find the matched node in the graph, and put that node into + * *@target_entry. + * + * BFS_RNOMATCH: Haven't found the matched node and keep *@target_entry + * _unchanged_. + */ +enum bfs_result { + BFS_EINVALIDNODE = -2, + BFS_EQUEUEFULL = -1, + BFS_RMATCH = 0, + BFS_RNOMATCH = 1, +}; + +/* + * bfs_result < 0 means error + */ +static inline bool bfs_error(enum bfs_result res) +{ + return res < 0; +} /* - * Forward- or backward-dependency search, used for both circular dependency - * checking and hardirq-unsafe/softirq-unsafe checking. + * DEP_*_BIT in lock_list::dep + * + * For dependency @prev -> @next: + * + * SR: @prev is shared reader (->read != 0) and @next is recursive reader + * (->read == 2) + * ER: @prev is exclusive locker (->read == 0) and @next is recursive reader + * SN: @prev is shared reader and @next is non-recursive locker (->read != 2) + * EN: @prev is exclusive locker and @next is non-recursive locker + * + * Note that we define the value of DEP_*_BITs so that: + * bit0 is prev->read == 0 + * bit1 is next->read != 2 */ -static int __bfs(struct lock_list *source_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry, - int offset) +#define DEP_SR_BIT (0 + (0 << 1)) /* 0 */ +#define DEP_ER_BIT (1 + (0 << 1)) /* 1 */ +#define DEP_SN_BIT (0 + (1 << 1)) /* 2 */ +#define DEP_EN_BIT (1 + (1 << 1)) /* 3 */ + +#define DEP_SR_MASK (1U << (DEP_SR_BIT)) +#define DEP_ER_MASK (1U << (DEP_ER_BIT)) +#define DEP_SN_MASK (1U << (DEP_SN_BIT)) +#define DEP_EN_MASK (1U << (DEP_EN_BIT)) + +static inline unsigned int +__calc_dep_bit(struct held_lock *prev, struct held_lock *next) { + return (prev->read == 0) + ((next->read != 2) << 1); +} + +static inline u8 calc_dep(struct held_lock *prev, struct held_lock *next) +{ + return 1U << __calc_dep_bit(prev, next); +} + +/* + * calculate the dep_bit for backwards edges. We care about whether @prev is + * shared and whether @next is recursive. + */ +static inline unsigned int +__calc_dep_bitb(struct held_lock *prev, struct held_lock *next) +{ + return (next->read != 2) + ((prev->read == 0) << 1); +} + +static inline u8 calc_depb(struct held_lock *prev, struct held_lock *next) +{ + return 1U << __calc_dep_bitb(prev, next); +} + +/* + * Initialize a lock_list entry @lock belonging to @class as the root for a BFS + * search. + */ +static inline void __bfs_init_root(struct lock_list *lock, + struct lock_class *class) +{ + lock->class = class; + lock->parent = NULL; + lock->only_xr = 0; +} + +/* + * Initialize a lock_list entry @lock based on a lock acquisition @hlock as the + * root for a BFS search. + * + * ->only_xr of the initial lock node is set to @hlock->read == 2, to make sure + * that <prev> -> @hlock and @hlock -> <whatever __bfs() found> is not -(*R)-> + * and -(S*)->. + */ +static inline void bfs_init_root(struct lock_list *lock, + struct held_lock *hlock) +{ + __bfs_init_root(lock, hlock_class(hlock)); + lock->only_xr = (hlock->read == 2); +} + +/* + * Similar to bfs_init_root() but initialize the root for backwards BFS. + * + * ->only_xr of the initial lock node is set to @hlock->read != 0, to make sure + * that <next> -> @hlock and @hlock -> <whatever backwards BFS found> is not + * -(*S)-> and -(R*)-> (reverse order of -(*R)-> and -(S*)->). + */ +static inline void bfs_init_rootb(struct lock_list *lock, + struct held_lock *hlock) +{ + __bfs_init_root(lock, hlock_class(hlock)); + lock->only_xr = (hlock->read != 0); +} + +static inline struct lock_list *__bfs_next(struct lock_list *lock, int offset) +{ + if (!lock || !lock->parent) + return NULL; + + return list_next_or_null_rcu(get_dep_list(lock->parent, offset), + &lock->entry, struct lock_list, entry); +} + +/* + * Breadth-First Search to find a strong path in the dependency graph. + * + * @source_entry: the source of the path we are searching for. + * @data: data used for the second parameter of @match function + * @match: match function for the search + * @target_entry: pointer to the target of a matched path + * @offset: the offset to struct lock_class to determine whether it is + * locks_after or locks_before + * + * We may have multiple edges (considering different kinds of dependencies, + * e.g. ER and SN) between two nodes in the dependency graph. But + * only the strong dependency path in the graph is relevant to deadlocks. A + * strong dependency path is a dependency path that doesn't have two adjacent + * dependencies as -(*R)-> -(S*)->, please see: + * + * Documentation/locking/lockdep-design.rst + * + * for more explanation of the definition of strong dependency paths + * + * In __bfs(), we only traverse in the strong dependency path: + * + * In lock_list::only_xr, we record whether the previous dependency only + * has -(*R)-> in the search, and if it does (prev only has -(*R)->), we + * filter out any -(S*)-> in the current dependency and after that, the + * ->only_xr is set according to whether we only have -(*R)-> left. + */ +static enum bfs_result __bfs(struct lock_list *source_entry, + void *data, + bool (*match)(struct lock_list *entry, void *data), + bool (*skip)(struct lock_list *entry, void *data), + struct lock_list **target_entry, + int offset) +{ + struct circular_queue *cq = &lock_cq; + struct lock_list *lock = NULL; struct lock_list *entry; - struct lock_list *lock; struct list_head *head; - struct circular_queue *cq = &lock_cq; - int ret = 1; + unsigned int cq_depth; + bool first; lockdep_assert_locked(); - if (match(source_entry, data)) { - *target_entry = source_entry; - ret = 0; - goto exit; - } - - head = get_dep_list(source_entry, offset); - if (list_empty(head)) - goto exit; - __cq_init(cq); __cq_enqueue(cq, source_entry); - while ((lock = __cq_dequeue(cq))) { + while ((lock = __bfs_next(lock, offset)) || (lock = __cq_dequeue(cq))) { + if (!lock->class) + return BFS_EINVALIDNODE; - if (!lock->class) { - ret = -2; - goto exit; + /* + * Step 1: check whether we already finish on this one. + * + * If we have visited all the dependencies from this @lock to + * others (iow, if we have visited all lock_list entries in + * @lock->class->locks_{after,before}) we skip, otherwise go + * and visit all the dependencies in the list and mark this + * list accessed. + */ + if (lock_accessed(lock)) + continue; + else + mark_lock_accessed(lock); + + /* + * Step 2: check whether prev dependency and this form a strong + * dependency path. + */ + if (lock->parent) { /* Parent exists, check prev dependency */ + u8 dep = lock->dep; + bool prev_only_xr = lock->parent->only_xr; + + /* + * Mask out all -(S*)-> if we only have *R in previous + * step, because -(*R)-> -(S*)-> don't make up a strong + * dependency. + */ + if (prev_only_xr) + dep &= ~(DEP_SR_MASK | DEP_SN_MASK); + + /* If nothing left, we skip */ + if (!dep) + continue; + + /* If there are only -(*R)-> left, set that for the next step */ + lock->only_xr = !(dep & (DEP_SN_MASK | DEP_EN_MASK)); } - head = get_dep_list(lock, offset); + /* + * Step 3: we haven't visited this and there is a strong + * dependency path to this, so check with @match. + * If @skip is provide and returns true, we skip this + * lock (and any path this lock is in). + */ + if (skip && skip(lock, data)) + continue; + + if (match(lock, data)) { + *target_entry = lock; + return BFS_RMATCH; + } + /* + * Step 4: if not match, expand the path by adding the + * forward or backwards dependencies in the search + * + */ + first = true; + head = get_dep_list(lock, offset); list_for_each_entry_rcu(entry, head, entry) { - if (!lock_accessed(entry)) { - unsigned int cq_depth; - mark_lock_accessed(entry, lock); - if (match(entry, data)) { - *target_entry = entry; - ret = 0; - goto exit; - } + visit_lock_entry(entry, lock); - if (__cq_enqueue(cq, entry)) { - ret = -1; - goto exit; - } - cq_depth = __cq_get_elem_count(cq); - if (max_bfs_queue_depth < cq_depth) - max_bfs_queue_depth = cq_depth; - } + /* + * Note we only enqueue the first of the list into the + * queue, because we can always find a sibling + * dependency from one (see __bfs_next()), as a result + * the space of queue is saved. + */ + if (!first) + continue; + + first = false; + + if (__cq_enqueue(cq, entry)) + return BFS_EQUEUEFULL; + + cq_depth = __cq_get_elem_count(cq); + if (max_bfs_queue_depth < cq_depth) + max_bfs_queue_depth = cq_depth; } } -exit: - return ret; + + return BFS_RNOMATCH; } -static inline int __bfs_forwards(struct lock_list *src_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry) +static inline enum bfs_result +__bfs_forwards(struct lock_list *src_entry, + void *data, + bool (*match)(struct lock_list *entry, void *data), + bool (*skip)(struct lock_list *entry, void *data), + struct lock_list **target_entry) { - return __bfs(src_entry, data, match, target_entry, + return __bfs(src_entry, data, match, skip, target_entry, offsetof(struct lock_class, locks_after)); } -static inline int __bfs_backwards(struct lock_list *src_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry) +static inline enum bfs_result +__bfs_backwards(struct lock_list *src_entry, + void *data, + bool (*match)(struct lock_list *entry, void *data), + bool (*skip)(struct lock_list *entry, void *data), + struct lock_list **target_entry) { - return __bfs(src_entry, data, match, target_entry, + return __bfs(src_entry, data, match, skip, target_entry, offsetof(struct lock_class, locks_before)); } @@ -1572,7 +1862,7 @@ print_circular_bug_entry(struct lock_list *target, int depth) if (debug_locks_silent) return; printk("\n-> #%u", depth); - print_lock_name(target->class); + print_lock_name(NULL, target->class); printk(KERN_CONT ":\n"); print_lock_trace(target->trace, 6); } @@ -1585,6 +1875,8 @@ print_circular_lock_scenario(struct held_lock *src, struct lock_class *source = hlock_class(src); struct lock_class *target = hlock_class(tgt); struct lock_class *parent = prt->class; + int src_read = src->read; + int tgt_read = tgt->read; /* * A direct locking problem where unsafe_class lock is taken @@ -1601,28 +1893,36 @@ print_circular_lock_scenario(struct held_lock *src, */ if (parent != source) { printk("Chain exists of:\n "); - __print_lock_name(source); + __print_lock_name(src, source); printk(KERN_CONT " --> "); - __print_lock_name(parent); + __print_lock_name(NULL, parent); printk(KERN_CONT " --> "); - __print_lock_name(target); + __print_lock_name(tgt, target); printk(KERN_CONT "\n\n"); } printk(" Possible unsafe locking scenario:\n\n"); printk(" CPU0 CPU1\n"); printk(" ---- ----\n"); - printk(" lock("); - __print_lock_name(target); + if (tgt_read != 0) + printk(" rlock("); + else + printk(" lock("); + __print_lock_name(tgt, target); printk(KERN_CONT ");\n"); printk(" lock("); - __print_lock_name(parent); + __print_lock_name(NULL, parent); printk(KERN_CONT ");\n"); printk(" lock("); - __print_lock_name(target); + __print_lock_name(tgt, target); printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(source); + if (src_read != 0) + printk(" rlock("); + else if (src->sync) + printk(" sync("); + else + printk(" lock("); + __print_lock_name(src, source); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } @@ -1659,15 +1959,72 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, print_circular_bug_entry(entry, depth); } -static inline int class_equal(struct lock_list *entry, void *data) +/* + * We are about to add A -> B into the dependency graph, and in __bfs() a + * strong dependency path A -> .. -> B is found: hlock_class equals + * entry->class. + * + * If A -> .. -> B can replace A -> B in any __bfs() search (means the former + * is _stronger_ than or equal to the latter), we consider A -> B as redundant. + * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A + * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the + * dependency graph, as any strong path ..-> A -> B ->.. we can get with + * having dependency A -> B, we could already get a equivalent path ..-> A -> + * .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant. + * + * We need to make sure both the start and the end of A -> .. -> B is not + * weaker than A -> B. For the start part, please see the comment in + * check_redundant(). For the end part, we need: + * + * Either + * + * a) A -> B is -(*R)-> (everything is not weaker than that) + * + * or + * + * b) A -> .. -> B is -(*N)-> (nothing is stronger than this) + * + */ +static inline bool hlock_equal(struct lock_list *entry, void *data) { - return entry->class == data; + struct held_lock *hlock = (struct held_lock *)data; + + return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */ + (hlock->read == 2 || /* A -> B is -(*R)-> */ + !entry->only_xr); /* A -> .. -> B is -(*N)-> */ +} + +/* + * We are about to add B -> A into the dependency graph, and in __bfs() a + * strong dependency path A -> .. -> B is found: hlock_class equals + * entry->class. + * + * We will have a deadlock case (conflict) if A -> .. -> B -> A is a strong + * dependency cycle, that means: + * + * Either + * + * a) B -> A is -(E*)-> + * + * or + * + * b) A -> .. -> B is -(*N)-> (i.e. A -> .. -(*N)-> B) + * + * as then we don't have -(*R)-> -(S*)-> in the cycle. + */ +static inline bool hlock_conflict(struct lock_list *entry, void *data) +{ + struct held_lock *hlock = (struct held_lock *)data; + + return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */ + (hlock->read == 0 || /* B -> A is -(E*)-> */ + !entry->only_xr); /* A -> .. -> B is -(*N)-> */ } static noinline void print_circular_bug(struct lock_list *this, - struct lock_list *target, - struct held_lock *check_src, - struct held_lock *check_tgt) + struct lock_list *target, + struct held_lock *check_src, + struct held_lock *check_tgt) { struct task_struct *curr = current; struct lock_list *parent; @@ -1714,18 +2071,18 @@ static noinline void print_bfs_bug(int ret) WARN(1, "lockdep bfs error:%d\n", ret); } -static int noop_count(struct lock_list *entry, void *data) +static bool noop_count(struct lock_list *entry, void *data) { (*(unsigned long *)data)++; - return 0; + return false; } static unsigned long __lockdep_count_forward_deps(struct lock_list *this) { unsigned long count = 0; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; - __bfs_forwards(this, (void *)&count, noop_count, &target_entry); + __bfs_forwards(this, (void *)&count, noop_count, NULL, &target_entry); return count; } @@ -1734,8 +2091,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) unsigned long ret, flags; struct lock_list this; - this.parent = NULL; - this.class = class; + __bfs_init_root(&this, class); raw_local_irq_save(flags); lockdep_lock(); @@ -1749,9 +2105,9 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) static unsigned long __lockdep_count_backward_deps(struct lock_list *this) { unsigned long count = 0; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; - __bfs_backwards(this, (void *)&count, noop_count, &target_entry); + __bfs_backwards(this, (void *)&count, noop_count, NULL, &target_entry); return count; } @@ -1761,8 +2117,7 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) unsigned long ret, flags; struct lock_list this; - this.parent = NULL; - this.class = class; + __bfs_init_root(&this, class); raw_local_irq_save(flags); lockdep_lock(); @@ -1775,46 +2130,48 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) /* * Check that the dependency graph starting at <src> can lead to - * <target> or not. Print an error and return 0 if it does. + * <target> or not. */ -static noinline int -check_path(struct lock_class *target, struct lock_list *src_entry, +static noinline enum bfs_result +check_path(struct held_lock *target, struct lock_list *src_entry, + bool (*match)(struct lock_list *entry, void *data), + bool (*skip)(struct lock_list *entry, void *data), struct lock_list **target_entry) { - int ret; + enum bfs_result ret; - ret = __bfs_forwards(src_entry, (void *)target, class_equal, - target_entry); + ret = __bfs_forwards(src_entry, target, match, skip, target_entry); - if (unlikely(ret < 0)) + if (unlikely(bfs_error(ret))) print_bfs_bug(ret); return ret; } +static void print_deadlock_bug(struct task_struct *, struct held_lock *, struct held_lock *); + /* * Prove that the dependency graph starting at <src> can not * lead to <target>. If it can, there is a circle when adding * <target> -> <src> dependency. * - * Print an error and return 0 if it does. + * Print an error and return BFS_RMATCH if it does. */ -static noinline int +static noinline enum bfs_result check_noncircular(struct held_lock *src, struct held_lock *target, struct lock_trace **const trace) { - int ret; - struct lock_list *uninitialized_var(target_entry); - struct lock_list src_entry = { - .class = hlock_class(src), - .parent = NULL, - }; + enum bfs_result ret; + struct lock_list *target_entry; + struct lock_list src_entry; + + bfs_init_root(&src_entry, src); debug_atomic_inc(nr_cyclic_checks); - ret = check_path(hlock_class(target), &src_entry, &target_entry); + ret = check_path(target, &src_entry, hlock_conflict, NULL, &target_entry); - if (unlikely(!ret)) { + if (unlikely(ret == BFS_RMATCH)) { if (!*trace) { /* * If save_trace fails here, the printing might @@ -1824,83 +2181,144 @@ check_noncircular(struct held_lock *src, struct held_lock *target, *trace = save_trace(); } - print_circular_bug(&src_entry, target_entry, src, target); + if (src->class_idx == target->class_idx) + print_deadlock_bug(current, src, target); + else + print_circular_bug(&src_entry, target_entry, src, target); } return ret; } -#ifdef CONFIG_LOCKDEP_SMALL +#ifdef CONFIG_TRACE_IRQFLAGS + /* - * Check that the dependency graph starting at <src> can lead to - * <target> or not. If it can, <src> -> <target> dependency is already - * in the graph. + * Forwards and backwards subgraph searching, for the purposes of + * proving that two subgraphs can be connected by a new dependency + * without creating any illegal irq-safe -> irq-unsafe lock dependency. + * + * A irq safe->unsafe deadlock happens with the following conditions: + * + * 1) We have a strong dependency path A -> ... -> B * - * Print an error and return 2 if it does or 1 if it does not. + * 2) and we have ENABLED_IRQ usage of B and USED_IN_IRQ usage of A, therefore + * irq can create a new dependency B -> A (consider the case that a holder + * of B gets interrupted by an irq whose handler will try to acquire A). + * + * 3) the dependency circle A -> ... -> B -> A we get from 1) and 2) is a + * strong circle: + * + * For the usage bits of B: + * a) if A -> B is -(*N)->, then B -> A could be any type, so any + * ENABLED_IRQ usage suffices. + * b) if A -> B is -(*R)->, then B -> A must be -(E*)->, so only + * ENABLED_IRQ_*_READ usage suffices. + * + * For the usage bits of A: + * c) if A -> B is -(E*)->, then B -> A could be any type, so any + * USED_IN_IRQ usage suffices. + * d) if A -> B is -(S*)->, then B -> A must be -(*N)->, so only + * USED_IN_IRQ_*_READ usage suffices. */ -static noinline int -check_redundant(struct held_lock *src, struct held_lock *target) -{ - int ret; - struct lock_list *uninitialized_var(target_entry); - struct lock_list src_entry = { - .class = hlock_class(src), - .parent = NULL, - }; - - debug_atomic_inc(nr_redundant_checks); - - ret = check_path(hlock_class(target), &src_entry, &target_entry); - if (!ret) { - debug_atomic_inc(nr_redundant); - ret = 2; - } else if (ret < 0) - ret = 0; - - return ret; -} -#endif - -#ifdef CONFIG_TRACE_IRQFLAGS - -static inline int usage_accumulate(struct lock_list *entry, void *mask) +/* + * There is a strong dependency path in the dependency graph: A -> B, and now + * we need to decide which usage bit of A should be accumulated to detect + * safe->unsafe bugs. + * + * Note that usage_accumulate() is used in backwards search, so ->only_xr + * stands for whether A -> B only has -(S*)-> (in this case ->only_xr is true). + * + * As above, if only_xr is false, which means A -> B has -(E*)-> dependency + * path, any usage of A should be considered. Otherwise, we should only + * consider _READ usage. + */ +static inline bool usage_accumulate(struct lock_list *entry, void *mask) { - *(unsigned long *)mask |= entry->class->usage_mask; + if (!entry->only_xr) + *(unsigned long *)mask |= entry->class->usage_mask; + else /* Mask out _READ usage bits */ + *(unsigned long *)mask |= (entry->class->usage_mask & LOCKF_IRQ); - return 0; + return false; } /* - * Forwards and backwards subgraph searching, for the purposes of - * proving that two subgraphs can be connected by a new dependency - * without creating any illegal irq-safe -> irq-unsafe lock dependency. + * There is a strong dependency path in the dependency graph: A -> B, and now + * we need to decide which usage bit of B conflicts with the usage bits of A, + * i.e. which usage bit of B may introduce safe->unsafe deadlocks. + * + * As above, if only_xr is false, which means A -> B has -(*N)-> dependency + * path, any usage of B should be considered. Otherwise, we should only + * consider _READ usage. */ +static inline bool usage_match(struct lock_list *entry, void *mask) +{ + if (!entry->only_xr) + return !!(entry->class->usage_mask & *(unsigned long *)mask); + else /* Mask out _READ usage bits */ + return !!((entry->class->usage_mask & LOCKF_IRQ) & *(unsigned long *)mask); +} -static inline int usage_match(struct lock_list *entry, void *mask) +static inline bool usage_skip(struct lock_list *entry, void *mask) { - return entry->class->usage_mask & *(unsigned long *)mask; + if (entry->class->lock_type == LD_LOCK_NORMAL) + return false; + + /* + * Skip local_lock() for irq inversion detection. + * + * For !RT, local_lock() is not a real lock, so it won't carry any + * dependency. + * + * For RT, an irq inversion happens when we have lock A and B, and on + * some CPU we can have: + * + * lock(A); + * <interrupted> + * lock(B); + * + * where lock(B) cannot sleep, and we have a dependency B -> ... -> A. + * + * Now we prove local_lock() cannot exist in that dependency. First we + * have the observation for any lock chain L1 -> ... -> Ln, for any + * 1 <= i <= n, Li.inner_wait_type <= L1.inner_wait_type, otherwise + * wait context check will complain. And since B is not a sleep lock, + * therefore B.inner_wait_type >= 2, and since the inner_wait_type of + * local_lock() is 3, which is greater than 2, therefore there is no + * way the local_lock() exists in the dependency B -> ... -> A. + * + * As a result, we will skip local_lock(), when we search for irq + * inversion bugs. + */ + if (entry->class->lock_type == LD_LOCK_PERCPU && + DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG)) + return false; + + /* + * Skip WAIT_OVERRIDE for irq inversion detection -- it's not actually + * a lock and only used to override the wait_type. + */ + + return true; } /* * Find a node in the forwards-direction dependency sub-graph starting * at @root->class that matches @bit. * - * Return 0 if such a node exists in the subgraph, and put that node + * Return BFS_MATCH if such a node exists in the subgraph, and put that node * into *@target_entry. - * - * Return 1 otherwise and keep *@target_entry unchanged. - * Return <0 on error. */ -static int +static enum bfs_result find_usage_forwards(struct lock_list *root, unsigned long usage_mask, struct lock_list **target_entry) { - int result; + enum bfs_result result; debug_atomic_inc(nr_find_usage_forwards_checks); - result = __bfs_forwards(root, &usage_mask, usage_match, target_entry); + result = __bfs_forwards(root, &usage_mask, usage_match, usage_skip, target_entry); return result; } @@ -1908,22 +2326,16 @@ find_usage_forwards(struct lock_list *root, unsigned long usage_mask, /* * Find a node in the backwards-direction dependency sub-graph starting * at @root->class that matches @bit. - * - * Return 0 if such a node exists in the subgraph, and put that node - * into *@target_entry. - * - * Return 1 otherwise and keep *@target_entry unchanged. - * Return <0 on error. */ -static int +static enum bfs_result find_usage_backwards(struct lock_list *root, unsigned long usage_mask, struct lock_list **target_entry) { - int result; + enum bfs_result result; debug_atomic_inc(nr_find_usage_backwards_checks); - result = __bfs_backwards(root, &usage_mask, usage_match, target_entry); + result = __bfs_backwards(root, &usage_mask, usage_match, usage_skip, target_entry); return result; } @@ -1933,13 +2345,13 @@ static void print_lock_class_header(struct lock_class *class, int depth) int bit; printk("%*s->", depth, ""); - print_lock_name(class); + print_lock_name(NULL, class); #ifdef CONFIG_DEBUG_LOCKDEP printk(KERN_CONT " ops: %lu", debug_class_ops_read(class)); #endif printk(KERN_CONT " {\n"); - for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { + for (bit = 0; bit < LOCK_TRACE_STATES; bit++) { if (class->usage_mask & (1 << bit)) { int len = depth; @@ -1955,7 +2367,56 @@ static void print_lock_class_header(struct lock_class *class, int depth) } /* - * printk the shortest lock dependencies from @start to @end in reverse order: + * Dependency path printing: + * + * After BFS we get a lock dependency path (linked via ->parent of lock_list), + * printing out each lock in the dependency path will help on understanding how + * the deadlock could happen. Here are some details about dependency path + * printing: + * + * 1) A lock_list can be either forwards or backwards for a lock dependency, + * for a lock dependency A -> B, there are two lock_lists: + * + * a) lock_list in the ->locks_after list of A, whose ->class is B and + * ->links_to is A. In this case, we can say the lock_list is + * "A -> B" (forwards case). + * + * b) lock_list in the ->locks_before list of B, whose ->class is A + * and ->links_to is B. In this case, we can say the lock_list is + * "B <- A" (bacwards case). + * + * The ->trace of both a) and b) point to the call trace where B was + * acquired with A held. + * + * 2) A "helper" lock_list is introduced during BFS, this lock_list doesn't + * represent a certain lock dependency, it only provides an initial entry + * for BFS. For example, BFS may introduce a "helper" lock_list whose + * ->class is A, as a result BFS will search all dependencies starting with + * A, e.g. A -> B or A -> C. + * + * The notation of a forwards helper lock_list is like "-> A", which means + * we should search the forwards dependencies starting with "A", e.g A -> B + * or A -> C. + * + * The notation of a bacwards helper lock_list is like "<- B", which means + * we should search the backwards dependencies ending with "B", e.g. + * B <- A or B <- C. + */ + +/* + * printk the shortest lock dependencies from @root to @leaf in reverse order. + * + * We have a lock dependency path as follow: + * + * @root @leaf + * | | + * V V + * ->parent ->parent + * | lock_list | <--------- | lock_list | ... | lock_list | <--------- | lock_list | + * | -> L1 | | L1 -> L2 | ... |Ln-2 -> Ln-1| | Ln-1 -> Ln| + * + * , so it's natural that we start from @leaf and print every ->class and + * ->trace until we reach the @root. */ static void __used print_shortest_lock_dependencies(struct lock_list *leaf, @@ -1983,6 +2444,61 @@ print_shortest_lock_dependencies(struct lock_list *leaf, } while (entry && (depth >= 0)); } +/* + * printk the shortest lock dependencies from @leaf to @root. + * + * We have a lock dependency path (from a backwards search) as follow: + * + * @leaf @root + * | | + * V V + * ->parent ->parent + * | lock_list | ---------> | lock_list | ... | lock_list | ---------> | lock_list | + * | L2 <- L1 | | L3 <- L2 | ... | Ln <- Ln-1 | | <- Ln | + * + * , so when we iterate from @leaf to @root, we actually print the lock + * dependency path L1 -> L2 -> .. -> Ln in the non-reverse order. + * + * Another thing to notice here is that ->class of L2 <- L1 is L1, while the + * ->trace of L2 <- L1 is the call trace of L2, in fact we don't have the call + * trace of L1 in the dependency path, which is alright, because most of the + * time we can figure out where L1 is held from the call trace of L2. + */ +static void __used +print_shortest_lock_dependencies_backwards(struct lock_list *leaf, + struct lock_list *root) +{ + struct lock_list *entry = leaf; + const struct lock_trace *trace = NULL; + int depth; + + /*compute depth from generated tree by BFS*/ + depth = get_lock_depth(leaf); + + do { + print_lock_class_header(entry->class, depth); + if (trace) { + printk("%*s ... acquired at:\n", depth, ""); + print_lock_trace(trace, 2); + printk("\n"); + } + + /* + * Record the pointer to the trace for the next lock_list + * entry, see the comments for the function. + */ + trace = entry->trace; + + if (depth == 0 && (entry != root)) { + printk("lockdep:%s bad path found in chain graph\n", __func__); + break; + } + + entry = get_lock_parent(entry); + depth--; + } while (entry && (depth >= 0)); +} + static void print_irq_lock_scenario(struct lock_list *safe_entry, struct lock_list *unsafe_entry, @@ -2011,11 +2527,11 @@ print_irq_lock_scenario(struct lock_list *safe_entry, */ if (middle_class != unsafe_class) { printk("Chain exists of:\n "); - __print_lock_name(safe_class); + __print_lock_name(NULL, safe_class); printk(KERN_CONT " --> "); - __print_lock_name(middle_class); + __print_lock_name(NULL, middle_class); printk(KERN_CONT " --> "); - __print_lock_name(unsafe_class); + __print_lock_name(NULL, unsafe_class); printk(KERN_CONT "\n\n"); } @@ -2023,18 +2539,18 @@ print_irq_lock_scenario(struct lock_list *safe_entry, printk(" CPU0 CPU1\n"); printk(" ---- ----\n"); printk(" lock("); - __print_lock_name(unsafe_class); + __print_lock_name(NULL, unsafe_class); printk(KERN_CONT ");\n"); printk(" local_irq_disable();\n"); printk(" lock("); - __print_lock_name(safe_class); + __print_lock_name(NULL, safe_class); printk(KERN_CONT ");\n"); printk(" lock("); - __print_lock_name(middle_class); + __print_lock_name(NULL, middle_class); printk(KERN_CONT ");\n"); printk(" <Interrupt>\n"); printk(" lock("); - __print_lock_name(safe_class); + __print_lock_name(NULL, safe_class); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } @@ -2062,29 +2578,29 @@ print_bad_irq_dependency(struct task_struct *curr, pr_warn("-----------------------------------------------------\n"); pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), - curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, + lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT, curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, - curr->hardirqs_enabled, + lockdep_hardirqs_enabled(), curr->softirqs_enabled); print_lock(next); pr_warn("\nand this task is already holding:\n"); print_lock(prev); pr_warn("which would create a new lock dependency:\n"); - print_lock_name(hlock_class(prev)); + print_lock_name(prev, hlock_class(prev)); pr_cont(" ->"); - print_lock_name(hlock_class(next)); + print_lock_name(next, hlock_class(next)); pr_cont("\n"); pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n", irqclass); - print_lock_name(backwards_entry->class); + print_lock_name(NULL, backwards_entry->class); pr_warn("\n... which became %s-irq-safe at:\n", irqclass); print_lock_trace(backwards_entry->class->usage_traces[bit1], 1); pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); - print_lock_name(forwards_entry->class); + print_lock_name(NULL, forwards_entry->class); pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); pr_warn("..."); @@ -2097,10 +2613,7 @@ print_bad_irq_dependency(struct task_struct *curr, lockdep_print_held_locks(curr); pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); - prev_root->trace = save_trace(); - if (!prev_root->trace) - return; - print_shortest_lock_dependencies(backwards_entry, prev_root); + print_shortest_lock_dependencies_backwards(backwards_entry, prev_root); pr_warn("\nthe dependencies between the lock to be acquired"); pr_warn(" and %s-irq-unsafe lock:\n", irqclass); @@ -2179,17 +2692,39 @@ static unsigned long invert_dir_mask(unsigned long mask) } /* - * As above, we clear bitnr0 (LOCK_*_READ off) with bitmask ops. First, for all - * bits with bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). - * And then mask out all bitnr0. + * Note that a LOCK_ENABLED_IRQ_*_READ usage and a LOCK_USED_IN_IRQ_*_READ + * usage may cause deadlock too, for example: + * + * P1 P2 + * <irq disabled> + * write_lock(l1); <irq enabled> + * read_lock(l2); + * write_lock(l2); + * <in irq> + * read_lock(l1); + * + * , in above case, l1 will be marked as LOCK_USED_IN_IRQ_HARDIRQ_READ and l2 + * will marked as LOCK_ENABLE_IRQ_HARDIRQ_READ, and this is a possible + * deadlock. + * + * In fact, all of the following cases may cause deadlocks: + * + * LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_* + * LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_* + * LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_*_READ + * LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_*_READ + * + * As a result, to calculate the "exclusive mask", first we invert the + * direction (USED_IN/ENABLED) of the original mask, and 1) for all bits with + * bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). 2) for all + * bits with bitnr0 cleared (LOCK_*_READ), add those with bitnr0 set (LOCK_*). */ static unsigned long exclusive_mask(unsigned long mask) { unsigned long excl = invert_dir_mask(mask); - /* Strip read */ excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK; - excl &= ~LOCKF_IRQ_READ; + excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK; return excl; } @@ -2206,6 +2741,7 @@ static unsigned long original_mask(unsigned long mask) unsigned long excl = invert_dir_mask(mask); /* Include read in existing usages */ + excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK; excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK; return excl; @@ -2220,14 +2756,24 @@ static int find_exclusive_match(unsigned long mask, enum lock_usage_bit *bitp, enum lock_usage_bit *excl_bitp) { - int bit, excl; + int bit, excl, excl_read; for_each_set_bit(bit, &mask, LOCK_USED) { + /* + * exclusive_bit() strips the read bit, however, + * LOCK_ENABLED_IRQ_*_READ may cause deadlocks too, so we need + * to search excl | LOCK_USAGE_READ_MASK as well. + */ excl = exclusive_bit(bit); + excl_read = excl | LOCK_USAGE_READ_MASK; if (excl_mask & lock_flag(excl)) { *bitp = bit; *excl_bitp = excl; return 0; + } else if (excl_mask & lock_flag(excl_read)) { + *bitp = bit; + *excl_bitp = excl_read; + return 0; } } return -1; @@ -2244,20 +2790,19 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, { unsigned long usage_mask = 0, forward_mask, backward_mask; enum lock_usage_bit forward_bit = 0, backward_bit = 0; - struct lock_list *uninitialized_var(target_entry1); - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry1; + struct lock_list *target_entry; struct lock_list this, that; - int ret; + enum bfs_result ret; /* * Step 1: gather all hard/soft IRQs usages backward in an * accumulated usage mask. */ - this.parent = NULL; - this.class = hlock_class(prev); + bfs_init_rootb(&this, prev); - ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL); - if (ret < 0) { + ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, usage_skip, NULL); + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } @@ -2272,30 +2817,39 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, */ forward_mask = exclusive_mask(usage_mask); - that.parent = NULL; - that.class = hlock_class(next); + bfs_init_root(&that, next); ret = find_usage_forwards(&that, forward_mask, &target_entry1); - if (ret < 0) { + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } - if (ret == 1) - return ret; + if (ret == BFS_RNOMATCH) + return 1; /* * Step 3: we found a bad match! Now retrieve a lock from the backward * list whose usage mask matches the exclusive usage mask from the * lock found on the forward list. + * + * Note, we should only keep the LOCKF_ENABLED_IRQ_ALL bits, considering + * the follow case: + * + * When trying to add A -> B to the graph, we find that there is a + * hardirq-safe L, that L -> ... -> A, and another hardirq-unsafe M, + * that B -> ... -> M. However M is **softirq-safe**, if we use exact + * invert bits of M's usage_mask, we will find another lock N that is + * **softirq-unsafe** and N -> ... -> A, however N -> .. -> M will not + * cause a inversion deadlock. */ - backward_mask = original_mask(target_entry1->class->usage_mask); + backward_mask = original_mask(target_entry1->class->usage_mask & LOCKF_ENABLED_IRQ_ALL); ret = find_usage_backwards(&this, backward_mask, &target_entry); - if (ret < 0) { + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } - if (DEBUG_LOCKS_WARN_ON(ret == 1)) + if (DEBUG_LOCKS_WARN_ON(ret == BFS_RNOMATCH)) return 1; /* @@ -2324,8 +2878,68 @@ static inline int check_irq_usage(struct task_struct *curr, { return 1; } + +static inline bool usage_skip(struct lock_list *entry, void *mask) +{ + return false; +} + #endif /* CONFIG_TRACE_IRQFLAGS */ +#ifdef CONFIG_LOCKDEP_SMALL +/* + * Check that the dependency graph starting at <src> can lead to + * <target> or not. If it can, <src> -> <target> dependency is already + * in the graph. + * + * Return BFS_RMATCH if it does, or BFS_RNOMATCH if it does not, return BFS_E* if + * any error appears in the bfs search. + */ +static noinline enum bfs_result +check_redundant(struct held_lock *src, struct held_lock *target) +{ + enum bfs_result ret; + struct lock_list *target_entry; + struct lock_list src_entry; + + bfs_init_root(&src_entry, src); + /* + * Special setup for check_redundant(). + * + * To report redundant, we need to find a strong dependency path that + * is equal to or stronger than <src> -> <target>. So if <src> is E, + * we need to let __bfs() only search for a path starting at a -(E*)->, + * we achieve this by setting the initial node's ->only_xr to true in + * that case. And if <prev> is S, we set initial ->only_xr to false + * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant. + */ + src_entry.only_xr = src->read == 0; + + debug_atomic_inc(nr_redundant_checks); + + /* + * Note: we skip local_lock() for redundant check, because as the + * comment in usage_skip(), A -> local_lock() -> B and A -> B are not + * the same. + */ + ret = check_path(target, &src_entry, hlock_equal, usage_skip, &target_entry); + + if (ret == BFS_RMATCH) + debug_atomic_inc(nr_redundant); + + return ret; +} + +#else + +static inline enum bfs_result +check_redundant(struct held_lock *src, struct held_lock *target) +{ + return BFS_RNOMATCH; +} + +#endif + static void inc_chains(int irq_context) { if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT) @@ -2356,10 +2970,10 @@ print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv) printk(" CPU0\n"); printk(" ----\n"); printk(" lock("); - __print_lock_name(prev); + __print_lock_name(prv, prev); printk(KERN_CONT ");\n"); printk(" lock("); - __print_lock_name(next); + __print_lock_name(nxt, next); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); printk(" May be due to missing lock nesting notation\n\n"); @@ -2369,6 +2983,8 @@ static void print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, struct held_lock *next) { + struct lock_class *class = hlock_class(prev); + if (!debug_locks_off_graph_unlock() || debug_locks_silent) return; @@ -2383,6 +2999,11 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, pr_warn("\nbut task is already holding lock:\n"); print_lock(prev); + if (class->cmp_fn) { + pr_warn("and the lock comparison function returns %i:\n", + class->cmp_fn(prev->instance, next->instance)); + } + pr_warn("\nother info that might help us debug this:\n"); print_deadlock_scenario(next, prev); lockdep_print_held_locks(curr); @@ -2397,11 +3018,14 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, * (Note that this has to be done separately, because the graph cannot * detect such classes of deadlocks.) * - * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read + * Returns: 0 on deadlock detected, 1 on OK, 2 if another lock with the same + * lock class is held but nest_lock is also held, i.e. we rely on the + * nest_lock to avoid the deadlock. */ static int check_deadlock(struct task_struct *curr, struct held_lock *next) { + struct lock_class *class; struct held_lock *prev; struct held_lock *nest = NULL; int i; @@ -2420,7 +3044,13 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) * lock class (i.e. read_lock(lock)+read_lock(lock)): */ if ((next->read == 2) && prev->read) - return 2; + continue; + + class = hlock_class(prev); + + if (class->cmp_fn && + class->cmp_fn(prev->instance, next->instance) < 0) + continue; /* * We're holding the nest_lock, which serializes this lock's @@ -2459,11 +3089,11 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, + struct held_lock *next, u16 distance, struct lock_trace **const trace) { struct lock_list *entry; - int ret; + enum bfs_result ret; if (!hlock_class(prev)->key || !hlock_class(next)->key) { /* @@ -2483,6 +3113,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, return 2; } + if (prev->class_idx == next->class_idx) { + struct lock_class *class = hlock_class(prev); + + if (class->cmp_fn && + class->cmp_fn(prev->instance, next->instance) < 0) + return 2; + } + /* * Prove that the new <prev> -> <next> dependency would not * create a circular dependency in the graph. (We do this by @@ -2494,23 +3132,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * in the graph whose neighbours are to be checked. */ ret = check_noncircular(next, prev, trace); - if (unlikely(ret <= 0)) + if (unlikely(bfs_error(ret) || ret == BFS_RMATCH)) return 0; if (!check_irq_usage(curr, prev, next)) return 0; /* - * For recursive read-locks we do all the dependency checks, - * but we dont store read-triggered dependencies (only - * write-triggered dependencies). This ensures that only the - * write-side dependencies matter, and that if for example a - * write-lock never takes any other locks, then the reads are - * equivalent to a NOP. - */ - if (next->read == 2 || prev->read == 2) - return 1; - /* * Is the <prev> -> <next> dependency already present? * * (this may occur even though this is a new chain: consider @@ -2522,18 +3150,46 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, if (entry->class == hlock_class(next)) { if (distance == 1) entry->distance = 1; - return 1; + entry->dep |= calc_dep(prev, next); + + /* + * Also, update the reverse dependency in @next's + * ->locks_before list. + * + * Here we reuse @entry as the cursor, which is fine + * because we won't go to the next iteration of the + * outer loop: + * + * For normal cases, we return in the inner loop. + * + * If we fail to return, we have inconsistency, i.e. + * <prev>::locks_after contains <next> while + * <next>::locks_before doesn't contain <prev>. In + * that case, we return after the inner and indicate + * something is wrong. + */ + list_for_each_entry(entry, &hlock_class(next)->locks_before, entry) { + if (entry->class == hlock_class(prev)) { + if (distance == 1) + entry->distance = 1; + entry->dep |= calc_depb(prev, next); + return 1; + } + } + + /* <prev> is not found in <next>::locks_before */ + return 0; } } -#ifdef CONFIG_LOCKDEP_SMALL /* * Is the <prev> -> <next> link redundant? */ ret = check_redundant(prev, next); - if (ret != 1) - return ret; -#endif + if (bfs_error(ret)) + return 0; + else if (ret == BFS_RMATCH) + return 2; if (!*trace) { *trace = save_trace(); @@ -2546,15 +3202,15 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * to the previous lock's dependency list: */ ret = add_lock_to_list(hlock_class(next), hlock_class(prev), - &hlock_class(prev)->locks_after, - next->acquire_ip, distance, *trace); + &hlock_class(prev)->locks_after, distance, + calc_dep(prev, next), *trace); if (!ret) return 0; ret = add_lock_to_list(hlock_class(prev), hlock_class(next), - &hlock_class(next)->locks_before, - next->acquire_ip, distance, *trace); + &hlock_class(next)->locks_before, distance, + calc_depb(prev, next), *trace); if (!ret) return 0; @@ -2590,16 +3246,11 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) goto out_bug; for (;;) { - int distance = curr->lockdep_depth - depth + 1; + u16 distance = curr->lockdep_depth - depth + 1; hlock = curr->held_locks + depth - 1; - /* - * Only non-recursive-read entries get new dependencies - * added: - */ - if (hlock->read != 2 && hlock->check) { - int ret = check_prev_add(curr, hlock, next, distance, - &trace); + if (hlock->check) { + int ret = check_prev_add(curr, hlock, next, distance, &trace); if (!ret) return 0; @@ -2846,7 +3497,8 @@ static int alloc_chain_hlocks(int req) size = chain_block_size(curr); if (likely(size >= req)) { del_chain_block(0, size, chain_block_next(curr)); - add_chain_block(curr + req, size - req); + if (size > req) + add_chain_block(curr + req, size - req); return curr; } } @@ -2875,7 +3527,10 @@ static inline void free_chain_hlocks(int base, int size) struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) { - return lock_classes + chain_hlocks[chain->base + i]; + u16 chain_hlock = chain_hlocks[chain->base + i]; + unsigned int class_idx = chain_hlock_class_idx(chain_hlock); + + return lock_classes + class_idx; } /* @@ -2901,12 +3556,12 @@ static inline int get_first_held_lock(struct task_struct *curr, /* * Returns the next chain_key iteration */ -static u64 print_chain_key_iteration(int class_idx, u64 chain_key) +static u64 print_chain_key_iteration(u16 hlock_id, u64 chain_key) { - u64 new_chain_key = iterate_chain_key(chain_key, class_idx); + u64 new_chain_key = iterate_chain_key(chain_key, hlock_id); - printk(" class_idx:%d -> chain_key:%016Lx", - class_idx, + printk(" hlock_id:%d -> chain_key:%016Lx", + (unsigned int)hlock_id, (unsigned long long)new_chain_key); return new_chain_key; } @@ -2923,12 +3578,12 @@ print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_ne hlock_next->irq_context); for (; i < depth; i++) { hlock = curr->held_locks + i; - chain_key = print_chain_key_iteration(hlock->class_idx, chain_key); + chain_key = print_chain_key_iteration(hlock_id(hlock), chain_key); print_lock(hlock); } - print_chain_key_iteration(hlock_next->class_idx, chain_key); + print_chain_key_iteration(hlock_id(hlock_next), chain_key); print_lock(hlock_next); } @@ -2936,14 +3591,14 @@ static void print_chain_keys_chain(struct lock_chain *chain) { int i; u64 chain_key = INITIAL_CHAIN_KEY; - int class_id; + u16 hlock_id; printk("depth: %u\n", chain->depth); for (i = 0; i < chain->depth; i++) { - class_id = chain_hlocks[chain->base + i]; - chain_key = print_chain_key_iteration(class_id, chain_key); + hlock_id = chain_hlocks[chain->base + i]; + chain_key = print_chain_key_iteration(hlock_id, chain_key); - print_lock_name(lock_classes + class_id); + print_lock_name(NULL, lock_classes + chain_hlock_class_idx(hlock_id)); printk("\n"); } } @@ -2992,7 +3647,7 @@ static int check_no_collision(struct task_struct *curr, } for (j = 0; j < chain->depth - 1; j++, i++) { - id = curr->held_locks[i].class_idx; + id = hlock_id(&curr->held_locks[i]); if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) { print_collision(curr, hlock, chain); @@ -3041,7 +3696,6 @@ static inline int add_chain_cache(struct task_struct *curr, struct held_lock *hlock, u64 chain_key) { - struct lock_class *class = hlock_class(hlock); struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; int i, j; @@ -3084,11 +3738,11 @@ static inline int add_chain_cache(struct task_struct *curr, chain->base = j; for (j = 0; j < chain->depth - 1; j++, i++) { - int lock_id = curr->held_locks[i].class_idx; + int lock_id = hlock_id(curr->held_locks + i); chain_hlocks[chain->base + j] = lock_id; } - chain_hlocks[chain->base + j] = class - lock_classes; + chain_hlocks[chain->base + j] = hlock_id(hlock); hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); inc_chains(chain->irq_context); @@ -3204,15 +3858,12 @@ static int validate_chain(struct task_struct *curr, if (!ret) return 0; /* - * Mark recursive read, as we jump over it when - * building dependencies (just like we jump over - * trylock entries): - */ - if (ret == 2) - hlock->read = 2; - /* * Add dependency only if this lock is not the head - * of the chain, and if it's not a secondary read-lock: + * of the chain, and if the new lock introduces no more + * lock dependency (because we already hold a lock with the + * same lock class) nor deadlock (because the nest_lock + * serializes nesting locks), see the comments for + * check_deadlock(). */ if (!chain_head && ret != 2) { if (!check_prevs_add(curr, hlock)) @@ -3275,7 +3926,7 @@ static void check_chain_key(struct task_struct *curr) if (prev_hlock && (prev_hlock->irq_context != hlock->irq_context)) chain_key = INITIAL_CHAIN_KEY; - chain_key = iterate_chain_key(chain_key, hlock->class_idx); + chain_key = iterate_chain_key(chain_key, hlock_id(hlock)); prev_hlock = hlock; } if (chain_key != curr->curr_chain_key) { @@ -3304,11 +3955,11 @@ static void print_usage_bug_scenario(struct held_lock *lock) printk(" CPU0\n"); printk(" ----\n"); printk(" lock("); - __print_lock_name(class); + __print_lock_name(lock, class); printk(KERN_CONT ");\n"); printk(" <Interrupt>\n"); printk(" lock("); - __print_lock_name(class); + __print_lock_name(lock, class); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } @@ -3317,7 +3968,7 @@ static void print_usage_bug(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) { - if (!debug_locks_off_graph_unlock() || debug_locks_silent) + if (!debug_locks_off() || debug_locks_silent) return; pr_warn("\n"); @@ -3331,9 +3982,9 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", curr->comm, task_pid_nr(curr), - lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, + lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT, lockdep_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, - lockdep_hardirqs_enabled(curr), + lockdep_hardirqs_enabled(), lockdep_softirqs_enabled(curr)); print_lock(this); @@ -3358,6 +4009,7 @@ valid_state(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) { if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) { + graph_unlock(); print_usage_bug(curr, this, bad_bit, new_bit); return 0; } @@ -3393,7 +4045,7 @@ print_irq_inversion_bug(struct task_struct *curr, pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass); else pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); - print_lock_name(other->class); + print_lock_name(NULL, other->class); pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n"); pr_warn("\nother info that might help us debug this:\n"); @@ -3434,24 +4086,32 @@ print_irq_inversion_bug(struct task_struct *curr, */ static int check_usage_forwards(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit bit, const char *irqclass) + enum lock_usage_bit bit) { - int ret; + enum bfs_result ret; struct lock_list root; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; + enum lock_usage_bit read_bit = bit + LOCK_USAGE_READ_MASK; + unsigned usage_mask = lock_flag(bit) | lock_flag(read_bit); - root.parent = NULL; - root.class = hlock_class(this); - ret = find_usage_forwards(&root, lock_flag(bit), &target_entry); - if (ret < 0) { + bfs_init_root(&root, this); + ret = find_usage_forwards(&root, usage_mask, &target_entry); + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } - if (ret == 1) - return ret; + if (ret == BFS_RNOMATCH) + return 1; + + /* Check whether write or read usage is the match */ + if (target_entry->class->usage_mask & lock_flag(bit)) { + print_irq_inversion_bug(curr, &root, target_entry, + this, 1, state_name(bit)); + } else { + print_irq_inversion_bug(curr, &root, target_entry, + this, 1, state_name(read_bit)); + } - print_irq_inversion_bug(curr, &root, target_entry, - this, 1, irqclass); return 0; } @@ -3461,42 +4121,52 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, */ static int check_usage_backwards(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit bit, const char *irqclass) + enum lock_usage_bit bit) { - int ret; + enum bfs_result ret; struct lock_list root; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; + enum lock_usage_bit read_bit = bit + LOCK_USAGE_READ_MASK; + unsigned usage_mask = lock_flag(bit) | lock_flag(read_bit); - root.parent = NULL; - root.class = hlock_class(this); - ret = find_usage_backwards(&root, lock_flag(bit), &target_entry); - if (ret < 0) { + bfs_init_rootb(&root, this); + ret = find_usage_backwards(&root, usage_mask, &target_entry); + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } - if (ret == 1) - return ret; + if (ret == BFS_RNOMATCH) + return 1; + + /* Check whether write or read usage is the match */ + if (target_entry->class->usage_mask & lock_flag(bit)) { + print_irq_inversion_bug(curr, &root, target_entry, + this, 0, state_name(bit)); + } else { + print_irq_inversion_bug(curr, &root, target_entry, + this, 0, state_name(read_bit)); + } - print_irq_inversion_bug(curr, &root, target_entry, - this, 0, irqclass); return 0; } void print_irqtrace_events(struct task_struct *curr) { - printk("irq event stamp: %u\n", curr->irq_events); + const struct irqtrace_events *trace = &curr->irqtrace; + + printk("irq event stamp: %u\n", trace->irq_events); printk("hardirqs last enabled at (%u): [<%px>] %pS\n", - curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip, - (void *)curr->hardirq_enable_ip); + trace->hardirq_enable_event, (void *)trace->hardirq_enable_ip, + (void *)trace->hardirq_enable_ip); printk("hardirqs last disabled at (%u): [<%px>] %pS\n", - curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip, - (void *)curr->hardirq_disable_ip); + trace->hardirq_disable_event, (void *)trace->hardirq_disable_ip, + (void *)trace->hardirq_disable_ip); printk("softirqs last enabled at (%u): [<%px>] %pS\n", - curr->softirq_enable_event, (void *)curr->softirq_enable_ip, - (void *)curr->softirq_enable_ip); + trace->softirq_enable_event, (void *)trace->softirq_enable_ip, + (void *)trace->softirq_enable_ip); printk("softirqs last disabled at (%u): [<%px>] %pS\n", - curr->softirq_disable_event, (void *)curr->softirq_disable_ip, - (void *)curr->softirq_disable_ip); + trace->softirq_disable_event, (void *)trace->softirq_disable_ip, + (void *)trace->softirq_disable_ip); } static int HARDIRQ_verbose(struct lock_class *class) @@ -3515,8 +4185,6 @@ static int SOFTIRQ_verbose(struct lock_class *class) return 0; } -#define STRICT_READ_CHECKS 1 - static int (*state_verbose_f[])(struct lock_class *class) = { #define LOCKDEP_STATE(__STATE) \ __STATE##_verbose, @@ -3542,16 +4210,6 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, int dir = new_bit & LOCK_USAGE_DIR_MASK; /* - * mark USED_IN has to look forwards -- to ensure no dependency - * has ENABLED state, which would allow recursion deadlocks. - * - * mark ENABLED has to look backwards -- to ensure no dependee - * has USED_IN state, which, again, would allow recursion deadlocks. - */ - check_usage_f usage = dir ? - check_usage_backwards : check_usage_forwards; - - /* * Validate that this particular lock does not have conflicting * usage states. */ @@ -3559,23 +4217,30 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, return 0; /* - * Validate that the lock dependencies don't have conflicting usage - * states. + * Check for read in write conflicts */ - if ((!read || STRICT_READ_CHECKS) && - !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK))) + if (!read && !valid_state(curr, this, new_bit, + excl_bit + LOCK_USAGE_READ_MASK)) return 0; + /* - * Check for read in write conflicts + * Validate that the lock dependencies don't have conflicting usage + * states. */ - if (!read) { - if (!valid_state(curr, this, new_bit, excl_bit + LOCK_USAGE_READ_MASK)) + if (dir) { + /* + * mark ENABLED has to look backwards -- to ensure no dependee + * has USED_IN state, which, again, would allow recursion deadlocks. + */ + if (!check_usage_backwards(curr, this, excl_bit)) return 0; - - if (STRICT_READ_CHECKS && - !usage(curr, this, excl_bit + LOCK_USAGE_READ_MASK, - state_name(new_bit + LOCK_USAGE_READ_MASK))) + } else { + /* + * mark USED_IN has to look forwards -- to ensure no dependency + * has ENABLED state, which would allow recursion deadlocks. + */ + if (!check_usage_forwards(curr, this, excl_bit)) return 0; } @@ -3637,19 +4302,27 @@ static void __trace_hardirqs_on_caller(void) /** * lockdep_hardirqs_on_prepare - Prepare for enabling interrupts - * @ip: Caller address * * Invoked before a possible transition to RCU idle from exit to user or * guest mode. This ensures that all RCU operations are done before RCU * stops watching. After the RCU transition lockdep_hardirqs_on() has to be * invoked to set the final state. */ -void lockdep_hardirqs_on_prepare(unsigned long ip) +void lockdep_hardirqs_on_prepare(void) { - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!debug_locks)) + return; + + /* + * NMIs do not (and cannot) track lock dependencies, nothing to do. + */ + if (unlikely(in_nmi())) return; - if (unlikely(current->hardirqs_enabled)) { + if (unlikely(this_cpu_read(lockdep_recursion))) + return; + + if (unlikely(lockdep_hardirqs_enabled())) { /* * Neither irq nor preemption are disabled here * so this is racy by nature but losing one hit @@ -3677,12 +4350,12 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) * Can't allow enabling interrupts while in an interrupt handler, * that's general bad form and such. Recursion, limited stack etc.. */ - if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context())) return; current->hardirq_chain_key = current->curr_chain_key; - current->lockdep_recursion++; + lockdep_recursion_inc(); __trace_hardirqs_on_caller(); lockdep_recursion_finish(); } @@ -3690,12 +4363,35 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_on_prepare); void noinstr lockdep_hardirqs_on(unsigned long ip) { - struct task_struct *curr = current; + struct irqtrace_events *trace = ¤t->irqtrace; + + if (unlikely(!debug_locks)) + return; + + /* + * NMIs can happen in the middle of local_irq_{en,dis}able() where the + * tracking state and hardware state are out of sync. + * + * NMIs must save lockdep_hardirqs_enabled() to restore IRQ state from, + * and not rely on hardware state like normal interrupts. + */ + if (unlikely(in_nmi())) { + if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI)) + return; - if (unlikely(!debug_locks || curr->lockdep_recursion)) + /* + * Skip: + * - recursion check, because NMI can hit lockdep; + * - hardware state check, because above; + * - chain_key check, see lockdep_hardirqs_on_prepare(). + */ + goto skip_checks; + } + + if (unlikely(this_cpu_read(lockdep_recursion))) return; - if (curr->hardirqs_enabled) { + if (lockdep_hardirqs_enabled()) { /* * Neither irq nor preemption are disabled here * so this is racy by nature but losing one hit @@ -3720,10 +4416,11 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) DEBUG_LOCKS_WARN_ON(current->hardirq_chain_key != current->curr_chain_key); +skip_checks: /* we'll do an OFF -> ON transition: */ - curr->hardirqs_enabled = 1; - curr->hardirq_enable_ip = ip; - curr->hardirq_enable_event = ++curr->irq_events; + __this_cpu_write(hardirqs_enabled, 1); + trace->hardirq_enable_ip = ip; + trace->hardirq_enable_event = ++trace->irq_events; debug_atomic_inc(hardirqs_on_events); } EXPORT_SYMBOL_GPL(lockdep_hardirqs_on); @@ -3733,9 +4430,18 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_on); */ void noinstr lockdep_hardirqs_off(unsigned long ip) { - struct task_struct *curr = current; + if (unlikely(!debug_locks)) + return; - if (unlikely(!debug_locks || curr->lockdep_recursion)) + /* + * Matching lockdep_hardirqs_on(), allow NMIs in the middle of lockdep; + * they will restore the software state. This ensures the software + * state is consistent inside NMIs as well. + */ + if (in_nmi()) { + if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI)) + return; + } else if (__this_cpu_read(lockdep_recursion)) return; /* @@ -3745,13 +4451,15 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->hardirqs_enabled) { + if (lockdep_hardirqs_enabled()) { + struct irqtrace_events *trace = ¤t->irqtrace; + /* * We have done an ON -> OFF transition: */ - curr->hardirqs_enabled = 0; - curr->hardirq_disable_ip = ip; - curr->hardirq_disable_event = ++curr->irq_events; + __this_cpu_write(hardirqs_enabled, 0); + trace->hardirq_disable_ip = ip; + trace->hardirq_disable_event = ++trace->irq_events; debug_atomic_inc(hardirqs_off_events); } else { debug_atomic_inc(redundant_hardirqs_off); @@ -3764,9 +4472,9 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_off); */ void lockdep_softirqs_on(unsigned long ip) { - struct task_struct *curr = current; + struct irqtrace_events *trace = ¤t->irqtrace; - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; /* @@ -3776,26 +4484,26 @@ void lockdep_softirqs_on(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->softirqs_enabled) { + if (current->softirqs_enabled) { debug_atomic_inc(redundant_softirqs_on); return; } - current->lockdep_recursion++; + lockdep_recursion_inc(); /* * We'll do an OFF -> ON transition: */ - curr->softirqs_enabled = 1; - curr->softirq_enable_ip = ip; - curr->softirq_enable_event = ++curr->irq_events; + current->softirqs_enabled = 1; + trace->softirq_enable_ip = ip; + trace->softirq_enable_event = ++trace->irq_events; debug_atomic_inc(softirqs_on_events); /* * We are going to turn softirqs on, so set the * usage bit for all held locks, if hardirqs are * enabled too: */ - if (curr->hardirqs_enabled) - mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); + if (lockdep_hardirqs_enabled()) + mark_held_locks(current, LOCK_ENABLED_SOFTIRQ); lockdep_recursion_finish(); } @@ -3804,9 +4512,7 @@ void lockdep_softirqs_on(unsigned long ip) */ void lockdep_softirqs_off(unsigned long ip) { - struct task_struct *curr = current; - - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; /* @@ -3815,13 +4521,15 @@ void lockdep_softirqs_off(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->softirqs_enabled) { + if (current->softirqs_enabled) { + struct irqtrace_events *trace = ¤t->irqtrace; + /* * We have done an ON -> OFF transition: */ - curr->softirqs_enabled = 0; - curr->softirq_disable_ip = ip; - curr->softirq_disable_event = ++curr->irq_events; + current->softirqs_enabled = 0; + trace->softirq_disable_ip = ip; + trace->softirq_disable_event = ++trace->irq_events; debug_atomic_inc(softirqs_off_events); /* * Whoops, we wanted softirqs off, so why aren't they? @@ -3843,7 +4551,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) */ if (!hlock->trylock) { if (hlock->read) { - if (curr->hardirq_context) + if (lockdep_hardirq_context()) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ_READ)) return 0; @@ -3852,7 +4560,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) LOCK_USED_IN_SOFTIRQ_READ)) return 0; } else { - if (curr->hardirq_context) + if (lockdep_hardirq_context()) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) return 0; if (curr->softirq_context) @@ -3860,7 +4568,13 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) return 0; } } - if (!hlock->hardirqs_off) { + + /* + * For lock_sync(), don't mark the ENABLED usage, since lock_sync() + * creates no critical section and no extra dependency can be introduced + * by interrupts + */ + if (!hlock->hardirqs_off && !hlock->sync) { if (hlock->read) { if (!mark_lock(curr, hlock, LOCK_ENABLED_HARDIRQ_READ)) @@ -3890,7 +4604,7 @@ lock_used: static inline unsigned int task_irq_context(struct task_struct *task) { - return LOCK_CHAIN_HARDIRQ_CONTEXT * !!task->hardirq_context + + return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context() + LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context; } @@ -3923,13 +4637,18 @@ static int separate_irq_context(struct task_struct *curr, static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit) { - unsigned int new_mask = 1 << new_bit, ret = 1; + unsigned int new_mask, ret = 1; if (new_bit >= LOCK_USAGE_STATES) { DEBUG_LOCKS_WARN_ON(1); return 0; } + if (new_bit == LOCK_USED && this->read) + new_bit = LOCK_USED_READ; + + new_mask = 1 << new_bit; + /* * If already set then do not dirty the cacheline, * nor do any checks: @@ -3942,26 +4661,26 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, /* * Make sure we didn't race: */ - if (unlikely(hlock_class(this)->usage_mask & new_mask)) { - graph_unlock(); - return 1; - } + if (unlikely(hlock_class(this)->usage_mask & new_mask)) + goto unlock; + + if (!hlock_class(this)->usage_mask) + debug_atomic_dec(nr_unused_locks); hlock_class(this)->usage_mask |= new_mask; - if (!(hlock_class(this)->usage_traces[new_bit] = save_trace())) - return 0; + if (new_bit < LOCK_TRACE_STATES) { + if (!(hlock_class(this)->usage_traces[new_bit] = save_trace())) + return 0; + } - switch (new_bit) { - case LOCK_USED: - debug_atomic_dec(nr_unused_locks); - break; - default: + if (new_bit < LOCK_USED) { ret = mark_lock_irq(curr, this, new_bit); if (!ret) return 0; } +unlock: graph_unlock(); /* @@ -3983,7 +4702,7 @@ static inline short task_wait_context(struct task_struct *curr) * Set appropriate wait type for the context; for IRQs we have to take * into account force_irqthread as that is implied by PREEMPT_RT. */ - if (curr->hardirq_context) { + if (lockdep_hardirq_context()) { /* * Check if force_irqthreads will run us threaded. */ @@ -4037,7 +4756,7 @@ print_lock_invalid_wait_context(struct task_struct *curr, /* * Verify the wait_type context. * - * This check validates we takes locks in the right wait-type order; that is it + * This check validates we take locks in the right wait-type order; that is it * ensures that we do not take mutexes inside spinlocks and do not attempt to * acquire spinlocks inside raw_spinlocks and the sort. * @@ -4051,12 +4770,12 @@ print_lock_invalid_wait_context(struct task_struct *curr, */ static int check_wait_context(struct task_struct *curr, struct held_lock *next) { - short next_inner = hlock_class(next)->wait_type_inner; - short next_outer = hlock_class(next)->wait_type_outer; - short curr_inner; + u8 next_inner = hlock_class(next)->wait_type_inner; + u8 next_outer = hlock_class(next)->wait_type_outer; + u8 curr_inner; int depth; - if (!curr->lockdep_depth || !next_inner || next->trylock) + if (!next_inner || next->trylock) return 0; if (!next_outer) @@ -4076,7 +4795,8 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next) for (; depth < curr->lockdep_depth; depth++) { struct held_lock *prev = curr->held_locks + depth; - short prev_inner = hlock_class(prev)->wait_type_inner; + struct lock_class *class = hlock_class(prev); + u8 prev_inner = class->wait_type_inner; if (prev_inner) { /* @@ -4086,6 +4806,14 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next) * Also due to trylocks. */ curr_inner = min(curr_inner, prev_inner); + + /* + * Allow override for annotations -- this is typically + * only valid/needed for code that only exists when + * CONFIG_PREEMPT_RT=n. + */ + if (unlikely(class->lock_type == LD_LOCK_WAIT_OVERRIDE)) + curr_inner = prev_inner; } } @@ -4125,9 +4853,9 @@ static inline int check_wait_context(struct task_struct *curr, /* * Initialize a lock instance's lock-class mapping info: */ -void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, +void lockdep_init_map_type(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, - short inner, short outer) + u8 inner, u8 outer, u8 lock_type) { int i; @@ -4150,6 +4878,7 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, lock->wait_type_outer = outer; lock->wait_type_inner = inner; + lock->lock_type = lock_type; /* * No key, no joy, we need to hash something. @@ -4174,25 +4903,51 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, if (subclass) { unsigned long flags; - if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion)) + if (DEBUG_LOCKS_WARN_ON(!lockdep_enabled())) return; raw_local_irq_save(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); register_lock_class(lock, subclass, 1); lockdep_recursion_finish(); raw_local_irq_restore(flags); } } -EXPORT_SYMBOL_GPL(lockdep_init_map_waits); +EXPORT_SYMBOL_GPL(lockdep_init_map_type); struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); +#ifdef CONFIG_PROVE_LOCKING +void lockdep_set_lock_cmp_fn(struct lockdep_map *lock, lock_cmp_fn cmp_fn, + lock_print_fn print_fn) +{ + struct lock_class *class = lock->class_cache[0]; + unsigned long flags; + + raw_local_irq_save(flags); + lockdep_recursion_inc(); + + if (!class) + class = register_lock_class(lock, 0, 0); + + if (class) { + WARN_ON(class->cmp_fn && class->cmp_fn != cmp_fn); + WARN_ON(class->print_fn && class->print_fn != print_fn); + + class->cmp_fn = cmp_fn; + class->print_fn = print_fn; + } + + lockdep_recursion_finish(); + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lockdep_set_lock_cmp_fn); +#endif + static void print_lock_nested_lock_not_held(struct task_struct *curr, - struct held_lock *hlock, - unsigned long ip) + struct held_lock *hlock) { if (!debug_locks_off()) return; @@ -4234,7 +4989,7 @@ static int __lock_is_held(const struct lockdep_map *lock, int read); static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, int hardirqs_off, struct lockdep_map *nest_lock, unsigned long ip, - int references, int pin_count) + int references, int pin_count, int sync) { struct task_struct *curr = current; struct lock_class *class = NULL; @@ -4285,7 +5040,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, class_idx = class - lock_classes; - if (depth) { /* we're holding locks */ + if (depth && !sync) { + /* we're holding locks and the new held lock is not a sync */ hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { if (!references) @@ -4319,6 +5075,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->trylock = trylock; hlock->read = read; hlock->check = check; + hlock->sync = !!sync; hlock->hardirqs_off = !!hardirqs_off; hlock->references = references; #ifdef CONFIG_LOCK_STAT @@ -4365,10 +5122,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, chain_key = INITIAL_CHAIN_KEY; chain_head = 1; } - chain_key = iterate_chain_key(chain_key, class_idx); + chain_key = iterate_chain_key(chain_key, hlock_id(hlock)); if (nest_lock && !__lock_is_held(nest_lock, -1)) { - print_lock_nested_lock_not_held(curr, hlock, ip); + print_lock_nested_lock_not_held(curr, hlock); return 0; } @@ -4380,6 +5137,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!validate_chain(curr, hlock, chain_head, chain_key)) return 0; + /* For lock_sync(), we are done here since no actual critical section */ + if (hlock->sync) + return 1; + curr->curr_chain_key = chain_key; curr->lockdep_depth++; check_chain_key(curr); @@ -4521,7 +5282,7 @@ static int reacquire_held_locks(struct task_struct *curr, unsigned int depth, hlock->read, hlock->check, hlock->hardirqs_off, hlock->nest_lock, hlock->acquire_ip, - hlock->references, hlock->pin_count)) { + hlock->references, hlock->pin_count, 0)) { case 0: return 1; case 1: @@ -4565,9 +5326,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name, return 0; } - lockdep_init_map_waits(lock, name, key, 0, - lock->wait_type_inner, - lock->wait_type_outer); + lockdep_init_map_type(lock, name, key, 0, + lock->wait_type_inner, + lock->wait_type_outer, + lock->lock_type); class = register_lock_class(lock, subclass, 0); hlock->class_idx = class - lock_classes; @@ -4731,14 +5493,14 @@ int __lock_is_held(const struct lockdep_map *lock, int read) struct held_lock *hlock = curr->held_locks + i; if (match_held_lock(hlock, lock)) { - if (read == -1 || hlock->read == read) - return 1; + if (read == -1 || !!hlock->read == read) + return LOCK_STATE_HELD; - return 0; + return LOCK_STATE_NOT_HELD; } } - return 0; + return LOCK_STATE_NOT_HELD; } static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock) @@ -4759,7 +5521,7 @@ static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock) * be guessable and still allows some pin nesting in * our u32 pin_count. */ - cookie.val = 1 + (prandom_u32() >> 16); + cookie.val = 1 + (sched_clock() & 0xffff); hlock->pin_count += cookie.val; return cookie; } @@ -4819,22 +5581,26 @@ static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie /* * Check whether we follow the irq-flags state precisely: */ -static void check_flags(unsigned long flags) +static noinstr void check_flags(unsigned long flags) { #if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) if (!debug_locks) return; + /* Get the warning out.. */ + instrumentation_begin(); + if (irqs_disabled_flags(flags)) { - if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) { + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled())) { printk("possible reason: unannotated irqs-off.\n"); } } else { - if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) { + if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled())) { printk("possible reason: unannotated irqs-on.\n"); } } +#ifndef CONFIG_PREEMPT_RT /* * We dont accurately track softirq state in e.g. * hardirq contexts (such as on 4KSTACKS), so only @@ -4849,9 +5615,12 @@ static void check_flags(unsigned long flags) DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); } } +#endif if (!debug_locks) print_irqtrace_events(current); + + instrumentation_end(); #endif } @@ -4861,11 +5630,11 @@ void lock_set_class(struct lockdep_map *lock, const char *name, { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); check_flags(flags); if (__lock_set_class(lock, name, key, subclass, ip)) check_chain_key(current); @@ -4878,11 +5647,11 @@ void lock_downgrade(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); check_flags(flags); if (__lock_downgrade(lock, ip)) check_chain_key(current); @@ -4896,12 +5665,20 @@ static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock { #ifdef CONFIG_PROVE_LOCKING struct lock_class *class = look_up_lock_class(lock, subclass); + unsigned long mask = LOCKF_USED; /* if it doesn't have a class (yet), it certainly hasn't been used yet */ if (!class) return; - if (!(class->usage_mask & LOCK_USED)) + /* + * READ locks only conflict with USED, such that if we only ever use + * READ locks, there is no deadlock possible -- RCU. + */ + if (!hlock->read) + mask |= LOCKF_USED_READ; + + if (!(class->usage_mask & mask)) return; hlock->class_idx = class - lock_classes; @@ -4912,7 +5689,7 @@ static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock static bool lockdep_nmi(void) { - if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK) + if (raw_cpu_read(lockdep_recursion)) return false; if (!in_nmi()) @@ -4922,6 +5699,20 @@ static bool lockdep_nmi(void) } /* + * read_lock() is recursive if: + * 1. We force lockdep think this way in selftests or + * 2. The implementation is not queued read/write lock or + * 3. The locker is at an in_interrupt() context. + */ +bool read_lock_is_recursive(void) +{ + return force_read_lock_recursive || + !IS_ENABLED(CONFIG_QUEUED_RWLOCKS) || + in_interrupt(); +} +EXPORT_SYMBOL_GPL(read_lock_is_recursive); + +/* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: */ @@ -4931,7 +5722,12 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, { unsigned long flags; - if (unlikely(current->lockdep_recursion)) { + trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); + + if (!debug_locks) + return; + + if (unlikely(!lockdep_enabled())) { /* XXX allow trylock from NMI ?!? */ if (lockdep_nmi() && !trylock) { struct held_lock hlock; @@ -4954,10 +5750,9 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; - trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); + lockdep_recursion_inc(); __lock_acquire(lock, subclass, trylock, read, check, - irqs_disabled_flags(flags), nest_lock, ip, 0, 0); + irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 0); lockdep_recursion_finish(); raw_local_irq_restore(flags); } @@ -4967,13 +5762,15 @@ void lock_release(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + trace_lock_release(lock, ip); + + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; - trace_lock_release(lock, ip); + + lockdep_recursion_inc(); if (__lock_release(lock, ip)) check_chain_key(current); lockdep_recursion_finish(); @@ -4981,18 +5778,50 @@ void lock_release(struct lockdep_map *lock, unsigned long ip) } EXPORT_SYMBOL_GPL(lock_release); +/* + * lock_sync() - A special annotation for synchronize_{s,}rcu()-like API. + * + * No actual critical section is created by the APIs annotated with this: these + * APIs are used to wait for one or multiple critical sections (on other CPUs + * or threads), and it means that calling these APIs inside these critical + * sections is potential deadlock. + */ +void lock_sync(struct lockdep_map *lock, unsigned subclass, int read, + int check, struct lockdep_map *nest_lock, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(!lockdep_enabled())) + return; + + raw_local_irq_save(flags); + check_flags(flags); + + lockdep_recursion_inc(); + __lock_acquire(lock, subclass, 0, read, check, + irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 1); + check_chain_key(current); + lockdep_recursion_finish(); + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_sync); + noinstr int lock_is_held_type(const struct lockdep_map *lock, int read) { unsigned long flags; - int ret = 0; + int ret = LOCK_STATE_NOT_HELD; - if (unlikely(current->lockdep_recursion)) - return 1; /* avoid false negative lockdep_assert_held() */ + /* + * Avoid false negative lockdep_assert_held() and + * lockdep_assert_not_held(). + */ + if (unlikely(!lockdep_enabled())) + return LOCK_STATE_UNKNOWN; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); ret = __lock_is_held(lock, read); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5007,13 +5836,13 @@ struct pin_cookie lock_pin_lock(struct lockdep_map *lock) struct pin_cookie cookie = NIL_COOKIE; unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return cookie; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); cookie = __lock_pin_lock(lock); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5026,13 +5855,13 @@ void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); __lock_repin_lock(lock, cookie); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5043,13 +5872,13 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); __lock_unpin_lock(lock, cookie); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5159,8 +5988,6 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) hlock->holdtime_stamp = now; } - trace_lock_acquired(lock, ip); - stats = get_lock_stats(hlock_class(hlock)); if (waittime) { if (hlock->read) @@ -5179,16 +6006,14 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - if (unlikely(!lock_stat || !debug_locks)) - return; + trace_lock_contended(lock, ip); - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lock_stat || !lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; - trace_lock_contended(lock, ip); + lockdep_recursion_inc(); __lock_contended(lock, ip); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5199,15 +6024,14 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - if (unlikely(!lock_stat || !debug_locks)) - return; + trace_lock_acquired(lock, ip); - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lock_stat || !lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); __lock_acquired(lock, ip); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5246,7 +6070,7 @@ static void remove_class_from_lock_chain(struct pending_free *pf, int i; for (i = chain->base; i < chain->base + chain->depth; i++) { - if (chain_hlocks[i] != class - lock_classes) + if (chain_hlock_class_idx(chain_hlocks[i]) != class - lock_classes) continue; /* * Each lock class occurs at most once in a lock chain so once @@ -5319,6 +6143,8 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) WRITE_ONCE(class->name, NULL); nr_lock_classes--; __clear_bit(class - lock_classes, lock_classes_in_use); + if (class - lock_classes == max_lock_class_idx) + max_lock_class_idx--; } else { WARN_ONCE(true, "%s() failed for class %s\n", __func__, class->name); @@ -5330,13 +6156,10 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) static void reinit_class(struct lock_class *class) { - void *const p = class; - const unsigned int offset = offsetof(struct lock_class, key); - WARN_ON_ONCE(!class->lock_entry.next); WARN_ON_ONCE(!list_empty(&class->locks_after)); WARN_ON_ONCE(!list_empty(&class->locks_before)); - memset(p + offset, 0, sizeof(*class) - offset); + memset_startat(class, 0, key); WARN_ON_ONCE(!class->lock_entry.next); WARN_ON_ONCE(!list_empty(&class->locks_after)); WARN_ON_ONCE(!list_empty(&class->locks_before)); @@ -5609,7 +6432,13 @@ void lockdep_reset_lock(struct lockdep_map *lock) lockdep_reset_lock_reg(lock); } -/* Unregister a dynamically allocated key. */ +/* + * Unregister a dynamically allocated key. + * + * Unlike lockdep_register_key(), a search is always done to find a matching + * key irrespective of debug_locks to avoid potential invalid access to freed + * memory in lock_class entry. + */ void lockdep_unregister_key(struct lock_class_key *key) { struct hlist_head *hash_head = keyhashentry(key); @@ -5624,10 +6453,8 @@ void lockdep_unregister_key(struct lock_class_key *key) return; raw_local_irq_save(flags); - if (!graph_lock()) - goto out_irq; + lockdep_lock(); - pf = get_pending_free(); hlist_for_each_entry_rcu(k, hash_head, hash_entry) { if (k == key) { hlist_del_rcu(&k->hash_entry); @@ -5635,11 +6462,13 @@ void lockdep_unregister_key(struct lock_class_key *key) break; } } - WARN_ON_ONCE(!found); - __lockdep_free_key_range(pf, key, 1); - call_rcu_zapped(pf); - graph_unlock(); -out_irq: + WARN_ON_ONCE(!found && debug_locks); + if (found) { + pf = get_pending_free(); + __lockdep_free_key_range(pf, key, 1); + call_rcu_zapped(pf); + } + lockdep_unlock(); raw_local_irq_restore(flags); /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */ @@ -5839,6 +6668,8 @@ asmlinkage __visible void lockdep_sys_exit(void) void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { struct task_struct *curr = current; + int dl = READ_ONCE(debug_locks); + bool rcu = warn_rcu_enter(); /* Note: the following can be executed concurrently, so be careful. */ pr_warn("\n"); @@ -5848,17 +6679,16 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) pr_warn("-----------------------------\n"); pr_warn("%s:%d %s!\n", file, line, s); pr_warn("\nother info that might help us debug this:\n\n"); - pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n", + pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n%s", !rcu_lockdep_current_cpu_online() ? "RCU used illegally from offline CPU!\n" - : !rcu_is_watching() - ? "RCU used illegally from idle CPU!\n" - : "", - rcu_scheduler_active, debug_locks); + : "", + rcu_scheduler_active, dl, + dl ? "" : "Possible false positive due to lockdep disabling via debug_locks = 0\n"); /* * If a CPU is in the RCU-free window in idle (ie: in the section - * between rcu_idle_enter() and rcu_idle_exit(), then RCU + * between ct_idle_enter() and ct_idle_exit(), then RCU * considers that CPU to be in an "extended quiescent state", * which means that RCU will be completely ignoring that CPU. * Therefore, rcu_read_lock() and friends have absolutely no @@ -5880,5 +6710,6 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) lockdep_print_held_locks(curr); pr_warn("\nstack backtrace:\n"); dump_stack(); + warn_rcu_exit(rcu); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index baca699b94e9..bbe9000260d0 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -19,9 +19,13 @@ enum lock_usage_bit { #include "lockdep_states.h" #undef LOCKDEP_STATE LOCK_USED, - LOCK_USAGE_STATES + LOCK_USED_READ, + LOCK_USAGE_STATES, }; +/* states after LOCK_USED_READ are not traced and printed */ +static_assert(LOCK_TRACE_STATES == LOCK_USAGE_STATES); + #define LOCK_USAGE_READ_MASK 1 #define LOCK_USAGE_DIR_MASK 2 #define LOCK_USAGE_STATE_MASK (~(LOCK_USAGE_READ_MASK | LOCK_USAGE_DIR_MASK)) @@ -40,6 +44,7 @@ enum { #include "lockdep_states.h" #undef LOCKDEP_STATE __LOCKF(USED) + __LOCKF(USED_READ) }; #define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE | @@ -94,16 +99,16 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = #define MAX_STACK_TRACE_ENTRIES 262144UL #define STACK_TRACE_HASH_SIZE 8192 #else -#define MAX_LOCKDEP_ENTRIES 32768UL +#define MAX_LOCKDEP_ENTRIES (1UL << CONFIG_LOCKDEP_BITS) -#define MAX_LOCKDEP_CHAINS_BITS 16 +#define MAX_LOCKDEP_CHAINS_BITS CONFIG_LOCKDEP_CHAINS_BITS /* * Stack-trace: tightly packed array of stack backtrace * addresses. Protected by the hash_lock. */ -#define MAX_STACK_TRACE_ENTRIES 524288UL -#define STACK_TRACE_HASH_SIZE 16384 +#define MAX_STACK_TRACE_ENTRIES (1UL << CONFIG_LOCKDEP_STACK_TRACE_BITS) +#define STACK_TRACE_HASH_SIZE (1 << CONFIG_LOCKDEP_STACK_TRACE_HASH_BITS) #endif /* @@ -116,10 +121,9 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) -extern struct list_head all_lock_classes; extern struct lock_chain lock_chains[]; -#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2) +#define LOCK_USAGE_CHARS (2*XXX_LOCK_USAGE_STATES + 1) extern void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]); @@ -146,6 +150,10 @@ extern unsigned int nr_large_chain_blocks; extern unsigned int max_lockdep_depth; extern unsigned int max_bfs_queue_depth; +extern unsigned long max_lock_class_idx; + +extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +extern unsigned long lock_classes_in_use[]; #ifdef CONFIG_PROVE_LOCKING extern unsigned long lockdep_count_forward_deps(struct lock_class *); @@ -200,7 +208,6 @@ struct lockdep_stats { }; DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); -extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; #define __debug_atomic_inc(ptr) \ this_cpu_inc(lockdep_stats.ptr); diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 5525cd3ba0c8..e2bfb1db589d 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -24,14 +24,33 @@ #include "lockdep_internals.h" +/* + * Since iteration of lock_classes is done without holding the lockdep lock, + * it is not safe to iterate all_lock_classes list directly as the iteration + * may branch off to free_lock_classes or the zapped list. Iteration is done + * directly on the lock_classes array by checking the lock_classes_in_use + * bitmap and max_lock_class_idx. + */ +#define iterate_lock_classes(idx, class) \ + for (idx = 0, class = lock_classes; idx <= max_lock_class_idx; \ + idx++, class++) + static void *l_next(struct seq_file *m, void *v, loff_t *pos) { - return seq_list_next(v, &all_lock_classes, pos); + struct lock_class *class = v; + + ++class; + *pos = class - lock_classes; + return (*pos > max_lock_class_idx) ? NULL : class; } static void *l_start(struct seq_file *m, loff_t *pos) { - return seq_list_start_head(&all_lock_classes, *pos); + unsigned long idx = *pos; + + if (idx > max_lock_class_idx) + return NULL; + return lock_classes + idx; } static void l_stop(struct seq_file *m, void *v) @@ -57,39 +76,43 @@ static void print_name(struct seq_file *m, struct lock_class *class) static int l_show(struct seq_file *m, void *v) { - struct lock_class *class = list_entry(v, struct lock_class, lock_entry); + struct lock_class *class = v; struct lock_list *entry; char usage[LOCK_USAGE_CHARS]; + int idx = class - lock_classes; - if (v == &all_lock_classes) { + if (v == lock_classes) seq_printf(m, "all lock classes:\n"); + + if (!test_bit(idx, lock_classes_in_use)) return 0; - } seq_printf(m, "%p", class->key); #ifdef CONFIG_DEBUG_LOCKDEP seq_printf(m, " OPS:%8ld", debug_class_ops_read(class)); #endif -#ifdef CONFIG_PROVE_LOCKING - seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class)); - seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class)); -#endif + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { + seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class)); + seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class)); - get_usage_chars(class, usage); - seq_printf(m, " %s", usage); + get_usage_chars(class, usage); + seq_printf(m, " %s", usage); + } seq_printf(m, ": "); print_name(m, class); seq_puts(m, "\n"); - list_for_each_entry(entry, &class->locks_after, entry) { - if (entry->distance == 1) { - seq_printf(m, " -> [%p] ", entry->class->key); - print_name(m, entry->class); - seq_puts(m, "\n"); + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { + list_for_each_entry(entry, &class->locks_after, entry) { + if (entry->distance == 1) { + seq_printf(m, " -> [%p] ", entry->class->key); + print_name(m, entry->class); + seq_puts(m, "\n"); + } } + seq_puts(m, "\n"); } - seq_puts(m, "\n"); return 0; } @@ -218,8 +241,11 @@ static int lockdep_stats_show(struct seq_file *m, void *v) #ifdef CONFIG_PROVE_LOCKING struct lock_class *class; + unsigned long idx; - list_for_each_entry(class, &all_lock_classes, lock_entry) { + iterate_lock_classes(idx, class) { + if (!test_bit(idx, lock_classes_in_use)) + continue; if (class->usage_mask == 0) nr_unused++; @@ -252,6 +278,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) sum_forward_deps += lockdep_count_forward_deps(class); } + #ifdef CONFIG_DEBUG_LOCKDEP DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused); #endif @@ -343,12 +370,14 @@ static int lockdep_stats_show(struct seq_file *m, void *v) seq_printf(m, " max bfs queue depth: %11u\n", max_bfs_queue_depth); #endif + seq_printf(m, " max lock class index: %11lu\n", + max_lock_class_idx); lockdep_stats_debug_show(m); seq_printf(m, " debug_locks: %11u\n", debug_locks); /* - * Zappped classes and lockdep data buffers reuse statistics. + * Zapped classes and lockdep data buffers reuse statistics. */ seq_puts(m, "\n"); seq_printf(m, " zapped classes: %11lu\n", @@ -411,7 +440,7 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr) static void seq_time(struct seq_file *m, s64 time) { - char num[15]; + char num[22]; snprint_time(num, sizeof(num), time); seq_printf(m, " %14s", num); @@ -423,7 +452,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt) seq_time(m, lt->min); seq_time(m, lt->max); seq_time(m, lt->total); - seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0); + seq_time(m, lt->nr ? div64_u64(lt->total, lt->nr) : 0); } static void seq_stats(struct seq_file *m, struct lock_stat_data *data) @@ -620,12 +649,16 @@ static int lock_stat_open(struct inode *inode, struct file *file) if (!res) { struct lock_stat_data *iter = data->stats; struct seq_file *m = file->private_data; + unsigned long idx; - list_for_each_entry(class, &all_lock_classes, lock_entry) { + iterate_lock_classes(idx, class) { + if (!test_bit(idx, lock_classes_in_use)) + continue; iter->class = class; iter->stats = lock_stats(class); iter++; } + data->iter_end = iter; sort(data->stats, data->iter_end - data->stats, @@ -643,6 +676,7 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct lock_class *class; + unsigned long idx; char c; if (count) { @@ -652,8 +686,11 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf, if (c != '0') return count; - list_for_each_entry(class, &all_lock_classes, lock_entry) + iterate_lock_classes(idx, class) { + if (!test_bit(idx, lock_classes_in_use)) + continue; clear_lock_stats(class); + } } return count; } diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 5efbfc68ce99..415d81e6ce70 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -27,45 +27,105 @@ #include <linux/moduleparam.h> #include <linux/delay.h> #include <linux/slab.h> -#include <linux/percpu-rwsem.h> #include <linux/torture.h> +#include <linux/reboot.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); -torture_param(int, nwriters_stress, -1, - "Number of write-locking stress-test threads"); -torture_param(int, nreaders_stress, -1, - "Number of read-locking stress-test threads"); +torture_param(int, acq_writer_lim, 0, "Write_acquisition time limit (jiffies)."); +torture_param(int, call_rcu_chains, 0, "Self-propagate call_rcu() chains during test (0=disable)."); +torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable"); +torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)"); +torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads"); +torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); -torture_param(int, onoff_interval, 0, - "Time between CPU hotplugs (s), 0=disable"); -torture_param(int, shuffle_interval, 3, - "Number of jiffies between shuffles, 0=disable"); +torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, rt_boost, 2, + "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types."); +torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens."); +torture_param(int, shuffle_interval, 3, "Number of jiffies between shuffles, 0=disable"); torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); -torture_param(int, stat_interval, 60, - "Number of seconds between stats printk()s"); +torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); -torture_param(int, verbose, 1, - "Enable verbose debugging printk()s"); +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); +torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority"); +/* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */ +#define MAX_NESTED_LOCKS 8 -static char *torture_type = "spin_lock"; +static char *torture_type = IS_ENABLED(CONFIG_PREEMPT_RT) ? "raw_spin_lock" : "spin_lock"; module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)"); +static cpumask_var_t bind_readers; // Bind the readers to the specified set of CPUs. +static cpumask_var_t bind_writers; // Bind the writers to the specified set of CPUs. + +// Parse a cpumask kernel parameter. If there are more users later on, +// this might need to got to a more central location. +static int param_set_cpumask(const char *val, const struct kernel_param *kp) +{ + cpumask_var_t *cm_bind = kp->arg; + int ret; + char *s; + + if (!alloc_cpumask_var(cm_bind, GFP_KERNEL)) { + s = "Out of memory"; + ret = -ENOMEM; + goto out_err; + } + ret = cpulist_parse(val, *cm_bind); + if (!ret) + return ret; + s = "Bad CPU range"; +out_err: + pr_warn("%s: %s, all CPUs set\n", kp->name, s); + cpumask_setall(*cm_bind); + return ret; +} + +// Output a cpumask kernel parameter. +static int param_get_cpumask(char *buffer, const struct kernel_param *kp) +{ + cpumask_var_t *cm_bind = kp->arg; + + return sprintf(buffer, "%*pbl", cpumask_pr_args(*cm_bind)); +} + +static bool cpumask_nonempty(cpumask_var_t mask) +{ + return cpumask_available(mask) && !cpumask_empty(mask); +} + +static const struct kernel_param_ops lt_bind_ops = { + .set = param_set_cpumask, + .get = param_get_cpumask, +}; + +module_param_cb(bind_readers, <_bind_ops, &bind_readers, 0644); +module_param_cb(bind_writers, <_bind_ops, &bind_writers, 0644); + +long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask); + static struct task_struct *stats_task; static struct task_struct **writer_tasks; static struct task_struct **reader_tasks; static bool lock_is_write_held; -static bool lock_is_read_held; +static atomic_t lock_is_read_held; +static unsigned long last_lock_release; struct lock_stress_stats { long n_lock_fail; long n_lock_acquired; }; +struct call_rcu_chain { + struct rcu_head crc_rh; + bool crc_stop; +}; +struct call_rcu_chain *call_rcu_chain_list; + /* Forward reference. */ static void lock_torture_cleanup(void); @@ -74,13 +134,16 @@ static void lock_torture_cleanup(void); */ struct lock_torture_ops { void (*init)(void); - int (*writelock)(void); + void (*exit)(void); + int (*nested_lock)(int tid, u32 lockset); + int (*writelock)(int tid); void (*write_delay)(struct torture_random_state *trsp); void (*task_boost)(struct torture_random_state *trsp); - void (*writeunlock)(void); - int (*readlock)(void); + void (*writeunlock)(int tid); + void (*nested_unlock)(int tid, u32 lockset); + int (*readlock)(int tid); void (*read_delay)(struct torture_random_state *trsp); - void (*readunlock)(void); + void (*readunlock)(int tid); unsigned long flags; /* for irq spinlocks */ const char *name; @@ -90,49 +153,82 @@ struct lock_torture_cxt { int nrealwriters_stress; int nrealreaders_stress; bool debug_lock; + bool init_called; atomic_t n_lock_torture_errors; struct lock_torture_ops *cur_ops; struct lock_stress_stats *lwsa; /* writer statistics */ struct lock_stress_stats *lrsa; /* reader statistics */ }; -static struct lock_torture_cxt cxt = { 0, 0, false, +static struct lock_torture_cxt cxt = { 0, 0, false, false, ATOMIC_INIT(0), NULL, NULL}; /* * Definitions for lock torture testing. */ -static int torture_lock_busted_write_lock(void) +static int torture_lock_busted_write_lock(int tid __maybe_unused) { return 0; /* BUGGY, do not use in real life!!! */ } static void torture_lock_busted_write_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; - /* We want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms); + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } -static void torture_lock_busted_write_unlock(void) +static void torture_lock_busted_write_unlock(int tid __maybe_unused) { /* BUGGY, do not use in real life!!! */ } -static void torture_boost_dummy(struct torture_random_state *trsp) +static void __torture_rt_boost(struct torture_random_state *trsp) { - /* Only rtmutexes care about priority */ + const unsigned int factor = rt_boost_factor; + + if (!rt_task(current)) { + /* + * Boost priority once every rt_boost_factor operations. When + * the task tries to take the lock, the rtmutex it will account + * for the new priority, and do any corresponding pi-dance. + */ + if (trsp && !(torture_random(trsp) % + (cxt.nrealwriters_stress * factor))) { + sched_set_fifo(current); + } else /* common case, do nothing */ + return; + } else { + /* + * The task will remain boosted for another 10 * rt_boost_factor + * operations, then restored back to its original prio, and so + * forth. + * + * When @trsp is nil, we want to force-reset the task for + * stopping the kthread. + */ + if (!trsp || !(torture_random(trsp) % + (cxt.nrealwriters_stress * factor * 2))) { + sched_set_normal(current, 0); + } else /* common case, do nothing */ + return; + } +} + +static void torture_rt_boost(struct torture_random_state *trsp) +{ + if (rt_boost != 2) + return; + + __torture_rt_boost(trsp); } static struct lock_torture_ops lock_busted_ops = { .writelock = torture_lock_busted_write_lock, .write_delay = torture_lock_busted_write_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_lock_busted_write_unlock, .readlock = NULL, .read_delay = NULL, @@ -142,7 +238,8 @@ static struct lock_torture_ops lock_busted_ops = { static DEFINE_SPINLOCK(torture_spinlock); -static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock) +static int torture_spin_lock_write_lock(int tid __maybe_unused) +__acquires(torture_spinlock) { spin_lock(&torture_spinlock); return 0; @@ -151,22 +248,24 @@ static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock) static void torture_spin_lock_write_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = 100; + unsigned long j; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms); - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2 * shortdelay_us))) + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) { + j = jiffies; + mdelay(long_hold); + pr_alert("%s: delay = %lu jiffies.\n", __func__, jiffies - j); + } + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us))) udelay(shortdelay_us); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } -static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock) +static void torture_spin_lock_write_unlock(int tid __maybe_unused) +__releases(torture_spinlock) { spin_unlock(&torture_spinlock); } @@ -174,7 +273,7 @@ static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock) static struct lock_torture_ops spin_lock_ops = { .writelock = torture_spin_lock_write_lock, .write_delay = torture_spin_lock_write_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_spin_lock_write_unlock, .readlock = NULL, .read_delay = NULL, @@ -182,7 +281,7 @@ static struct lock_torture_ops spin_lock_ops = { .name = "spin_lock" }; -static int torture_spin_lock_write_lock_irq(void) +static int torture_spin_lock_write_lock_irq(int tid __maybe_unused) __acquires(torture_spinlock) { unsigned long flags; @@ -192,7 +291,7 @@ __acquires(torture_spinlock) return 0; } -static void torture_lock_spin_write_unlock_irq(void) +static void torture_lock_spin_write_unlock_irq(int tid __maybe_unused) __releases(torture_spinlock) { spin_unlock_irqrestore(&torture_spinlock, cxt.cur_ops->flags); @@ -201,7 +300,7 @@ __releases(torture_spinlock) static struct lock_torture_ops spin_lock_irq_ops = { .writelock = torture_spin_lock_write_lock_irq, .write_delay = torture_spin_lock_write_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_lock_spin_write_unlock_irq, .readlock = NULL, .read_delay = NULL, @@ -209,9 +308,63 @@ static struct lock_torture_ops spin_lock_irq_ops = { .name = "spin_lock_irq" }; +static DEFINE_RAW_SPINLOCK(torture_raw_spinlock); + +static int torture_raw_spin_lock_write_lock(int tid __maybe_unused) +__acquires(torture_raw_spinlock) +{ + raw_spin_lock(&torture_raw_spinlock); + return 0; +} + +static void torture_raw_spin_lock_write_unlock(int tid __maybe_unused) +__releases(torture_raw_spinlock) +{ + raw_spin_unlock(&torture_raw_spinlock); +} + +static struct lock_torture_ops raw_spin_lock_ops = { + .writelock = torture_raw_spin_lock_write_lock, + .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_rt_boost, + .writeunlock = torture_raw_spin_lock_write_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "raw_spin_lock" +}; + +static int torture_raw_spin_lock_write_lock_irq(int tid __maybe_unused) +__acquires(torture_raw_spinlock) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&torture_raw_spinlock, flags); + cxt.cur_ops->flags = flags; + return 0; +} + +static void torture_raw_spin_lock_write_unlock_irq(int tid __maybe_unused) +__releases(torture_raw_spinlock) +{ + raw_spin_unlock_irqrestore(&torture_raw_spinlock, cxt.cur_ops->flags); +} + +static struct lock_torture_ops raw_spin_lock_irq_ops = { + .writelock = torture_raw_spin_lock_write_lock_irq, + .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_rt_boost, + .writeunlock = torture_raw_spin_lock_write_unlock_irq, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "raw_spin_lock_irq" +}; + static DEFINE_RWLOCK(torture_rwlock); -static int torture_rwlock_write_lock(void) __acquires(torture_rwlock) +static int torture_rwlock_write_lock(int tid __maybe_unused) +__acquires(torture_rwlock) { write_lock(&torture_rwlock); return 0; @@ -220,24 +373,24 @@ static int torture_rwlock_write_lock(void) __acquires(torture_rwlock) static void torture_rwlock_write_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = 100; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms); + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold); else udelay(shortdelay_us); } -static void torture_rwlock_write_unlock(void) __releases(torture_rwlock) +static void torture_rwlock_write_unlock(int tid __maybe_unused) +__releases(torture_rwlock) { write_unlock(&torture_rwlock); } -static int torture_rwlock_read_lock(void) __acquires(torture_rwlock) +static int torture_rwlock_read_lock(int tid __maybe_unused) +__acquires(torture_rwlock) { read_lock(&torture_rwlock); return 0; @@ -246,19 +399,18 @@ static int torture_rwlock_read_lock(void) __acquires(torture_rwlock) static void torture_rwlock_read_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 10; - const unsigned long longdelay_ms = 100; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealreaders_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms); + if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold))) + mdelay(long_hold); else udelay(shortdelay_us); } -static void torture_rwlock_read_unlock(void) __releases(torture_rwlock) +static void torture_rwlock_read_unlock(int tid __maybe_unused) +__releases(torture_rwlock) { read_unlock(&torture_rwlock); } @@ -266,7 +418,7 @@ static void torture_rwlock_read_unlock(void) __releases(torture_rwlock) static struct lock_torture_ops rw_lock_ops = { .writelock = torture_rwlock_write_lock, .write_delay = torture_rwlock_write_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_rwlock_write_unlock, .readlock = torture_rwlock_read_lock, .read_delay = torture_rwlock_read_delay, @@ -274,7 +426,8 @@ static struct lock_torture_ops rw_lock_ops = { .name = "rw_lock" }; -static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock) +static int torture_rwlock_write_lock_irq(int tid __maybe_unused) +__acquires(torture_rwlock) { unsigned long flags; @@ -283,13 +436,14 @@ static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock) return 0; } -static void torture_rwlock_write_unlock_irq(void) +static void torture_rwlock_write_unlock_irq(int tid __maybe_unused) __releases(torture_rwlock) { write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags); } -static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock) +static int torture_rwlock_read_lock_irq(int tid __maybe_unused) +__acquires(torture_rwlock) { unsigned long flags; @@ -298,7 +452,7 @@ static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock) return 0; } -static void torture_rwlock_read_unlock_irq(void) +static void torture_rwlock_read_unlock_irq(int tid __maybe_unused) __releases(torture_rwlock) { read_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags); @@ -307,7 +461,7 @@ __releases(torture_rwlock) static struct lock_torture_ops rw_lock_irq_ops = { .writelock = torture_rwlock_write_lock_irq, .write_delay = torture_rwlock_write_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_rwlock_write_unlock_irq, .readlock = torture_rwlock_read_lock_irq, .read_delay = torture_rwlock_read_delay, @@ -316,8 +470,31 @@ static struct lock_torture_ops rw_lock_irq_ops = { }; static DEFINE_MUTEX(torture_mutex); +static struct mutex torture_nested_mutexes[MAX_NESTED_LOCKS]; +static struct lock_class_key nested_mutex_keys[MAX_NESTED_LOCKS]; + +static void torture_mutex_init(void) +{ + int i; + + for (i = 0; i < MAX_NESTED_LOCKS; i++) + __mutex_init(&torture_nested_mutexes[i], __func__, + &nested_mutex_keys[i]); +} + +static int torture_mutex_nested_lock(int tid __maybe_unused, + u32 lockset) +{ + int i; + + for (i = 0; i < nested_locks; i++) + if (lockset & (1 << i)) + mutex_lock(&torture_nested_mutexes[i]); + return 0; +} -static int torture_mutex_lock(void) __acquires(torture_mutex) +static int torture_mutex_lock(int tid __maybe_unused) +__acquires(torture_mutex) { mutex_lock(&torture_mutex); return 0; @@ -325,28 +502,37 @@ static int torture_mutex_lock(void) __acquires(torture_mutex) static void torture_mutex_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; - /* We want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms * 5); - else - mdelay(longdelay_ms / 5); + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold * 5); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } -static void torture_mutex_unlock(void) __releases(torture_mutex) +static void torture_mutex_unlock(int tid __maybe_unused) +__releases(torture_mutex) { mutex_unlock(&torture_mutex); } +static void torture_mutex_nested_unlock(int tid __maybe_unused, + u32 lockset) +{ + int i; + + for (i = nested_locks - 1; i >= 0; i--) + if (lockset & (1 << i)) + mutex_unlock(&torture_nested_mutexes[i]); +} + static struct lock_torture_ops mutex_lock_ops = { + .init = torture_mutex_init, + .nested_lock = torture_mutex_nested_lock, .writelock = torture_mutex_lock, .write_delay = torture_mutex_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_mutex_unlock, + .nested_unlock = torture_mutex_nested_unlock, .readlock = NULL, .read_delay = NULL, .readunlock = NULL, @@ -354,12 +540,34 @@ static struct lock_torture_ops mutex_lock_ops = { }; #include <linux/ww_mutex.h> +/* + * The torture ww_mutexes should belong to the same lock class as + * torture_ww_class to avoid lockdep problem. The ww_mutex_init() + * function is called for initialization to ensure that. + */ static DEFINE_WD_CLASS(torture_ww_class); -static DEFINE_WW_MUTEX(torture_ww_mutex_0, &torture_ww_class); -static DEFINE_WW_MUTEX(torture_ww_mutex_1, &torture_ww_class); -static DEFINE_WW_MUTEX(torture_ww_mutex_2, &torture_ww_class); +static struct ww_mutex torture_ww_mutex_0, torture_ww_mutex_1, torture_ww_mutex_2; +static struct ww_acquire_ctx *ww_acquire_ctxs; -static int torture_ww_mutex_lock(void) +static void torture_ww_mutex_init(void) +{ + ww_mutex_init(&torture_ww_mutex_0, &torture_ww_class); + ww_mutex_init(&torture_ww_mutex_1, &torture_ww_class); + ww_mutex_init(&torture_ww_mutex_2, &torture_ww_class); + + ww_acquire_ctxs = kmalloc_array(cxt.nrealwriters_stress, + sizeof(*ww_acquire_ctxs), + GFP_KERNEL); + if (!ww_acquire_ctxs) + VERBOSE_TOROUT_STRING("ww_acquire_ctx: Out of memory"); +} + +static void torture_ww_mutex_exit(void) +{ + kfree(ww_acquire_ctxs); +} + +static int torture_ww_mutex_lock(int tid) __acquires(torture_ww_mutex_0) __acquires(torture_ww_mutex_1) __acquires(torture_ww_mutex_2) @@ -369,7 +577,7 @@ __acquires(torture_ww_mutex_2) struct list_head link; struct ww_mutex *lock; } locks[3], *ll, *ln; - struct ww_acquire_ctx ctx; + struct ww_acquire_ctx *ctx = &ww_acquire_ctxs[tid]; locks[0].lock = &torture_ww_mutex_0; list_add(&locks[0].link, &list); @@ -380,12 +588,12 @@ __acquires(torture_ww_mutex_2) locks[2].lock = &torture_ww_mutex_2; list_add(&locks[2].link, &list); - ww_acquire_init(&ctx, &torture_ww_class); + ww_acquire_init(ctx, &torture_ww_class); list_for_each_entry(ll, &list, link) { int err; - err = ww_mutex_lock(ll->lock, &ctx); + err = ww_mutex_lock(ll->lock, ctx); if (!err) continue; @@ -396,28 +604,32 @@ __acquires(torture_ww_mutex_2) if (err != -EDEADLK) return err; - ww_mutex_lock_slow(ll->lock, &ctx); + ww_mutex_lock_slow(ll->lock, ctx); list_move(&ll->link, &list); } - ww_acquire_fini(&ctx); return 0; } -static void torture_ww_mutex_unlock(void) +static void torture_ww_mutex_unlock(int tid) __releases(torture_ww_mutex_0) __releases(torture_ww_mutex_1) __releases(torture_ww_mutex_2) { + struct ww_acquire_ctx *ctx = &ww_acquire_ctxs[tid]; + ww_mutex_unlock(&torture_ww_mutex_0); ww_mutex_unlock(&torture_ww_mutex_1); ww_mutex_unlock(&torture_ww_mutex_2); + ww_acquire_fini(ctx); } static struct lock_torture_ops ww_mutex_lock_ops = { + .init = torture_ww_mutex_init, + .exit = torture_ww_mutex_exit, .writelock = torture_ww_mutex_lock, .write_delay = torture_mutex_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_ww_mutex_unlock, .readlock = NULL, .read_delay = NULL, @@ -427,79 +639,85 @@ static struct lock_torture_ops ww_mutex_lock_ops = { #ifdef CONFIG_RT_MUTEXES static DEFINE_RT_MUTEX(torture_rtmutex); +static struct rt_mutex torture_nested_rtmutexes[MAX_NESTED_LOCKS]; +static struct lock_class_key nested_rtmutex_keys[MAX_NESTED_LOCKS]; -static int torture_rtmutex_lock(void) __acquires(torture_rtmutex) +static void torture_rtmutex_init(void) { - rt_mutex_lock(&torture_rtmutex); - return 0; + int i; + + for (i = 0; i < MAX_NESTED_LOCKS; i++) + __rt_mutex_init(&torture_nested_rtmutexes[i], __func__, + &nested_rtmutex_keys[i]); } -static void torture_rtmutex_boost(struct torture_random_state *trsp) +static int torture_rtmutex_nested_lock(int tid __maybe_unused, + u32 lockset) { - int policy; - struct sched_param param; - const unsigned int factor = 50000; /* yes, quite arbitrary */ + int i; - if (!rt_task(current)) { - /* - * Boost priority once every ~50k operations. When the - * task tries to take the lock, the rtmutex it will account - * for the new priority, and do any corresponding pi-dance. - */ - if (trsp && !(torture_random(trsp) % - (cxt.nrealwriters_stress * factor))) { - policy = SCHED_FIFO; - param.sched_priority = MAX_RT_PRIO - 1; - } else /* common case, do nothing */ - return; - } else { - /* - * The task will remain boosted for another ~500k operations, - * then restored back to its original prio, and so forth. - * - * When @trsp is nil, we want to force-reset the task for - * stopping the kthread. - */ - if (!trsp || !(torture_random(trsp) % - (cxt.nrealwriters_stress * factor * 2))) { - policy = SCHED_NORMAL; - param.sched_priority = 0; - } else /* common case, do nothing */ - return; - } + for (i = 0; i < nested_locks; i++) + if (lockset & (1 << i)) + rt_mutex_lock(&torture_nested_rtmutexes[i]); + return 0; +} - sched_setscheduler_nocheck(current, policy, ¶m); +static int torture_rtmutex_lock(int tid __maybe_unused) +__acquires(torture_rtmutex) +{ + rt_mutex_lock(&torture_rtmutex); + return 0; } static void torture_rtmutex_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = 100; /* * We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold); if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms); - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2 * shortdelay_us))) + (cxt.nrealwriters_stress * 200 * shortdelay_us))) udelay(shortdelay_us); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } -static void torture_rtmutex_unlock(void) __releases(torture_rtmutex) +static void torture_rtmutex_unlock(int tid __maybe_unused) +__releases(torture_rtmutex) { rt_mutex_unlock(&torture_rtmutex); } +static void torture_rt_boost_rtmutex(struct torture_random_state *trsp) +{ + if (!rt_boost) + return; + + __torture_rt_boost(trsp); +} + +static void torture_rtmutex_nested_unlock(int tid __maybe_unused, + u32 lockset) +{ + int i; + + for (i = nested_locks - 1; i >= 0; i--) + if (lockset & (1 << i)) + rt_mutex_unlock(&torture_nested_rtmutexes[i]); +} + static struct lock_torture_ops rtmutex_lock_ops = { + .init = torture_rtmutex_init, + .nested_lock = torture_rtmutex_nested_lock, .writelock = torture_rtmutex_lock, .write_delay = torture_rtmutex_delay, - .task_boost = torture_rtmutex_boost, + .task_boost = torture_rt_boost_rtmutex, .writeunlock = torture_rtmutex_unlock, + .nested_unlock = torture_rtmutex_nested_unlock, .readlock = NULL, .read_delay = NULL, .readunlock = NULL, @@ -508,7 +726,8 @@ static struct lock_torture_ops rtmutex_lock_ops = { #endif static DECLARE_RWSEM(torture_rwsem); -static int torture_rwsem_down_write(void) __acquires(torture_rwsem) +static int torture_rwsem_down_write(int tid __maybe_unused) +__acquires(torture_rwsem) { down_write(&torture_rwsem); return 0; @@ -516,24 +735,21 @@ static int torture_rwsem_down_write(void) __acquires(torture_rwsem) static void torture_rwsem_write_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; - /* We want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms * 10); - else - mdelay(longdelay_ms / 10); + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold * 10); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } -static void torture_rwsem_up_write(void) __releases(torture_rwsem) +static void torture_rwsem_up_write(int tid __maybe_unused) +__releases(torture_rwsem) { up_write(&torture_rwsem); } -static int torture_rwsem_down_read(void) __acquires(torture_rwsem) +static int torture_rwsem_down_read(int tid __maybe_unused) +__acquires(torture_rwsem) { down_read(&torture_rwsem); return 0; @@ -541,19 +757,17 @@ static int torture_rwsem_down_read(void) __acquires(torture_rwsem) static void torture_rwsem_read_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; - /* We want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealreaders_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms * 2); + if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold))) + mdelay(long_hold * 2); else - mdelay(longdelay_ms / 2); + mdelay(long_hold / 2); if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } -static void torture_rwsem_up_read(void) __releases(torture_rwsem) +static void torture_rwsem_up_read(int tid __maybe_unused) +__releases(torture_rwsem) { up_read(&torture_rwsem); } @@ -561,7 +775,7 @@ static void torture_rwsem_up_read(void) __releases(torture_rwsem) static struct lock_torture_ops rwsem_lock_ops = { .writelock = torture_rwsem_down_write, .write_delay = torture_rwsem_write_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_rwsem_up_write, .readlock = torture_rwsem_down_read, .read_delay = torture_rwsem_read_delay, @@ -572,38 +786,48 @@ static struct lock_torture_ops rwsem_lock_ops = { #include <linux/percpu-rwsem.h> static struct percpu_rw_semaphore pcpu_rwsem; -void torture_percpu_rwsem_init(void) +static void torture_percpu_rwsem_init(void) { BUG_ON(percpu_init_rwsem(&pcpu_rwsem)); } -static int torture_percpu_rwsem_down_write(void) __acquires(pcpu_rwsem) +static void torture_percpu_rwsem_exit(void) +{ + percpu_free_rwsem(&pcpu_rwsem); +} + +static int torture_percpu_rwsem_down_write(int tid __maybe_unused) +__acquires(pcpu_rwsem) { percpu_down_write(&pcpu_rwsem); return 0; } -static void torture_percpu_rwsem_up_write(void) __releases(pcpu_rwsem) +static void torture_percpu_rwsem_up_write(int tid __maybe_unused) +__releases(pcpu_rwsem) { percpu_up_write(&pcpu_rwsem); } -static int torture_percpu_rwsem_down_read(void) __acquires(pcpu_rwsem) +static int torture_percpu_rwsem_down_read(int tid __maybe_unused) +__acquires(pcpu_rwsem) { percpu_down_read(&pcpu_rwsem); return 0; } -static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem) +static void torture_percpu_rwsem_up_read(int tid __maybe_unused) +__releases(pcpu_rwsem) { percpu_up_read(&pcpu_rwsem); } static struct lock_torture_ops percpu_rwsem_lock_ops = { .init = torture_percpu_rwsem_init, + .exit = torture_percpu_rwsem_exit, .writelock = torture_percpu_rwsem_down_write, .write_delay = torture_rwsem_write_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_percpu_rwsem_up_write, .readlock = torture_percpu_rwsem_down_read, .read_delay = torture_rwsem_read_delay, @@ -617,28 +841,63 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = { */ static int lock_torture_writer(void *arg) { + unsigned long j; + unsigned long j1; + u32 lockset_mask; struct lock_stress_stats *lwsp = arg; DEFINE_TORTURE_RANDOM(rand); + bool skip_main_lock; + int tid = lwsp - cxt.lwsa; VERBOSE_TOROUT_STRING("lock_torture_writer task started"); - set_user_nice(current, MAX_NICE); + if (!rt_task(current)) + set_user_nice(current, MAX_NICE); do { if ((torture_random(&rand) & 0xfffff) == 0) schedule_timeout_uninterruptible(1); + lockset_mask = torture_random(&rand); + /* + * When using nested_locks, we want to occasionally + * skip the main lock so we can avoid always serializing + * the lock chains on that central lock. By skipping the + * main lock occasionally, we can create different + * contention patterns (allowing for multiple disjoint + * blocked trees) + */ + skip_main_lock = (nested_locks && + !(torture_random(&rand) % 100)); + cxt.cur_ops->task_boost(&rand); - cxt.cur_ops->writelock(); - if (WARN_ON_ONCE(lock_is_write_held)) - lwsp->n_lock_fail++; - lock_is_write_held = 1; - if (WARN_ON_ONCE(lock_is_read_held)) - lwsp->n_lock_fail++; /* rare, but... */ + if (cxt.cur_ops->nested_lock) + cxt.cur_ops->nested_lock(tid, lockset_mask); + + if (!skip_main_lock) { + if (acq_writer_lim > 0) + j = jiffies; + cxt.cur_ops->writelock(tid); + if (WARN_ON_ONCE(lock_is_write_held)) + lwsp->n_lock_fail++; + lock_is_write_held = true; + if (WARN_ON_ONCE(atomic_read(&lock_is_read_held))) + lwsp->n_lock_fail++; /* rare, but... */ + if (acq_writer_lim > 0) { + j1 = jiffies; + WARN_ONCE(time_after(j1, j + acq_writer_lim), + "%s: Lock acquisition took %lu jiffies.\n", + __func__, j1 - j); + } + lwsp->n_lock_acquired++; + + cxt.cur_ops->write_delay(&rand); - lwsp->n_lock_acquired++; - cxt.cur_ops->write_delay(&rand); - lock_is_write_held = 0; - cxt.cur_ops->writeunlock(); + lock_is_write_held = false; + WRITE_ONCE(last_lock_release, jiffies); + cxt.cur_ops->writeunlock(tid); + } + if (cxt.cur_ops->nested_unlock) + cxt.cur_ops->nested_unlock(tid, lockset_mask); stutter_wait("lock_torture_writer"); } while (!torture_must_stop()); @@ -655,6 +914,7 @@ static int lock_torture_writer(void *arg) static int lock_torture_reader(void *arg) { struct lock_stress_stats *lrsp = arg; + int tid = lrsp - cxt.lrsa; DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("lock_torture_reader task started"); @@ -664,15 +924,15 @@ static int lock_torture_reader(void *arg) if ((torture_random(&rand) & 0xfffff) == 0) schedule_timeout_uninterruptible(1); - cxt.cur_ops->readlock(); - lock_is_read_held = 1; + cxt.cur_ops->readlock(tid); + atomic_inc(&lock_is_read_held); if (WARN_ON_ONCE(lock_is_write_held)) lrsp->n_lock_fail++; /* rare, but... */ lrsp->n_lock_acquired++; cxt.cur_ops->read_delay(&rand); - lock_is_read_held = 0; - cxt.cur_ops->readunlock(); + atomic_dec(&lock_is_read_held); + cxt.cur_ops->readunlock(tid); stutter_wait("lock_torture_reader"); } while (!torture_must_stop()); @@ -686,20 +946,22 @@ static int lock_torture_reader(void *arg) static void __torture_print_stats(char *page, struct lock_stress_stats *statp, bool write) { - bool fail = 0; + long cur; + bool fail = false; int i, n_stress; - long max = 0, min = statp ? statp[0].n_lock_acquired : 0; + long max = 0, min = statp ? data_race(statp[0].n_lock_acquired) : 0; long long sum = 0; n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress; for (i = 0; i < n_stress; i++) { - if (statp[i].n_lock_fail) + if (data_race(statp[i].n_lock_fail)) fail = true; - sum += statp[i].n_lock_acquired; - if (max < statp[i].n_lock_acquired) - max = statp[i].n_lock_acquired; - if (min > statp[i].n_lock_acquired) - min = statp[i].n_lock_acquired; + cur = data_race(statp[i].n_lock_acquired); + sum += cur; + if (max < cur) + max = cur; + if (min > cur) + min = cur; } page += sprintf(page, "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", @@ -771,16 +1033,69 @@ static int lock_torture_stats(void *arg) return 0; } + static inline void lock_torture_print_module_parms(struct lock_torture_ops *cur_ops, const char *tag) { + static cpumask_t cpumask_all; + cpumask_t *rcmp = cpumask_nonempty(bind_readers) ? bind_readers : &cpumask_all; + cpumask_t *wcmp = cpumask_nonempty(bind_writers) ? bind_writers : &cpumask_all; + + cpumask_setall(&cpumask_all); pr_alert("%s" TORTURE_FLAG - "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", + "--- %s%s: acq_writer_lim=%d bind_readers=%*pbl bind_writers=%*pbl call_rcu_chains=%d long_hold=%d nested_locks=%d nreaders_stress=%d nwriters_stress=%d onoff_holdoff=%d onoff_interval=%d rt_boost=%d rt_boost_factor=%d shuffle_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d verbose=%d writer_fifo=%d\n", torture_type, tag, cxt.debug_lock ? " [debug]": "", - cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval, - verbose, shuffle_interval, stutter, shutdown_secs, - onoff_interval, onoff_holdoff); + acq_writer_lim, cpumask_pr_args(rcmp), cpumask_pr_args(wcmp), + call_rcu_chains, long_hold, nested_locks, cxt.nrealreaders_stress, + cxt.nrealwriters_stress, onoff_holdoff, onoff_interval, rt_boost, + rt_boost_factor, shuffle_interval, shutdown_secs, stat_interval, stutter, + verbose, writer_fifo); +} + +// If requested, maintain call_rcu() chains to keep a grace period always +// in flight. These increase the probability of getting an RCU CPU stall +// warning and associated diagnostics when a locking primitive stalls. + +static void call_rcu_chain_cb(struct rcu_head *rhp) +{ + struct call_rcu_chain *crcp = container_of(rhp, struct call_rcu_chain, crc_rh); + + if (!smp_load_acquire(&crcp->crc_stop)) { + (void)start_poll_synchronize_rcu(); // Start one grace period... + call_rcu(&crcp->crc_rh, call_rcu_chain_cb); // ... and later start another. + } +} + +// Start the requested number of call_rcu() chains. +static int call_rcu_chain_init(void) +{ + int i; + + if (call_rcu_chains <= 0) + return 0; + call_rcu_chain_list = kcalloc(call_rcu_chains, sizeof(*call_rcu_chain_list), GFP_KERNEL); + if (!call_rcu_chain_list) + return -ENOMEM; + for (i = 0; i < call_rcu_chains; i++) { + call_rcu_chain_list[i].crc_stop = false; + call_rcu(&call_rcu_chain_list[i].crc_rh, call_rcu_chain_cb); + } + return 0; +} + +// Stop all of the call_rcu() chains. +static void call_rcu_chain_cleanup(void) +{ + int i; + + if (!call_rcu_chain_list) + return; + for (i = 0; i < call_rcu_chains; i++) + smp_store_release(&call_rcu_chain_list[i].crc_stop, true); + rcu_barrier(); + kfree(call_rcu_chain_list); + call_rcu_chain_list = NULL; } static void lock_torture_cleanup(void) @@ -792,17 +1107,17 @@ static void lock_torture_cleanup(void) /* * Indicates early cleanup, meaning that the test has not run, - * such as when passing bogus args when loading the module. As - * such, only perform the underlying torture-specific cleanups, - * and avoid anything related to locktorture. + * such as when passing bogus args when loading the module. + * However cxt->cur_ops.init() may have been invoked, so beside + * perform the underlying torture-specific cleanups, cur_ops.exit() + * will be invoked if needed. */ if (!cxt.lwsa && !cxt.lrsa) goto end; if (writer_tasks) { for (i = 0; i < cxt.nrealwriters_stress; i++) - torture_stop_kthread(lock_torture_writer, - writer_tasks[i]); + torture_stop_kthread(lock_torture_writer, writer_tasks[i]); kfree(writer_tasks); writer_tasks = NULL; } @@ -833,7 +1148,14 @@ static void lock_torture_cleanup(void) kfree(cxt.lrsa); cxt.lrsa = NULL; + call_rcu_chain_cleanup(); + end: + if (cxt.init_called) { + if (cxt.cur_ops->exit) + cxt.cur_ops->exit(); + cxt.init_called = false; + } torture_cleanup_end(); } @@ -844,6 +1166,7 @@ static int __init lock_torture_init(void) static struct lock_torture_ops *torture_ops[] = { &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, + &raw_spin_lock_ops, &raw_spin_lock_irq_ops, &rw_lock_ops, &rw_lock_irq_ops, &mutex_lock_ops, &ww_mutex_lock_ops, @@ -874,20 +1197,23 @@ static int __init lock_torture_init(void) goto unwind; } - if (nwriters_stress == 0 && nreaders_stress == 0) { + if (nwriters_stress == 0 && + (!cxt.cur_ops->readlock || nreaders_stress == 0)) { pr_alert("lock-torture: must run at least one locking thread\n"); firsterr = -EINVAL; goto unwind; } - if (cxt.cur_ops->init) - cxt.cur_ops->init(); - if (nwriters_stress >= 0) cxt.nrealwriters_stress = nwriters_stress; else cxt.nrealwriters_stress = 2 * num_online_cpus(); + if (cxt.cur_ops->init) { + cxt.cur_ops->init(); + cxt.init_called = true; + } + #ifdef CONFIG_DEBUG_MUTEXES if (str_has_prefix(torture_type, "mutex")) cxt.debug_lock = true; @@ -904,7 +1230,7 @@ static int __init lock_torture_init(void) /* Initialize the statistics so that each run gets its own numbers. */ if (nwriters_stress) { - lock_is_write_held = 0; + lock_is_write_held = false; cxt.lwsa = kmalloc_array(cxt.nrealwriters_stress, sizeof(*cxt.lwsa), GFP_KERNEL); @@ -935,7 +1261,6 @@ static int __init lock_torture_init(void) } if (nreaders_stress) { - lock_is_read_held = 0; cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress, sizeof(*cxt.lrsa), GFP_KERNEL); @@ -954,29 +1279,33 @@ static int __init lock_torture_init(void) } } + firsterr = call_rcu_chain_init(); + if (torture_init_error(firsterr)) + goto unwind; + lock_torture_print_module_parms(cxt.cur_ops, "Start of test"); /* Prepare torture context. */ if (onoff_interval > 0) { firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ, NULL); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (shuffle_interval > 0) { firsterr = torture_shuffle_init(shuffle_interval); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (shutdown_secs > 0) { firsterr = torture_shutdown_init(shutdown_secs, lock_torture_cleanup); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (stutter > 0) { firsterr = torture_stutter_init(stutter, stutter); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } @@ -985,18 +1314,22 @@ static int __init lock_torture_init(void) sizeof(writer_tasks[0]), GFP_KERNEL); if (writer_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); + TOROUT_ERRSTRING("writer_tasks: Out of memory"); firsterr = -ENOMEM; goto unwind; } } + /* cap nested_locks to MAX_NESTED_LOCKS */ + if (nested_locks > MAX_NESTED_LOCKS) + nested_locks = MAX_NESTED_LOCKS; + if (cxt.cur_ops->readlock) { reader_tasks = kcalloc(cxt.nrealreaders_stress, sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory"); + TOROUT_ERRSTRING("reader_tasks: Out of memory"); kfree(writer_tasks); writer_tasks = NULL; firsterr = -ENOMEM; @@ -1018,10 +1351,13 @@ static int __init lock_torture_init(void) goto create_reader; /* Create writer. */ - firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i], - writer_tasks[i]); - if (firsterr) + firsterr = torture_create_kthread_cb(lock_torture_writer, &cxt.lwsa[i], + writer_tasks[i], + writer_fifo ? sched_set_fifo : NULL); + if (torture_init_error(firsterr)) goto unwind; + if (cpumask_nonempty(bind_writers)) + torture_sched_setaffinity(writer_tasks[i]->pid, bind_writers); create_reader: if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress)) @@ -1029,13 +1365,15 @@ static int __init lock_torture_init(void) /* Create reader. */ firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j], reader_tasks[j]); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; + if (cpumask_nonempty(bind_readers)) + torture_sched_setaffinity(reader_tasks[j]->pid, bind_readers); } if (stat_interval > 0) { firsterr = torture_create_kthread(lock_torture_stats, NULL, stats_task); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } torture_init_end(); @@ -1044,6 +1382,10 @@ static int __init lock_torture_init(void) unwind: torture_init_end(); lock_torture_cleanup(); + if (shutdown_secs) { + WARN_ON(!IS_MODULE(CONFIG_LOCK_TORTURE_TEST)); + kernel_power_off(); + } return firsterr; } diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 5e10153b4d3c..85251d8771d9 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -7,7 +7,7 @@ * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock * with the desirable properties of being fair, and with each cpu trying * to acquire the lock spinning on a local variable. - * It avoids expensive cache bouncings that common test-and-set spin-lock + * It avoids expensive cache bounces that common test-and-set spin-lock * implementations incur. */ #ifndef __LINUX_MCS_SPINLOCK_H diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index a7276aaf2abc..bc8abb8549d2 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -1,6 +1,4 @@ /* - * kernel/mutex-debug.c - * * Debugging code for mutexes * * Started by Ingo Molnar: @@ -22,7 +20,7 @@ #include <linux/interrupt.h> #include <linux/debug_locks.h> -#include "mutex-debug.h" +#include "mutex.h" /* * Must be called with lock->wait_lock held. @@ -32,6 +30,7 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); waiter->magic = waiter; INIT_LIST_HEAD(&waiter->list); + waiter->ww_ctx = MUTEX_POISON_WW_CTX; } void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) @@ -57,7 +56,7 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, task->blocked_on = waiter; } -void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, +void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task) { DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); @@ -65,7 +64,7 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); task->blocked_on = NULL; - list_del_init(&waiter->list); + INIT_LIST_HEAD(&waiter->list); waiter->task = NULL; } diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h deleted file mode 100644 index 1edd3f45a4ec..000000000000 --- a/kernel/locking/mutex-debug.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Mutexes: blocking mutual exclusion locks - * - * started by Ingo Molnar: - * - * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * - * This file contains mutex debugging related internal declarations, - * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case. - * More details are in kernel/mutex-debug.c. - */ - -/* - * This must be called with lock->wait_lock held. - */ -extern void debug_mutex_lock_common(struct mutex *lock, - struct mutex_waiter *waiter); -extern void debug_mutex_wake_waiter(struct mutex *lock, - struct mutex_waiter *waiter); -extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); -extern void debug_mutex_add_waiter(struct mutex *lock, - struct mutex_waiter *waiter, - struct task_struct *task); -extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct task_struct *task); -extern void debug_mutex_unlock(struct mutex *lock); -extern void debug_mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 5352ce50a97e..cbae8c0b89ab 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -30,17 +30,23 @@ #include <linux/debug_locks.h> #include <linux/osq_lock.h> +#define CREATE_TRACE_POINTS +#include <trace/events/lock.h> + +#ifndef CONFIG_PREEMPT_RT +#include "mutex.h" + #ifdef CONFIG_DEBUG_MUTEXES -# include "mutex-debug.h" +# define MUTEX_WARN_ON(cond) DEBUG_LOCKS_WARN_ON(cond) #else -# include "mutex.h" +# define MUTEX_WARN_ON(cond) #endif void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { atomic_long_set(&lock->owner, 0); - spin_lock_init(&lock->wait_lock); + raw_spin_lock_init(&lock->wait_lock); INIT_LIST_HEAD(&lock->wait_list); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER osq_lock_init(&lock->osq); @@ -86,70 +92,64 @@ bool mutex_is_locked(struct mutex *lock) } EXPORT_SYMBOL(mutex_is_locked); -__must_check enum mutex_trylock_recursive_enum -mutex_trylock_recursive(struct mutex *lock) -{ - if (unlikely(__mutex_owner(lock) == current)) - return MUTEX_TRYLOCK_RECURSIVE; - - return mutex_trylock(lock); -} -EXPORT_SYMBOL(mutex_trylock_recursive); - static inline unsigned long __owner_flags(unsigned long owner) { return owner & MUTEX_FLAGS; } /* - * Trylock variant that retuns the owning task on failure. + * Returns: __mutex_owner(lock) on failure or NULL on success. */ -static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock) +static inline struct task_struct *__mutex_trylock_common(struct mutex *lock, bool handoff) { unsigned long owner, curr = (unsigned long)current; owner = atomic_long_read(&lock->owner); for (;;) { /* must loop, can race against a flag */ - unsigned long old, flags = __owner_flags(owner); + unsigned long flags = __owner_flags(owner); unsigned long task = owner & ~MUTEX_FLAGS; if (task) { - if (likely(task != curr)) - break; - - if (likely(!(flags & MUTEX_FLAG_PICKUP))) + if (flags & MUTEX_FLAG_PICKUP) { + if (task != curr) + break; + flags &= ~MUTEX_FLAG_PICKUP; + } else if (handoff) { + if (flags & MUTEX_FLAG_HANDOFF) + break; + flags |= MUTEX_FLAG_HANDOFF; + } else { break; - - flags &= ~MUTEX_FLAG_PICKUP; + } } else { -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(flags & MUTEX_FLAG_PICKUP); -#endif + MUTEX_WARN_ON(flags & (MUTEX_FLAG_HANDOFF | MUTEX_FLAG_PICKUP)); + task = curr; } - /* - * We set the HANDOFF bit, we must make sure it doesn't live - * past the point where we acquire it. This would be possible - * if we (accidentally) set the bit on an unlocked mutex. - */ - flags &= ~MUTEX_FLAG_HANDOFF; - - old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags); - if (old == owner) - return NULL; - - owner = old; + if (atomic_long_try_cmpxchg_acquire(&lock->owner, &owner, task | flags)) { + if (task == curr) + return NULL; + break; + } } return __owner_task(owner); } /* + * Trylock or set HANDOFF + */ +static inline bool __mutex_trylock_or_handoff(struct mutex *lock, bool handoff) +{ + return !__mutex_trylock_common(lock, handoff); +} + +/* * Actual trylock that will work on any unlocked state. */ static inline bool __mutex_trylock(struct mutex *lock) { - return !__mutex_trylock_or_owner(lock); + return !__mutex_trylock_common(lock, false); } #ifndef CONFIG_DEBUG_LOCK_ALLOC @@ -178,10 +178,7 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock) { unsigned long curr = (unsigned long)current; - if (atomic_long_cmpxchg_release(&lock->owner, curr, 0UL) == curr) - return true; - - return false; + return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL); } #endif @@ -204,7 +201,7 @@ static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_wait * Add @waiter to a given location in the lock wait_list and set the * FLAG_WAITERS flag if it's the first waiter. */ -static void __sched +static void __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct list_head *list) { @@ -215,9 +212,19 @@ __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); } +static void +__mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter) +{ + list_del(&waiter->list); + if (likely(list_empty(&lock->wait_list))) + __mutex_clear_flag(lock, MUTEX_FLAGS); + + debug_mutex_remove_waiter(lock, waiter, current); +} + /* * Give up ownership to a specific task, when @task = NULL, this is equivalent - * to a regular unlock. Sets PICKUP on a handoff, clears HANDOF, preserves + * to a regular unlock. Sets PICKUP on a handoff, clears HANDOFF, preserves * WAITERS. Provides RELEASE semantics like a regular unlock, the * __mutex_trylock() provides a matching ACQUIRE semantics for the handoff. */ @@ -226,23 +233,18 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task) unsigned long owner = atomic_long_read(&lock->owner); for (;;) { - unsigned long old, new; + unsigned long new; -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); - DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP); -#endif + MUTEX_WARN_ON(__owner_task(owner) != current); + MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP); new = (owner & MUTEX_FLAG_WAITERS); new |= (unsigned long)task; if (task) new |= MUTEX_FLAG_PICKUP; - old = atomic_long_cmpxchg_release(&lock->owner, owner, new); - if (old == owner) + if (atomic_long_try_cmpxchg_release(&lock->owner, &owner, new)) break; - - owner = old; } } @@ -286,218 +288,18 @@ void __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); #endif -/* - * Wait-Die: - * The newer transactions are killed when: - * It (the new transaction) makes a request for a lock being held - * by an older transaction. - * - * Wound-Wait: - * The newer transactions are wounded when: - * An older transaction makes a request for a lock being held by - * the newer transaction. - */ - -/* - * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired - * it. - */ -static __always_inline void -ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) -{ -#ifdef CONFIG_DEBUG_MUTEXES - /* - * If this WARN_ON triggers, you used ww_mutex_lock to acquire, - * but released with a normal mutex_unlock in this call. - * - * This should never happen, always use ww_mutex_unlock. - */ - DEBUG_LOCKS_WARN_ON(ww->ctx); - - /* - * Not quite done after calling ww_acquire_done() ? - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); - - if (ww_ctx->contending_lock) { - /* - * After -EDEADLK you tried to - * acquire a different ww_mutex? Bad! - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); - - /* - * You called ww_mutex_lock after receiving -EDEADLK, - * but 'forgot' to unlock everything else first? - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); - ww_ctx->contending_lock = NULL; - } - - /* - * Naughty, using a different class will lead to undefined behavior! - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); -#endif - ww_ctx->acquired++; - ww->ctx = ww_ctx; -} +#include "ww_mutex.h" -/* - * Determine if context @a is 'after' context @b. IOW, @a is a younger - * transaction than @b and depending on algorithm either needs to wait for - * @b or die. - */ -static inline bool __sched -__ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) -{ - - return (signed long)(a->stamp - b->stamp) > 0; -} - -/* - * Wait-Die; wake a younger waiter context (when locks held) such that it can - * die. - * - * Among waiters with context, only the first one can have other locks acquired - * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and - * __ww_mutex_check_kill() wake any but the earliest context. - */ -static bool __sched -__ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter, - struct ww_acquire_ctx *ww_ctx) -{ - if (!ww_ctx->is_wait_die) - return false; - - if (waiter->ww_ctx->acquired > 0 && - __ww_ctx_stamp_after(waiter->ww_ctx, ww_ctx)) { - debug_mutex_wake_waiter(lock, waiter); - wake_up_process(waiter->task); - } - - return true; -} - -/* - * Wound-Wait; wound a younger @hold_ctx if it holds the lock. - * - * Wound the lock holder if there are waiters with older transactions than - * the lock holders. Even if multiple waiters may wound the lock holder, - * it's sufficient that only one does. - */ -static bool __ww_mutex_wound(struct mutex *lock, - struct ww_acquire_ctx *ww_ctx, - struct ww_acquire_ctx *hold_ctx) -{ - struct task_struct *owner = __mutex_owner(lock); - - lockdep_assert_held(&lock->wait_lock); - - /* - * Possible through __ww_mutex_add_waiter() when we race with - * ww_mutex_set_context_fastpath(). In that case we'll get here again - * through __ww_mutex_check_waiters(). - */ - if (!hold_ctx) - return false; - - /* - * Can have !owner because of __mutex_unlock_slowpath(), but if owner, - * it cannot go away because we'll have FLAG_WAITERS set and hold - * wait_lock. - */ - if (!owner) - return false; - - if (ww_ctx->acquired > 0 && __ww_ctx_stamp_after(hold_ctx, ww_ctx)) { - hold_ctx->wounded = 1; - - /* - * wake_up_process() paired with set_current_state() - * inserts sufficient barriers to make sure @owner either sees - * it's wounded in __ww_mutex_check_kill() or has a - * wakeup pending to re-read the wounded state. - */ - if (owner != current) - wake_up_process(owner); - - return true; - } - - return false; -} - -/* - * We just acquired @lock under @ww_ctx, if there are later contexts waiting - * behind us on the wait-list, check if they need to die, or wound us. - * - * See __ww_mutex_add_waiter() for the list-order construction; basically the - * list is ordered by stamp, smallest (oldest) first. - * - * This relies on never mixing wait-die/wound-wait on the same wait-list; - * which is currently ensured by that being a ww_class property. - * - * The current task must not be on the wait list. - */ -static void __sched -__ww_mutex_check_waiters(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) -{ - struct mutex_waiter *cur; - - lockdep_assert_held(&lock->wait_lock); - - list_for_each_entry(cur, &lock->wait_list, list) { - if (!cur->ww_ctx) - continue; - - if (__ww_mutex_die(lock, cur, ww_ctx) || - __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx)) - break; - } -} +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* - * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx - * and wake up any waiters so they can recheck. + * Trylock variant that returns the owning task on failure. */ -static __always_inline void -ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock) { - ww_mutex_lock_acquired(lock, ctx); - - /* - * The lock->ctx update should be visible on all cores before - * the WAITERS check is done, otherwise contended waiters might be - * missed. The contended waiters will either see ww_ctx == NULL - * and keep spinning, or it will acquire wait_lock, add itself - * to waiter list and sleep. - */ - smp_mb(); /* See comments above and below. */ - - /* - * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS - * MB MB - * [R] MUTEX_FLAG_WAITERS [R] ww->ctx - * - * The memory barrier above pairs with the memory barrier in - * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx - * and/or !empty list. - */ - if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS))) - return; - - /* - * Uh oh, we raced in fastpath, check if any of the waiters need to - * die or wound us. - */ - spin_lock(&lock->base.wait_lock); - __ww_mutex_check_waiters(&lock->base, ctx); - spin_unlock(&lock->base.wait_lock); + return __mutex_trylock_common(lock, false); } -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - static inline bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, struct mutex_waiter *waiter) @@ -552,21 +354,23 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner, { bool ret = true; - rcu_read_lock(); + lockdep_assert_preemption_disabled(); + while (__mutex_owner(lock) == owner) { /* * Ensure we emit the owner->on_cpu, dereference _after_ - * checking lock->owner still matches owner. If that fails, - * owner might point to freed memory. If it still matches, - * the rcu_read_lock() ensures the memory stays valid. + * checking lock->owner still matches owner. And we already + * disabled preemption which is equal to the RCU read-side + * crital section in optimistic spinning code. Thus the + * task_strcut structure won't go away during the spinning + * period */ barrier(); /* * Use vcpu_is_preempted to detect lock holder preemption issue. */ - if (!owner->on_cpu || need_resched() || - vcpu_is_preempted(task_cpu(owner))) { + if (!owner_on_cpu(owner) || need_resched()) { ret = false; break; } @@ -578,7 +382,6 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner, cpu_relax(); } - rcu_read_unlock(); return ret; } @@ -591,19 +394,19 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) struct task_struct *owner; int retval = 1; + lockdep_assert_preemption_disabled(); + if (need_resched()) return 0; - rcu_read_lock(); - owner = __mutex_owner(lock); - /* - * As lock holder preemption issue, we both skip spinning if task is not - * on cpu or its cpu is preempted + * We already disabled preemption which is equal to the RCU read-side + * crital section in optimistic spinning code. Thus the task_strcut + * structure won't go away during the spinning period. */ + owner = __mutex_owner(lock); if (owner) - retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); - rcu_read_unlock(); + retval = owner_on_cpu(owner); /* * If lock->owner is not set, the mutex has been released. Return true @@ -636,7 +439,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) */ static __always_inline bool mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, - const bool use_ww_ctx, struct mutex_waiter *waiter) + struct mutex_waiter *waiter) { if (!waiter) { /* @@ -712,7 +515,7 @@ fail: #else static __always_inline bool mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, - const bool use_ww_ctx, struct mutex_waiter *waiter) + struct mutex_waiter *waiter) { return false; } @@ -729,6 +532,11 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne * This function must not be used in interrupt context. Unlocking * of a not locked mutex is not allowed. * + * The caller must ensure that the mutex stays alive until this function has + * returned - mutex_unlock() can NOT directly be used to release an object such + * that another concurrent task can free it. + * Mutexes are different from spinlocks & refcounts in this aspect. + * * This function is similar to (but not equivalent to) up(). */ void __sched mutex_unlock(struct mutex *lock) @@ -754,192 +562,32 @@ EXPORT_SYMBOL(mutex_unlock); */ void __sched ww_mutex_unlock(struct ww_mutex *lock) { - /* - * The unlocking fastpath is the 0->1 transition from 'locked' - * into 'unlocked' state: - */ - if (lock->ctx) { -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired); -#endif - if (lock->ctx->acquired > 0) - lock->ctx->acquired--; - lock->ctx = NULL; - } - + __ww_mutex_unlock(lock); mutex_unlock(&lock->base); } EXPORT_SYMBOL(ww_mutex_unlock); - -static __always_inline int __sched -__ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) -{ - if (ww_ctx->acquired > 0) { -#ifdef CONFIG_DEBUG_MUTEXES - struct ww_mutex *ww; - - ww = container_of(lock, struct ww_mutex, base); - DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock); - ww_ctx->contending_lock = ww; -#endif - return -EDEADLK; - } - - return 0; -} - - -/* - * Check the wound condition for the current lock acquire. - * - * Wound-Wait: If we're wounded, kill ourself. - * - * Wait-Die: If we're trying to acquire a lock already held by an older - * context, kill ourselves. - * - * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to - * look at waiters before us in the wait-list. - */ -static inline int __sched -__ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter, - struct ww_acquire_ctx *ctx) -{ - struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); - struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); - struct mutex_waiter *cur; - - if (ctx->acquired == 0) - return 0; - - if (!ctx->is_wait_die) { - if (ctx->wounded) - return __ww_mutex_kill(lock, ctx); - - return 0; - } - - if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx)) - return __ww_mutex_kill(lock, ctx); - - /* - * If there is a waiter in front of us that has a context, then its - * stamp is earlier than ours and we must kill ourself. - */ - cur = waiter; - list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) { - if (!cur->ww_ctx) - continue; - - return __ww_mutex_kill(lock, ctx); - } - - return 0; -} - -/* - * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest - * first. Such that older contexts are preferred to acquire the lock over - * younger contexts. - * - * Waiters without context are interspersed in FIFO order. - * - * Furthermore, for Wait-Die kill ourself immediately when possible (there are - * older contexts already waiting) to avoid unnecessary waiting and for - * Wound-Wait ensure we wound the owning context when it is younger. - */ -static inline int __sched -__ww_mutex_add_waiter(struct mutex_waiter *waiter, - struct mutex *lock, - struct ww_acquire_ctx *ww_ctx) -{ - struct mutex_waiter *cur; - struct list_head *pos; - bool is_wait_die; - - if (!ww_ctx) { - __mutex_add_waiter(lock, waiter, &lock->wait_list); - return 0; - } - - is_wait_die = ww_ctx->is_wait_die; - - /* - * Add the waiter before the first waiter with a higher stamp. - * Waiters without a context are skipped to avoid starving - * them. Wait-Die waiters may die here. Wound-Wait waiters - * never die here, but they are sorted in stamp order and - * may wound the lock holder. - */ - pos = &lock->wait_list; - list_for_each_entry_reverse(cur, &lock->wait_list, list) { - if (!cur->ww_ctx) - continue; - - if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) { - /* - * Wait-Die: if we find an older context waiting, there - * is no point in queueing behind it, as we'd have to - * die the moment it would acquire the lock. - */ - if (is_wait_die) { - int ret = __ww_mutex_kill(lock, ww_ctx); - - if (ret) - return ret; - } - - break; - } - - pos = &cur->list; - - /* Wait-Die: ensure younger waiters die. */ - __ww_mutex_die(lock, cur, ww_ctx); - } - - __mutex_add_waiter(lock, waiter, pos); - - /* - * Wound-Wait: if we're blocking on a mutex owned by a younger context, - * wound that such that we might proceed. - */ - if (!is_wait_die) { - struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); - - /* - * See ww_mutex_set_context_fastpath(). Orders setting - * MUTEX_FLAG_WAITERS vs the ww->ctx load, - * such that either we or the fastpath will wound @ww->ctx. - */ - smp_mb(); - __ww_mutex_wound(lock, ww_ctx, ww->ctx); - } - - return 0; -} - /* * Lock a mutex (possibly interruptible), slowpath: */ static __always_inline int __sched -__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, +__mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip, struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) { struct mutex_waiter waiter; - bool first = false; struct ww_mutex *ww; int ret; + if (!use_ww_ctx) + ww_ctx = NULL; + might_sleep(); -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(lock->magic != lock); -#endif + MUTEX_WARN_ON(lock->magic != lock); ww = container_of(lock, struct ww_mutex, base); - if (use_ww_ctx && ww_ctx) { + if (ww_ctx) { if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) return -EALREADY; @@ -950,44 +598,48 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ if (ww_ctx->acquired == 0) ww_ctx->wounded = 0; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + nest_lock = &ww_ctx->dep_map; +#endif } preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); + trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); if (__mutex_trylock(lock) || - mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, NULL)) { + mutex_optimistic_spin(lock, ww_ctx, NULL)) { /* got the lock, yay! */ lock_acquired(&lock->dep_map, ip); - if (use_ww_ctx && ww_ctx) + if (ww_ctx) ww_mutex_set_context_fastpath(ww, ww_ctx); + trace_contention_end(lock, 0); preempt_enable(); return 0; } - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); /* * After waiting to acquire the wait_lock, try again. */ if (__mutex_trylock(lock)) { - if (use_ww_ctx && ww_ctx) + if (ww_ctx) __ww_mutex_check_waiters(lock, ww_ctx); goto skip_wait; } debug_mutex_lock_common(lock, &waiter); + waiter.task = current; + if (use_ww_ctx) + waiter.ww_ctx = ww_ctx; lock_contended(&lock->dep_map, ip); if (!use_ww_ctx) { /* add waiting tasks to the end of the waitqueue (FIFO): */ __mutex_add_waiter(lock, &waiter, &lock->wait_list); - - -#ifdef CONFIG_DEBUG_MUTEXES - waiter.ww_ctx = MUTEX_POISON_WW_CTX; -#endif } else { /* * Add in stamp order, waking up waiters that must kill @@ -996,14 +648,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx); if (ret) goto err_early_kill; - - waiter.ww_ctx = ww_ctx; } - waiter.task = current; - set_current_state(state); + trace_contention_begin(lock, LCB_F_MUTEX); for (;;) { + bool first; + /* * Once we hold wait_lock, we're serialized against * mutex_unlock() handing the lock off to us, do a trylock @@ -1023,24 +674,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto err; } - if (use_ww_ctx && ww_ctx) { + if (ww_ctx) { ret = __ww_mutex_check_kill(lock, &waiter, ww_ctx); if (ret) goto err; } - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); schedule_preempt_disabled(); - /* - * ww_mutex needs to always recheck its position since its waiter - * list is not FIFO ordered. - */ - if ((use_ww_ctx && ww_ctx) || !first) { - first = __mutex_waiter_is_first(lock, &waiter); - if (first) - __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); - } + first = __mutex_waiter_is_first(lock, &waiter); set_current_state(state); /* @@ -1048,17 +691,23 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * state back to RUNNING and fall through the next schedule(), * or we must see its unlock and acquire. */ - if (__mutex_trylock(lock) || - (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, &waiter))) + if (__mutex_trylock_or_handoff(lock, first)) break; - spin_lock(&lock->wait_lock); + if (first) { + trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); + if (mutex_optimistic_spin(lock, ww_ctx, &waiter)) + break; + trace_contention_begin(lock, LCB_F_MUTEX); + } + + raw_spin_lock(&lock->wait_lock); } - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); acquired: __set_current_state(TASK_RUNNING); - if (use_ww_ctx && ww_ctx) { + if (ww_ctx) { /* * Wound-Wait; we stole the lock (!first_waiter), check the * waiters as anyone might want to wound us. @@ -1068,28 +717,28 @@ acquired: __ww_mutex_check_waiters(lock, ww_ctx); } - mutex_remove_waiter(lock, &waiter, current); - if (likely(list_empty(&lock->wait_list))) - __mutex_clear_flag(lock, MUTEX_FLAGS); + __mutex_remove_waiter(lock, &waiter); debug_mutex_free_waiter(&waiter); skip_wait: /* got the lock - cleanup and rejoice! */ lock_acquired(&lock->dep_map, ip); + trace_contention_end(lock, 0); - if (use_ww_ctx && ww_ctx) + if (ww_ctx) ww_mutex_lock_acquired(ww, ww_ctx); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); preempt_enable(); return 0; err: __set_current_state(TASK_RUNNING); - mutex_remove_waiter(lock, &waiter, current); + __mutex_remove_waiter(lock, &waiter); err_early_kill: - spin_unlock(&lock->wait_lock); + trace_contention_end(lock, ret); + raw_spin_unlock(&lock->wait_lock); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, ip); preempt_enable(); @@ -1097,19 +746,56 @@ err_early_kill: } static int __sched -__mutex_lock(struct mutex *lock, long state, unsigned int subclass, +__mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip) { return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false); } static int __sched -__ww_mutex_lock(struct mutex *lock, long state, unsigned int subclass, - struct lockdep_map *nest_lock, unsigned long ip, - struct ww_acquire_ctx *ww_ctx) +__ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, + unsigned long ip, struct ww_acquire_ctx *ww_ctx) +{ + return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true); +} + +/** + * ww_mutex_trylock - tries to acquire the w/w mutex with optional acquire context + * @ww: mutex to lock + * @ww_ctx: optional w/w acquire context + * + * Trylocks a mutex with the optional acquire context; no deadlock detection is + * possible. Returns 1 if the mutex has been acquired successfully, 0 otherwise. + * + * Unlike ww_mutex_lock, no deadlock handling is performed. However, if a @ctx is + * specified, -EALREADY handling may happen in calls to ww_mutex_trylock. + * + * A mutex acquired with this function must be released with ww_mutex_unlock. + */ +int ww_mutex_trylock(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) { - return __mutex_lock_common(lock, state, subclass, nest_lock, ip, ww_ctx, true); + if (!ww_ctx) + return mutex_trylock(&ww->base); + + MUTEX_WARN_ON(ww->base.magic != &ww->base); + + /* + * Reset the wounded flag after a kill. No other process can + * race and wound us here, since they can't have a valid owner + * pointer if we don't have any locks held. + */ + if (ww_ctx->acquired == 0) + ww_ctx->wounded = 0; + + if (__mutex_trylock(&ww->base)) { + ww_mutex_set_context_fastpath(ww, ww_ctx); + mutex_acquire_nest(&ww->base.dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_); + return 1; + } + + return 0; } +EXPORT_SYMBOL(ww_mutex_trylock); #ifdef CONFIG_DEBUG_LOCK_ALLOC void __sched @@ -1188,8 +874,7 @@ ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) might_sleep(); ret = __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, - 0, ctx ? &ctx->dep_map : NULL, _RET_IP_, - ctx); + 0, _RET_IP_, ctx); if (!ret && ctx && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); @@ -1204,8 +889,7 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) might_sleep(); ret = __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, - 0, ctx ? &ctx->dep_map : NULL, _RET_IP_, - ctx); + 0, _RET_IP_, ctx); if (!ret && ctx && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); @@ -1236,29 +920,21 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne */ owner = atomic_long_read(&lock->owner); for (;;) { - unsigned long old; - -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); - DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP); -#endif + MUTEX_WARN_ON(__owner_task(owner) != current); + MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP); if (owner & MUTEX_FLAG_HANDOFF) break; - old = atomic_long_cmpxchg_release(&lock->owner, owner, - __owner_flags(owner)); - if (old == owner) { + if (atomic_long_try_cmpxchg_release(&lock->owner, &owner, __owner_flags(owner))) { if (owner & MUTEX_FLAG_WAITERS) break; return; } - - owner = old; } - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); debug_mutex_unlock(lock); if (!list_empty(&lock->wait_list)) { /* get the first entry from the wait-list: */ @@ -1275,7 +951,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne if (owner & MUTEX_FLAG_HANDOFF) __mutex_handoff(lock, next); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); wake_up_q(&wake_q); } @@ -1379,7 +1055,7 @@ __mutex_lock_interruptible_slowpath(struct mutex *lock) static noinline int __sched __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, NULL, + return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, _RET_IP_, ctx); } @@ -1387,7 +1063,7 @@ static noinline int __sched __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, NULL, + return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, _RET_IP_, ctx); } @@ -1411,9 +1087,7 @@ int __sched mutex_trylock(struct mutex *lock) { bool locked; -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(lock->magic != lock); -#endif + MUTEX_WARN_ON(lock->magic != lock); locked = __mutex_trylock(lock); if (locked) @@ -1454,7 +1128,11 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) } EXPORT_SYMBOL(ww_mutex_lock_interruptible); -#endif +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ +#endif /* !CONFIG_PREEMPT_RT */ + +EXPORT_TRACEPOINT_SYMBOL_GPL(contention_begin); +EXPORT_TRACEPOINT_SYMBOL_GPL(contention_end); /** * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 1c2287d3fa71..0b2a79c4013b 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -5,21 +5,41 @@ * started by Ingo Molnar: * * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * - * This file contains mutex debugging related internal prototypes, for the - * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: */ -#define mutex_remove_waiter(lock, waiter, task) \ - __list_del((waiter)->list.prev, (waiter)->list.next) - -#define debug_mutex_wake_waiter(lock, waiter) do { } while (0) -#define debug_mutex_free_waiter(waiter) do { } while (0) -#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) -#define debug_mutex_unlock(lock) do { } while (0) -#define debug_mutex_init(lock, name, key) do { } while (0) +/* + * This is the control structure for tasks blocked on mutex, which resides + * on the blocked task's kernel stack: + */ +struct mutex_waiter { + struct list_head list; + struct task_struct *task; + struct ww_acquire_ctx *ww_ctx; +#ifdef CONFIG_DEBUG_MUTEXES + void *magic; +#endif +}; -static inline void -debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) -{ -} +#ifdef CONFIG_DEBUG_MUTEXES +extern void debug_mutex_lock_common(struct mutex *lock, + struct mutex_waiter *waiter); +extern void debug_mutex_wake_waiter(struct mutex *lock, + struct mutex_waiter *waiter); +extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); +extern void debug_mutex_add_waiter(struct mutex *lock, + struct mutex_waiter *waiter, + struct task_struct *task); +extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, + struct task_struct *task); +extern void debug_mutex_unlock(struct mutex *lock); +extern void debug_mutex_init(struct mutex *lock, const char *name, + struct lock_class_key *key); +#else /* CONFIG_DEBUG_MUTEXES */ +# define debug_mutex_lock_common(lock, waiter) do { } while (0) +# define debug_mutex_wake_waiter(lock, waiter) do { } while (0) +# define debug_mutex_free_waiter(waiter) do { } while (0) +# define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) +# define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0) +# define debug_mutex_unlock(lock) do { } while (0) +# define debug_mutex_init(lock, name, key) do { } while (0) +#endif /* !CONFIG_DEBUG_MUTEXES */ diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 1f7734949ac8..75a6f6133866 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -11,6 +11,13 @@ * called from interrupt context and we have preemption disabled while * spinning. */ + +struct optimistic_spin_node { + struct optimistic_spin_node *next, *prev; + int locked; /* 1 if lock acquired */ + int cpu; /* encoded CPU # + 1 value */ +}; + static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node); /* @@ -37,32 +44,28 @@ static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val) /* * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. * Can return NULL in case we were the last queued and we updated @lock instead. + * + * If osq_lock() is being cancelled there must be a previous node + * and 'old_cpu' is its CPU #. + * For osq_unlock() there is never a previous node and old_cpu is + * set to OSQ_UNLOCKED_VAL. */ static inline struct optimistic_spin_node * osq_wait_next(struct optimistic_spin_queue *lock, struct optimistic_spin_node *node, - struct optimistic_spin_node *prev) + int old_cpu) { - struct optimistic_spin_node *next = NULL; int curr = encode_cpu(smp_processor_id()); - int old; - - /* - * If there is a prev node in queue, then the 'old' value will be - * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if - * we're currently last in queue, then the queue will then become empty. - */ - old = prev ? prev->cpu : OSQ_UNLOCKED_VAL; for (;;) { if (atomic_read(&lock->tail) == curr && - atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) { + atomic_cmpxchg_acquire(&lock->tail, curr, old_cpu) == curr) { /* * We were the last queued, we moved @lock back. @prev * will now observe @lock and will complete its * unlock()/unqueue(). */ - break; + return NULL; } /* @@ -76,15 +79,15 @@ osq_wait_next(struct optimistic_spin_queue *lock, * wait for a new @node->next from its Step-C. */ if (node->next) { + struct optimistic_spin_node *next; + next = xchg(&node->next, NULL); if (next) - break; + return next; } cpu_relax(); } - - return next; } bool osq_lock(struct optimistic_spin_queue *lock) @@ -135,7 +138,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) */ /* - * Wait to acquire the lock or cancelation. Note that need_resched() + * Wait to acquire the lock or cancellation. Note that need_resched() * will come with an IPI, which will wake smp_cond_load_relaxed() if it * is implemented with a monitor-wait. vcpu_is_preempted() relies on * polling, be careful. @@ -154,13 +157,17 @@ bool osq_lock(struct optimistic_spin_queue *lock) */ for (;;) { - if (prev->next == node && + /* + * cpu_relax() below implies a compiler barrier which would + * prevent this comparison being optimized away. + */ + if (data_race(prev->next) == node && cmpxchg(&prev->next, node, NULL) == node) break; /* * We can only fail the cmpxchg() racing against an unlock(), - * in which case we should observe @node->locked becomming + * in which case we should observe @node->locked becoming * true. */ if (smp_load_acquire(&node->locked)) @@ -182,7 +189,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) * back to @prev. */ - next = osq_wait_next(lock, node, prev); + next = osq_wait_next(lock, node, prev->cpu); if (!next) return false; @@ -222,7 +229,7 @@ void osq_unlock(struct optimistic_spin_queue *lock) return; } - next = osq_wait_next(lock, node, NULL); + next = osq_wait_next(lock, node, OSQ_UNLOCKED_VAL); if (next) WRITE_ONCE(next->locked, 1); } diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 8bbafe3e5203..6083883c4fe0 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -7,7 +7,9 @@ #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/sched/task.h> +#include <linux/sched/debug.h> #include <linux/errno.h> +#include <trace/events/lock.h> int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, const char *name, struct lock_class_key *key) @@ -45,7 +47,7 @@ EXPORT_SYMBOL_GPL(percpu_free_rwsem); static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) { - __this_cpu_inc(*sem->read_count); + this_cpu_inc(*sem->read_count); /* * Due to having preemption disabled the decrement happens on @@ -71,7 +73,7 @@ static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) if (likely(!atomic_read_acquire(&sem->block))) return true; - __this_cpu_dec(*sem->read_count); + this_cpu_dec(*sem->read_count); /* Prod writer to re-evaluate readers_active_check() */ rcuwait_wake_up(&sem->writer); @@ -162,7 +164,7 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) __set_current_state(TASK_RUNNING); } -bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) +bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) { if (__percpu_down_read_trylock(sem)) return true; @@ -170,9 +172,11 @@ bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) if (try) return false; + trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ); preempt_enable(); percpu_rwsem_wait(sem, /* .reader = */ true); preempt_disable(); + trace_contention_end(sem, 0); return true; } @@ -188,6 +192,12 @@ EXPORT_SYMBOL_GPL(__percpu_down_read); __sum; \ }) +bool percpu_is_read_locked(struct percpu_rw_semaphore *sem) +{ + return per_cpu_sum(*sem->read_count) != 0 && !atomic_read(&sem->block); +} +EXPORT_SYMBOL_GPL(percpu_is_read_locked); + /* * Return true if the modular sum of the sem->read_count per-CPU variable is * zero. If this sum is zero, then it is stable due to the fact that if any @@ -211,8 +221,10 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem) return true; } -void percpu_down_write(struct percpu_rw_semaphore *sem) +void __sched percpu_down_write(struct percpu_rw_semaphore *sem) { + bool contended = false; + might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); @@ -223,8 +235,11 @@ void percpu_down_write(struct percpu_rw_semaphore *sem) * Try set sem->block; this provides writer-writer exclusion. * Having sem->block set makes new readers block. */ - if (!__percpu_down_write_trylock(sem)) + if (!__percpu_down_write_trylock(sem)) { + trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE); percpu_rwsem_wait(sem, /* .reader = */ false); + contended = true; + } /* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */ @@ -236,6 +251,8 @@ void percpu_down_write(struct percpu_rw_semaphore *sem) /* Wait for all active readers to complete. */ rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE); + if (contended) + trace_contention_end(sem, 0); } EXPORT_SYMBOL_GPL(percpu_down_write); diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index fe9ca92faa2a..d2ef312a8611 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -12,13 +12,13 @@ #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/spinlock.h> -#include <asm/qrwlock.h> +#include <trace/events/lock.h> /** - * queued_read_lock_slowpath - acquire read lock of a queue rwlock - * @lock: Pointer to queue rwlock structure + * queued_read_lock_slowpath - acquire read lock of a queued rwlock + * @lock: Pointer to queued rwlock structure */ -void queued_read_lock_slowpath(struct qrwlock *lock) +void __lockfunc queued_read_lock_slowpath(struct qrwlock *lock) { /* * Readers come here when they cannot get the lock without waiting @@ -35,6 +35,8 @@ void queued_read_lock_slowpath(struct qrwlock *lock) } atomic_sub(_QR_BIAS, &lock->cnts); + trace_contention_begin(lock, LCB_F_SPIN | LCB_F_READ); + /* * Put the reader into the wait queue */ @@ -52,32 +54,39 @@ void queued_read_lock_slowpath(struct qrwlock *lock) * Signal the next one in queue to become queue head */ arch_spin_unlock(&lock->wait_lock); + + trace_contention_end(lock, 0); } EXPORT_SYMBOL(queued_read_lock_slowpath); /** - * queued_write_lock_slowpath - acquire write lock of a queue rwlock - * @lock : Pointer to queue rwlock structure + * queued_write_lock_slowpath - acquire write lock of a queued rwlock + * @lock : Pointer to queued rwlock structure */ -void queued_write_lock_slowpath(struct qrwlock *lock) +void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock) { + int cnts; + + trace_contention_begin(lock, LCB_F_SPIN | LCB_F_WRITE); + /* Put the writer into the wait queue */ arch_spin_lock(&lock->wait_lock); /* Try to acquire the lock directly if no reader is present */ - if (!atomic_read(&lock->cnts) && - (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0)) + if (!(cnts = atomic_read(&lock->cnts)) && + atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED)) goto unlock; /* Set the waiting flag to notify readers that a writer is pending */ - atomic_add(_QW_WAITING, &lock->cnts); + atomic_or(_QW_WAITING, &lock->cnts); /* When no more readers or writers, set the locked flag */ do { - atomic_cond_read_acquire(&lock->cnts, VAL == _QW_WAITING); - } while (atomic_cmpxchg_relaxed(&lock->cnts, _QW_WAITING, - _QW_LOCKED) != _QW_WAITING); + cnts = atomic_cond_read_relaxed(&lock->cnts, VAL == _QW_WAITING); + } while (!atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED)); unlock: arch_spin_unlock(&lock->wait_lock); + + trace_contention_end(lock, 0); } EXPORT_SYMBOL(queued_write_lock_slowpath); diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index b9515fcc9b29..ebe6b8ec7cb3 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -22,6 +22,7 @@ #include <linux/prefetch.h> #include <asm/byteorder.h> #include <asm/qspinlock.h> +#include <trace/events/lock.h> /* * Include queued spinlock statistics code @@ -312,7 +313,7 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : * queue : ^--' : */ -void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) +void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) { struct mcs_spinlock *prev, *next, *node; u32 old, tail; @@ -370,7 +371,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) /* * We're pending, wait for the owner to go away. * - * 0,1,1 -> 0,1,0 + * 0,1,1 -> *,1,0 * * this wait loop must be a load-acquire such that we match the * store-release that clears the locked bit and create lock @@ -379,7 +380,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * barriers. */ if (val & _Q_LOCKED_MASK) - atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_MASK)); + smp_cond_load_acquire(&lock->locked, !VAL); /* * take ownership and clear the pending bit. @@ -401,6 +402,8 @@ pv_queue: idx = node->count++; tail = encode_tail(smp_processor_id(), idx); + trace_contention_begin(lock, LCB_F_SPIN); + /* * 4 nodes are allocated based on the assumption that there will * not be nested NMIs taking spinlocks. That may not be true in @@ -554,6 +557,8 @@ locked: pv_kick_node(lock, next); release: + trace_contention_end(lock, 0); + /* * release the node */ @@ -581,4 +586,11 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath); #include "qspinlock_paravirt.h" #include "qspinlock.c" +bool nopvspin __initdata; +static __init int parse_nopvspin(char *arg) +{ + nopvspin = true; + return 0; +} +early_param("nopvspin", parse_nopvspin); #endif diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index e84d21aa0722..ae2b12f68b90 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -294,8 +294,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) { struct pv_node *pn = (struct pv_node *)node; struct pv_node *pp = (struct pv_node *)prev; + bool __maybe_unused wait_early; int loop; - bool wait_early; for (;;) { for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) { @@ -486,10 +486,20 @@ gotlock: } /* + * Include the architecture specific callee-save thunk of the + * __pv_queued_spin_unlock(). This thunk is put together with + * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock + * function close to each other sharing consecutive instruction cachelines. + * Alternatively, architecture specific version of __pv_queued_spin_unlock() + * can be defined. + */ +#include <asm/qspinlock_paravirt.h> + +/* * PV versions of the unlock fastpath and slowpath functions to be used * instead of queued_spin_unlock(). */ -__visible void +__visible __lockfunc void __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) { struct pv_node *node; @@ -533,18 +543,8 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) pv_kick(node->cpu); } -/* - * Include the architecture specific callee-save thunk of the - * __pv_queued_spin_unlock(). This thunk is put together with - * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock - * function close to each other sharing consecutive instruction cachelines. - * Alternatively, architecture specific version of __pv_queued_spin_unlock() - * can be defined. - */ -#include <asm/qspinlock_paravirt.h> - #ifndef __pv_queued_spin_unlock -__visible void __pv_queued_spin_unlock(struct qspinlock *lock) +__visible __lockfunc void __pv_queued_spin_unlock(struct qspinlock *lock) { u8 locked; diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c deleted file mode 100644 index 36e69100e8e0..000000000000 --- a/kernel/locking/rtmutex-debug.c +++ /dev/null @@ -1,182 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * RT-Mutexes: blocking mutual exclusion locks with PI support - * - * started by Ingo Molnar and Thomas Gleixner: - * - * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> - * - * This code is based on the rt.c implementation in the preempt-rt tree. - * Portions of said code are - * - * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey - * Copyright (C) 2006 Esben Nielsen - * Copyright (C) 2006 Kihon Technologies Inc., - * Steven Rostedt <rostedt@goodmis.org> - * - * See rt.c in preempt-rt for proper credits and further information - */ -#include <linux/sched.h> -#include <linux/sched/rt.h> -#include <linux/sched/debug.h> -#include <linux/delay.h> -#include <linux/export.h> -#include <linux/spinlock.h> -#include <linux/kallsyms.h> -#include <linux/syscalls.h> -#include <linux/interrupt.h> -#include <linux/rbtree.h> -#include <linux/fs.h> -#include <linux/debug_locks.h> - -#include "rtmutex_common.h" - -static void printk_task(struct task_struct *p) -{ - if (p) - printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio); - else - printk("<none>"); -} - -static void printk_lock(struct rt_mutex *lock, int print_owner) -{ - if (lock->name) - printk(" [%p] {%s}\n", - lock, lock->name); - else - printk(" [%p] {%s:%d}\n", - lock, lock->file, lock->line); - - if (print_owner && rt_mutex_owner(lock)) { - printk(".. ->owner: %p\n", lock->owner); - printk(".. held by: "); - printk_task(rt_mutex_owner(lock)); - printk("\n"); - } -} - -void rt_mutex_debug_task_free(struct task_struct *task) -{ - DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root)); - DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); -} - -/* - * We fill out the fields in the waiter to store the information about - * the deadlock. We print when we return. act_waiter can be NULL in - * case of a remove waiter operation. - */ -void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, - struct rt_mutex_waiter *act_waiter, - struct rt_mutex *lock) -{ - struct task_struct *task; - - if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter) - return; - - task = rt_mutex_owner(act_waiter->lock); - if (task && task != current) { - act_waiter->deadlock_task_pid = get_pid(task_pid(task)); - act_waiter->deadlock_lock = lock; - } -} - -void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) -{ - struct task_struct *task; - - if (!waiter->deadlock_lock || !debug_locks) - return; - - rcu_read_lock(); - task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID); - if (!task) { - rcu_read_unlock(); - return; - } - - if (!debug_locks_off()) { - rcu_read_unlock(); - return; - } - - pr_warn("\n"); - pr_warn("============================================\n"); - pr_warn("WARNING: circular locking deadlock detected!\n"); - pr_warn("%s\n", print_tainted()); - pr_warn("--------------------------------------------\n"); - printk("%s/%d is deadlocking current task %s/%d\n\n", - task->comm, task_pid_nr(task), - current->comm, task_pid_nr(current)); - - printk("\n1) %s/%d is trying to acquire this lock:\n", - current->comm, task_pid_nr(current)); - printk_lock(waiter->lock, 1); - - printk("\n2) %s/%d is blocked on this lock:\n", - task->comm, task_pid_nr(task)); - printk_lock(waiter->deadlock_lock, 1); - - debug_show_held_locks(current); - debug_show_held_locks(task); - - printk("\n%s/%d's [blocked] stackdump:\n\n", - task->comm, task_pid_nr(task)); - show_stack(task, NULL, KERN_DEFAULT); - printk("\n%s/%d's [current] stackdump:\n\n", - current->comm, task_pid_nr(current)); - dump_stack(); - debug_show_all_locks(); - rcu_read_unlock(); - - printk("[ turning off deadlock detection." - "Please report this trace. ]\n\n"); -} - -void debug_rt_mutex_lock(struct rt_mutex *lock) -{ -} - -void debug_rt_mutex_unlock(struct rt_mutex *lock) -{ - DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); -} - -void -debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner) -{ -} - -void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) -{ - DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); -} - -void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) -{ - memset(waiter, 0x11, sizeof(*waiter)); - waiter->deadlock_task_pid = NULL; -} - -void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) -{ - put_pid(waiter->deadlock_task_pid); - memset(waiter, 0x22, sizeof(*waiter)); -} - -void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key) -{ - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lock->name = name; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif -} - diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h deleted file mode 100644 index fc549713bba3..000000000000 --- a/kernel/locking/rtmutex-debug.h +++ /dev/null @@ -1,37 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * RT-Mutexes: blocking mutual exclusion locks with PI support - * - * started by Ingo Molnar and Thomas Gleixner: - * - * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> - * - * This file contains macros used solely by rtmutex.c. Debug version. - */ - -extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); -extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); -extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key); -extern void debug_rt_mutex_lock(struct rt_mutex *lock); -extern void debug_rt_mutex_unlock(struct rt_mutex *lock); -extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, - struct task_struct *powner); -extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); -extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, - struct rt_mutex_waiter *waiter, - struct rt_mutex *lock); -extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); -# define debug_rt_mutex_reset_waiter(w) \ - do { (w)->deadlock_lock = NULL; } while (0) - -static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, - enum rtmutex_chainwalk walk) -{ - return (waiter != NULL); -} - -static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) -{ - debug_rt_mutex_print_deadlock(w); -} diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index cfdd5b93264d..88d08eeb8bc0 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -8,20 +8,60 @@ * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt * Copyright (C) 2006 Esben Nielsen + * Adaptive Spinlocks: + * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich, + * and Peter Morreale, + * Adaptive Spinlocks simplification: + * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com> * * See Documentation/locking/rt-mutex-design.rst for details. */ -#include <linux/spinlock.h> -#include <linux/export.h> +#include <linux/sched.h> +#include <linux/sched/debug.h> +#include <linux/sched/deadline.h> #include <linux/sched/signal.h> #include <linux/sched/rt.h> -#include <linux/sched/deadline.h> #include <linux/sched/wake_q.h> -#include <linux/sched/debug.h> -#include <linux/timer.h> +#include <linux/ww_mutex.h> + +#include <trace/events/lock.h> #include "rtmutex_common.h" +#ifndef WW_RT +# define build_ww_mutex() (false) +# define ww_container_of(rtm) NULL + +static inline int __ww_mutex_add_waiter(struct rt_mutex_waiter *waiter, + struct rt_mutex *lock, + struct ww_acquire_ctx *ww_ctx) +{ + return 0; +} + +static inline void __ww_mutex_check_waiters(struct rt_mutex *lock, + struct ww_acquire_ctx *ww_ctx) +{ +} + +static inline void ww_mutex_lock_acquired(struct ww_mutex *lock, + struct ww_acquire_ctx *ww_ctx) +{ +} + +static inline int __ww_mutex_check_kill(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct ww_acquire_ctx *ww_ctx) +{ + return 0; +} + +#else +# define build_ww_mutex() (true) +# define ww_container_of(rtm) container_of(rtm, struct ww_mutex, base) +# include "ww_mutex.h" +#endif + /* * lock->owner state tracking: * @@ -49,24 +89,41 @@ * set this bit before looking at the lock. */ -static void -rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) +static __always_inline struct task_struct * +rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner) { unsigned long val = (unsigned long)owner; if (rt_mutex_has_waiters(lock)) val |= RT_MUTEX_HAS_WAITERS; - WRITE_ONCE(lock->owner, (struct task_struct *)val); + return (struct task_struct *)val; } -static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) +static __always_inline void +rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner) +{ + /* + * lock->wait_lock is held but explicit acquire semantics are needed + * for a new lock owner so WRITE_ONCE is insufficient. + */ + xchg_acquire(&lock->owner, rt_mutex_owner_encode(lock, owner)); +} + +static __always_inline void rt_mutex_clear_owner(struct rt_mutex_base *lock) +{ + /* lock->wait_lock is held so the unlock provides release semantics. */ + WRITE_ONCE(lock->owner, rt_mutex_owner_encode(lock, NULL)); +} + +static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock) { lock->owner = (struct task_struct *) ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); } -static void fixup_rt_mutex_waiters(struct rt_mutex *lock) +static __always_inline void +fixup_rt_mutex_waiters(struct rt_mutex_base *lock, bool acquire_lock) { unsigned long owner, *p = (unsigned long *) &lock->owner; @@ -132,8 +189,21 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) * still set. */ owner = READ_ONCE(*p); - if (owner & RT_MUTEX_HAS_WAITERS) - WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); + if (owner & RT_MUTEX_HAS_WAITERS) { + /* + * See rt_mutex_set_owner() and rt_mutex_clear_owner() on + * why xchg_acquire() is used for updating owner for + * locking and WRITE_ONCE() for unlocking. + * + * WRITE_ONCE() would work for the acquire case too, but + * in case that the lock acquisition failed it might + * force other lockers into the slow path unnecessarily. + */ + if (acquire_lock) + xchg_acquire(p, owner & ~RT_MUTEX_HAS_WAITERS); + else + WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); + } } /* @@ -141,22 +211,46 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) * set up. */ #ifndef CONFIG_DEBUG_RT_MUTEXES -# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c) -# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c) +static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock, + struct task_struct *old, + struct task_struct *new) +{ + return try_cmpxchg_acquire(&lock->owner, &old, new); +} + +static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock) +{ + return rt_mutex_cmpxchg_acquire(lock, NULL, current); +} + +static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, + struct task_struct *old, + struct task_struct *new) +{ + return try_cmpxchg_release(&lock->owner, &old, new); +} /* * Callers must hold the ->wait_lock -- which is the whole purpose as we force * all future threads that attempt to [Rmw] the lock to the slowpath. As such * relaxed semantics suffice. */ -static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock) { - unsigned long owner, *p = (unsigned long *) &lock->owner; + unsigned long *p = (unsigned long *) &lock->owner; + unsigned long owner, new; + owner = READ_ONCE(*p); do { - owner = *p; - } while (cmpxchg_relaxed(p, owner, - owner | RT_MUTEX_HAS_WAITERS) != owner); + new = owner | RT_MUTEX_HAS_WAITERS; + } while (!try_cmpxchg_relaxed(p, &owner, new)); + + /* + * The cmpxchg loop above is relaxed to avoid back-to-back ACQUIRE + * operations in the event of contention. Ensure the successful + * cmpxchg is visible. + */ + smp_mb__after_atomic(); } /* @@ -165,8 +259,8 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) * 2) Drop lock->wait_lock * 3) Try to unlock the lock with cmpxchg */ -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, - unsigned long flags) +static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock, + unsigned long flags) __releases(lock->wait_lock) { struct task_struct *owner = rt_mutex_owner(lock); @@ -201,10 +295,36 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, } #else -# define rt_mutex_cmpxchg_acquire(l,c,n) (0) -# define rt_mutex_cmpxchg_release(l,c,n) (0) +static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock, + struct task_struct *old, + struct task_struct *new) +{ + return false; + +} + +static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock); + +static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock) +{ + /* + * With debug enabled rt_mutex_cmpxchg trylock() will always fail. + * + * Avoid unconditionally taking the slow path by using + * rt_mutex_slow_trylock() which is covered by the debug code and can + * acquire a non-contended rtmutex. + */ + return rt_mutex_slowtrylock(lock); +} + +static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, + struct task_struct *old, + struct task_struct *new) +{ + return false; +} -static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock) { lock->owner = (struct task_struct *) ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); @@ -213,8 +333,8 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) /* * Simple slow path only version: lock->owner is protected by lock->wait_lock. */ -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, - unsigned long flags) +static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock, + unsigned long flags) __releases(lock->wait_lock) { lock->owner = NULL; @@ -223,15 +343,53 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, } #endif +static __always_inline int __waiter_prio(struct task_struct *task) +{ + int prio = task->prio; + + if (!rt_prio(prio)) + return DEFAULT_PRIO; + + return prio; +} + +/* + * Update the waiter->tree copy of the sort keys. + */ +static __always_inline void +waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) +{ + lockdep_assert_held(&waiter->lock->wait_lock); + lockdep_assert(RB_EMPTY_NODE(&waiter->tree.entry)); + + waiter->tree.prio = __waiter_prio(task); + waiter->tree.deadline = task->dl.deadline; +} + +/* + * Update the waiter->pi_tree copy of the sort keys (from the tree copy). + */ +static __always_inline void +waiter_clone_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) +{ + lockdep_assert_held(&waiter->lock->wait_lock); + lockdep_assert_held(&task->pi_lock); + lockdep_assert(RB_EMPTY_NODE(&waiter->pi_tree.entry)); + + waiter->pi_tree.prio = waiter->tree.prio; + waiter->pi_tree.deadline = waiter->tree.deadline; +} + /* - * Only use with rt_mutex_waiter_{less,equal}() + * Only use with rt_waiter_node_{less,equal}() */ +#define task_to_waiter_node(p) \ + &(struct rt_waiter_node){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline } #define task_to_waiter(p) \ - &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } + &(struct rt_mutex_waiter){ .tree = *task_to_waiter_node(p) } -static inline int -rt_mutex_waiter_less(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) +static __always_inline int rt_waiter_node_less(struct rt_waiter_node *left, + struct rt_waiter_node *right) { if (left->prio < right->prio) return 1; @@ -248,9 +406,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, return 0; } -static inline int -rt_mutex_waiter_equal(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) +static __always_inline int rt_waiter_node_equal(struct rt_waiter_node *left, + struct rt_waiter_node *right) { if (left->prio != right->prio) return 0; @@ -267,76 +424,110 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, return 1; } -static void -rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) +static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, + struct rt_mutex_waiter *top_waiter) { - struct rb_node **link = &lock->waiters.rb_root.rb_node; - struct rb_node *parent = NULL; - struct rt_mutex_waiter *entry; - bool leftmost = true; - - while (*link) { - parent = *link; - entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry); - if (rt_mutex_waiter_less(waiter, entry)) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = false; - } + if (rt_waiter_node_less(&waiter->tree, &top_waiter->tree)) + return true; + +#ifdef RT_MUTEX_BUILD_SPINLOCKS + /* + * Note that RT tasks are excluded from same priority (lateral) + * steals to prevent the introduction of an unbounded latency. + */ + if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio)) + return false; + + return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree); +#else + return false; +#endif +} + +#define __node_2_waiter(node) \ + rb_entry((node), struct rt_mutex_waiter, tree.entry) + +static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b) +{ + struct rt_mutex_waiter *aw = __node_2_waiter(a); + struct rt_mutex_waiter *bw = __node_2_waiter(b); + + if (rt_waiter_node_less(&aw->tree, &bw->tree)) + return 1; + + if (!build_ww_mutex()) + return 0; + + if (rt_waiter_node_less(&bw->tree, &aw->tree)) + return 0; + + /* NOTE: relies on waiter->ww_ctx being set before insertion */ + if (aw->ww_ctx) { + if (!bw->ww_ctx) + return 1; + + return (signed long)(aw->ww_ctx->stamp - + bw->ww_ctx->stamp) < 0; } - rb_link_node(&waiter->tree_entry, parent, link); - rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost); + return 0; } -static void -rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) +static __always_inline void +rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) { - if (RB_EMPTY_NODE(&waiter->tree_entry)) + lockdep_assert_held(&lock->wait_lock); + + rb_add_cached(&waiter->tree.entry, &lock->waiters, __waiter_less); +} + +static __always_inline void +rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) +{ + lockdep_assert_held(&lock->wait_lock); + + if (RB_EMPTY_NODE(&waiter->tree.entry)) return; - rb_erase_cached(&waiter->tree_entry, &lock->waiters); - RB_CLEAR_NODE(&waiter->tree_entry); + rb_erase_cached(&waiter->tree.entry, &lock->waiters); + RB_CLEAR_NODE(&waiter->tree.entry); +} + +#define __node_2_rt_node(node) \ + rb_entry((node), struct rt_waiter_node, entry) + +static __always_inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b) +{ + return rt_waiter_node_less(__node_2_rt_node(a), __node_2_rt_node(b)); } -static void +static __always_inline void rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { - struct rb_node **link = &task->pi_waiters.rb_root.rb_node; - struct rb_node *parent = NULL; - struct rt_mutex_waiter *entry; - bool leftmost = true; - - while (*link) { - parent = *link; - entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry); - if (rt_mutex_waiter_less(waiter, entry)) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = false; - } - } + lockdep_assert_held(&task->pi_lock); - rb_link_node(&waiter->pi_tree_entry, parent, link); - rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost); + rb_add_cached(&waiter->pi_tree.entry, &task->pi_waiters, __pi_waiter_less); } -static void +static __always_inline void rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { - if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) + lockdep_assert_held(&task->pi_lock); + + if (RB_EMPTY_NODE(&waiter->pi_tree.entry)) return; - rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters); - RB_CLEAR_NODE(&waiter->pi_tree_entry); + rb_erase_cached(&waiter->pi_tree.entry, &task->pi_waiters); + RB_CLEAR_NODE(&waiter->pi_tree.entry); } -static void rt_mutex_adjust_prio(struct task_struct *p) +static __always_inline void rt_mutex_adjust_prio(struct rt_mutex_base *lock, + struct task_struct *p) { struct task_struct *pi_task = NULL; + lockdep_assert_held(&lock->wait_lock); + lockdep_assert(rt_mutex_owner(lock) == p); lockdep_assert_held(&p->pi_lock); if (task_has_pi_waiters(p)) @@ -345,6 +536,42 @@ static void rt_mutex_adjust_prio(struct task_struct *p) rt_mutex_setprio(p, pi_task); } +/* RT mutex specific wake_q wrappers */ +static __always_inline void rt_mutex_wake_q_add_task(struct rt_wake_q_head *wqh, + struct task_struct *task, + unsigned int wake_state) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && wake_state == TASK_RTLOCK_WAIT) { + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) + WARN_ON_ONCE(wqh->rtlock_task); + get_task_struct(task); + wqh->rtlock_task = task; + } else { + wake_q_add(&wqh->head, task); + } +} + +static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh, + struct rt_mutex_waiter *w) +{ + rt_mutex_wake_q_add_task(wqh, w->task, w->wake_state); +} + +static __always_inline void rt_mutex_wake_up_q(struct rt_wake_q_head *wqh) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && wqh->rtlock_task) { + wake_up_state(wqh->rtlock_task, TASK_RTLOCK_WAIT); + put_task_struct(wqh->rtlock_task); + wqh->rtlock_task = NULL; + } + + if (!wake_q_empty(&wqh->head)) + wake_up_q(&wqh->head); + + /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */ + preempt_enable(); +} + /* * Deadlock detection is conditional: * @@ -358,25 +585,16 @@ static void rt_mutex_adjust_prio(struct task_struct *p) * deadlock detection is disabled independent of the detect argument * and the config settings. */ -static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, - enum rtmutex_chainwalk chwalk) +static __always_inline bool +rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, + enum rtmutex_chainwalk chwalk) { - /* - * This is just a wrapper function for the following call, - * because debug_rt_mutex_detect_deadlock() smells like a magic - * debug feature and I wanted to keep the cond function in the - * main source file along with the comments instead of having - * two of the same in the headers. - */ - return debug_rt_mutex_detect_deadlock(waiter, chwalk); + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) + return waiter != NULL; + return chwalk == RT_MUTEX_FULL_CHAINWALK; } -/* - * Max number of times we'll walk the boosting chain: - */ -int max_lock_depth = 1024; - -static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) +static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_struct *p) { return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL; } @@ -405,9 +623,14 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) * Chain walk basics and protection scope * * [R] refcount on task - * [P] task->pi_lock held + * [Pn] task->pi_lock held * [L] rtmutex->wait_lock held * + * Normal locking order: + * + * rtmutex->wait_lock + * task->pi_lock + * * Step Description Protected by * function arguments: * @task [R] @@ -422,39 +645,44 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) * again: * loop_sanity_check(); * retry: - * [1] lock(task->pi_lock); [R] acquire [P] - * [2] waiter = task->pi_blocked_on; [P] - * [3] check_exit_conditions_1(); [P] - * [4] lock = waiter->lock; [P] - * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L] - * unlock(task->pi_lock); release [P] + * [1] lock(task->pi_lock); [R] acquire [P1] + * [2] waiter = task->pi_blocked_on; [P1] + * [3] check_exit_conditions_1(); [P1] + * [4] lock = waiter->lock; [P1] + * [5] if (!try_lock(lock->wait_lock)) { [P1] try to acquire [L] + * unlock(task->pi_lock); release [P1] * goto retry; * } - * [6] check_exit_conditions_2(); [P] + [L] - * [7] requeue_lock_waiter(lock, waiter); [P] + [L] - * [8] unlock(task->pi_lock); release [P] + * [6] check_exit_conditions_2(); [P1] + [L] + * [7] requeue_lock_waiter(lock, waiter); [P1] + [L] + * [8] unlock(task->pi_lock); release [P1] * put_task_struct(task); release [R] * [9] check_exit_conditions_3(); [L] * [10] task = owner(lock); [L] * get_task_struct(task); [L] acquire [R] - * lock(task->pi_lock); [L] acquire [P] - * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L] - * [12] check_exit_conditions_4(); [P] + [L] - * [13] unlock(task->pi_lock); release [P] + * lock(task->pi_lock); [L] acquire [P2] + * [11] requeue_pi_waiter(tsk, waiters(lock));[P2] + [L] + * [12] check_exit_conditions_4(); [P2] + [L] + * [13] unlock(task->pi_lock); release [P2] * unlock(lock->wait_lock); release [L] * goto again; + * + * Where P1 is the blocking task and P2 is the lock owner; going up one step + * the owner becomes the next blocked task etc.. + * +* */ -static int rt_mutex_adjust_prio_chain(struct task_struct *task, - enum rtmutex_chainwalk chwalk, - struct rt_mutex *orig_lock, - struct rt_mutex *next_lock, - struct rt_mutex_waiter *orig_waiter, - struct task_struct *top_task) +static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, + enum rtmutex_chainwalk chwalk, + struct rt_mutex_base *orig_lock, + struct rt_mutex_base *next_lock, + struct rt_mutex_waiter *orig_waiter, + struct task_struct *top_task) { struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; struct rt_mutex_waiter *prerequeue_top_waiter; int ret = 0, depth = 0; - struct rt_mutex *lock; + struct rt_mutex_base *lock; bool detect_deadlock; bool requeue = true; @@ -537,6 +765,31 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto out_unlock_pi; /* + * There could be 'spurious' loops in the lock graph due to ww_mutex, + * consider: + * + * P1: A, ww_A, ww_B + * P2: ww_B, ww_A + * P3: A + * + * P3 should not return -EDEADLK because it gets trapped in the cycle + * created by P1 and P2 (which will resolve -- and runs into + * max_lock_depth above). Therefore disable detect_deadlock such that + * the below termination condition can trigger once all relevant tasks + * are boosted. + * + * Even when we start with ww_mutex we can disable deadlock detection, + * since we would supress a ww_mutex induced deadlock at [6] anyway. + * Supressing it here however is not sufficient since we might still + * hit [6] due to adjustment driven iteration. + * + * NOTE: if someone were to create a deadlock between 2 ww_classes we'd + * utterly fail to report it; lockdep should. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && waiter->ww_ctx && detect_deadlock) + detect_deadlock = false; + + /* * Drop out, when the task has no waiters. Note, * top_waiter can be NULL, when we are in the deboosting * mode! @@ -565,7 +818,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * enabled we continue, but stop the requeueing in the chain * walk. */ - if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { + if (rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) { if (!detect_deadlock) goto out_unlock_pi; else @@ -573,13 +826,18 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } /* - * [4] Get the next lock + * [4] Get the next lock; per holding task->pi_lock we can't unblock + * and guarantee @lock's existence. */ lock = waiter->lock; /* * [5] We need to trylock here as we are holding task->pi_lock, * which is the reverse lock order versus the other rtmutex * operations. + * + * Per the above, holding task->pi_lock guarantees lock exists, so + * inverting this lock order is infeasible from a life-time + * perspective. */ if (!raw_spin_trylock(&lock->wait_lock)) { raw_spin_unlock_irq(&task->pi_lock); @@ -597,9 +855,21 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * walk, we detected a deadlock. */ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { - debug_rt_mutex_deadlock(chwalk, orig_waiter, lock); - raw_spin_unlock(&lock->wait_lock); ret = -EDEADLK; + + /* + * When the deadlock is due to ww_mutex; also see above. Don't + * report the deadlock and instead let the ww_mutex wound/die + * logic pick which of the contending threads gets -EDEADLK. + * + * NOTE: assumes the cycle only contains a single ww_class; any + * other configuration and we fail to report; also, see + * lockdep. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter && orig_waiter->ww_ctx) + ret = 0; + + raw_spin_unlock(&lock->wait_lock); goto out_unlock_pi; } @@ -671,18 +941,18 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * or * * DL CBS enforcement advancing the effective deadline. - * - * Even though pi_waiters also uses these fields, and that tree is only - * updated in [11], we can do this here, since we hold [L], which - * serializes all pi_waiters access and rb_erase() does not care about - * the values of the node being removed. */ - waiter->prio = task->prio; - waiter->deadline = task->dl.deadline; + waiter_update_prio(waiter, task); rt_mutex_enqueue(lock, waiter); - /* [8] Release the task */ + /* + * [8] Release the (blocking) task in preparation for + * taking the owner task in [10]. + * + * Since we hold lock->waiter_lock, task cannot unblock, even if we + * release task->pi_lock. + */ raw_spin_unlock(&task->pi_lock); put_task_struct(task); @@ -699,13 +969,19 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * then we need to wake the new top waiter up to try * to get the lock. */ - if (prerequeue_top_waiter != rt_mutex_top_waiter(lock)) - wake_up_process(rt_mutex_top_waiter(lock)->task); + top_waiter = rt_mutex_top_waiter(lock); + if (prerequeue_top_waiter != top_waiter) + wake_up_state(top_waiter->task, top_waiter->wake_state); raw_spin_unlock_irq(&lock->wait_lock); return 0; } - /* [10] Grab the next task, i.e. the owner of @lock */ + /* + * [10] Grab the next task, i.e. the owner of @lock + * + * Per holding lock->wait_lock and checking for !owner above, there + * must be an owner and it cannot go away. + */ task = get_task_struct(rt_mutex_owner(lock)); raw_spin_lock(&task->pi_lock); @@ -718,13 +994,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * and adjust the priority of the owner. */ rt_mutex_dequeue_pi(task, prerequeue_top_waiter); + waiter_clone_prio(waiter, task); rt_mutex_enqueue_pi(task, waiter); - rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(lock, task); } else if (prerequeue_top_waiter == waiter) { /* * The waiter was the top waiter on the lock, but is - * no longer the top prority waiter. Replace waiter in + * no longer the top priority waiter. Replace waiter in * the owner tasks pi waiters tree with the new top * (highest priority) waiter and adjust the priority * of the owner. @@ -734,8 +1011,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, */ rt_mutex_dequeue_pi(task, waiter); waiter = rt_mutex_top_waiter(lock); + waiter_clone_prio(waiter, task); rt_mutex_enqueue_pi(task, waiter); - rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(lock, task); } else { /* * Nothing changed. No need to do any priority @@ -802,8 +1080,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * @waiter: The waiter that is queued to the lock's wait tree if the * callsite called task_blocked_on_lock(), otherwise NULL */ -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, - struct rt_mutex_waiter *waiter) +static int __sched +try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task, + struct rt_mutex_waiter *waiter) { lockdep_assert_held(&lock->wait_lock); @@ -838,19 +1117,21 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * trylock attempt. */ if (waiter) { - /* - * If waiter is not the highest priority waiter of - * @lock, give up. - */ - if (waiter != rt_mutex_top_waiter(lock)) - return 0; + struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock); /* - * We can acquire the lock. Remove the waiter from the - * lock waiters tree. + * If waiter is the highest priority waiter of @lock, + * or allowed to steal it, take it over. */ - rt_mutex_dequeue(lock, waiter); - + if (waiter == top_waiter || rt_mutex_steal(waiter, top_waiter)) { + /* + * We can acquire the lock. Remove the waiter from the + * lock waiters tree. + */ + rt_mutex_dequeue(lock, waiter); + } else { + return 0; + } } else { /* * If the lock has waiters already we check whether @task is @@ -861,13 +1142,9 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * not need to be dequeued. */ if (rt_mutex_has_waiters(lock)) { - /* - * If @task->prio is greater than or equal to - * the top waiter priority (kernel view), - * @task lost. - */ - if (!rt_mutex_waiter_less(task_to_waiter(task), - rt_mutex_top_waiter(lock))) + /* Check whether the trylock can steal it. */ + if (!rt_mutex_steal(task_to_waiter(task), + rt_mutex_top_waiter(lock))) return 0; /* @@ -904,9 +1181,6 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, raw_spin_unlock(&task->pi_lock); takeit: - /* We got the lock. */ - debug_rt_mutex_lock(lock); - /* * This either preserves the RT_MUTEX_HAS_WAITERS bit if there * are still waiters or clears it. @@ -923,14 +1197,15 @@ takeit: * * This must be called with lock->wait_lock held and interrupts disabled */ -static int task_blocks_on_rt_mutex(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task, - enum rtmutex_chainwalk chwalk) +static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, + struct ww_acquire_ctx *ww_ctx, + enum rtmutex_chainwalk chwalk) { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; - struct rt_mutex *next_lock; + struct rt_mutex_base *next_lock; int chain_walk = 0, res; lockdep_assert_held(&lock->wait_lock); @@ -943,15 +1218,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, * the other will detect the deadlock and return -EDEADLOCK, * which is wrong, as the other waiter is not in a deadlock * situation. + * + * Except for ww_mutex, in that case the chain walk must already deal + * with spurious cycles, see the comments at [3] and [6]. */ - if (owner == task) + if (owner == task && !(build_ww_mutex() && ww_ctx)) return -EDEADLK; raw_spin_lock(&task->pi_lock); waiter->task = task; waiter->lock = lock; - waiter->prio = task->prio; - waiter->deadline = task->dl.deadline; + waiter_update_prio(waiter, task); + waiter_clone_prio(waiter, task); /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) @@ -962,6 +1240,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, raw_spin_unlock(&task->pi_lock); + if (build_ww_mutex() && ww_ctx) { + struct rt_mutex *rtm; + + /* Check whether the waiter should back out immediately */ + rtm = container_of(lock, struct rt_mutex, rtmutex); + res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx); + if (res) { + raw_spin_lock(&task->pi_lock); + rt_mutex_dequeue(lock, waiter); + task->pi_blocked_on = NULL; + raw_spin_unlock(&task->pi_lock); + return res; + } + } + if (!owner) return 0; @@ -970,7 +1263,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); - rt_mutex_adjust_prio(owner); + rt_mutex_adjust_prio(lock, owner); if (owner->pi_blocked_on) chain_walk = 1; } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { @@ -1012,11 +1305,13 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, * * Called with lock->wait_lock held and interrupts disabled. */ -static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, - struct rt_mutex *lock) +static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh, + struct rt_mutex_base *lock) { struct rt_mutex_waiter *waiter; + lockdep_assert_held(&lock->wait_lock); + raw_spin_lock(¤t->pi_lock); waiter = rt_mutex_top_waiter(lock); @@ -1029,7 +1324,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, * task unblocks. */ rt_mutex_dequeue_pi(current, waiter); - rt_mutex_adjust_prio(current); + rt_mutex_adjust_prio(lock, current); /* * As we are waking up the top waiter, and the waiter stays @@ -1049,244 +1344,14 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, * deboost but before waking our donor task, hence the preempt_disable() * before unlock. * - * Pairs with preempt_enable() in rt_mutex_postunlock(); + * Pairs with preempt_enable() in rt_mutex_wake_up_q(); */ preempt_disable(); - wake_q_add(wake_q, waiter->task); + rt_mutex_wake_q_add(wqh, waiter); raw_spin_unlock(¤t->pi_lock); } -/* - * Remove a waiter from a lock and give up - * - * Must be called with lock->wait_lock held and interrupts disabled. I must - * have just failed to try_to_take_rt_mutex(). - */ -static void remove_waiter(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter) -{ - bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); - struct task_struct *owner = rt_mutex_owner(lock); - struct rt_mutex *next_lock; - - lockdep_assert_held(&lock->wait_lock); - - raw_spin_lock(¤t->pi_lock); - rt_mutex_dequeue(lock, waiter); - current->pi_blocked_on = NULL; - raw_spin_unlock(¤t->pi_lock); - - /* - * Only update priority if the waiter was the highest priority - * waiter of the lock and there is an owner to update. - */ - if (!owner || !is_top_waiter) - return; - - raw_spin_lock(&owner->pi_lock); - - rt_mutex_dequeue_pi(owner, waiter); - - if (rt_mutex_has_waiters(lock)) - rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); - - rt_mutex_adjust_prio(owner); - - /* Store the lock on which owner is blocked or NULL */ - next_lock = task_blocked_on_lock(owner); - - raw_spin_unlock(&owner->pi_lock); - - /* - * Don't walk the chain, if the owner task is not blocked - * itself. - */ - if (!next_lock) - return; - - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(owner); - - raw_spin_unlock_irq(&lock->wait_lock); - - rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock, - next_lock, NULL, current); - - raw_spin_lock_irq(&lock->wait_lock); -} - -/* - * Recheck the pi chain, in case we got a priority setting - * - * Called from sched_setscheduler - */ -void rt_mutex_adjust_pi(struct task_struct *task) -{ - struct rt_mutex_waiter *waiter; - struct rt_mutex *next_lock; - unsigned long flags; - - raw_spin_lock_irqsave(&task->pi_lock, flags); - - waiter = task->pi_blocked_on; - if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - return; - } - next_lock = waiter->lock; - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(task); - - rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL, - next_lock, NULL, task); -} - -void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) -{ - debug_rt_mutex_init_waiter(waiter); - RB_CLEAR_NODE(&waiter->pi_tree_entry); - RB_CLEAR_NODE(&waiter->tree_entry); - waiter->task = NULL; -} - -/** - * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop - * @lock: the rt_mutex to take - * @state: the state the task should block in (TASK_INTERRUPTIBLE - * or TASK_UNINTERRUPTIBLE) - * @timeout: the pre-initialized and started timer, or NULL for none - * @waiter: the pre-initialized rt_mutex_waiter - * - * Must be called with lock->wait_lock held and interrupts disabled - */ -static int __sched -__rt_mutex_slowlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - struct rt_mutex_waiter *waiter) -{ - int ret = 0; - - for (;;) { - /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock, current, waiter)) - break; - - /* - * TASK_INTERRUPTIBLE checks for signals and - * timeout. Ignored otherwise. - */ - if (likely(state == TASK_INTERRUPTIBLE)) { - /* Signal pending? */ - if (signal_pending(current)) - ret = -EINTR; - if (timeout && !timeout->task) - ret = -ETIMEDOUT; - if (ret) - break; - } - - raw_spin_unlock_irq(&lock->wait_lock); - - debug_rt_mutex_print_deadlock(waiter); - - schedule(); - - raw_spin_lock_irq(&lock->wait_lock); - set_current_state(state); - } - - __set_current_state(TASK_RUNNING); - return ret; -} - -static void rt_mutex_handle_deadlock(int res, int detect_deadlock, - struct rt_mutex_waiter *w) -{ - /* - * If the result is not -EDEADLOCK or the caller requested - * deadlock detection, nothing to do here. - */ - if (res != -EDEADLOCK || detect_deadlock) - return; - - /* - * Yell lowdly and stop the task right here. - */ - rt_mutex_print_deadlock(w); - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - } -} - -/* - * Slow path lock function: - */ -static int __sched -rt_mutex_slowlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - enum rtmutex_chainwalk chwalk) -{ - struct rt_mutex_waiter waiter; - unsigned long flags; - int ret = 0; - - rt_mutex_init_waiter(&waiter); - - /* - * Technically we could use raw_spin_[un]lock_irq() here, but this can - * be called in early boot if the cmpxchg() fast path is disabled - * (debug, no architecture support). In this case we will acquire the - * rtmutex with lock->wait_lock held. But we cannot unconditionally - * enable interrupts in that early boot case. So we need to use the - * irqsave/restore variants. - */ - raw_spin_lock_irqsave(&lock->wait_lock, flags); - - /* Try to acquire the lock again: */ - if (try_to_take_rt_mutex(lock, current, NULL)) { - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - return 0; - } - - set_current_state(state); - - /* Setup the timer, when timeout != NULL */ - if (unlikely(timeout)) - hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); - - ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); - - if (likely(!ret)) - /* sleep on the mutex */ - ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); - - if (unlikely(ret)) { - __set_current_state(TASK_RUNNING); - remove_waiter(lock, &waiter); - rt_mutex_handle_deadlock(ret, chwalk, &waiter); - } - - /* - * try_to_take_rt_mutex() sets the waiter bit - * unconditionally. We might have to fix that up. - */ - fixup_rt_mutex_waiters(lock); - - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - - /* Remove pending timer: */ - if (unlikely(timeout)) - hrtimer_cancel(&timeout->timer); - - debug_rt_mutex_free_waiter(&waiter); - - return ret; -} - -static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock) +static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock) { int ret = try_to_take_rt_mutex(lock, current, NULL); @@ -1294,7 +1359,7 @@ static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock) * try_to_take_rt_mutex() sets the lock waiters bit * unconditionally. Clean this up. */ - fixup_rt_mutex_waiters(lock); + fixup_rt_mutex_waiters(lock, true); return ret; } @@ -1302,7 +1367,7 @@ static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock) /* * Slow path try-lock function: */ -static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) +static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock) { unsigned long flags; int ret; @@ -1328,14 +1393,20 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) return ret; } +static __always_inline int __rt_mutex_trylock(struct rt_mutex_base *lock) +{ + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) + return 1; + + return rt_mutex_slowtrylock(lock); +} + /* * Slow path to release a rt-mutex. - * - * Return whether the current task needs to call rt_mutex_postunlock(). */ -static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, - struct wake_q_head *wake_q) +static void __sched rt_mutex_slowunlock(struct rt_mutex_base *lock) { + DEFINE_RT_WAKE_Q(wqh); unsigned long flags; /* irqsave required to support early boot calls */ @@ -1377,7 +1448,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, while (!rt_mutex_has_waiters(lock)) { /* Drops lock->wait_lock ! */ if (unlock_rt_mutex_safe(lock, flags) == true) - return false; + return; /* Relock the rtmutex and try again */ raw_spin_lock_irqsave(&lock->wait_lock, flags); } @@ -1388,534 +1459,407 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, * * Queue the next waiter for wakeup once we release the wait_lock. */ - mark_wakeup_next_waiter(wake_q, lock); + mark_wakeup_next_waiter(&wqh, lock); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - return true; /* call rt_mutex_postunlock() */ + rt_mutex_wake_up_q(&wqh); } -/* - * debug aware fast / slowpath lock,trylock,unlock - * - * The atomic acquire/release ops are compiled away, when either the - * architecture does not support cmpxchg or when debugging is enabled. - */ -static inline int -rt_mutex_fastlock(struct rt_mutex *lock, int state, - int (*slowfn)(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - enum rtmutex_chainwalk chwalk)) +static __always_inline void __rt_mutex_unlock(struct rt_mutex_base *lock) { - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) - return 0; + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) + return; - return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); + rt_mutex_slowunlock(lock); } -static inline int -rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - enum rtmutex_chainwalk chwalk, - int (*slowfn)(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - enum rtmutex_chainwalk chwalk)) +#ifdef CONFIG_SMP +static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *owner) { - if (chwalk == RT_MUTEX_MIN_CHAINWALK && - likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) - return 0; + bool res = true; - return slowfn(lock, state, timeout, chwalk); + rcu_read_lock(); + for (;;) { + /* If owner changed, trylock again. */ + if (owner != rt_mutex_owner(lock)) + break; + /* + * Ensure that @owner is dereferenced after checking that + * the lock owner still matches @owner. If that fails, + * @owner might point to freed memory. If it still matches, + * the rcu_read_lock() ensures the memory stays valid. + */ + barrier(); + /* + * Stop spinning when: + * - the lock owner has been scheduled out + * - current is not longer the top waiter + * - current is requested to reschedule (redundant + * for CONFIG_PREEMPT_RCU=y) + * - the VCPU on which owner runs is preempted + */ + if (!owner_on_cpu(owner) || need_resched() || + !rt_mutex_waiter_is_top_waiter(lock, waiter)) { + res = false; + break; + } + cpu_relax(); + } + rcu_read_unlock(); + return res; } - -static inline int -rt_mutex_fasttrylock(struct rt_mutex *lock, - int (*slowfn)(struct rt_mutex *lock)) +#else +static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *owner) { - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) - return 1; - - return slowfn(lock); + return false; } +#endif +#ifdef RT_MUTEX_BUILD_MUTEX /* - * Performs the wakeup of the the top-waiter and re-enables preemption. + * Functions required for: + * - rtmutex, futex on all kernels + * - mutex and rwsem substitutions on RT kernels */ -void rt_mutex_postunlock(struct wake_q_head *wake_q) + +/* + * Remove a waiter from a lock and give up + * + * Must be called with lock->wait_lock held and interrupts disabled. It must + * have just failed to try_to_take_rt_mutex(). + */ +static void __sched remove_waiter(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter) { - wake_up_q(wake_q); + bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); + struct task_struct *owner = rt_mutex_owner(lock); + struct rt_mutex_base *next_lock; - /* Pairs with preempt_disable() in rt_mutex_slowunlock() */ - preempt_enable(); -} + lockdep_assert_held(&lock->wait_lock); -static inline void -rt_mutex_fastunlock(struct rt_mutex *lock, - bool (*slowfn)(struct rt_mutex *lock, - struct wake_q_head *wqh)) -{ - DEFINE_WAKE_Q(wake_q); + raw_spin_lock(¤t->pi_lock); + rt_mutex_dequeue(lock, waiter); + current->pi_blocked_on = NULL; + raw_spin_unlock(¤t->pi_lock); - if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) + /* + * Only update priority if the waiter was the highest priority + * waiter of the lock and there is an owner to update. + */ + if (!owner || !is_top_waiter) return; - if (slowfn(lock, &wake_q)) - rt_mutex_postunlock(&wake_q); -} - -static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass) -{ - might_sleep(); + raw_spin_lock(&owner->pi_lock); - mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); -} + rt_mutex_dequeue_pi(owner, waiter); -#ifdef CONFIG_DEBUG_LOCK_ALLOC -/** - * rt_mutex_lock_nested - lock a rt_mutex - * - * @lock: the rt_mutex to be locked - * @subclass: the lockdep subclass - */ -void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) -{ - __rt_mutex_lock(lock, subclass); -} -EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); + if (rt_mutex_has_waiters(lock)) + rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); -#else /* !CONFIG_DEBUG_LOCK_ALLOC */ + rt_mutex_adjust_prio(lock, owner); -/** - * rt_mutex_lock - lock a rt_mutex - * - * @lock: the rt_mutex to be locked - */ -void __sched rt_mutex_lock(struct rt_mutex *lock) -{ - __rt_mutex_lock(lock, 0); -} -EXPORT_SYMBOL_GPL(rt_mutex_lock); -#endif + /* Store the lock on which owner is blocked or NULL */ + next_lock = task_blocked_on_lock(owner); -/** - * rt_mutex_lock_interruptible - lock a rt_mutex interruptible - * - * @lock: the rt_mutex to be locked - * - * Returns: - * 0 on success - * -EINTR when interrupted by a signal - */ -int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) -{ - int ret; + raw_spin_unlock(&owner->pi_lock); - might_sleep(); + /* + * Don't walk the chain, if the owner task is not blocked + * itself. + */ + if (!next_lock) + return; - mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); - ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); - if (ret) - mutex_release(&lock->dep_map, _RET_IP_); + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(owner); - return ret; -} -EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); + raw_spin_unlock_irq(&lock->wait_lock); -/* - * Futex variant, must not use fastpath. - */ -int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) -{ - return rt_mutex_slowtrylock(lock); -} + rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock, + next_lock, NULL, current); -int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) -{ - return __rt_mutex_slowtrylock(lock); + raw_spin_lock_irq(&lock->wait_lock); } /** - * rt_mutex_timed_lock - lock a rt_mutex interruptible - * the timeout structure is provided - * by the caller - * - * @lock: the rt_mutex to be locked - * @timeout: timeout structure or NULL (no timeout) + * rt_mutex_slowlock_block() - Perform the wait-wake-try-to-take loop + * @lock: the rt_mutex to take + * @ww_ctx: WW mutex context pointer + * @state: the state the task should block in (TASK_INTERRUPTIBLE + * or TASK_UNINTERRUPTIBLE) + * @timeout: the pre-initialized and started timer, or NULL for none + * @waiter: the pre-initialized rt_mutex_waiter * - * Returns: - * 0 on success - * -EINTR when interrupted by a signal - * -ETIMEDOUT when the timeout expired + * Must be called with lock->wait_lock held and interrupts disabled */ -int -rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) +static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, + struct ww_acquire_ctx *ww_ctx, + unsigned int state, + struct hrtimer_sleeper *timeout, + struct rt_mutex_waiter *waiter) { - int ret; + struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); + struct task_struct *owner; + int ret = 0; - might_sleep(); + for (;;) { + /* Try to acquire the lock: */ + if (try_to_take_rt_mutex(lock, current, waiter)) + break; - mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); - ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, - RT_MUTEX_MIN_CHAINWALK, - rt_mutex_slowlock); - if (ret) - mutex_release(&lock->dep_map, _RET_IP_); + if (timeout && !timeout->task) { + ret = -ETIMEDOUT; + break; + } + if (signal_pending_state(state, current)) { + ret = -EINTR; + break; + } - return ret; -} -EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); + if (build_ww_mutex() && ww_ctx) { + ret = __ww_mutex_check_kill(rtm, waiter, ww_ctx); + if (ret) + break; + } -/** - * rt_mutex_trylock - try to lock a rt_mutex - * - * @lock: the rt_mutex to be locked - * - * This function can only be called in thread context. It's safe to - * call it from atomic regions, but not from hard interrupt or soft - * interrupt context. - * - * Returns 1 on success and 0 on contention - */ -int __sched rt_mutex_trylock(struct rt_mutex *lock) -{ - int ret; + if (waiter == rt_mutex_top_waiter(lock)) + owner = rt_mutex_owner(lock); + else + owner = NULL; + raw_spin_unlock_irq(&lock->wait_lock); - if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) - return 0; + if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) + rt_mutex_schedule(); - ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); - if (ret) - mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + raw_spin_lock_irq(&lock->wait_lock); + set_current_state(state); + } + __set_current_state(TASK_RUNNING); return ret; } -EXPORT_SYMBOL_GPL(rt_mutex_trylock); -/** - * rt_mutex_unlock - unlock a rt_mutex - * - * @lock: the rt_mutex to be unlocked - */ -void __sched rt_mutex_unlock(struct rt_mutex *lock) +static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, + struct rt_mutex_waiter *w) { - mutex_release(&lock->dep_map, _RET_IP_); - rt_mutex_fastunlock(lock, rt_mutex_slowunlock); + /* + * If the result is not -EDEADLOCK or the caller requested + * deadlock detection, nothing to do here. + */ + if (res != -EDEADLOCK || detect_deadlock) + return; + + if (build_ww_mutex() && w->ww_ctx) + return; + + /* + * Yell loudly and stop the task right here. + */ + WARN(1, "rtmutex deadlock detected\n"); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + rt_mutex_schedule(); + } } -EXPORT_SYMBOL_GPL(rt_mutex_unlock); /** - * Futex variant, that since futex variants do not use the fast-path, can be - * simple and will not need to retry. + * __rt_mutex_slowlock - Locking slowpath invoked with lock::wait_lock held + * @lock: The rtmutex to block lock + * @ww_ctx: WW mutex context pointer + * @state: The task state for sleeping + * @chwalk: Indicator whether full or partial chainwalk is requested + * @waiter: Initializer waiter for blocking */ -bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wake_q) +static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, + struct ww_acquire_ctx *ww_ctx, + unsigned int state, + enum rtmutex_chainwalk chwalk, + struct rt_mutex_waiter *waiter) { + struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); + struct ww_mutex *ww = ww_container_of(rtm); + int ret; + lockdep_assert_held(&lock->wait_lock); - debug_rt_mutex_unlock(lock); + /* Try to acquire the lock again: */ + if (try_to_take_rt_mutex(lock, current, NULL)) { + if (build_ww_mutex() && ww_ctx) { + __ww_mutex_check_waiters(rtm, ww_ctx); + ww_mutex_lock_acquired(ww, ww_ctx); + } + return 0; + } + + set_current_state(state); + + trace_contention_begin(lock, LCB_F_RT); - if (!rt_mutex_has_waiters(lock)) { - lock->owner = NULL; - return false; /* done */ + ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk); + if (likely(!ret)) + ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter); + + if (likely(!ret)) { + /* acquired the lock */ + if (build_ww_mutex() && ww_ctx) { + if (!ww_ctx->is_wait_die) + __ww_mutex_check_waiters(rtm, ww_ctx); + ww_mutex_lock_acquired(ww, ww_ctx); + } + } else { + __set_current_state(TASK_RUNNING); + remove_waiter(lock, waiter); + rt_mutex_handle_deadlock(ret, chwalk, waiter); } /* - * We've already deboosted, mark_wakeup_next_waiter() will - * retain preempt_disabled when we drop the wait_lock, to - * avoid inversion prior to the wakeup. preempt_disable() - * therein pairs with rt_mutex_postunlock(). + * try_to_take_rt_mutex() sets the waiter bit + * unconditionally. We might have to fix that up. */ - mark_wakeup_next_waiter(wake_q, lock); + fixup_rt_mutex_waiters(lock, true); + + trace_contention_end(lock, ret); - return true; /* call postunlock() */ + return ret; } -void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) +static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, + struct ww_acquire_ctx *ww_ctx, + unsigned int state) { - DEFINE_WAKE_Q(wake_q); - unsigned long flags; - bool postunlock; + struct rt_mutex_waiter waiter; + int ret; - raw_spin_lock_irqsave(&lock->wait_lock, flags); - postunlock = __rt_mutex_futex_unlock(lock, &wake_q); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + rt_mutex_init_waiter(&waiter); + waiter.ww_ctx = ww_ctx; - if (postunlock) - rt_mutex_postunlock(&wake_q); -} + ret = __rt_mutex_slowlock(lock, ww_ctx, state, RT_MUTEX_MIN_CHAINWALK, + &waiter); -/** - * rt_mutex_destroy - mark a mutex unusable - * @lock: the mutex to be destroyed - * - * This function marks the mutex uninitialized, and any subsequent - * use of the mutex is forbidden. The mutex must not be locked when - * this function is called. - */ -void rt_mutex_destroy(struct rt_mutex *lock) -{ - WARN_ON(rt_mutex_is_locked(lock)); -#ifdef CONFIG_DEBUG_RT_MUTEXES - lock->magic = NULL; -#endif + debug_rt_mutex_free_waiter(&waiter); + return ret; } -EXPORT_SYMBOL_GPL(rt_mutex_destroy); -/** - * __rt_mutex_init - initialize the rt lock - * - * @lock: the rt lock to be initialized - * - * Initialize the rt lock to unlocked state. - * - * Initializing of a locked rt lock is not allowed +/* + * rt_mutex_slowlock - Locking slowpath invoked when fast path fails + * @lock: The rtmutex to block lock + * @ww_ctx: WW mutex context pointer + * @state: The task state for sleeping */ -void __rt_mutex_init(struct rt_mutex *lock, const char *name, - struct lock_class_key *key) +static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, + struct ww_acquire_ctx *ww_ctx, + unsigned int state) { - lock->owner = NULL; - raw_spin_lock_init(&lock->wait_lock); - lock->waiters = RB_ROOT_CACHED; + unsigned long flags; + int ret; + + /* + * Do all pre-schedule work here, before we queue a waiter and invoke + * PI -- any such work that trips on rtlock (PREEMPT_RT spinlock) would + * otherwise recurse back into task_blocks_on_rt_mutex() through + * rtlock_slowlock() and will then enqueue a second waiter for this + * same task and things get really confusing real fast. + */ + rt_mutex_pre_schedule(); - if (name && key) - debug_rt_mutex_init(lock, name, key); + /* + * Technically we could use raw_spin_[un]lock_irq() here, but this can + * be called in early boot if the cmpxchg() fast path is disabled + * (debug, no architecture support). In this case we will acquire the + * rtmutex with lock->wait_lock held. But we cannot unconditionally + * enable interrupts in that early boot case. So we need to use the + * irqsave/restore variants. + */ + raw_spin_lock_irqsave(&lock->wait_lock, flags); + ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + rt_mutex_post_schedule(); + + return ret; } -EXPORT_SYMBOL_GPL(__rt_mutex_init); -/** - * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a - * proxy owner - * - * @lock: the rt_mutex to be locked - * @proxy_owner:the task to set as owner - * - * No locking. Caller has to do serializing itself - * - * Special API call for PI-futex support. This initializes the rtmutex and - * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not - * possible at this point because the pi_state which contains the rtmutex - * is not yet visible to other tasks. - */ -void rt_mutex_init_proxy_locked(struct rt_mutex *lock, - struct task_struct *proxy_owner) +static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock, + unsigned int state) { - __rt_mutex_init(lock, NULL, NULL); - debug_rt_mutex_proxy_lock(lock, proxy_owner); - rt_mutex_set_owner(lock, proxy_owner); + lockdep_assert(!current->pi_blocked_on); + + if (likely(rt_mutex_try_acquire(lock))) + return 0; + + return rt_mutex_slowlock(lock, NULL, state); } +#endif /* RT_MUTEX_BUILD_MUTEX */ -/** - * rt_mutex_proxy_unlock - release a lock on behalf of owner - * - * @lock: the rt_mutex to be locked - * - * No locking. Caller has to do serializing itself - * - * Special API call for PI-futex support. This merrily cleans up the rtmutex - * (debugging) state. Concurrent operations on this rt_mutex are not - * possible because it belongs to the pi_state which is about to be freed - * and it is not longer visible to other tasks. +#ifdef RT_MUTEX_BUILD_SPINLOCKS +/* + * Functions required for spin/rw_lock substitution on RT kernels */ -void rt_mutex_proxy_unlock(struct rt_mutex *lock, - struct task_struct *proxy_owner) -{ - debug_rt_mutex_proxy_unlock(lock); - rt_mutex_set_owner(lock, NULL); -} /** - * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task - * @lock: the rt_mutex to take - * @waiter: the pre-initialized rt_mutex_waiter - * @task: the task to prepare - * - * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock - * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. - * - * NOTE: does _NOT_ remove the @waiter on failure; must either call - * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this. - * - * Returns: - * 0 - task blocked on lock - * 1 - acquired the lock for task, caller should wake it up - * <0 - error - * - * Special API call for PI-futex support. + * rtlock_slowlock_locked - Slow path lock acquisition for RT locks + * @lock: The underlying RT mutex */ -int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task) +static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) { - int ret; + struct rt_mutex_waiter waiter; + struct task_struct *owner; lockdep_assert_held(&lock->wait_lock); - if (try_to_take_rt_mutex(lock, task, NULL)) - return 1; + if (try_to_take_rt_mutex(lock, current, NULL)) + return; - /* We enforce deadlock detection for futexes */ - ret = task_blocks_on_rt_mutex(lock, waiter, task, - RT_MUTEX_FULL_CHAINWALK); + rt_mutex_init_rtlock_waiter(&waiter); - if (ret && !rt_mutex_owner(lock)) { - /* - * Reset the return value. We might have - * returned with -EDEADLK and the owner - * released the lock while we were walking the - * pi chain. Let the waiter sort it out. - */ - ret = 0; - } + /* Save current state and set state to TASK_RTLOCK_WAIT */ + current_save_and_set_rtlock_wait_state(); - debug_rt_mutex_print_deadlock(waiter); + trace_contention_begin(lock, LCB_F_RT); - return ret; -} + task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK); -/** - * rt_mutex_start_proxy_lock() - Start lock acquisition for another task - * @lock: the rt_mutex to take - * @waiter: the pre-initialized rt_mutex_waiter - * @task: the task to prepare - * - * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock - * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. - * - * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter - * on failure. - * - * Returns: - * 0 - task blocked on lock - * 1 - acquired the lock for task, caller should wake it up - * <0 - error - * - * Special API call for PI-futex support. - */ -int rt_mutex_start_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task) -{ - int ret; - - raw_spin_lock_irq(&lock->wait_lock); - ret = __rt_mutex_start_proxy_lock(lock, waiter, task); - if (unlikely(ret)) - remove_waiter(lock, waiter); - raw_spin_unlock_irq(&lock->wait_lock); + for (;;) { + /* Try to acquire the lock again */ + if (try_to_take_rt_mutex(lock, current, &waiter)) + break; - return ret; -} + if (&waiter == rt_mutex_top_waiter(lock)) + owner = rt_mutex_owner(lock); + else + owner = NULL; + raw_spin_unlock_irq(&lock->wait_lock); -/** - * rt_mutex_next_owner - return the next owner of the lock - * - * @lock: the rt lock query - * - * Returns the next owner of the lock or NULL - * - * Caller has to serialize against other accessors to the lock - * itself. - * - * Special API call for PI-futex support - */ -struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) -{ - if (!rt_mutex_has_waiters(lock)) - return NULL; + if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) + schedule_rtlock(); - return rt_mutex_top_waiter(lock)->task; -} + raw_spin_lock_irq(&lock->wait_lock); + set_current_state(TASK_RTLOCK_WAIT); + } -/** - * rt_mutex_wait_proxy_lock() - Wait for lock acquisition - * @lock: the rt_mutex we were woken on - * @to: the timeout, null if none. hrtimer should already have - * been started. - * @waiter: the pre-initialized rt_mutex_waiter - * - * Wait for the the lock acquisition started on our behalf by - * rt_mutex_start_proxy_lock(). Upon failure, the caller must call - * rt_mutex_cleanup_proxy_lock(). - * - * Returns: - * 0 - success - * <0 - error, one of -EINTR, -ETIMEDOUT - * - * Special API call for PI-futex support - */ -int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter) -{ - int ret; + /* Restore the task state */ + current_restore_rtlock_saved_state(); - raw_spin_lock_irq(&lock->wait_lock); - /* sleep on the mutex */ - set_current_state(TASK_INTERRUPTIBLE); - ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); /* - * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might - * have to fix that up. + * try_to_take_rt_mutex() sets the waiter bit unconditionally. + * We might have to fix that up: */ - fixup_rt_mutex_waiters(lock); - raw_spin_unlock_irq(&lock->wait_lock); + fixup_rt_mutex_waiters(lock, true); + debug_rt_mutex_free_waiter(&waiter); - return ret; + trace_contention_end(lock, 0); } -/** - * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition - * @lock: the rt_mutex we were woken on - * @waiter: the pre-initialized rt_mutex_waiter - * - * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or - * rt_mutex_wait_proxy_lock(). - * - * Unless we acquired the lock; we're still enqueued on the wait-list and can - * in fact still be granted ownership until we're removed. Therefore we can - * find we are in fact the owner and must disregard the - * rt_mutex_wait_proxy_lock() failure. - * - * Returns: - * true - did the cleanup, we done. - * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned, - * caller should disregards its return value. - * - * Special API call for PI-futex support - */ -bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter) +static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock) { - bool cleanup = false; - - raw_spin_lock_irq(&lock->wait_lock); - /* - * Do an unconditional try-lock, this deals with the lock stealing - * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter() - * sets a NULL owner. - * - * We're not interested in the return value, because the subsequent - * test on rt_mutex_owner() will infer that. If the trylock succeeded, - * we will own the lock and it will have removed the waiter. If we - * failed the trylock, we're still not owner and we need to remove - * ourselves. - */ - try_to_take_rt_mutex(lock, current, waiter); - /* - * Unless we're the owner; we're still enqueued on the wait_list. - * So check if we became owner, if not, take us off the wait_list. - */ - if (rt_mutex_owner(lock) != current) { - remove_waiter(lock, waiter); - cleanup = true; - } - /* - * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might - * have to fix that up. - */ - fixup_rt_mutex_waiters(lock); - - raw_spin_unlock_irq(&lock->wait_lock); + unsigned long flags; - return cleanup; + raw_spin_lock_irqsave(&lock->wait_lock, flags); + rtlock_slowlock_locked(lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); } + +#endif /* RT_MUTEX_BUILD_SPINLOCKS */ diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h deleted file mode 100644 index 732f96abf462..000000000000 --- a/kernel/locking/rtmutex.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * RT-Mutexes: blocking mutual exclusion locks with PI support - * - * started by Ingo Molnar and Thomas Gleixner: - * - * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> - * - * This file contains macros used solely by rtmutex.c. - * Non-debug version. - */ - -#define rt_mutex_deadlock_check(l) (0) -#define debug_rt_mutex_init_waiter(w) do { } while (0) -#define debug_rt_mutex_free_waiter(w) do { } while (0) -#define debug_rt_mutex_lock(l) do { } while (0) -#define debug_rt_mutex_proxy_lock(l,p) do { } while (0) -#define debug_rt_mutex_proxy_unlock(l) do { } while (0) -#define debug_rt_mutex_unlock(l) do { } while (0) -#define debug_rt_mutex_init(m, n, k) do { } while (0) -#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) -#define debug_rt_mutex_print_deadlock(w) do { } while (0) -#define debug_rt_mutex_reset_waiter(w) do { } while (0) - -static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) -{ - WARN(1, "rtmutex deadlock detected\n"); -} - -static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w, - enum rtmutex_chainwalk walk) -{ - return walk == RT_MUTEX_FULL_CHAINWALK; -} diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c new file mode 100644 index 000000000000..a6974d044593 --- /dev/null +++ b/kernel/locking/rtmutex_api.c @@ -0,0 +1,612 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * rtmutex API + */ +#include <linux/spinlock.h> +#include <linux/export.h> + +#define RT_MUTEX_BUILD_MUTEX +#include "rtmutex.c" + +/* + * Max number of times we'll walk the boosting chain: + */ +int max_lock_depth = 1024; + +/* + * Debug aware fast / slowpath lock,trylock,unlock + * + * The atomic acquire/release ops are compiled away, when either the + * architecture does not support cmpxchg or when debugging is enabled. + */ +static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock, + unsigned int state, + struct lockdep_map *nest_lock, + unsigned int subclass) +{ + int ret; + + might_sleep(); + mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, _RET_IP_); + ret = __rt_mutex_lock(&lock->rtmutex, state); + if (ret) + mutex_release(&lock->dep_map, _RET_IP_); + return ret; +} + +void rt_mutex_base_init(struct rt_mutex_base *rtb) +{ + __rt_mutex_base_init(rtb); +} +EXPORT_SYMBOL(rt_mutex_base_init); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +/** + * rt_mutex_lock_nested - lock a rt_mutex + * + * @lock: the rt_mutex to be locked + * @subclass: the lockdep subclass + */ +void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) +{ + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, subclass); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); + +void __sched _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock) +{ + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, nest_lock, 0); +} +EXPORT_SYMBOL_GPL(_rt_mutex_lock_nest_lock); + +#else /* !CONFIG_DEBUG_LOCK_ALLOC */ + +/** + * rt_mutex_lock - lock a rt_mutex + * + * @lock: the rt_mutex to be locked + */ +void __sched rt_mutex_lock(struct rt_mutex *lock) +{ + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, 0); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock); +#endif + +/** + * rt_mutex_lock_interruptible - lock a rt_mutex interruptible + * + * @lock: the rt_mutex to be locked + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + */ +int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) +{ + return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, NULL, 0); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); + +/** + * rt_mutex_lock_killable - lock a rt_mutex killable + * + * @lock: the rt_mutex to be locked + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + */ +int __sched rt_mutex_lock_killable(struct rt_mutex *lock) +{ + return __rt_mutex_lock_common(lock, TASK_KILLABLE, NULL, 0); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable); + +/** + * rt_mutex_trylock - try to lock a rt_mutex + * + * @lock: the rt_mutex to be locked + * + * This function can only be called in thread context. It's safe to call it + * from atomic regions, but not from hard or soft interrupt context. + * + * Returns: + * 1 on success + * 0 on contention + */ +int __sched rt_mutex_trylock(struct rt_mutex *lock) +{ + int ret; + + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) + return 0; + + ret = __rt_mutex_trylock(&lock->rtmutex); + if (ret) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL_GPL(rt_mutex_trylock); + +/** + * rt_mutex_unlock - unlock a rt_mutex + * + * @lock: the rt_mutex to be unlocked + */ +void __sched rt_mutex_unlock(struct rt_mutex *lock) +{ + mutex_release(&lock->dep_map, _RET_IP_); + __rt_mutex_unlock(&lock->rtmutex); +} +EXPORT_SYMBOL_GPL(rt_mutex_unlock); + +/* + * Futex variants, must not use fastpath. + */ +int __sched rt_mutex_futex_trylock(struct rt_mutex_base *lock) +{ + return rt_mutex_slowtrylock(lock); +} + +int __sched __rt_mutex_futex_trylock(struct rt_mutex_base *lock) +{ + return __rt_mutex_slowtrylock(lock); +} + +/** + * __rt_mutex_futex_unlock - Futex variant, that since futex variants + * do not use the fast-path, can be simple and will not need to retry. + * + * @lock: The rt_mutex to be unlocked + * @wqh: The wake queue head from which to get the next lock waiter + */ +bool __sched __rt_mutex_futex_unlock(struct rt_mutex_base *lock, + struct rt_wake_q_head *wqh) +{ + lockdep_assert_held(&lock->wait_lock); + + debug_rt_mutex_unlock(lock); + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + return false; /* done */ + } + + /* + * We've already deboosted, mark_wakeup_next_waiter() will + * retain preempt_disabled when we drop the wait_lock, to + * avoid inversion prior to the wakeup. preempt_disable() + * therein pairs with rt_mutex_postunlock(). + */ + mark_wakeup_next_waiter(wqh, lock); + + return true; /* call postunlock() */ +} + +void __sched rt_mutex_futex_unlock(struct rt_mutex_base *lock) +{ + DEFINE_RT_WAKE_Q(wqh); + unsigned long flags; + bool postunlock; + + raw_spin_lock_irqsave(&lock->wait_lock, flags); + postunlock = __rt_mutex_futex_unlock(lock, &wqh); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + + if (postunlock) + rt_mutex_postunlock(&wqh); +} + +/** + * __rt_mutex_init - initialize the rt_mutex + * + * @lock: The rt_mutex to be initialized + * @name: The lock name used for debugging + * @key: The lock class key used for debugging + * + * Initialize the rt_mutex to unlocked state. + * + * Initializing of a locked rt_mutex is not allowed + */ +void __sched __rt_mutex_init(struct rt_mutex *lock, const char *name, + struct lock_class_key *key) +{ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + __rt_mutex_base_init(&lock->rtmutex); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); +} +EXPORT_SYMBOL_GPL(__rt_mutex_init); + +/** + * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a + * proxy owner + * + * @lock: the rt_mutex to be locked + * @proxy_owner:the task to set as owner + * + * No locking. Caller has to do serializing itself + * + * Special API call for PI-futex support. This initializes the rtmutex and + * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not + * possible at this point because the pi_state which contains the rtmutex + * is not yet visible to other tasks. + */ +void __sched rt_mutex_init_proxy_locked(struct rt_mutex_base *lock, + struct task_struct *proxy_owner) +{ + static struct lock_class_key pi_futex_key; + + __rt_mutex_base_init(lock); + /* + * On PREEMPT_RT the futex hashbucket spinlock becomes 'sleeping' + * and rtmutex based. That causes a lockdep false positive, because + * some of the futex functions invoke spin_unlock(&hb->lock) with + * the wait_lock of the rtmutex associated to the pi_futex held. + * spin_unlock() in turn takes wait_lock of the rtmutex on which + * the spinlock is based, which makes lockdep notice a lock + * recursion. Give the futex/rtmutex wait_lock a separate key. + */ + lockdep_set_class(&lock->wait_lock, &pi_futex_key); + rt_mutex_set_owner(lock, proxy_owner); +} + +/** + * rt_mutex_proxy_unlock - release a lock on behalf of owner + * + * @lock: the rt_mutex to be locked + * + * No locking. Caller has to do serializing itself + * + * Special API call for PI-futex support. This just cleans up the rtmutex + * (debugging) state. Concurrent operations on this rt_mutex are not + * possible because it belongs to the pi_state which is about to be freed + * and it is not longer visible to other tasks. + */ +void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock) +{ + debug_rt_mutex_proxy_unlock(lock); + rt_mutex_clear_owner(lock); +} + +/** + * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * + * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock + * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. + * + * NOTE: does _NOT_ remove the @waiter on failure; must either call + * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this. + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for PI-futex support. + */ +int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task) +{ + int ret; + + lockdep_assert_held(&lock->wait_lock); + + if (try_to_take_rt_mutex(lock, task, NULL)) + return 1; + + /* We enforce deadlock detection for futexes */ + ret = task_blocks_on_rt_mutex(lock, waiter, task, NULL, + RT_MUTEX_FULL_CHAINWALK); + + if (ret && !rt_mutex_owner(lock)) { + /* + * Reset the return value. We might have + * returned with -EDEADLK and the owner + * released the lock while we were walking the + * pi chain. Let the waiter sort it out. + */ + ret = 0; + } + + return ret; +} + +/** + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * + * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock + * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. + * + * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter + * on failure. + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for PI-futex support. + */ +int __sched rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task) +{ + int ret; + + raw_spin_lock_irq(&lock->wait_lock); + ret = __rt_mutex_start_proxy_lock(lock, waiter, task); + if (unlikely(ret)) + remove_waiter(lock, waiter); + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +} + +/** + * rt_mutex_wait_proxy_lock() - Wait for lock acquisition + * @lock: the rt_mutex we were woken on + * @to: the timeout, null if none. hrtimer should already have + * been started. + * @waiter: the pre-initialized rt_mutex_waiter + * + * Wait for the lock acquisition started on our behalf by + * rt_mutex_start_proxy_lock(). Upon failure, the caller must call + * rt_mutex_cleanup_proxy_lock(). + * + * Returns: + * 0 - success + * <0 - error, one of -EINTR, -ETIMEDOUT + * + * Special API call for PI-futex support + */ +int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter) +{ + int ret; + + raw_spin_lock_irq(&lock->wait_lock); + /* sleep on the mutex */ + set_current_state(TASK_INTERRUPTIBLE); + ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter); + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock, true); + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +} + +/** + * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition + * @lock: the rt_mutex we were woken on + * @waiter: the pre-initialized rt_mutex_waiter + * + * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or + * rt_mutex_wait_proxy_lock(). + * + * Unless we acquired the lock; we're still enqueued on the wait-list and can + * in fact still be granted ownership until we're removed. Therefore we can + * find we are in fact the owner and must disregard the + * rt_mutex_wait_proxy_lock() failure. + * + * Returns: + * true - did the cleanup, we done. + * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned, + * caller should disregards its return value. + * + * Special API call for PI-futex support + */ +bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter) +{ + bool cleanup = false; + + raw_spin_lock_irq(&lock->wait_lock); + /* + * Do an unconditional try-lock, this deals with the lock stealing + * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter() + * sets a NULL owner. + * + * We're not interested in the return value, because the subsequent + * test on rt_mutex_owner() will infer that. If the trylock succeeded, + * we will own the lock and it will have removed the waiter. If we + * failed the trylock, we're still not owner and we need to remove + * ourselves. + */ + try_to_take_rt_mutex(lock, current, waiter); + /* + * Unless we're the owner; we're still enqueued on the wait_list. + * So check if we became owner, if not, take us off the wait_list. + */ + if (rt_mutex_owner(lock) != current) { + remove_waiter(lock, waiter); + cleanup = true; + } + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock, false); + + raw_spin_unlock_irq(&lock->wait_lock); + + return cleanup; +} + +/* + * Recheck the pi chain, in case we got a priority setting + * + * Called from sched_setscheduler + */ +void __sched rt_mutex_adjust_pi(struct task_struct *task) +{ + struct rt_mutex_waiter *waiter; + struct rt_mutex_base *next_lock; + unsigned long flags; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + + waiter = task->pi_blocked_on; + if (!waiter || rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) { + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + return; + } + next_lock = waiter->lock; + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(task); + + rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL, + next_lock, NULL, task); +} + +/* + * Performs the wakeup of the top-waiter and re-enables preemption. + */ +void __sched rt_mutex_postunlock(struct rt_wake_q_head *wqh) +{ + rt_mutex_wake_up_q(wqh); +} + +#ifdef CONFIG_DEBUG_RT_MUTEXES +void rt_mutex_debug_task_free(struct task_struct *task) +{ + DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root)); + DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); +} +#endif + +#ifdef CONFIG_PREEMPT_RT +/* Mutexes */ +void __mutex_rt_init(struct mutex *mutex, const char *name, + struct lock_class_key *key) +{ + debug_check_no_locks_freed((void *)mutex, sizeof(*mutex)); + lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP); +} +EXPORT_SYMBOL(__mutex_rt_init); + +static __always_inline int __mutex_lock_common(struct mutex *lock, + unsigned int state, + unsigned int subclass, + struct lockdep_map *nest_lock, + unsigned long ip) +{ + int ret; + + might_sleep(); + mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); + ret = __rt_mutex_lock(&lock->rtmutex, state); + if (ret) + mutex_release(&lock->dep_map, ip); + else + lock_acquired(&lock->dep_map, ip); + return ret; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) +{ + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); +} +EXPORT_SYMBOL_GPL(mutex_lock_nested); + +void __sched _mutex_lock_nest_lock(struct mutex *lock, + struct lockdep_map *nest_lock) +{ + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest_lock, _RET_IP_); +} +EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); + +int __sched mutex_lock_interruptible_nested(struct mutex *lock, + unsigned int subclass) +{ + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_); +} +EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); + +int __sched mutex_lock_killable_nested(struct mutex *lock, + unsigned int subclass) +{ + return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); +} +EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); + +void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) +{ + int token; + + might_sleep(); + + token = io_schedule_prepare(); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); + io_schedule_finish(token); +} +EXPORT_SYMBOL_GPL(mutex_lock_io_nested); + +#else /* CONFIG_DEBUG_LOCK_ALLOC */ + +void __sched mutex_lock(struct mutex *lock) +{ + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); +} +EXPORT_SYMBOL(mutex_lock); + +int __sched mutex_lock_interruptible(struct mutex *lock) +{ + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); +} +EXPORT_SYMBOL(mutex_lock_interruptible); + +int __sched mutex_lock_killable(struct mutex *lock) +{ + return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); +} +EXPORT_SYMBOL(mutex_lock_killable); + +void __sched mutex_lock_io(struct mutex *lock) +{ + int token = io_schedule_prepare(); + + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); + io_schedule_finish(token); +} +EXPORT_SYMBOL(mutex_lock_io); +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ + +int __sched mutex_trylock(struct mutex *lock) +{ + int ret; + + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) + return 0; + + ret = __rt_mutex_trylock(&lock->rtmutex); + if (ret) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(mutex_trylock); + +void __sched mutex_unlock(struct mutex *lock) +{ + mutex_release(&lock->dep_map, _RET_IP_); + __rt_mutex_unlock(&lock->rtmutex); +} +EXPORT_SYMBOL(mutex_unlock); + +#endif /* CONFIG_PREEMPT_RT */ diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index d1d62f942be2..1162e07cdaea 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -13,50 +13,127 @@ #ifndef __KERNEL_RTMUTEX_COMMON_H #define __KERNEL_RTMUTEX_COMMON_H +#include <linux/debug_locks.h> #include <linux/rtmutex.h> #include <linux/sched/wake_q.h> + +/* + * This is a helper for the struct rt_mutex_waiter below. A waiter goes in two + * separate trees and they need their own copy of the sort keys because of + * different locking requirements. + * + * @entry: rbtree node to enqueue into the waiters tree + * @prio: Priority of the waiter + * @deadline: Deadline of the waiter if applicable + * + * See rt_waiter_node_less() and waiter_*_prio(). + */ +struct rt_waiter_node { + struct rb_node entry; + int prio; + u64 deadline; +}; + /* * This is the control structure for tasks blocked on a rt_mutex, * which is allocated on the kernel stack on of the blocked task. * - * @tree_entry: pi node to enqueue into the mutex waiters tree - * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree + * @tree: node to enqueue into the mutex waiters tree + * @pi_tree: node to enqueue into the mutex owner waiters tree * @task: task reference to the blocked task + * @lock: Pointer to the rt_mutex on which the waiter blocks + * @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT) + * @ww_ctx: WW context pointer + * + * @tree is ordered by @lock->wait_lock + * @pi_tree is ordered by rt_mutex_owner(@lock)->pi_lock */ struct rt_mutex_waiter { - struct rb_node tree_entry; - struct rb_node pi_tree_entry; + struct rt_waiter_node tree; + struct rt_waiter_node pi_tree; struct task_struct *task; - struct rt_mutex *lock; -#ifdef CONFIG_DEBUG_RT_MUTEXES - unsigned long ip; - struct pid *deadlock_task_pid; - struct rt_mutex *deadlock_lock; -#endif - int prio; - u64 deadline; + struct rt_mutex_base *lock; + unsigned int wake_state; + struct ww_acquire_ctx *ww_ctx; +}; + +/** + * rt_wake_q_head - Wrapper around regular wake_q_head to support + * "sleeping" spinlocks on RT + * @head: The regular wake_q_head for sleeping lock variants + * @rtlock_task: Task pointer for RT lock (spin/rwlock) wakeups + */ +struct rt_wake_q_head { + struct wake_q_head head; + struct task_struct *rtlock_task; }; +#define DEFINE_RT_WAKE_Q(name) \ + struct rt_wake_q_head name = { \ + .head = WAKE_Q_HEAD_INITIALIZER(name.head), \ + .rtlock_task = NULL, \ + } + /* - * Various helpers to access the waiters-tree: + * PI-futex support (proxy locking functions, etc.): */ +extern void rt_mutex_init_proxy_locked(struct rt_mutex_base *lock, + struct task_struct *proxy_owner); +extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock); +extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task); +extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task); +extern int rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter); +extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter); -#ifdef CONFIG_RT_MUTEXES +extern int rt_mutex_futex_trylock(struct rt_mutex_base *l); +extern int __rt_mutex_futex_trylock(struct rt_mutex_base *l); + +extern void rt_mutex_futex_unlock(struct rt_mutex_base *lock); +extern bool __rt_mutex_futex_unlock(struct rt_mutex_base *lock, + struct rt_wake_q_head *wqh); -static inline int rt_mutex_has_waiters(struct rt_mutex *lock) +extern void rt_mutex_postunlock(struct rt_wake_q_head *wqh); + +/* + * Must be guarded because this header is included from rcu/tree_plugin.h + * unconditionally. + */ +#ifdef CONFIG_RT_MUTEXES +static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock) { return !RB_EMPTY_ROOT(&lock->waiters.rb_root); } -static inline struct rt_mutex_waiter * -rt_mutex_top_waiter(struct rt_mutex *lock) +/* + * Lockless speculative check whether @waiter is still the top waiter on + * @lock. This is solely comparing pointers and not derefencing the + * leftmost entry which might be about to vanish. + */ +static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter) +{ + struct rb_node *leftmost = rb_first_cached(&lock->waiters); + + return rb_entry(leftmost, struct rt_mutex_waiter, tree.entry) == waiter; +} + +static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock) { struct rb_node *leftmost = rb_first_cached(&lock->waiters); struct rt_mutex_waiter *w = NULL; + lockdep_assert_held(&lock->wait_lock); + if (leftmost) { - w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry); + w = rb_entry(leftmost, struct rt_mutex_waiter, tree.entry); BUG_ON(w->lock != lock); } return w; @@ -67,45 +144,17 @@ static inline int task_has_pi_waiters(struct task_struct *p) return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root); } -static inline struct rt_mutex_waiter * -task_top_pi_waiter(struct task_struct *p) -{ - return rb_entry(p->pi_waiters.rb_leftmost, - struct rt_mutex_waiter, pi_tree_entry); -} - -#else - -static inline int rt_mutex_has_waiters(struct rt_mutex *lock) -{ - return false; -} - -static inline struct rt_mutex_waiter * -rt_mutex_top_waiter(struct rt_mutex *lock) +static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p) { - return NULL; -} + lockdep_assert_held(&p->pi_lock); -static inline int task_has_pi_waiters(struct task_struct *p) -{ - return false; + return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter, + pi_tree.entry); } -static inline struct rt_mutex_waiter * -task_top_pi_waiter(struct task_struct *p) -{ - return NULL; -} - -#endif - -/* - * lock->owner state tracking: - */ #define RT_MUTEX_HAS_WAITERS 1UL -static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) +static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) { unsigned long owner = (unsigned long) READ_ONCE(lock->owner); @@ -127,40 +176,59 @@ enum rtmutex_chainwalk { RT_MUTEX_FULL_CHAINWALK, }; -/* - * PI-futex support (proxy locking functions, etc.): - */ -extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); -extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, - struct task_struct *proxy_owner); -extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, - struct task_struct *proxy_owner); -extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); -extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task); -extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task); -extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter); -extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter); +static inline void __rt_mutex_base_init(struct rt_mutex_base *lock) +{ + raw_spin_lock_init(&lock->wait_lock); + lock->waiters = RB_ROOT_CACHED; + lock->owner = NULL; +} -extern int rt_mutex_futex_trylock(struct rt_mutex *l); -extern int __rt_mutex_futex_trylock(struct rt_mutex *l); +/* Debug functions */ +static inline void debug_rt_mutex_unlock(struct rt_mutex_base *lock) +{ + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) + DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); +} -extern void rt_mutex_futex_unlock(struct rt_mutex *lock); -extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wqh); +static inline void debug_rt_mutex_proxy_unlock(struct rt_mutex_base *lock) +{ + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) + DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); +} -extern void rt_mutex_postunlock(struct wake_q_head *wake_q); +static inline void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) +{ + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) + memset(waiter, 0x11, sizeof(*waiter)); +} -#ifdef CONFIG_DEBUG_RT_MUTEXES -# include "rtmutex-debug.h" -#else -# include "rtmutex.h" -#endif +static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) +{ + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) + memset(waiter, 0x22, sizeof(*waiter)); +} + +static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) +{ + debug_rt_mutex_init_waiter(waiter); + RB_CLEAR_NODE(&waiter->pi_tree.entry); + RB_CLEAR_NODE(&waiter->tree.entry); + waiter->wake_state = TASK_NORMAL; + waiter->task = NULL; +} + +static inline void rt_mutex_init_rtlock_waiter(struct rt_mutex_waiter *waiter) +{ + rt_mutex_init_waiter(waiter); + waiter->wake_state = TASK_RTLOCK_WAIT; +} + +#else /* CONFIG_RT_MUTEXES */ +/* Used in rcu/tree_plugin.h */ +static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) +{ + return NULL; +} +#endif /* !CONFIG_RT_MUTEXES */ #endif diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c new file mode 100644 index 000000000000..34a59569db6b --- /dev/null +++ b/kernel/locking/rwbase_rt.c @@ -0,0 +1,297 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * RT-specific reader/writer semaphores and reader/writer locks + * + * down_write/write_lock() + * 1) Lock rtmutex + * 2) Remove the reader BIAS to force readers into the slow path + * 3) Wait until all readers have left the critical section + * 4) Mark it write locked + * + * up_write/write_unlock() + * 1) Remove the write locked marker + * 2) Set the reader BIAS, so readers can use the fast path again + * 3) Unlock rtmutex, to release blocked readers + * + * down_read/read_lock() + * 1) Try fast path acquisition (reader BIAS is set) + * 2) Take tmutex::wait_lock, which protects the writelocked flag + * 3) If !writelocked, acquire it for read + * 4) If writelocked, block on tmutex + * 5) unlock rtmutex, goto 1) + * + * up_read/read_unlock() + * 1) Try fast path release (reader count != 1) + * 2) Wake the writer waiting in down_write()/write_lock() #3 + * + * down_read/read_lock()#3 has the consequence, that rw semaphores and rw + * locks on RT are not writer fair, but writers, which should be avoided in + * RT tasks (think mmap_sem), are subject to the rtmutex priority/DL + * inheritance mechanism. + * + * It's possible to make the rw primitives writer fair by keeping a list of + * active readers. A blocked writer would force all newly incoming readers + * to block on the rtmutex, but the rtmutex would have to be proxy locked + * for one reader after the other. We can't use multi-reader inheritance + * because there is no way to support that with SCHED_DEADLINE. + * Implementing the one by one reader boosting/handover mechanism is a + * major surgery for a very dubious value. + * + * The risk of writer starvation is there, but the pathological use cases + * which trigger it are not necessarily the typical RT workloads. + * + * Fast-path orderings: + * The lock/unlock of readers can run in fast paths: lock and unlock are only + * atomic ops, and there is no inner lock to provide ACQUIRE and RELEASE + * semantics of rwbase_rt. Atomic ops should thus provide _acquire() + * and _release() (or stronger). + * + * Common code shared between RT rw_semaphore and rwlock + */ + +static __always_inline int rwbase_read_trylock(struct rwbase_rt *rwb) +{ + int r; + + /* + * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is + * set. + */ + for (r = atomic_read(&rwb->readers); r < 0;) { + if (likely(atomic_try_cmpxchg_acquire(&rwb->readers, &r, r + 1))) + return 1; + } + return 0; +} + +static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, + unsigned int state) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + int ret; + + rwbase_pre_schedule(); + raw_spin_lock_irq(&rtm->wait_lock); + + /* + * Call into the slow lock path with the rtmutex->wait_lock + * held, so this can't result in the following race: + * + * Reader1 Reader2 Writer + * down_read() + * down_write() + * rtmutex_lock(m) + * wait() + * down_read() + * unlock(m->wait_lock) + * up_read() + * wake(Writer) + * lock(m->wait_lock) + * sem->writelocked=true + * unlock(m->wait_lock) + * + * up_write() + * sem->writelocked=false + * rtmutex_unlock(m) + * down_read() + * down_write() + * rtmutex_lock(m) + * wait() + * rtmutex_lock(m) + * + * That would put Reader1 behind the writer waiting on + * Reader2 to call up_read(), which might be unbound. + */ + + trace_contention_begin(rwb, LCB_F_RT | LCB_F_READ); + + /* + * For rwlocks this returns 0 unconditionally, so the below + * !ret conditionals are optimized out. + */ + ret = rwbase_rtmutex_slowlock_locked(rtm, state); + + /* + * On success the rtmutex is held, so there can't be a writer + * active. Increment the reader count and immediately drop the + * rtmutex again. + * + * rtmutex->wait_lock has to be unlocked in any case of course. + */ + if (!ret) + atomic_inc(&rwb->readers); + raw_spin_unlock_irq(&rtm->wait_lock); + if (!ret) + rwbase_rtmutex_unlock(rtm); + + trace_contention_end(rwb, ret); + rwbase_post_schedule(); + return ret; +} + +static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb, + unsigned int state) +{ + lockdep_assert(!current->pi_blocked_on); + + if (rwbase_read_trylock(rwb)) + return 0; + + return __rwbase_read_lock(rwb, state); +} + +static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb, + unsigned int state) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + struct task_struct *owner; + DEFINE_RT_WAKE_Q(wqh); + + raw_spin_lock_irq(&rtm->wait_lock); + /* + * Wake the writer, i.e. the rtmutex owner. It might release the + * rtmutex concurrently in the fast path (due to a signal), but to + * clean up rwb->readers it needs to acquire rtm->wait_lock. The + * worst case which can happen is a spurious wakeup. + */ + owner = rt_mutex_owner(rtm); + if (owner) + rt_mutex_wake_q_add_task(&wqh, owner, state); + + /* Pairs with the preempt_enable in rt_mutex_wake_up_q() */ + preempt_disable(); + raw_spin_unlock_irq(&rtm->wait_lock); + rt_mutex_wake_up_q(&wqh); +} + +static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb, + unsigned int state) +{ + /* + * rwb->readers can only hit 0 when a writer is waiting for the + * active readers to leave the critical section. + * + * dec_and_test() is fully ordered, provides RELEASE. + */ + if (unlikely(atomic_dec_and_test(&rwb->readers))) + __rwbase_read_unlock(rwb, state); +} + +static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias, + unsigned long flags) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + + /* + * _release() is needed in case that reader is in fast path, pairing + * with atomic_try_cmpxchg_acquire() in rwbase_read_trylock(). + */ + (void)atomic_add_return_release(READER_BIAS - bias, &rwb->readers); + raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + rwbase_rtmutex_unlock(rtm); +} + +static inline void rwbase_write_unlock(struct rwbase_rt *rwb) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + unsigned long flags; + + raw_spin_lock_irqsave(&rtm->wait_lock, flags); + __rwbase_write_unlock(rwb, WRITER_BIAS, flags); +} + +static inline void rwbase_write_downgrade(struct rwbase_rt *rwb) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + unsigned long flags; + + raw_spin_lock_irqsave(&rtm->wait_lock, flags); + /* Release it and account current as reader */ + __rwbase_write_unlock(rwb, WRITER_BIAS - 1, flags); +} + +static inline bool __rwbase_write_trylock(struct rwbase_rt *rwb) +{ + /* Can do without CAS because we're serialized by wait_lock. */ + lockdep_assert_held(&rwb->rtmutex.wait_lock); + + /* + * _acquire is needed in case the reader is in the fast path, pairing + * with rwbase_read_unlock(), provides ACQUIRE. + */ + if (!atomic_read_acquire(&rwb->readers)) { + atomic_set(&rwb->readers, WRITER_BIAS); + return 1; + } + + return 0; +} + +static int __sched rwbase_write_lock(struct rwbase_rt *rwb, + unsigned int state) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + unsigned long flags; + + /* Take the rtmutex as a first step */ + if (rwbase_rtmutex_lock_state(rtm, state)) + return -EINTR; + + /* Force readers into slow path */ + atomic_sub(READER_BIAS, &rwb->readers); + + rwbase_pre_schedule(); + + raw_spin_lock_irqsave(&rtm->wait_lock, flags); + if (__rwbase_write_trylock(rwb)) + goto out_unlock; + + rwbase_set_and_save_current_state(state); + trace_contention_begin(rwb, LCB_F_RT | LCB_F_WRITE); + for (;;) { + /* Optimized out for rwlocks */ + if (rwbase_signal_pending_state(state, current)) { + rwbase_restore_current_state(); + __rwbase_write_unlock(rwb, 0, flags); + rwbase_post_schedule(); + trace_contention_end(rwb, -EINTR); + return -EINTR; + } + + if (__rwbase_write_trylock(rwb)) + break; + + raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + rwbase_schedule(); + raw_spin_lock_irqsave(&rtm->wait_lock, flags); + + set_current_state(state); + } + rwbase_restore_current_state(); + trace_contention_end(rwb, 0); + +out_unlock: + raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + rwbase_post_schedule(); + return 0; +} + +static inline int rwbase_write_trylock(struct rwbase_rt *rwb) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + unsigned long flags; + + if (!rwbase_rtmutex_trylock(rtm)) + return 0; + + atomic_sub(READER_BIAS, &rwb->readers); + + raw_spin_lock_irqsave(&rtm->wait_lock, flags); + if (__rwbase_write_trylock(rwb)) { + raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + return 1; + } + __rwbase_write_unlock(rwb, 0, flags); + return 0; +} diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index f11b9bd3431d..c6d17aee4209 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -27,23 +27,19 @@ #include <linux/export.h> #include <linux/rwsem.h> #include <linux/atomic.h> +#include <trace/events/lock.h> +#ifndef CONFIG_PREEMPT_RT #include "lock_events.h" /* - * The least significant 3 bits of the owner value has the following + * The least significant 2 bits of the owner value has the following * meanings when set. - * - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers - * - Bit 1: RWSEM_RD_NONSPINNABLE - Readers cannot spin on this lock. - * - Bit 2: RWSEM_WR_NONSPINNABLE - Writers cannot spin on this lock. + * - Bit 0: RWSEM_READER_OWNED - rwsem may be owned by readers (just a hint) + * - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock * - * When the rwsem is either owned by an anonymous writer, or it is - * reader-owned, but a spinning writer has timed out, both nonspinnable - * bits will be set to disable optimistic spinning by readers and writers. - * In the later case, the last unlocking reader should then check the - * writer nonspinnable bit and clear it only to give writers preference - * to acquire the lock via optimistic spinning, but not readers. Similar - * action is also done in the reader slowpath. + * When the rwsem is reader-owned and a spinning writer has timed out, + * the nonspinnable bit will be set to disable optimistic spinning. * When a writer acquires a rwsem, it puts its task_struct pointer * into the owner field. It is cleared after an unlock. @@ -59,46 +55,13 @@ * is involved. Ideally we would like to track all the readers that own * a rwsem, but the overhead is simply too big. * - * Reader optimistic spinning is helpful when the reader critical section - * is short and there aren't that many readers around. It makes readers - * relatively more preferred than writers. When a writer times out spinning - * on a reader-owned lock and set the nospinnable bits, there are two main - * reasons for that. - * - * 1) The reader critical section is long, perhaps the task sleeps after - * acquiring the read lock. - * 2) There are just too many readers contending the lock causing it to - * take a while to service all of them. - * - * In the former case, long reader critical section will impede the progress - * of writers which is usually more important for system performance. In - * the later case, reader optimistic spinning tends to make the reader - * groups that contain readers that acquire the lock together smaller - * leading to more of them. That may hurt performance in some cases. In - * other words, the setting of nonspinnable bits indicates that reader - * optimistic spinning may not be helpful for those workloads that cause - * it. - * - * Therefore, any writers that had observed the setting of the writer - * nonspinnable bit for a given rwsem after they fail to acquire the lock - * via optimistic spinning will set the reader nonspinnable bit once they - * acquire the write lock. Similarly, readers that observe the setting - * of reader nonspinnable bit at slowpath entry will set the reader - * nonspinnable bits when they acquire the read lock via the wakeup path. - * - * Once the reader nonspinnable bit is on, it will only be reset when - * a writer is able to acquire the rwsem in the fast path or somehow a - * reader or writer in the slowpath doesn't observe the nonspinable bit. - * - * This is to discourage reader optmistic spinning on that particular - * rwsem and make writers more preferred. This adaptive disabling of reader - * optimistic spinning will alleviate the negative side effect of this - * feature. + * A fast path reader optimistic lock stealing is supported when the rwsem + * is previously owned by a writer and the following conditions are met: + * - rwsem is not currently writer owned + * - the handoff isn't set. */ #define RWSEM_READER_OWNED (1UL << 0) -#define RWSEM_RD_NONSPINNABLE (1UL << 1) -#define RWSEM_WR_NONSPINNABLE (1UL << 2) -#define RWSEM_NONSPINNABLE (RWSEM_RD_NONSPINNABLE | RWSEM_WR_NONSPINNABLE) +#define RWSEM_NONSPINNABLE (1UL << 1) #define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE) #ifdef CONFIG_DEBUG_RWSEMS @@ -143,9 +106,9 @@ * atomic_long_cmpxchg() will be used to obtain writer lock. * * There are three places where the lock handoff bit may be set or cleared. - * 1) rwsem_mark_wake() for readers. - * 2) rwsem_try_write_lock() for writers. - * 3) Error path of rwsem_down_write_slowpath(). + * 1) rwsem_mark_wake() for readers -- set, clear + * 2) rwsem_try_write_lock() for writers -- set, clear + * 3) rwsem_del_waiter() -- clear * * For all the above cases, wait_lock will be held. A writer must also * be the first one in the wait_list to be eligible for setting the handoff @@ -170,14 +133,19 @@ * the owner value concurrently without lock. Read from owner, however, * may not need READ_ONCE() as long as the pointer value is only used * for comparison and isn't being dereferenced. + * + * Both rwsem_{set,clear}_owner() functions should be in the same + * preempt disable section as the atomic op that changes sem->count. */ static inline void rwsem_set_owner(struct rw_semaphore *sem) { + lockdep_assert_preemption_disabled(); atomic_long_set(&sem->owner, (long)current); } static inline void rwsem_clear_owner(struct rw_semaphore *sem) { + lockdep_assert_preemption_disabled(); atomic_long_set(&sem->owner, 0); } @@ -203,7 +171,7 @@ static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, struct task_struct *owner) { unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED | - (atomic_long_read(&sem->owner) & RWSEM_RD_NONSPINNABLE); + (atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE); atomic_long_set(&sem->owner, val); } @@ -270,12 +238,31 @@ static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem) owner | RWSEM_NONSPINNABLE)); } -static inline bool rwsem_read_trylock(struct rw_semaphore *sem) +static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp) { - long cnt = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count); - if (WARN_ON_ONCE(cnt < 0)) + *cntp = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count); + + if (WARN_ON_ONCE(*cntp < 0)) rwsem_set_nonspinnable(sem); - return !(cnt & RWSEM_READ_FAILED_MASK); + + if (!(*cntp & RWSEM_READ_FAILED_MASK)) { + rwsem_set_reader_owned(sem); + return true; + } + + return false; +} + +static inline bool rwsem_write_trylock(struct rw_semaphore *sem) +{ + long tmp = RWSEM_UNLOCKED_VALUE; + + if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) { + rwsem_set_owner(sem); + return true; + } + + return false; } /* @@ -353,7 +340,7 @@ struct rwsem_waiter { struct task_struct *task; enum rwsem_waiter_type type; unsigned long timeout; - unsigned long last_rowner; + bool handoff_set; }; #define rwsem_first_waiter(sem) \ list_first_entry(&sem->wait_list, struct rwsem_waiter, list) @@ -364,12 +351,6 @@ enum rwsem_wake_type { RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ }; -enum writer_wait_state { - WRITER_NOT_FIRST, /* Writer is not first in wait list */ - WRITER_FIRST, /* Writer is first in wait list */ - WRITER_HANDOFF /* Writer is first & handoff needed */ -}; - /* * The typical HZ value is either 250 or 1000. So set the minimum waiting * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait @@ -385,6 +366,34 @@ enum writer_wait_state { */ #define MAX_READERS_WAKEUP 0x100 +static inline void +rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) +{ + lockdep_assert_held(&sem->wait_lock); + list_add_tail(&waiter->list, &sem->wait_list); + /* caller will set RWSEM_FLAG_WAITERS */ +} + +/* + * Remove a waiter from the wait_list and clear flags. + * + * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of + * this function. Modify with care. + * + * Return: true if wait_list isn't empty and false otherwise + */ +static inline bool +rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) +{ + lockdep_assert_held(&sem->wait_lock); + list_del(&waiter->list); + if (likely(!list_empty(&sem->wait_list))) + return true; + + atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count); + return false; +} + /* * handle the lock release when processes blocked on it that can now run * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must @@ -396,6 +405,8 @@ enum writer_wait_state { * preferably when the wait_lock is released * - woken process blocks are discarded from the list after having task zeroed * - writers are only marked woken if downgrading is false + * + * Implies rwsem_del_waiter() for all woken readers. */ static void rwsem_mark_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type, @@ -451,10 +462,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * to give up the lock), request a HANDOFF to * force the issue. */ - if (!(oldcount & RWSEM_FLAG_HANDOFF) && - time_after(jiffies, waiter->timeout)) { - adjustment -= RWSEM_FLAG_HANDOFF; - lockevent_inc(rwsem_rlock_handoff); + if (time_after(jiffies, waiter->timeout)) { + if (!(oldcount & RWSEM_FLAG_HANDOFF)) { + adjustment -= RWSEM_FLAG_HANDOFF; + lockevent_inc(rwsem_rlock_handoff); + } + waiter->handoff_set = true; } atomic_long_add(-adjustment, &sem->count); @@ -467,10 +480,6 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * the reader is copied over. */ owner = waiter->task; - if (waiter->last_rowner & RWSEM_RD_NONSPINNABLE) { - owner = (void *)((unsigned long)owner | RWSEM_RD_NONSPINNABLE); - lockevent_inc(rwsem_opt_norspin); - } __rwsem_set_reader_owned(sem, owner); } @@ -508,24 +517,31 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, /* * Limit # of readers that can be woken up per wakeup call. */ - if (woken >= MAX_READERS_WAKEUP) + if (unlikely(woken >= MAX_READERS_WAKEUP)) break; } adjustment = woken * RWSEM_READER_BIAS - adjustment; lockevent_cond_inc(rwsem_wake_reader, woken); + + oldcount = atomic_long_read(&sem->count); if (list_empty(&sem->wait_list)) { - /* hit end of list above */ + /* + * Combined with list_move_tail() above, this implies + * rwsem_del_waiter(). + */ adjustment -= RWSEM_FLAG_WAITERS; + if (oldcount & RWSEM_FLAG_HANDOFF) + adjustment -= RWSEM_FLAG_HANDOFF; + } else if (woken) { + /* + * When we've woken a reader, we no longer need to force + * writers to give up the lock and we can clear HANDOFF. + */ + if (oldcount & RWSEM_FLAG_HANDOFF) + adjustment -= RWSEM_FLAG_HANDOFF; } - /* - * When we've woken a reader, we no longer need to force writers - * to give up the lock and we can clear HANDOFF. - */ - if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF)) - adjustment -= RWSEM_FLAG_HANDOFF; - if (adjustment) atomic_long_add(adjustment, &sem->count); @@ -552,16 +568,43 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, } /* + * Remove a waiter and try to wake up other waiters in the wait queue + * This function is called from the out_nolock path of both the reader and + * writer slowpaths with wait_lock held. It releases the wait_lock and + * optionally wake up waiters before it returns. + */ +static inline void +rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter, + struct wake_q_head *wake_q) + __releases(&sem->wait_lock) +{ + bool first = rwsem_first_waiter(sem) == waiter; + + wake_q_init(wake_q); + + /* + * If the wait_list isn't empty and the waiter to be deleted is + * the first waiter, we wake up the remaining waiters as they may + * be eligible to acquire or spin on the lock. + */ + if (rwsem_del_waiter(sem, waiter) && first) + rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q); + raw_spin_unlock_irq(&sem->wait_lock); + if (!wake_q_empty(wake_q)) + wake_up_q(wake_q); +} + +/* * This function must be called with the sem->wait_lock held to prevent * race conditions between checking the rwsem wait list and setting the * sem->count accordingly. * - * If wstate is WRITER_HANDOFF, it will make sure that either the handoff - * bit is set or the lock is acquired with handoff bit cleared. + * Implies rwsem_del_waiter() on success. */ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, - enum writer_wait_state wstate) + struct rwsem_waiter *waiter) { + struct rwsem_waiter *first = rwsem_first_waiter(sem); long count, new; lockdep_assert_held(&sem->wait_lock); @@ -570,13 +613,26 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, do { bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); - if (has_handoff && wstate == WRITER_NOT_FIRST) - return false; + if (has_handoff) { + /* + * Honor handoff bit and yield only when the first + * waiter is the one that set it. Otherwisee, we + * still try to acquire the rwsem. + */ + if (first->handoff_set && (waiter != first)) + return false; + } new = count; if (count & RWSEM_LOCK_MASK) { - if (has_handoff || (wstate != WRITER_HANDOFF)) + /* + * A waiter (first or not) can set the handoff bit + * if it is an RT task or wait in the wait queue + * for too long. + */ + if (has_handoff || (!rt_task(waiter->task) && + !time_after(jiffies, waiter->timeout))) return false; new |= RWSEM_FLAG_HANDOFF; @@ -590,41 +646,44 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)); /* - * We have either acquired the lock with handoff bit cleared or - * set the handoff bit. + * We have either acquired the lock with handoff bit cleared or set + * the handoff bit. Only the first waiter can have its handoff_set + * set here to enable optimistic spinning in slowpath loop. */ - if (new & RWSEM_FLAG_HANDOFF) + if (new & RWSEM_FLAG_HANDOFF) { + first->handoff_set = true; + lockevent_inc(rwsem_wlock_handoff); return false; + } + /* + * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on + * success. + */ + list_del(&waiter->list); rwsem_set_owner(sem); return true; } -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* - * Try to acquire read lock before the reader is put on wait queue. - * Lock acquisition isn't allowed if the rwsem is locked or a writer handoff - * is ongoing. + * The rwsem_spin_on_owner() function returns the following 4 values + * depending on the lock owner state. + * OWNER_NULL : owner is currently NULL + * OWNER_WRITER: when owner changes and is a writer + * OWNER_READER: when owner changes and the new owner may be a reader. + * OWNER_NONSPINNABLE: + * when optimistic spinning has to stop because either the + * owner stops running, is unknown, or its timeslice has + * been used up. */ -static inline bool rwsem_try_read_lock_unqueued(struct rw_semaphore *sem) -{ - long count = atomic_long_read(&sem->count); - - if (count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF)) - return false; - - count = atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count); - if (!(count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { - rwsem_set_reader_owned(sem); - lockevent_inc(rwsem_opt_rlock); - return true; - } - - /* Back out the change */ - atomic_long_add(-RWSEM_READER_BIAS, &sem->count); - return false; -} +enum owner_state { + OWNER_NULL = 1 << 0, + OWNER_WRITER = 1 << 1, + OWNER_READER = 1 << 2, + OWNER_NONSPINNABLE = 1 << 3, +}; +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* * Try to acquire write lock before the writer has been put on wait queue. */ @@ -636,24 +695,14 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, count | RWSEM_WRITER_LOCKED)) { rwsem_set_owner(sem); - lockevent_inc(rwsem_opt_wlock); + lockevent_inc(rwsem_opt_lock); return true; } } return false; } -static inline bool owner_on_cpu(struct task_struct *owner) -{ - /* - * As lock holder preemption issue, we both skip spinning if - * task is not on cpu or its cpu is preempted - */ - return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); -} - -static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, - unsigned long nonspinnable) +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *owner; unsigned long flags; @@ -664,45 +713,28 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, return false; } - preempt_disable(); - rcu_read_lock(); + /* + * Disable preemption is equal to the RCU read-side crital section, + * thus the task_strcut structure won't go away. + */ owner = rwsem_owner_flags(sem, &flags); /* * Don't check the read-owner as the entry may be stale. */ - if ((flags & nonspinnable) || + if ((flags & RWSEM_NONSPINNABLE) || (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner))) ret = false; - rcu_read_unlock(); - preempt_enable(); lockevent_cond_inc(rwsem_opt_fail, !ret); return ret; } -/* - * The rwsem_spin_on_owner() function returns the folowing 4 values - * depending on the lock owner state. - * OWNER_NULL : owner is currently NULL - * OWNER_WRITER: when owner changes and is a writer - * OWNER_READER: when owner changes and the new owner may be a reader. - * OWNER_NONSPINNABLE: - * when optimistic spinning has to stop because either the - * owner stops running, is unknown, or its timeslice has - * been used up. - */ -enum owner_state { - OWNER_NULL = 1 << 0, - OWNER_WRITER = 1 << 1, - OWNER_READER = 1 << 2, - OWNER_NONSPINNABLE = 1 << 3, -}; #define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER) static inline enum owner_state -rwsem_owner_state(struct task_struct *owner, unsigned long flags, unsigned long nonspinnable) +rwsem_owner_state(struct task_struct *owner, unsigned long flags) { - if (flags & nonspinnable) + if (flags & RWSEM_NONSPINNABLE) return OWNER_NONSPINNABLE; if (flags & RWSEM_READER_OWNED) @@ -712,18 +744,19 @@ rwsem_owner_state(struct task_struct *owner, unsigned long flags, unsigned long } static noinline enum owner_state -rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) +rwsem_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *new, *owner; unsigned long flags, new_flags; enum owner_state state; + lockdep_assert_preemption_disabled(); + owner = rwsem_owner_flags(sem, &flags); - state = rwsem_owner_state(owner, flags, nonspinnable); + state = rwsem_owner_state(owner, flags); if (state != OWNER_WRITER) return state; - rcu_read_lock(); for (;;) { /* * When a waiting writer set the handoff flag, it may spin @@ -733,7 +766,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) */ new = rwsem_owner_flags(sem, &new_flags); if ((new != owner) || (new_flags != flags)) { - state = rwsem_owner_state(new, new_flags, nonspinnable); + state = rwsem_owner_state(new, new_flags); break; } @@ -741,7 +774,9 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) * Ensure we emit the owner->on_cpu, dereference _after_ * checking sem->owner still matches owner, if that fails, * owner might point to free()d memory, if it still matches, - * the rcu_read_lock() ensures the memory stays valid. + * our spinning context already disabled preemption which is + * equal to RCU read-side crital section ensures the memory + * stays valid. */ barrier(); @@ -752,7 +787,6 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) cpu_relax(); } - rcu_read_unlock(); return state; } @@ -782,16 +816,12 @@ static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem) return sched_clock() + delta; } -static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) { bool taken = false; int prev_owner_state = OWNER_NULL; int loop = 0; u64 rspin_threshold = 0; - unsigned long nonspinnable = wlock ? RWSEM_WR_NONSPINNABLE - : RWSEM_RD_NONSPINNABLE; - - preempt_disable(); /* sem->wait_lock should not be held when doing optimistic spinning */ if (!osq_lock(&sem->osq)) @@ -806,15 +836,14 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) for (;;) { enum owner_state owner_state; - owner_state = rwsem_spin_on_owner(sem, nonspinnable); + owner_state = rwsem_spin_on_owner(sem); if (!(owner_state & OWNER_SPINNABLE)) break; /* * Try to acquire the lock */ - taken = wlock ? rwsem_try_write_lock_unqueued(sem) - : rwsem_try_read_lock_unqueued(sem); + taken = rwsem_try_write_lock_unqueued(sem); if (taken) break; @@ -822,7 +851,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) /* * Time-based reader-owned rwsem optimistic spinning */ - if (wlock && (owner_state == OWNER_READER)) { + if (owner_state == OWNER_READER) { /* * Re-initialize rspin_threshold every time when * the owner state changes from non-reader to reader. @@ -831,7 +860,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) * the beginning of the 2nd reader phase. */ if (prev_owner_state != OWNER_READER) { - if (rwsem_test_oflags(sem, nonspinnable)) + if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)) break; rspin_threshold = rwsem_rspin_threshold(sem); loop = 0; @@ -871,7 +900,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) * we try to get it. The new owner may be a spinnable * writer. * - * To take advantage of two scenarios listed agove, the RT + * To take advantage of two scenarios listed above, the RT * task is made to retry one more time to see if it can * acquire the lock or continue spinning on the new owning * writer. Of course, if the time lag is long enough or the @@ -901,124 +930,97 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) } osq_unlock(&sem->osq); done: - preempt_enable(); lockevent_cond_inc(rwsem_opt_fail, !taken); return taken; } /* - * Clear the owner's RWSEM_WR_NONSPINNABLE bit if it is set. This should + * Clear the owner's RWSEM_NONSPINNABLE bit if it is set. This should * only be called when the reader count reaches 0. - * - * This give writers better chance to acquire the rwsem first before - * readers when the rwsem was being held by readers for a relatively long - * period of time. Race can happen that an optimistic spinner may have - * just stolen the rwsem and set the owner, but just clearing the - * RWSEM_WR_NONSPINNABLE bit will do no harm anyway. */ -static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) +static inline void clear_nonspinnable(struct rw_semaphore *sem) { - if (rwsem_test_oflags(sem, RWSEM_WR_NONSPINNABLE)) - atomic_long_andnot(RWSEM_WR_NONSPINNABLE, &sem->owner); + if (unlikely(rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))) + atomic_long_andnot(RWSEM_NONSPINNABLE, &sem->owner); } -/* - * This function is called when the reader fails to acquire the lock via - * optimistic spinning. In this case we will still attempt to do a trylock - * when comparing the rwsem state right now with the state when entering - * the slowpath indicates that the reader is still in a valid reader phase. - * This happens when the following conditions are true: - * - * 1) The lock is currently reader owned, and - * 2) The lock is previously not reader-owned or the last read owner changes. - * - * In the former case, we have transitioned from a writer phase to a - * reader-phase while spinning. In the latter case, it means the reader - * phase hasn't ended when we entered the optimistic spinning loop. In - * both cases, the reader is eligible to acquire the lock. This is the - * secondary path where a read lock is acquired optimistically. - * - * The reader non-spinnable bit wasn't set at time of entry or it will - * not be here at all. - */ -static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, - unsigned long last_rowner) -{ - unsigned long owner = atomic_long_read(&sem->owner); - - if (!(owner & RWSEM_READER_OWNED)) - return false; - - if (((owner ^ last_rowner) & ~RWSEM_OWNER_FLAGS_MASK) && - rwsem_try_read_lock_unqueued(sem)) { - lockevent_inc(rwsem_opt_rlock2); - lockevent_add(rwsem_opt_fail, -1); - return true; - } - return false; -} #else -static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, - unsigned long nonspinnable) +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) { return false; } -static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) +static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem) { return false; } -static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) { } +static inline void clear_nonspinnable(struct rw_semaphore *sem) { } -static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, - unsigned long last_rowner) +static inline enum owner_state +rwsem_spin_on_owner(struct rw_semaphore *sem) { - return false; + return OWNER_NONSPINNABLE; } +#endif + +/* + * Prepare to wake up waiter(s) in the wait queue by putting them into the + * given wake_q if the rwsem lock owner isn't a writer. If rwsem is likely + * reader-owned, wake up read lock waiters in queue front or wake up any + * front waiter otherwise. -static inline int -rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) + * This is being called from both reader and writer slow paths. + */ +static inline void rwsem_cond_wake_waiter(struct rw_semaphore *sem, long count, + struct wake_q_head *wake_q) { - return 0; + enum rwsem_wake_type wake_type; + + if (count & RWSEM_WRITER_MASK) + return; + + if (count & RWSEM_READER_MASK) { + wake_type = RWSEM_WAKE_READERS; + } else { + wake_type = RWSEM_WAKE_ANY; + clear_nonspinnable(sem); + } + rwsem_mark_wake(sem, wake_type, wake_q); } -#define OWNER_NULL 1 -#endif /* * Wait for the read lock to be granted */ static struct rw_semaphore __sched * -rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) +rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int state) { - long count, adjustment = -RWSEM_READER_BIAS; + long adjustment = -RWSEM_READER_BIAS; + long rcnt = (count >> RWSEM_READER_SHIFT); struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); - bool wake = false; /* - * Save the current read-owner of rwsem, if available, and the - * reader nonspinnable bit. + * To prevent a constant stream of readers from starving a sleeping + * writer, don't attempt optimistic lock stealing if the lock is + * very likely owned by readers. */ - waiter.last_rowner = atomic_long_read(&sem->owner); - if (!(waiter.last_rowner & RWSEM_READER_OWNED)) - waiter.last_rowner &= RWSEM_RD_NONSPINNABLE; - - if (!rwsem_can_spin_on_owner(sem, RWSEM_RD_NONSPINNABLE)) + if ((atomic_long_read(&sem->owner) & RWSEM_READER_OWNED) && + (rcnt > 1) && !(count & RWSEM_WRITER_LOCKED)) goto queue; /* - * Undo read bias from down_read() and do optimistic spinning. + * Reader optimistic lock stealing. */ - atomic_long_add(-RWSEM_READER_BIAS, &sem->count); - adjustment = 0; - if (rwsem_optimistic_spin(sem, false)) { - /* rwsem_optimistic_spin() implies ACQUIRE on success */ + if (!(count & (RWSEM_WRITER_LOCKED | RWSEM_FLAG_HANDOFF))) { + rwsem_set_reader_owned(sem); + lockevent_inc(rwsem_rlock_steal); + /* - * Wake up other readers in the wait list if the front - * waiter is a reader. + * Wake up other readers in the wait queue if it is + * the first reader. */ - if ((atomic_long_read(&sem->count) & RWSEM_FLAG_WAITERS)) { + if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) { raw_spin_lock_irq(&sem->wait_lock); if (!list_empty(&sem->wait_list)) rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, @@ -1027,26 +1029,23 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) wake_up_q(&wake_q); } return sem; - } else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) { - /* rwsem_reader_phase_trylock() implies ACQUIRE on success */ - return sem; } queue: waiter.task = current; waiter.type = RWSEM_WAITING_FOR_READ; waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; + waiter.handoff_set = false; raw_spin_lock_irq(&sem->wait_lock); if (list_empty(&sem->wait_list)) { /* * In case the wait queue is empty and the lock isn't owned - * by a writer or has the handoff bit set, this reader can - * exit the slowpath and return immediately as its - * RWSEM_READER_BIAS has already been set in the count. + * by a writer, this reader can exit the slowpath and return + * immediately as its RWSEM_READER_BIAS has already been set + * in the count. */ - if (adjustment && !(atomic_long_read(&sem->count) & - (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { + if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) { /* Provide lock ACQUIRE */ smp_acquire__after_ctrl_dep(); raw_spin_unlock_irq(&sem->wait_lock); @@ -1056,30 +1055,18 @@ queue: } adjustment += RWSEM_FLAG_WAITERS; } - list_add_tail(&waiter.list, &sem->wait_list); + rwsem_add_waiter(sem, &waiter); /* we're now waiting on the lock, but no longer actively locking */ - if (adjustment) - count = atomic_long_add_return(adjustment, &sem->count); - else - count = atomic_long_read(&sem->count); - - /* - * If there are no active locks, wake the front queued process(es). - * - * If there are no writers and we are first in the queue, - * wake our own waiter to join the existing active readers ! - */ - if (!(count & RWSEM_LOCK_MASK)) { - clear_wr_nonspinnable(sem); - wake = true; - } - if (wake || (!(count & RWSEM_WRITER_MASK) && - (adjustment & RWSEM_FLAG_WAITERS))) - rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + count = atomic_long_add_return(adjustment, &sem->count); + rwsem_cond_wake_waiter(sem, count, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); - wake_up_q(&wake_q); + + if (!wake_q_empty(&wake_q)) + wake_up_q(&wake_q); + + trace_contention_begin(sem, LCB_F_READ); /* wait to be given the lock */ for (;;) { @@ -1096,100 +1083,54 @@ queue: /* Ordered by sem->wait_lock against rwsem_mark_wake(). */ break; } - schedule(); + schedule_preempt_disabled(); lockevent_inc(rwsem_sleep_reader); } __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock); + trace_contention_end(sem, 0); return sem; out_nolock: - list_del(&waiter.list); - if (list_empty(&sem->wait_list)) { - atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF, - &sem->count); - } - raw_spin_unlock_irq(&sem->wait_lock); + rwsem_del_wake_waiter(sem, &waiter, &wake_q); __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock_fail); + trace_contention_end(sem, -EINTR); return ERR_PTR(-EINTR); } /* - * This function is called by the a write lock owner. So the owner value - * won't get changed by others. - */ -static inline void rwsem_disable_reader_optspin(struct rw_semaphore *sem, - bool disable) -{ - if (unlikely(disable)) { - atomic_long_or(RWSEM_RD_NONSPINNABLE, &sem->owner); - lockevent_inc(rwsem_opt_norspin); - } -} - -/* * Wait until we successfully acquire the write lock */ -static struct rw_semaphore * +static struct rw_semaphore __sched * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { - long count; - bool disable_rspin; - enum writer_wait_state wstate; struct rwsem_waiter waiter; - struct rw_semaphore *ret = sem; DEFINE_WAKE_Q(wake_q); /* do optimistic spinning and steal lock if possible */ - if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) && - rwsem_optimistic_spin(sem, true)) { + if (rwsem_can_spin_on_owner(sem) && rwsem_optimistic_spin(sem)) { /* rwsem_optimistic_spin() implies ACQUIRE on success */ return sem; } /* - * Disable reader optimistic spinning for this rwsem after - * acquiring the write lock when the setting of the nonspinnable - * bits are observed. - */ - disable_rspin = atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE; - - /* * Optimistic spinning failed, proceed to the slowpath * and block until we can acquire the sem. */ waiter.task = current; waiter.type = RWSEM_WAITING_FOR_WRITE; waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; + waiter.handoff_set = false; raw_spin_lock_irq(&sem->wait_lock); - - /* account for this before adding a new element to the list */ - wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST; - - list_add_tail(&waiter.list, &sem->wait_list); + rwsem_add_waiter(sem, &waiter); /* we're now waiting on the lock */ - if (wstate == WRITER_NOT_FIRST) { - count = atomic_long_read(&sem->count); - - /* - * If there were already threads queued before us and: - * 1) there are no no active locks, wake the front - * queued process(es) as the handoff bit might be set. - * 2) there are no active writers and some readers, the lock - * must be read owned; so we try to wake any read lock - * waiters that were queued ahead of us. - */ - if (count & RWSEM_WRITER_MASK) - goto wait; - - rwsem_mark_wake(sem, (count & RWSEM_READER_MASK) - ? RWSEM_WAKE_READERS - : RWSEM_WAKE_ANY, &wake_q); - + if (rwsem_first_waiter(sem) != &waiter) { + rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count), + &wake_q); if (!wake_q_empty(&wake_q)) { /* * We want to minimize wait_lock hold time especially @@ -1197,24 +1138,27 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) */ raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); - wake_q_init(&wake_q); /* Used again, reinit */ raw_spin_lock_irq(&sem->wait_lock); } } else { atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count); } -wait: /* wait until we successfully acquire the lock */ set_current_state(state); + trace_contention_begin(sem, LCB_F_WRITE); + for (;;) { - if (rwsem_try_write_lock(sem, wstate)) { + if (rwsem_try_write_lock(sem, &waiter)) { /* rwsem_try_write_lock() implies ACQUIRE on success */ break; } raw_spin_unlock_irq(&sem->wait_lock); + if (signal_pending_state(state, current)) + goto out_nolock; + /* * After setting the handoff bit and failing to acquire * the lock, attempt to spin on owner to accelerate lock @@ -1223,71 +1167,32 @@ wait: * In this case, we attempt to acquire the lock again * without sleeping. */ - if (wstate == WRITER_HANDOFF && - rwsem_spin_on_owner(sem, RWSEM_NONSPINNABLE) == OWNER_NULL) - goto trylock_again; - - /* Block until there are no active lockers. */ - for (;;) { - if (signal_pending_state(state, current)) - goto out_nolock; - - schedule(); - lockevent_inc(rwsem_sleep_writer); - set_current_state(state); - /* - * If HANDOFF bit is set, unconditionally do - * a trylock. - */ - if (wstate == WRITER_HANDOFF) - break; - - if ((wstate == WRITER_NOT_FIRST) && - (rwsem_first_waiter(sem) == &waiter)) - wstate = WRITER_FIRST; - - count = atomic_long_read(&sem->count); - if (!(count & RWSEM_LOCK_MASK)) - break; + if (waiter.handoff_set) { + enum owner_state owner_state; - /* - * The setting of the handoff bit is deferred - * until rwsem_try_write_lock() is called. - */ - if ((wstate == WRITER_FIRST) && (rt_task(current) || - time_after(jiffies, waiter.timeout))) { - wstate = WRITER_HANDOFF; - lockevent_inc(rwsem_wlock_handoff); - break; - } + owner_state = rwsem_spin_on_owner(sem); + if (owner_state == OWNER_NULL) + goto trylock_again; } + + schedule_preempt_disabled(); + lockevent_inc(rwsem_sleep_writer); + set_current_state(state); trylock_again: raw_spin_lock_irq(&sem->wait_lock); } __set_current_state(TASK_RUNNING); - list_del(&waiter.list); - rwsem_disable_reader_optspin(sem, disable_rspin); raw_spin_unlock_irq(&sem->wait_lock); lockevent_inc(rwsem_wlock); - - return ret; + trace_contention_end(sem, 0); + return sem; out_nolock: __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&sem->wait_lock); - list_del(&waiter.list); - - if (unlikely(wstate == WRITER_HANDOFF)) - atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count); - - if (list_empty(&sem->wait_list)) - atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); - else - rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - raw_spin_unlock_irq(&sem->wait_lock); - wake_up_q(&wake_q); + rwsem_del_wake_waiter(sem, &waiter, &wake_q); lockevent_inc(rwsem_wlock_fail); - + trace_contention_end(sem, -EINTR); return ERR_PTR(-EINTR); } @@ -1295,7 +1200,7 @@ out_nolock: * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here */ -static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem, long count) +static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; DEFINE_WAKE_Q(wake_q); @@ -1335,89 +1240,96 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) /* * lock for reading */ -static inline void __down_read(struct rw_semaphore *sem) +static __always_inline int __down_read_common(struct rw_semaphore *sem, int state) { - if (!rwsem_read_trylock(sem)) { - rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); + int ret = 0; + long count; + + preempt_disable(); + if (!rwsem_read_trylock(sem, &count)) { + if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) { + ret = -EINTR; + goto out; + } DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); - } else { - rwsem_set_reader_owned(sem); } +out: + preempt_enable(); + return ret; } -static inline int __down_read_killable(struct rw_semaphore *sem) +static __always_inline void __down_read(struct rw_semaphore *sem) { - if (!rwsem_read_trylock(sem)) { - if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE))) - return -EINTR; - DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); - } else { - rwsem_set_reader_owned(sem); - } - return 0; + __down_read_common(sem, TASK_UNINTERRUPTIBLE); +} + +static __always_inline int __down_read_interruptible(struct rw_semaphore *sem) +{ + return __down_read_common(sem, TASK_INTERRUPTIBLE); +} + +static __always_inline int __down_read_killable(struct rw_semaphore *sem) +{ + return __down_read_common(sem, TASK_KILLABLE); } static inline int __down_read_trylock(struct rw_semaphore *sem) { + int ret = 0; long tmp; DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); - /* - * Optimize for the case when the rwsem is not locked at all. - */ - tmp = RWSEM_UNLOCKED_VALUE; - do { + preempt_disable(); + tmp = atomic_long_read(&sem->count); + while (!(tmp & RWSEM_READ_FAILED_MASK)) { if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - tmp + RWSEM_READER_BIAS)) { + tmp + RWSEM_READER_BIAS)) { rwsem_set_reader_owned(sem); - return 1; + ret = 1; + break; } - } while (!(tmp & RWSEM_READ_FAILED_MASK)); - return 0; + } + preempt_enable(); + return ret; } /* * lock for writing */ -static inline void __down_write(struct rw_semaphore *sem) +static inline int __down_write_common(struct rw_semaphore *sem, int state) { - long tmp = RWSEM_UNLOCKED_VALUE; + int ret = 0; - if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - RWSEM_WRITER_LOCKED))) - rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE); - else - rwsem_set_owner(sem); + preempt_disable(); + if (unlikely(!rwsem_write_trylock(sem))) { + if (IS_ERR(rwsem_down_write_slowpath(sem, state))) + ret = -EINTR; + } + preempt_enable(); + return ret; } -static inline int __down_write_killable(struct rw_semaphore *sem) +static inline void __down_write(struct rw_semaphore *sem) { - long tmp = RWSEM_UNLOCKED_VALUE; + __down_write_common(sem, TASK_UNINTERRUPTIBLE); +} - if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - RWSEM_WRITER_LOCKED))) { - if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE))) - return -EINTR; - } else { - rwsem_set_owner(sem); - } - return 0; +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + return __down_write_common(sem, TASK_KILLABLE); } static inline int __down_write_trylock(struct rw_semaphore *sem) { - long tmp; + int ret; + preempt_disable(); DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); + ret = rwsem_write_trylock(sem); + preempt_enable(); - tmp = RWSEM_UNLOCKED_VALUE; - if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - RWSEM_WRITER_LOCKED)) { - rwsem_set_owner(sem); - return true; - } - return false; + return ret; } /* @@ -1430,14 +1342,16 @@ static inline void __up_read(struct rw_semaphore *sem) DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); + preempt_disable(); rwsem_clear_reader_owned(sem); tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == RWSEM_FLAG_WAITERS)) { - clear_wr_nonspinnable(sem); - rwsem_wake(sem, tmp); + clear_nonspinnable(sem); + rwsem_wake(sem); } + preempt_enable(); } /* @@ -1455,10 +1369,12 @@ static inline void __up_write(struct rw_semaphore *sem) DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) && !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem); + preempt_disable(); rwsem_clear_owner(sem); tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); if (unlikely(tmp & RWSEM_FLAG_WAITERS)) - rwsem_wake(sem, tmp); + rwsem_wake(sem); + preempt_enable(); } /* @@ -1476,13 +1392,131 @@ static inline void __downgrade_write(struct rw_semaphore *sem) * write side. As such, rely on RELEASE semantics. */ DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem); + preempt_disable(); tmp = atomic_long_fetch_add_release( -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count); rwsem_set_reader_owned(sem); if (tmp & RWSEM_FLAG_WAITERS) rwsem_downgrade_wake(sem); + preempt_enable(); } +#else /* !CONFIG_PREEMPT_RT */ + +#define RT_MUTEX_BUILD_MUTEX +#include "rtmutex.c" + +#define rwbase_set_and_save_current_state(state) \ + set_current_state(state) + +#define rwbase_restore_current_state() \ + __set_current_state(TASK_RUNNING) + +#define rwbase_rtmutex_lock_state(rtm, state) \ + __rt_mutex_lock(rtm, state) + +#define rwbase_rtmutex_slowlock_locked(rtm, state) \ + __rt_mutex_slowlock_locked(rtm, NULL, state) + +#define rwbase_rtmutex_unlock(rtm) \ + __rt_mutex_unlock(rtm) + +#define rwbase_rtmutex_trylock(rtm) \ + __rt_mutex_trylock(rtm) + +#define rwbase_signal_pending_state(state, current) \ + signal_pending_state(state, current) + +#define rwbase_pre_schedule() \ + rt_mutex_pre_schedule() + +#define rwbase_schedule() \ + rt_mutex_schedule() + +#define rwbase_post_schedule() \ + rt_mutex_post_schedule() + +#include "rwbase_rt.c" + +void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ + init_rwbase_rt(&(sem)->rwbase); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP); +#endif +} +EXPORT_SYMBOL(__init_rwsem); + +static inline void __down_read(struct rw_semaphore *sem) +{ + rwbase_read_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE); +} + +static inline int __down_read_interruptible(struct rw_semaphore *sem) +{ + return rwbase_read_lock(&sem->rwbase, TASK_INTERRUPTIBLE); +} + +static inline int __down_read_killable(struct rw_semaphore *sem) +{ + return rwbase_read_lock(&sem->rwbase, TASK_KILLABLE); +} + +static inline int __down_read_trylock(struct rw_semaphore *sem) +{ + return rwbase_read_trylock(&sem->rwbase); +} + +static inline void __up_read(struct rw_semaphore *sem) +{ + rwbase_read_unlock(&sem->rwbase, TASK_NORMAL); +} + +static inline void __sched __down_write(struct rw_semaphore *sem) +{ + rwbase_write_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE); +} + +static inline int __sched __down_write_killable(struct rw_semaphore *sem) +{ + return rwbase_write_lock(&sem->rwbase, TASK_KILLABLE); +} + +static inline int __down_write_trylock(struct rw_semaphore *sem) +{ + return rwbase_write_trylock(&sem->rwbase); +} + +static inline void __up_write(struct rw_semaphore *sem) +{ + rwbase_write_unlock(&sem->rwbase); +} + +static inline void __downgrade_write(struct rw_semaphore *sem) +{ + rwbase_write_downgrade(&sem->rwbase); +} + +/* Debug stubs for the common API */ +#define DEBUG_RWSEMS_WARN_ON(c, sem) + +static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, + struct task_struct *owner) +{ +} + +static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) +{ + int count = atomic_read(&sem->rwbase.readers); + + return count < 0 && count != READER_BIAS; +} + +#endif /* CONFIG_PREEMPT_RT */ + /* * lock for reading */ @@ -1495,6 +1529,20 @@ void __sched down_read(struct rw_semaphore *sem) } EXPORT_SYMBOL(down_read); +int __sched down_read_interruptible(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_interruptible)) { + rwsem_release(&sem->dep_map, _RET_IP_); + return -EINTR; + } + + return 0; +} +EXPORT_SYMBOL(down_read_interruptible); + int __sched down_read_killable(struct rw_semaphore *sem) { might_sleep(); @@ -1605,6 +1653,20 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) } EXPORT_SYMBOL(down_read_nested); +int down_read_killable_nested(struct rw_semaphore *sem, int subclass) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) { + rwsem_release(&sem->dep_map, _RET_IP_); + return -EINTR; + } + + return 0; +} +EXPORT_SYMBOL(down_read_killable_nested); + void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) { might_sleep(); @@ -1617,6 +1679,12 @@ void down_read_non_owner(struct rw_semaphore *sem) { might_sleep(); __down_read(sem); + /* + * The owner value for a reader-owned lock is mostly for debugging + * purpose only and is not critical to the correct functioning of + * rwsem. So it is perfectly fine to set it in a preempt-enabled + * context here. + */ __rwsem_set_reader_owned(sem, NULL); } EXPORT_SYMBOL(down_read_non_owner); diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h deleted file mode 100644 index e69de29bb2d1..000000000000 --- a/kernel/locking/rwsem.h +++ /dev/null diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index d9dd94defc0a..34bfae72f295 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -32,6 +32,7 @@ #include <linux/semaphore.h> #include <linux/spinlock.h> #include <linux/ftrace.h> +#include <trace/events/lock.h> static noinline void __down(struct semaphore *sem); static noinline int __down_interruptible(struct semaphore *sem); @@ -50,10 +51,11 @@ static noinline void __up(struct semaphore *sem); * Use of this function is deprecated, please use down_interruptible() or * down_killable() instead. */ -void down(struct semaphore *sem) +void __sched down(struct semaphore *sem) { unsigned long flags; + might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; @@ -72,11 +74,12 @@ EXPORT_SYMBOL(down); * If the sleep is interrupted by a signal, this function will return -EINTR. * If the semaphore is successfully acquired, this function returns 0. */ -int down_interruptible(struct semaphore *sem) +int __sched down_interruptible(struct semaphore *sem) { unsigned long flags; int result = 0; + might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; @@ -98,11 +101,12 @@ EXPORT_SYMBOL(down_interruptible); * -EINTR. If the semaphore is successfully acquired, this function returns * 0. */ -int down_killable(struct semaphore *sem) +int __sched down_killable(struct semaphore *sem) { unsigned long flags; int result = 0; + might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; @@ -119,7 +123,7 @@ EXPORT_SYMBOL(down_killable); * @sem: the semaphore to be acquired * * Try to acquire the semaphore atomically. Returns 0 if the semaphore has - * been acquired successfully or 1 if it it cannot be acquired. + * been acquired successfully or 1 if it cannot be acquired. * * NOTE: This return value is inverted from both spin_trylock and * mutex_trylock! Be careful about this when converting code. @@ -127,7 +131,7 @@ EXPORT_SYMBOL(down_killable); * Unlike mutex_trylock, this function can be used from interrupt context, * and the semaphore can be released by any task or interrupt. */ -int down_trylock(struct semaphore *sem) +int __sched down_trylock(struct semaphore *sem) { unsigned long flags; int count; @@ -152,11 +156,12 @@ EXPORT_SYMBOL(down_trylock); * If the semaphore is not released within the specified number of jiffies, * this function returns -ETIME. It returns 0 if the semaphore was acquired. */ -int down_timeout(struct semaphore *sem, long timeout) +int __sched down_timeout(struct semaphore *sem, long timeout) { unsigned long flags; int result = 0; + might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; @@ -175,7 +180,7 @@ EXPORT_SYMBOL(down_timeout); * Release the semaphore. Unlike mutexes, up() may be called from any * context and even by tasks which have never called down(). */ -void up(struct semaphore *sem) +void __sched up(struct semaphore *sem) { unsigned long flags; @@ -201,7 +206,7 @@ struct semaphore_waiter { * constant, and thus optimised away by the compiler. Likewise the * 'timeout' parameter for the cases without timeouts. */ -static inline int __sched __down_common(struct semaphore *sem, long state, +static inline int __sched ___down_common(struct semaphore *sem, long state, long timeout) { struct semaphore_waiter waiter; @@ -232,6 +237,18 @@ static inline int __sched __down_common(struct semaphore *sem, long state, return -EINTR; } +static inline int __sched __down_common(struct semaphore *sem, long state, + long timeout) +{ + int ret; + + trace_contention_begin(sem, 0); + ret = ___down_common(sem, state, timeout); + trace_contention_end(sem, ret); + + return ret; +} + static noinline void __sched __down(struct semaphore *sem) { __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 0ff08380f531..8475a0794f8c 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -58,10 +58,10 @@ EXPORT_PER_CPU_SYMBOL(__mmiowb_state); /* * We build the __lock_function inlines here. They are too large for * inlining all over the place, but here is only one user per function - * which embedds them into the calling _lock_function below. + * which embeds them into the calling _lock_function below. * * This could be a long-held lock. We both prepare to spin for a long - * time (making _this_ CPU preemptable if possible), and we also signal + * time (making _this_ CPU preemptible if possible), and we also signal * towards that other CPU that it should break the lock ASAP. */ #define BUILD_LOCK_OPS(op, locktype) \ @@ -124,13 +124,16 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ * __[spin|read|write]_lock_bh() */ BUILD_LOCK_OPS(spin, raw_spinlock); + +#ifndef CONFIG_PREEMPT_RT BUILD_LOCK_OPS(read, rwlock); BUILD_LOCK_OPS(write, rwlock); +#endif #endif #ifndef CONFIG_INLINE_SPIN_TRYLOCK -int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) +noinline int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) { return __raw_spin_trylock(lock); } @@ -138,7 +141,7 @@ EXPORT_SYMBOL(_raw_spin_trylock); #endif #ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH -int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) +noinline int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) { return __raw_spin_trylock_bh(lock); } @@ -146,7 +149,7 @@ EXPORT_SYMBOL(_raw_spin_trylock_bh); #endif #ifndef CONFIG_INLINE_SPIN_LOCK -void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) { __raw_spin_lock(lock); } @@ -154,7 +157,7 @@ EXPORT_SYMBOL(_raw_spin_lock); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE -unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) +noinline unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) { return __raw_spin_lock_irqsave(lock); } @@ -162,7 +165,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irqsave); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_IRQ -void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) { __raw_spin_lock_irq(lock); } @@ -170,7 +173,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irq); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_BH -void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) { __raw_spin_lock_bh(lock); } @@ -178,7 +181,7 @@ EXPORT_SYMBOL(_raw_spin_lock_bh); #endif #ifdef CONFIG_UNINLINE_SPIN_UNLOCK -void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) { __raw_spin_unlock(lock); } @@ -186,7 +189,7 @@ EXPORT_SYMBOL(_raw_spin_unlock); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE -void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) +noinline void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) { __raw_spin_unlock_irqrestore(lock, flags); } @@ -194,7 +197,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_irqrestore); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ -void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) { __raw_spin_unlock_irq(lock); } @@ -202,15 +205,17 @@ EXPORT_SYMBOL(_raw_spin_unlock_irq); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_BH -void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) { __raw_spin_unlock_bh(lock); } EXPORT_SYMBOL(_raw_spin_unlock_bh); #endif +#ifndef CONFIG_PREEMPT_RT + #ifndef CONFIG_INLINE_READ_TRYLOCK -int __lockfunc _raw_read_trylock(rwlock_t *lock) +noinline int __lockfunc _raw_read_trylock(rwlock_t *lock) { return __raw_read_trylock(lock); } @@ -218,7 +223,7 @@ EXPORT_SYMBOL(_raw_read_trylock); #endif #ifndef CONFIG_INLINE_READ_LOCK -void __lockfunc _raw_read_lock(rwlock_t *lock) +noinline void __lockfunc _raw_read_lock(rwlock_t *lock) { __raw_read_lock(lock); } @@ -226,7 +231,7 @@ EXPORT_SYMBOL(_raw_read_lock); #endif #ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE -unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) +noinline unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) { return __raw_read_lock_irqsave(lock); } @@ -234,7 +239,7 @@ EXPORT_SYMBOL(_raw_read_lock_irqsave); #endif #ifndef CONFIG_INLINE_READ_LOCK_IRQ -void __lockfunc _raw_read_lock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_read_lock_irq(rwlock_t *lock) { __raw_read_lock_irq(lock); } @@ -242,7 +247,7 @@ EXPORT_SYMBOL(_raw_read_lock_irq); #endif #ifndef CONFIG_INLINE_READ_LOCK_BH -void __lockfunc _raw_read_lock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_read_lock_bh(rwlock_t *lock) { __raw_read_lock_bh(lock); } @@ -250,7 +255,7 @@ EXPORT_SYMBOL(_raw_read_lock_bh); #endif #ifndef CONFIG_INLINE_READ_UNLOCK -void __lockfunc _raw_read_unlock(rwlock_t *lock) +noinline void __lockfunc _raw_read_unlock(rwlock_t *lock) { __raw_read_unlock(lock); } @@ -258,7 +263,7 @@ EXPORT_SYMBOL(_raw_read_unlock); #endif #ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE -void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +noinline void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { __raw_read_unlock_irqrestore(lock, flags); } @@ -266,7 +271,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irqrestore); #endif #ifndef CONFIG_INLINE_READ_UNLOCK_IRQ -void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) { __raw_read_unlock_irq(lock); } @@ -274,7 +279,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irq); #endif #ifndef CONFIG_INLINE_READ_UNLOCK_BH -void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) { __raw_read_unlock_bh(lock); } @@ -282,7 +287,7 @@ EXPORT_SYMBOL(_raw_read_unlock_bh); #endif #ifndef CONFIG_INLINE_WRITE_TRYLOCK -int __lockfunc _raw_write_trylock(rwlock_t *lock) +noinline int __lockfunc _raw_write_trylock(rwlock_t *lock) { return __raw_write_trylock(lock); } @@ -290,15 +295,25 @@ EXPORT_SYMBOL(_raw_write_trylock); #endif #ifndef CONFIG_INLINE_WRITE_LOCK -void __lockfunc _raw_write_lock(rwlock_t *lock) +noinline void __lockfunc _raw_write_lock(rwlock_t *lock) { __raw_write_lock(lock); } EXPORT_SYMBOL(_raw_write_lock); + +#ifndef CONFIG_DEBUG_LOCK_ALLOC +#define __raw_write_lock_nested(lock, subclass) __raw_write_lock(((void)(subclass), (lock))) +#endif + +void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass) +{ + __raw_write_lock_nested(lock, subclass); +} +EXPORT_SYMBOL(_raw_write_lock_nested); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE -unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) +noinline unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) { return __raw_write_lock_irqsave(lock); } @@ -306,7 +321,7 @@ EXPORT_SYMBOL(_raw_write_lock_irqsave); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_IRQ -void __lockfunc _raw_write_lock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_write_lock_irq(rwlock_t *lock) { __raw_write_lock_irq(lock); } @@ -314,7 +329,7 @@ EXPORT_SYMBOL(_raw_write_lock_irq); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_BH -void __lockfunc _raw_write_lock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_write_lock_bh(rwlock_t *lock) { __raw_write_lock_bh(lock); } @@ -322,7 +337,7 @@ EXPORT_SYMBOL(_raw_write_lock_bh); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK -void __lockfunc _raw_write_unlock(rwlock_t *lock) +noinline void __lockfunc _raw_write_unlock(rwlock_t *lock) { __raw_write_unlock(lock); } @@ -330,7 +345,7 @@ EXPORT_SYMBOL(_raw_write_unlock); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE -void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +noinline void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { __raw_write_unlock_irqrestore(lock, flags); } @@ -338,7 +353,7 @@ EXPORT_SYMBOL(_raw_write_unlock_irqrestore); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ -void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) { __raw_write_unlock_irq(lock); } @@ -346,13 +361,15 @@ EXPORT_SYMBOL(_raw_write_unlock_irq); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK_BH -void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) { __raw_write_unlock_bh(lock); } EXPORT_SYMBOL(_raw_write_unlock_bh); #endif +#endif /* !CONFIG_PREEMPT_RT */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) @@ -371,8 +388,7 @@ unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, local_irq_save(flags); preempt_disable(); spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock, - do_raw_spin_lock_flags, &flags); + LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); return flags; } EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested); diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index b9d93087ee66..87b03d2e41db 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -12,6 +12,7 @@ #include <linux/debug_locks.h> #include <linux/delay.h> #include <linux/export.h> +#include <linux/pid.h> void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, struct lock_class_key *key, short inner) @@ -31,6 +32,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, EXPORT_SYMBOL(__raw_spin_lock_init); +#ifndef CONFIG_PREEMPT_RT void __rwlock_init(rwlock_t *lock, const char *name, struct lock_class_key *key) { @@ -48,6 +50,7 @@ void __rwlock_init(rwlock_t *lock, const char *name, } EXPORT_SYMBOL(__rwlock_init); +#endif static void spin_dump(raw_spinlock_t *lock, const char *msg) { @@ -139,6 +142,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock) arch_spin_unlock(&lock->raw_lock); } +#ifndef CONFIG_PREEMPT_RT static void rwlock_bug(rwlock_t *lock, const char *msg) { if (!debug_locks_off()) @@ -228,3 +232,5 @@ void do_raw_write_unlock(rwlock_t *lock) debug_write_unlock(lock); arch_write_unlock(&lock->raw_lock); } + +#endif /* !CONFIG_PREEMPT_RT */ diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c new file mode 100644 index 000000000000..38e292454fcc --- /dev/null +++ b/kernel/locking/spinlock_rt.c @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * PREEMPT_RT substitution for spin/rw_locks + * + * spinlocks and rwlocks on RT are based on rtmutexes, with a few twists to + * resemble the non RT semantics: + * + * - Contrary to plain rtmutexes, spinlocks and rwlocks are state + * preserving. The task state is saved before blocking on the underlying + * rtmutex, and restored when the lock has been acquired. Regular wakeups + * during that time are redirected to the saved state so no wake up is + * missed. + * + * - Non RT spin/rwlocks disable preemption and eventually interrupts. + * Disabling preemption has the side effect of disabling migration and + * preventing RCU grace periods. + * + * The RT substitutions explicitly disable migration and take + * rcu_read_lock() across the lock held section. + */ +#include <linux/spinlock.h> +#include <linux/export.h> + +#define RT_MUTEX_BUILD_SPINLOCKS +#include "rtmutex.c" + +/* + * __might_resched() skips the state check as rtlocks are state + * preserving. Take RCU nesting into account as spin/read/write_lock() can + * legitimately nest into an RCU read side critical section. + */ +#define RTLOCK_RESCHED_OFFSETS \ + (rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT) + +#define rtlock_might_resched() \ + __might_resched(__FILE__, __LINE__, RTLOCK_RESCHED_OFFSETS) + +static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) +{ + lockdep_assert(!current->pi_blocked_on); + + if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current))) + rtlock_slowlock(rtm); +} + +static __always_inline void __rt_spin_lock(spinlock_t *lock) +{ + rtlock_might_resched(); + rtlock_lock(&lock->lock); + rcu_read_lock(); + migrate_disable(); +} + +void __sched rt_spin_lock(spinlock_t *lock) +{ + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); + __rt_spin_lock(lock); +} +EXPORT_SYMBOL(rt_spin_lock); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __sched rt_spin_lock_nested(spinlock_t *lock, int subclass) +{ + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + __rt_spin_lock(lock); +} +EXPORT_SYMBOL(rt_spin_lock_nested); + +void __sched rt_spin_lock_nest_lock(spinlock_t *lock, + struct lockdep_map *nest_lock) +{ + spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); + __rt_spin_lock(lock); +} +EXPORT_SYMBOL(rt_spin_lock_nest_lock); +#endif + +void __sched rt_spin_unlock(spinlock_t *lock) +{ + spin_release(&lock->dep_map, _RET_IP_); + migrate_enable(); + rcu_read_unlock(); + + if (unlikely(!rt_mutex_cmpxchg_release(&lock->lock, current, NULL))) + rt_mutex_slowunlock(&lock->lock); +} +EXPORT_SYMBOL(rt_spin_unlock); + +/* + * Wait for the lock to get unlocked: instead of polling for an unlock + * (like raw spinlocks do), lock and unlock, to force the kernel to + * schedule if there's contention: + */ +void __sched rt_spin_lock_unlock(spinlock_t *lock) +{ + spin_lock(lock); + spin_unlock(lock); +} +EXPORT_SYMBOL(rt_spin_lock_unlock); + +static __always_inline int __rt_spin_trylock(spinlock_t *lock) +{ + int ret = 1; + + if (unlikely(!rt_mutex_cmpxchg_acquire(&lock->lock, NULL, current))) + ret = rt_mutex_slowtrylock(&lock->lock); + + if (ret) { + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + rcu_read_lock(); + migrate_disable(); + } + return ret; +} + +int __sched rt_spin_trylock(spinlock_t *lock) +{ + return __rt_spin_trylock(lock); +} +EXPORT_SYMBOL(rt_spin_trylock); + +int __sched rt_spin_trylock_bh(spinlock_t *lock) +{ + int ret; + + local_bh_disable(); + ret = __rt_spin_trylock(lock); + if (!ret) + local_bh_enable(); + return ret; +} +EXPORT_SYMBOL(rt_spin_trylock_bh); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __rt_spin_lock_init(spinlock_t *lock, const char *name, + struct lock_class_key *key, bool percpu) +{ + u8 type = percpu ? LD_LOCK_PERCPU : LD_LOCK_NORMAL; + + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map_type(&lock->dep_map, name, key, 0, LD_WAIT_CONFIG, + LD_WAIT_INV, type); +} +EXPORT_SYMBOL(__rt_spin_lock_init); +#endif + +/* + * RT-specific reader/writer locks + */ +#define rwbase_set_and_save_current_state(state) \ + current_save_and_set_rtlock_wait_state() + +#define rwbase_restore_current_state() \ + current_restore_rtlock_saved_state() + +static __always_inline int +rwbase_rtmutex_lock_state(struct rt_mutex_base *rtm, unsigned int state) +{ + if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current))) + rtlock_slowlock(rtm); + return 0; +} + +static __always_inline int +rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state) +{ + rtlock_slowlock_locked(rtm); + return 0; +} + +static __always_inline void rwbase_rtmutex_unlock(struct rt_mutex_base *rtm) +{ + if (likely(rt_mutex_cmpxchg_acquire(rtm, current, NULL))) + return; + + rt_mutex_slowunlock(rtm); +} + +static __always_inline int rwbase_rtmutex_trylock(struct rt_mutex_base *rtm) +{ + if (likely(rt_mutex_cmpxchg_acquire(rtm, NULL, current))) + return 1; + + return rt_mutex_slowtrylock(rtm); +} + +#define rwbase_signal_pending_state(state, current) (0) + +#define rwbase_pre_schedule() + +#define rwbase_schedule() \ + schedule_rtlock() + +#define rwbase_post_schedule() + +#include "rwbase_rt.c" +/* + * The common functions which get wrapped into the rwlock API. + */ +int __sched rt_read_trylock(rwlock_t *rwlock) +{ + int ret; + + ret = rwbase_read_trylock(&rwlock->rwbase); + if (ret) { + rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_); + rcu_read_lock(); + migrate_disable(); + } + return ret; +} +EXPORT_SYMBOL(rt_read_trylock); + +int __sched rt_write_trylock(rwlock_t *rwlock) +{ + int ret; + + ret = rwbase_write_trylock(&rwlock->rwbase); + if (ret) { + rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_); + rcu_read_lock(); + migrate_disable(); + } + return ret; +} +EXPORT_SYMBOL(rt_write_trylock); + +void __sched rt_read_lock(rwlock_t *rwlock) +{ + rtlock_might_resched(); + rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); + rwbase_read_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); + rcu_read_lock(); + migrate_disable(); +} +EXPORT_SYMBOL(rt_read_lock); + +void __sched rt_write_lock(rwlock_t *rwlock) +{ + rtlock_might_resched(); + rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); + rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); + rcu_read_lock(); + migrate_disable(); +} +EXPORT_SYMBOL(rt_write_lock); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass) +{ + rtlock_might_resched(); + rwlock_acquire(&rwlock->dep_map, subclass, 0, _RET_IP_); + rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); + rcu_read_lock(); + migrate_disable(); +} +EXPORT_SYMBOL(rt_write_lock_nested); +#endif + +void __sched rt_read_unlock(rwlock_t *rwlock) +{ + rwlock_release(&rwlock->dep_map, _RET_IP_); + migrate_enable(); + rcu_read_unlock(); + rwbase_read_unlock(&rwlock->rwbase, TASK_RTLOCK_WAIT); +} +EXPORT_SYMBOL(rt_read_unlock); + +void __sched rt_write_unlock(rwlock_t *rwlock) +{ + rwlock_release(&rwlock->dep_map, _RET_IP_); + rcu_read_unlock(); + migrate_enable(); + rwbase_write_unlock(&rwlock->rwbase); +} +EXPORT_SYMBOL(rt_write_unlock); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __rt_rwlock_init(rwlock_t *rwlock, const char *name, + struct lock_class_key *key) +{ + debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock)); + lockdep_init_map_wait(&rwlock->dep_map, name, key, 0, LD_WAIT_CONFIG); +} +EXPORT_SYMBOL(__rt_rwlock_init); +#endif diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 3e82f449b4ff..78719e1ef1b1 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -9,13 +9,22 @@ #include <linux/delay.h> #include <linux/kthread.h> #include <linux/module.h> -#include <linux/random.h> +#include <linux/prandom.h> #include <linux/slab.h> #include <linux/ww_mutex.h> static DEFINE_WD_CLASS(ww_class); struct workqueue_struct *wq; +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH +#define ww_acquire_init_noinject(a, b) do { \ + ww_acquire_init((a), (b)); \ + (a)->deadlock_inject_countdown = ~0U; \ + } while (0) +#else +#define ww_acquire_init_noinject(a, b) ww_acquire_init((a), (b)) +#endif + struct test_mutex { struct work_struct work; struct ww_mutex mutex; @@ -36,7 +45,7 @@ static void test_mutex_work(struct work_struct *work) wait_for_completion(&mtx->go); if (mtx->flags & TEST_MTX_TRY) { - while (!ww_mutex_trylock(&mtx->mutex)) + while (!ww_mutex_trylock(&mtx->mutex, NULL)) cond_resched(); } else { ww_mutex_lock(&mtx->mutex, NULL); @@ -109,19 +118,39 @@ static int test_mutex(void) return 0; } -static int test_aa(void) +static int test_aa(bool trylock) { struct ww_mutex mutex; struct ww_acquire_ctx ctx; int ret; + const char *from = trylock ? "trylock" : "lock"; ww_mutex_init(&mutex, &ww_class); ww_acquire_init(&ctx, &ww_class); - ww_mutex_lock(&mutex, &ctx); + if (!trylock) { + ret = ww_mutex_lock(&mutex, &ctx); + if (ret) { + pr_err("%s: initial lock failed!\n", __func__); + goto out; + } + } else { + ret = !ww_mutex_trylock(&mutex, &ctx); + if (ret) { + pr_err("%s: initial trylock failed!\n", __func__); + goto out; + } + } - if (ww_mutex_trylock(&mutex)) { - pr_err("%s: trylocked itself!\n", __func__); + if (ww_mutex_trylock(&mutex, NULL)) { + pr_err("%s: trylocked itself without context from %s!\n", __func__, from); + ww_mutex_unlock(&mutex); + ret = -EINVAL; + goto out; + } + + if (ww_mutex_trylock(&mutex, &ctx)) { + pr_err("%s: trylocked itself with context from %s!\n", __func__, from); ww_mutex_unlock(&mutex); ret = -EINVAL; goto out; @@ -129,17 +158,17 @@ static int test_aa(void) ret = ww_mutex_lock(&mutex, &ctx); if (ret != -EALREADY) { - pr_err("%s: missed deadlock for recursing, ret=%d\n", - __func__, ret); + pr_err("%s: missed deadlock for recursing, ret=%d from %s\n", + __func__, ret, from); if (!ret) ww_mutex_unlock(&mutex); ret = -EINVAL; goto out; } + ww_mutex_unlock(&mutex); ret = 0; out: - ww_mutex_unlock(&mutex); ww_acquire_fini(&ctx); return ret; } @@ -150,7 +179,7 @@ struct test_abba { struct ww_mutex b_mutex; struct completion a_ready; struct completion b_ready; - bool resolve; + bool resolve, trylock; int result; }; @@ -160,8 +189,13 @@ static void test_abba_work(struct work_struct *work) struct ww_acquire_ctx ctx; int err; - ww_acquire_init(&ctx, &ww_class); - ww_mutex_lock(&abba->b_mutex, &ctx); + ww_acquire_init_noinject(&ctx, &ww_class); + if (!abba->trylock) + ww_mutex_lock(&abba->b_mutex, &ctx); + else + WARN_ON(!ww_mutex_trylock(&abba->b_mutex, &ctx)); + + WARN_ON(READ_ONCE(abba->b_mutex.ctx) != &ctx); complete(&abba->b_ready); wait_for_completion(&abba->a_ready); @@ -181,7 +215,7 @@ static void test_abba_work(struct work_struct *work) abba->result = err; } -static int test_abba(bool resolve) +static int test_abba(bool trylock, bool resolve) { struct test_abba abba; struct ww_acquire_ctx ctx; @@ -192,12 +226,18 @@ static int test_abba(bool resolve) INIT_WORK_ONSTACK(&abba.work, test_abba_work); init_completion(&abba.a_ready); init_completion(&abba.b_ready); + abba.trylock = trylock; abba.resolve = resolve; schedule_work(&abba.work); - ww_acquire_init(&ctx, &ww_class); - ww_mutex_lock(&abba.a_mutex, &ctx); + ww_acquire_init_noinject(&ctx, &ww_class); + if (!trylock) + ww_mutex_lock(&abba.a_mutex, &ctx); + else + WARN_ON(!ww_mutex_trylock(&abba.a_mutex, &ctx)); + + WARN_ON(READ_ONCE(abba.a_mutex.ctx) != &ctx); complete(&abba.a_ready); wait_for_completion(&abba.b_ready); @@ -249,7 +289,7 @@ static void test_cycle_work(struct work_struct *work) struct ww_acquire_ctx ctx; int err, erra = 0; - ww_acquire_init(&ctx, &ww_class); + ww_acquire_init_noinject(&ctx, &ww_class); ww_mutex_lock(&cycle->a_mutex, &ctx); complete(cycle->a_signal); @@ -346,6 +386,19 @@ struct stress { int nlocks; }; +struct rnd_state rng; +DEFINE_SPINLOCK(rng_lock); + +static inline u32 prandom_u32_below(u32 ceil) +{ + u32 ret; + + spin_lock(&rng_lock); + ret = prandom_u32_state(&rng) % ceil; + spin_unlock(&rng_lock); + return ret; +} + static int *get_random_order(int count) { int *order; @@ -359,7 +412,7 @@ static int *get_random_order(int count) order[n] = n; for (n = count - 1; n > 1; n--) { - r = get_random_int() % (n + 1); + r = prandom_u32_below(n + 1); if (r != n) { tmp = order[n]; order[n] = order[r]; @@ -412,21 +465,21 @@ retry: ww_mutex_unlock(&locks[order[n]]); if (err == -EDEADLK) { - ww_mutex_lock_slow(&locks[order[contended]], &ctx); - goto retry; + if (!time_after(jiffies, stress->timeout)) { + ww_mutex_lock_slow(&locks[order[contended]], &ctx); + goto retry; + } } + ww_acquire_fini(&ctx); if (err) { pr_err_once("stress (%s) failed with %d\n", __func__, err); break; } - - ww_acquire_fini(&ctx); } while (!time_after(jiffies, stress->timeout)); kfree(order); - kfree(stress); } struct reorder_lock { @@ -491,14 +544,13 @@ out: list_for_each_entry_safe(ll, ln, &locks, link) kfree(ll); kfree(order); - kfree(stress); } static void stress_one_work(struct work_struct *work) { struct stress *stress = container_of(work, typeof(*stress), work); const int nlocks = stress->nlocks; - struct ww_mutex *lock = stress->locks + (get_random_int() % nlocks); + struct ww_mutex *lock = stress->locks + get_random_u32_below(nlocks); int err; do { @@ -512,8 +564,6 @@ static void stress_one_work(struct work_struct *work) break; } } while (!time_after(jiffies, stress->timeout)); - - kfree(stress); } #define STRESS_INORDER BIT(0) @@ -524,15 +574,24 @@ static void stress_one_work(struct work_struct *work) static int stress(int nlocks, int nthreads, unsigned int flags) { struct ww_mutex *locks; - int n; + struct stress *stress_array; + int n, count; locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL); if (!locks) return -ENOMEM; + stress_array = kmalloc_array(nthreads, sizeof(*stress_array), + GFP_KERNEL); + if (!stress_array) { + kfree(locks); + return -ENOMEM; + } + for (n = 0; n < nlocks; n++) ww_mutex_init(&locks[n], &ww_class); + count = 0; for (n = 0; nthreads; n++) { struct stress *stress; void (*fn)(struct work_struct *work); @@ -556,9 +615,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags) if (!fn) continue; - stress = kmalloc(sizeof(*stress), GFP_KERNEL); - if (!stress) - break; + stress = &stress_array[count++]; INIT_WORK(&stress->work, fn); stress->locks = locks; @@ -573,6 +630,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags) for (n = 0; n < nlocks; n++) ww_mutex_destroy(&locks[n]); + kfree(stress_array); kfree(locks); return 0; @@ -581,7 +639,11 @@ static int stress(int nlocks, int nthreads, unsigned int flags) static int __init test_ww_mutex_init(void) { int ncpus = num_online_cpus(); - int ret; + int ret, i; + + printk(KERN_INFO "Beginning ww mutex selftests\n"); + + prandom_seed_state(&rng, get_random_u64()); wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0); if (!wq) @@ -591,17 +653,19 @@ static int __init test_ww_mutex_init(void) if (ret) return ret; - ret = test_aa(); + ret = test_aa(false); if (ret) return ret; - ret = test_abba(false); + ret = test_aa(true); if (ret) return ret; - ret = test_abba(true); - if (ret) - return ret; + for (i = 0; i < 4; i++) { + ret = test_abba(i & 1, i & 2); + if (ret) + return ret; + } ret = test_cycle(ncpus); if (ret) @@ -615,10 +679,11 @@ static int __init test_ww_mutex_init(void) if (ret) return ret; - ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL); + ret = stress(2047, hweight32(STRESS_ALL)*ncpus, STRESS_ALL); if (ret) return ret; + printk(KERN_INFO "All ww mutex selftests passed\n"); return 0; } diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h new file mode 100644 index 000000000000..3ad2cc4823e5 --- /dev/null +++ b/kernel/locking/ww_mutex.h @@ -0,0 +1,569 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef WW_RT + +#define MUTEX mutex +#define MUTEX_WAITER mutex_waiter + +static inline struct mutex_waiter * +__ww_waiter_first(struct mutex *lock) +{ + struct mutex_waiter *w; + + w = list_first_entry(&lock->wait_list, struct mutex_waiter, list); + if (list_entry_is_head(w, &lock->wait_list, list)) + return NULL; + + return w; +} + +static inline struct mutex_waiter * +__ww_waiter_next(struct mutex *lock, struct mutex_waiter *w) +{ + w = list_next_entry(w, list); + if (list_entry_is_head(w, &lock->wait_list, list)) + return NULL; + + return w; +} + +static inline struct mutex_waiter * +__ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w) +{ + w = list_prev_entry(w, list); + if (list_entry_is_head(w, &lock->wait_list, list)) + return NULL; + + return w; +} + +static inline struct mutex_waiter * +__ww_waiter_last(struct mutex *lock) +{ + struct mutex_waiter *w; + + w = list_last_entry(&lock->wait_list, struct mutex_waiter, list); + if (list_entry_is_head(w, &lock->wait_list, list)) + return NULL; + + return w; +} + +static inline void +__ww_waiter_add(struct mutex *lock, struct mutex_waiter *waiter, struct mutex_waiter *pos) +{ + struct list_head *p = &lock->wait_list; + if (pos) + p = &pos->list; + __mutex_add_waiter(lock, waiter, p); +} + +static inline struct task_struct * +__ww_mutex_owner(struct mutex *lock) +{ + return __mutex_owner(lock); +} + +static inline bool +__ww_mutex_has_waiters(struct mutex *lock) +{ + return atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS; +} + +static inline void lock_wait_lock(struct mutex *lock) +{ + raw_spin_lock(&lock->wait_lock); +} + +static inline void unlock_wait_lock(struct mutex *lock) +{ + raw_spin_unlock(&lock->wait_lock); +} + +static inline void lockdep_assert_wait_lock_held(struct mutex *lock) +{ + lockdep_assert_held(&lock->wait_lock); +} + +#else /* WW_RT */ + +#define MUTEX rt_mutex +#define MUTEX_WAITER rt_mutex_waiter + +static inline struct rt_mutex_waiter * +__ww_waiter_first(struct rt_mutex *lock) +{ + struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root); + if (!n) + return NULL; + return rb_entry(n, struct rt_mutex_waiter, tree.entry); +} + +static inline struct rt_mutex_waiter * +__ww_waiter_next(struct rt_mutex *lock, struct rt_mutex_waiter *w) +{ + struct rb_node *n = rb_next(&w->tree.entry); + if (!n) + return NULL; + return rb_entry(n, struct rt_mutex_waiter, tree.entry); +} + +static inline struct rt_mutex_waiter * +__ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w) +{ + struct rb_node *n = rb_prev(&w->tree.entry); + if (!n) + return NULL; + return rb_entry(n, struct rt_mutex_waiter, tree.entry); +} + +static inline struct rt_mutex_waiter * +__ww_waiter_last(struct rt_mutex *lock) +{ + struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root); + if (!n) + return NULL; + return rb_entry(n, struct rt_mutex_waiter, tree.entry); +} + +static inline void +__ww_waiter_add(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct rt_mutex_waiter *pos) +{ + /* RT unconditionally adds the waiter first and then removes it on error */ +} + +static inline struct task_struct * +__ww_mutex_owner(struct rt_mutex *lock) +{ + return rt_mutex_owner(&lock->rtmutex); +} + +static inline bool +__ww_mutex_has_waiters(struct rt_mutex *lock) +{ + return rt_mutex_has_waiters(&lock->rtmutex); +} + +static inline void lock_wait_lock(struct rt_mutex *lock) +{ + raw_spin_lock(&lock->rtmutex.wait_lock); +} + +static inline void unlock_wait_lock(struct rt_mutex *lock) +{ + raw_spin_unlock(&lock->rtmutex.wait_lock); +} + +static inline void lockdep_assert_wait_lock_held(struct rt_mutex *lock) +{ + lockdep_assert_held(&lock->rtmutex.wait_lock); +} + +#endif /* WW_RT */ + +/* + * Wait-Die: + * The newer transactions are killed when: + * It (the new transaction) makes a request for a lock being held + * by an older transaction. + * + * Wound-Wait: + * The newer transactions are wounded when: + * An older transaction makes a request for a lock being held by + * the newer transaction. + */ + +/* + * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired + * it. + */ +static __always_inline void +ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) +{ +#ifdef DEBUG_WW_MUTEXES + /* + * If this WARN_ON triggers, you used ww_mutex_lock to acquire, + * but released with a normal mutex_unlock in this call. + * + * This should never happen, always use ww_mutex_unlock. + */ + DEBUG_LOCKS_WARN_ON(ww->ctx); + + /* + * Not quite done after calling ww_acquire_done() ? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); + + if (ww_ctx->contending_lock) { + /* + * After -EDEADLK you tried to + * acquire a different ww_mutex? Bad! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); + + /* + * You called ww_mutex_lock after receiving -EDEADLK, + * but 'forgot' to unlock everything else first? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); + ww_ctx->contending_lock = NULL; + } + + /* + * Naughty, using a different class will lead to undefined behavior! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); +#endif + ww_ctx->acquired++; + ww->ctx = ww_ctx; +} + +/* + * Determine if @a is 'less' than @b. IOW, either @a is a lower priority task + * or, when of equal priority, a younger transaction than @b. + * + * Depending on the algorithm, @a will either need to wait for @b, or die. + */ +static inline bool +__ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) +{ +/* + * Can only do the RT prio for WW_RT, because task->prio isn't stable due to PI, + * so the wait_list ordering will go wobbly. rt_mutex re-queues the waiter and + * isn't affected by this. + */ +#ifdef WW_RT + /* kernel prio; less is more */ + int a_prio = a->task->prio; + int b_prio = b->task->prio; + + if (rt_prio(a_prio) || rt_prio(b_prio)) { + + if (a_prio > b_prio) + return true; + + if (a_prio < b_prio) + return false; + + /* equal static prio */ + + if (dl_prio(a_prio)) { + if (dl_time_before(b->task->dl.deadline, + a->task->dl.deadline)) + return true; + + if (dl_time_before(a->task->dl.deadline, + b->task->dl.deadline)) + return false; + } + + /* equal prio */ + } +#endif + + /* FIFO order tie break -- bigger is younger */ + return (signed long)(a->stamp - b->stamp) > 0; +} + +/* + * Wait-Die; wake a lesser waiter context (when locks held) such that it can + * die. + * + * Among waiters with context, only the first one can have other locks acquired + * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and + * __ww_mutex_check_kill() wake any but the earliest context. + */ +static bool +__ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, + struct ww_acquire_ctx *ww_ctx) +{ + if (!ww_ctx->is_wait_die) + return false; + + if (waiter->ww_ctx->acquired > 0 && __ww_ctx_less(waiter->ww_ctx, ww_ctx)) { +#ifndef WW_RT + debug_mutex_wake_waiter(lock, waiter); +#endif + wake_up_process(waiter->task); + } + + return true; +} + +/* + * Wound-Wait; wound a lesser @hold_ctx if it holds the lock. + * + * Wound the lock holder if there are waiters with more important transactions + * than the lock holders. Even if multiple waiters may wound the lock holder, + * it's sufficient that only one does. + */ +static bool __ww_mutex_wound(struct MUTEX *lock, + struct ww_acquire_ctx *ww_ctx, + struct ww_acquire_ctx *hold_ctx) +{ + struct task_struct *owner = __ww_mutex_owner(lock); + + lockdep_assert_wait_lock_held(lock); + + /* + * Possible through __ww_mutex_add_waiter() when we race with + * ww_mutex_set_context_fastpath(). In that case we'll get here again + * through __ww_mutex_check_waiters(). + */ + if (!hold_ctx) + return false; + + /* + * Can have !owner because of __mutex_unlock_slowpath(), but if owner, + * it cannot go away because we'll have FLAG_WAITERS set and hold + * wait_lock. + */ + if (!owner) + return false; + + if (ww_ctx->acquired > 0 && __ww_ctx_less(hold_ctx, ww_ctx)) { + hold_ctx->wounded = 1; + + /* + * wake_up_process() paired with set_current_state() + * inserts sufficient barriers to make sure @owner either sees + * it's wounded in __ww_mutex_check_kill() or has a + * wakeup pending to re-read the wounded state. + */ + if (owner != current) + wake_up_process(owner); + + return true; + } + + return false; +} + +/* + * We just acquired @lock under @ww_ctx, if there are more important contexts + * waiting behind us on the wait-list, check if they need to die, or wound us. + * + * See __ww_mutex_add_waiter() for the list-order construction; basically the + * list is ordered by stamp, smallest (oldest) first. + * + * This relies on never mixing wait-die/wound-wait on the same wait-list; + * which is currently ensured by that being a ww_class property. + * + * The current task must not be on the wait list. + */ +static void +__ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) +{ + struct MUTEX_WAITER *cur; + + lockdep_assert_wait_lock_held(lock); + + for (cur = __ww_waiter_first(lock); cur; + cur = __ww_waiter_next(lock, cur)) { + + if (!cur->ww_ctx) + continue; + + if (__ww_mutex_die(lock, cur, ww_ctx) || + __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx)) + break; + } +} + +/* + * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx + * and wake up any waiters so they can recheck. + */ +static __always_inline void +ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + ww_mutex_lock_acquired(lock, ctx); + + /* + * The lock->ctx update should be visible on all cores before + * the WAITERS check is done, otherwise contended waiters might be + * missed. The contended waiters will either see ww_ctx == NULL + * and keep spinning, or it will acquire wait_lock, add itself + * to waiter list and sleep. + */ + smp_mb(); /* See comments above and below. */ + + /* + * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS + * MB MB + * [R] MUTEX_FLAG_WAITERS [R] ww->ctx + * + * The memory barrier above pairs with the memory barrier in + * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx + * and/or !empty list. + */ + if (likely(!__ww_mutex_has_waiters(&lock->base))) + return; + + /* + * Uh oh, we raced in fastpath, check if any of the waiters need to + * die or wound us. + */ + lock_wait_lock(&lock->base); + __ww_mutex_check_waiters(&lock->base, ctx); + unlock_wait_lock(&lock->base); +} + +static __always_inline int +__ww_mutex_kill(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) +{ + if (ww_ctx->acquired > 0) { +#ifdef DEBUG_WW_MUTEXES + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base); + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock); + ww_ctx->contending_lock = ww; +#endif + return -EDEADLK; + } + + return 0; +} + +/* + * Check the wound condition for the current lock acquire. + * + * Wound-Wait: If we're wounded, kill ourself. + * + * Wait-Die: If we're trying to acquire a lock already held by an older + * context, kill ourselves. + * + * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to + * look at waiters before us in the wait-list. + */ +static inline int +__ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter, + struct ww_acquire_ctx *ctx) +{ + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); + struct MUTEX_WAITER *cur; + + if (ctx->acquired == 0) + return 0; + + if (!ctx->is_wait_die) { + if (ctx->wounded) + return __ww_mutex_kill(lock, ctx); + + return 0; + } + + if (hold_ctx && __ww_ctx_less(ctx, hold_ctx)) + return __ww_mutex_kill(lock, ctx); + + /* + * If there is a waiter in front of us that has a context, then its + * stamp is earlier than ours and we must kill ourself. + */ + for (cur = __ww_waiter_prev(lock, waiter); cur; + cur = __ww_waiter_prev(lock, cur)) { + + if (!cur->ww_ctx) + continue; + + return __ww_mutex_kill(lock, ctx); + } + + return 0; +} + +/* + * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest + * first. Such that older contexts are preferred to acquire the lock over + * younger contexts. + * + * Waiters without context are interspersed in FIFO order. + * + * Furthermore, for Wait-Die kill ourself immediately when possible (there are + * older contexts already waiting) to avoid unnecessary waiting and for + * Wound-Wait ensure we wound the owning context when it is younger. + */ +static inline int +__ww_mutex_add_waiter(struct MUTEX_WAITER *waiter, + struct MUTEX *lock, + struct ww_acquire_ctx *ww_ctx) +{ + struct MUTEX_WAITER *cur, *pos = NULL; + bool is_wait_die; + + if (!ww_ctx) { + __ww_waiter_add(lock, waiter, NULL); + return 0; + } + + is_wait_die = ww_ctx->is_wait_die; + + /* + * Add the waiter before the first waiter with a higher stamp. + * Waiters without a context are skipped to avoid starving + * them. Wait-Die waiters may die here. Wound-Wait waiters + * never die here, but they are sorted in stamp order and + * may wound the lock holder. + */ + for (cur = __ww_waiter_last(lock); cur; + cur = __ww_waiter_prev(lock, cur)) { + + if (!cur->ww_ctx) + continue; + + if (__ww_ctx_less(ww_ctx, cur->ww_ctx)) { + /* + * Wait-Die: if we find an older context waiting, there + * is no point in queueing behind it, as we'd have to + * die the moment it would acquire the lock. + */ + if (is_wait_die) { + int ret = __ww_mutex_kill(lock, ww_ctx); + + if (ret) + return ret; + } + + break; + } + + pos = cur; + + /* Wait-Die: ensure younger waiters die. */ + __ww_mutex_die(lock, cur, ww_ctx); + } + + __ww_waiter_add(lock, waiter, pos); + + /* + * Wound-Wait: if we're blocking on a mutex owned by a younger context, + * wound that such that we might proceed. + */ + if (!is_wait_die) { + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + + /* + * See ww_mutex_set_context_fastpath(). Orders setting + * MUTEX_FLAG_WAITERS vs the ww->ctx load, + * such that either we or the fastpath will wound @ww->ctx. + */ + smp_mb(); + __ww_mutex_wound(lock, ww_ctx, ww->ctx); + } + + return 0; +} + +static inline void __ww_mutex_unlock(struct ww_mutex *lock) +{ + if (lock->ctx) { +#ifdef DEBUG_WW_MUTEXES + DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired); +#endif + if (lock->ctx->acquired > 0) + lock->ctx->acquired--; + lock->ctx = NULL; + } +} diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c new file mode 100644 index 000000000000..c7196de838ed --- /dev/null +++ b/kernel/locking/ww_rt_mutex.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * rtmutex API + */ +#include <linux/spinlock.h> +#include <linux/export.h> + +#define RT_MUTEX_BUILD_MUTEX +#define WW_RT +#include "rtmutex.c" + +int ww_mutex_trylock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx) +{ + struct rt_mutex *rtm = &lock->base; + + if (!ww_ctx) + return rt_mutex_trylock(rtm); + + /* + * Reset the wounded flag after a kill. No other process can + * race and wound us here, since they can't have a valid owner + * pointer if we don't have any locks held. + */ + if (ww_ctx->acquired == 0) + ww_ctx->wounded = 0; + + if (__rt_mutex_trylock(&rtm->rtmutex)) { + ww_mutex_set_context_fastpath(lock, ww_ctx); + mutex_acquire_nest(&rtm->dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_); + return 1; + } + + return 0; +} +EXPORT_SYMBOL(ww_mutex_trylock); + +static int __sched +__ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx, + unsigned int state, unsigned long ip) +{ + struct lockdep_map __maybe_unused *nest_lock = NULL; + struct rt_mutex *rtm = &lock->base; + int ret; + + might_sleep(); + + if (ww_ctx) { + if (unlikely(ww_ctx == READ_ONCE(lock->ctx))) + return -EALREADY; + + /* + * Reset the wounded flag after a kill. No other process can + * race and wound us here, since they can't have a valid owner + * pointer if we don't have any locks held. + */ + if (ww_ctx->acquired == 0) + ww_ctx->wounded = 0; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + nest_lock = &ww_ctx->dep_map; +#endif + } + mutex_acquire_nest(&rtm->dep_map, 0, 0, nest_lock, ip); + + if (likely(rt_mutex_try_acquire(&rtm->rtmutex))) { + if (ww_ctx) + ww_mutex_set_context_fastpath(lock, ww_ctx); + return 0; + } + + ret = rt_mutex_slowlock(&rtm->rtmutex, ww_ctx, state); + + if (ret) + mutex_release(&rtm->dep_map, ip); + return ret; +} + +int __sched +ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + return __ww_rt_mutex_lock(lock, ctx, TASK_UNINTERRUPTIBLE, _RET_IP_); +} +EXPORT_SYMBOL(ww_mutex_lock); + +int __sched +ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + return __ww_rt_mutex_lock(lock, ctx, TASK_INTERRUPTIBLE, _RET_IP_); +} +EXPORT_SYMBOL(ww_mutex_lock_interruptible); + +void __sched ww_mutex_unlock(struct ww_mutex *lock) +{ + struct rt_mutex *rtm = &lock->base; + + __ww_mutex_unlock(lock); + + mutex_release(&rtm->dep_map, _RET_IP_); + __rt_mutex_unlock(&rtm->rtmutex); +} +EXPORT_SYMBOL(ww_mutex_unlock); |