diff options
Diffstat (limited to 'kernel/locking')
40 files changed, 10871 insertions, 5352 deletions
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 760158d9d98d..a114949eeed5 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,16 +1,21 @@ +# SPDX-License-Identifier: GPL-2.0 # Any varying coverage in these files is non-deterministic # and is generally not a function of system call inputs. KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o +# Avoid recursion lockdep -> sanitizer -> ... -> lockdep & improve performance. +KASAN_SANITIZE_lockdep.o := n +KCSAN_SANITIZE_lockdep.o := n + ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE) endif +obj-$(CONFIG_DEBUG_IRQFLAGS) += irqflag-debug.o obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o obj-$(CONFIG_LOCKDEP) += lockdep.o ifeq ($(CONFIG_PROC_FS),y) @@ -20,12 +25,11 @@ obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o -obj-$(CONFIG_RT_MUTEXES) += rtmutex.o -obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o +obj-$(CONFIG_RT_MUTEXES) += rtmutex_api.o +obj-$(CONFIG_PREEMPT_RT) += spinlock_rt.o ww_rt_mutex.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o -obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o -obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o +obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o diff --git a/kernel/locking/irqflag-debug.c b/kernel/locking/irqflag-debug.c new file mode 100644 index 000000000000..810b50344d35 --- /dev/null +++ b/kernel/locking/irqflag-debug.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/bug.h> +#include <linux/export.h> +#include <linux/irqflags.h> + +noinstr void warn_bogus_irq_restore(void) +{ + instrumentation_begin(); + WARN_ONCE(1, "raw_local_irq_restore() called with IRQs enabled\n"); + instrumentation_end(); +} +EXPORT_SYMBOL(warn_bogus_irq_restore); diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c new file mode 100644 index 000000000000..e68d82099558 --- /dev/null +++ b/kernel/locking/lock_events.c @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Waiman Long <waiman.long@hpe.com> + */ + +/* + * Collect locking event counts + */ +#include <linux/debugfs.h> +#include <linux/sched.h> +#include <linux/sched/clock.h> +#include <linux/fs.h> + +#include "lock_events.h" + +#undef LOCK_EVENT +#define LOCK_EVENT(name) [LOCKEVENT_ ## name] = #name, + +#define LOCK_EVENTS_DIR "lock_event_counts" + +/* + * When CONFIG_LOCK_EVENT_COUNTS is enabled, event counts of different + * types of locks will be reported under the <debugfs>/lock_event_counts/ + * directory. See lock_events_list.h for the list of available locking + * events. + * + * Writing to the special ".reset_counts" file will reset all the above + * locking event counts. This is a very slow operation and so should not + * be done frequently. + * + * These event counts are implemented as per-cpu variables which are + * summed and computed whenever the corresponding debugfs files are read. This + * minimizes added overhead making the counts usable even in a production + * environment. + */ +static const char * const lockevent_names[lockevent_num + 1] = { + +#include "lock_events_list.h" + + [LOCKEVENT_reset_cnts] = ".reset_counts", +}; + +/* + * Per-cpu counts + */ +DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]); + +/* + * The lockevent_read() function can be overridden. + */ +ssize_t __weak lockevent_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[64]; + int cpu, id, len; + u64 sum = 0; + + /* + * Get the counter ID stored in file->f_inode->i_private + */ + id = (long)file_inode(file)->i_private; + + if (id >= lockevent_num) + return -EBADF; + + for_each_possible_cpu(cpu) + sum += per_cpu(lockevents[id], cpu); + len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum); + + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +/* + * Function to handle write request + * + * When idx = reset_cnts, reset all the counts. + */ +static ssize_t lockevent_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + int cpu; + + /* + * Get the counter ID stored in file->f_inode->i_private + */ + if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts) + return count; + + for_each_possible_cpu(cpu) { + int i; + unsigned long *ptr = per_cpu_ptr(lockevents, cpu); + + for (i = 0 ; i < lockevent_num; i++) + WRITE_ONCE(ptr[i], 0); + } + return count; +} + +/* + * Debugfs data structures + */ +static const struct file_operations fops_lockevent = { + .read = lockevent_read, + .write = lockevent_write, + .llseek = default_llseek, +}; + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +#include <asm/paravirt.h> + +static bool __init skip_lockevent(const char *name) +{ + static int pv_on __initdata = -1; + + if (pv_on < 0) + pv_on = !pv_is_native_spin_unlock(); + /* + * Skip PV qspinlock events on bare metal. + */ + if (!pv_on && !memcmp(name, "pv_", 3)) + return true; + return false; +} +#else +static inline bool skip_lockevent(const char *name) +{ + return false; +} +#endif + +/* + * Initialize debugfs for the locking event counts. + */ +static int __init init_lockevent_counts(void) +{ + struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL); + int i; + + if (IS_ERR(d_counts)) + goto out; + + /* + * Create the debugfs files + * + * As reading from and writing to the stat files can be slow, only + * root is allowed to do the read/write to limit impact to system + * performance. + */ + for (i = 0; i < lockevent_num; i++) { + if (skip_lockevent(lockevent_names[i])) + continue; + if (IS_ERR(debugfs_create_file(lockevent_names[i], 0400, d_counts, + (void *)(long)i, &fops_lockevent))) + goto fail_undo; + } + + if (IS_ERR(debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200, + d_counts, (void *)(long)LOCKEVENT_reset_cnts, + &fops_lockevent))) + goto fail_undo; + + return 0; +fail_undo: + debugfs_remove_recursive(d_counts); +out: + pr_warn("Could not create '%s' debugfs entries\n", LOCK_EVENTS_DIR); + return -ENOMEM; +} +fs_initcall(init_lockevent_counts); diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h new file mode 100644 index 000000000000..d2345e9c0190 --- /dev/null +++ b/kernel/locking/lock_events.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Waiman Long <longman@redhat.com> + */ + +#ifndef __LOCKING_LOCK_EVENTS_H +#define __LOCKING_LOCK_EVENTS_H + +enum lock_events { + +#include "lock_events_list.h" + + lockevent_num, /* Total number of lock event counts */ + LOCKEVENT_reset_cnts = lockevent_num, +}; + +#ifdef CONFIG_LOCK_EVENT_COUNTS +/* + * Per-cpu counters + */ +DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]); + +/* + * Increment the statistical counters. use raw_cpu_inc() because of lower + * overhead and we don't care if we loose the occasional update. + */ +static inline void __lockevent_inc(enum lock_events event, bool cond) +{ + if (cond) + raw_cpu_inc(lockevents[event]); +} + +#define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true) +#define lockevent_cond_inc(ev, c) __lockevent_inc(LOCKEVENT_ ##ev, c) + +static inline void __lockevent_add(enum lock_events event, int inc) +{ + raw_cpu_add(lockevents[event], inc); +} + +#define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c) + +#else /* CONFIG_LOCK_EVENT_COUNTS */ + +#define lockevent_inc(ev) +#define lockevent_add(ev, c) do { (void)(c); } while (0) +#define lockevent_cond_inc(ev, c) do { (void)(c); } while (0) + +#endif /* CONFIG_LOCK_EVENT_COUNTS */ + +ssize_t lockevent_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos); + +#endif /* __LOCKING_LOCK_EVENTS_H */ diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h new file mode 100644 index 000000000000..4e36258cc34f --- /dev/null +++ b/kernel/locking/lock_events_list.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Waiman Long <longman@redhat.com> + */ + +#ifndef LOCK_EVENT +#define LOCK_EVENT(name) LOCKEVENT_ ## name, +#endif + +#ifdef CONFIG_QUEUED_SPINLOCKS +#ifdef CONFIG_PARAVIRT_SPINLOCKS +/* + * Locking events for PV qspinlock. + */ +LOCK_EVENT(pv_hash_hops) /* Average # of hops per hashing operation */ +LOCK_EVENT(pv_kick_unlock) /* # of vCPU kicks issued at unlock time */ +LOCK_EVENT(pv_kick_wake) /* # of vCPU kicks for pv_latency_wake */ +LOCK_EVENT(pv_latency_kick) /* Average latency (ns) of vCPU kick */ +LOCK_EVENT(pv_latency_wake) /* Average latency (ns) of kick-to-wakeup */ +LOCK_EVENT(pv_lock_stealing) /* # of lock stealing operations */ +LOCK_EVENT(pv_spurious_wakeup) /* # of spurious wakeups in non-head vCPUs */ +LOCK_EVENT(pv_wait_again) /* # of wait's after queue head vCPU kick */ +LOCK_EVENT(pv_wait_early) /* # of early vCPU wait's */ +LOCK_EVENT(pv_wait_head) /* # of vCPU wait's at the queue head */ +LOCK_EVENT(pv_wait_node) /* # of vCPU wait's at non-head queue node */ +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +/* + * Locking events for qspinlock + * + * Subtracting lock_use_node[234] from lock_slowpath will give you + * lock_use_node1. + */ +LOCK_EVENT(lock_pending) /* # of locking ops via pending code */ +LOCK_EVENT(lock_slowpath) /* # of locking ops via MCS lock queue */ +LOCK_EVENT(lock_use_node2) /* # of locking ops that use 2nd percpu node */ +LOCK_EVENT(lock_use_node3) /* # of locking ops that use 3rd percpu node */ +LOCK_EVENT(lock_use_node4) /* # of locking ops that use 4th percpu node */ +LOCK_EVENT(lock_no_node) /* # of locking ops w/o using percpu node */ +#endif /* CONFIG_QUEUED_SPINLOCKS */ + +/* + * Locking events for Resilient Queued Spin Lock + */ +LOCK_EVENT(rqspinlock_lock_timeout) /* # of locking ops that timeout */ + +/* + * Locking events for rwsem + */ +LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */ +LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */ +LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */ +LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */ +LOCK_EVENT(rwsem_opt_lock) /* # of opt-acquired write locks */ +LOCK_EVENT(rwsem_opt_fail) /* # of failed optspins */ +LOCK_EVENT(rwsem_opt_nospin) /* # of disabled optspins */ +LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ +LOCK_EVENT(rwsem_rlock_steal) /* # of read locks by lock stealing */ +LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */ +LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */ +LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */ +LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */ +LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */ +LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */ + +/* + * Locking events for rtlock_slowlock() + */ +LOCK_EVENT(rtlock_slowlock) /* # of rtlock_slowlock() calls */ +LOCK_EVENT(rtlock_slow_acq1) /* # of locks acquired after wait_lock */ +LOCK_EVENT(rtlock_slow_acq2) /* # of locks acquired in for loop */ +LOCK_EVENT(rtlock_slow_sleep) /* # of sleeps */ +LOCK_EVENT(rtlock_slow_wake) /* # of wakeup's */ + +/* + * Locking events for rt_mutex_slowlock() + */ +LOCK_EVENT(rtmutex_slowlock) /* # of rt_mutex_slowlock() calls */ +LOCK_EVENT(rtmutex_slow_block) /* # of rt_mutex_slowlock_block() calls */ +LOCK_EVENT(rtmutex_slow_acq1) /* # of locks acquired after wait_lock */ +LOCK_EVENT(rtmutex_slow_acq2) /* # of locks acquired at the end */ +LOCK_EVENT(rtmutex_slow_acq3) /* # of locks acquired in *block() */ +LOCK_EVENT(rtmutex_slow_sleep) /* # of sleeps */ +LOCK_EVENT(rtmutex_slow_wake) /* # of wakeup's */ +LOCK_EVENT(rtmutex_deadlock) /* # of rt_mutex_handle_deadlock()'s */ + +/* + * Locking events for lockdep + */ +LOCK_EVENT(lockdep_acquire) +LOCK_EVENT(lockdep_lock) +LOCK_EVENT(lockdep_nocheck) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7d2499bec5fe..2d4c5bab5af8 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/lockdep.c * @@ -45,33 +46,87 @@ #include <linux/hash.h> #include <linux/ftrace.h> #include <linux/stringify.h> +#include <linux/bitmap.h> #include <linux/bitops.h> #include <linux/gfp.h> -#include <linux/kmemcheck.h> #include <linux/random.h> #include <linux/jhash.h> +#include <linux/nmi.h> +#include <linux/rcupdate.h> +#include <linux/kprobes.h> +#include <linux/lockdep.h> +#include <linux/context_tracking.h> +#include <linux/console.h> +#include <linux/kasan.h> #include <asm/sections.h> #include "lockdep_internals.h" +#include "lock_events.h" -#define CREATE_TRACE_POINTS #include <trace/events/lock.h> #ifdef CONFIG_PROVE_LOCKING -int prove_locking = 1; +static int prove_locking = 1; module_param(prove_locking, int, 0644); #else #define prove_locking 0 #endif #ifdef CONFIG_LOCK_STAT -int lock_stat = 1; +static int lock_stat = 1; module_param(lock_stat, int, 0644); #else #define lock_stat 0 #endif +#ifdef CONFIG_SYSCTL +static const struct ctl_table kern_lockdep_table[] = { +#ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", + .data = &prove_locking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_PROVE_LOCKING */ +#ifdef CONFIG_LOCK_STAT + { + .procname = "lock_stat", + .data = &lock_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_LOCK_STAT */ +}; + +static __init int kernel_lockdep_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_lockdep_table); + return 0; +} +late_initcall(kernel_lockdep_sysctls_init); +#endif /* CONFIG_SYSCTL */ + +DEFINE_PER_CPU(unsigned int, lockdep_recursion); +EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion); + +static __always_inline bool lockdep_enabled(void) +{ + if (!debug_locks) + return false; + + if (this_cpu_read(lockdep_recursion)) + return false; + + if (current->lockdep_recursion) + return false; + + return true; +} + /* * lockdep_lock: protects the lockdep graph, the hashes and the * class/list/hash allocators. @@ -80,11 +135,44 @@ module_param(lock_stat, int, 0644); * to use a raw spinlock - we really dont want the spinlock * code to recurse back into the lockdep code... */ -static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t __lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static struct task_struct *__owner; + +static inline void lockdep_lock(void) +{ + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + + __this_cpu_inc(lockdep_recursion); + arch_spin_lock(&__lock); + __owner = current; +} + +static inline void lockdep_unlock(void) +{ + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + + if (debug_locks && DEBUG_LOCKS_WARN_ON(__owner != current)) + return; + + __owner = NULL; + arch_spin_unlock(&__lock); + __this_cpu_dec(lockdep_recursion); +} + +#ifdef CONFIG_PROVE_LOCKING +static inline bool lockdep_assert_locked(void) +{ + return DEBUG_LOCKS_WARN_ON(__owner != current); +} +#endif + +static struct task_struct *lockdep_selftest_task_struct; + static int graph_lock(void) { - arch_spin_lock(&lockdep_lock); + lockdep_lock(); + lockevent_inc(lockdep_lock); /* * Make sure that if another CPU detected a bug while * walking the graph we dont change it (while the other @@ -92,27 +180,15 @@ static int graph_lock(void) * dropped already) */ if (!debug_locks) { - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); return 0; } - /* prevent any recursions within lockdep from causing deadlocks */ - current->lockdep_recursion++; return 1; } -static inline int graph_unlock(void) +static inline void graph_unlock(void) { - if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) { - /* - * The lockdep graph lock isn't locked while we expect it to - * be, we're confused now, bye! - */ - return DEBUG_LOCKS_WARN_ON(1); - } - - current->lockdep_recursion--; - arch_spin_unlock(&lockdep_lock); - return 0; + lockdep_unlock(); } /* @@ -123,33 +199,51 @@ static inline int debug_locks_off_graph_unlock(void) { int ret = debug_locks_off(); - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); return ret; } unsigned long nr_list_entries; static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; +static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES); /* * All data structures here are protected by the global debug_lock. * - * Mutex key structs only get allocated, once during bootup, and never - * get freed - this significantly simplifies the debugging code. + * nr_lock_classes is the number of elements of lock_classes[] that is + * in use. */ +#define KEYHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1) +#define KEYHASH_SIZE (1UL << KEYHASH_BITS) +static struct hlist_head lock_keys_hash[KEYHASH_SIZE]; unsigned long nr_lock_classes; -static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +unsigned long nr_zapped_classes; +unsigned long nr_dynamic_keys; +unsigned long max_lock_class_idx; +struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS); static inline struct lock_class *hlock_class(struct held_lock *hlock) { - if (!hlock->class_idx) { + unsigned int class_idx = hlock->class_idx; + + /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfield */ + barrier(); + + if (!test_bit(class_idx, lock_classes_in_use)) { /* * Someone passed in garbage, we give up. */ DEBUG_LOCKS_WARN_ON(1); return NULL; } - return lock_classes + hlock->class_idx - 1; + + /* + * At this point, if the passed hlock->class_idx is still garbage, + * we just have to live with it + */ + return lock_classes + class_idx; } #ifdef CONFIG_LOCK_STAT @@ -203,33 +297,30 @@ static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) dst->nr += src->nr; } -struct lock_class_stats lock_stats(struct lock_class *class) +void lock_stats(struct lock_class *class, struct lock_class_stats *stats) { - struct lock_class_stats stats; int cpu, i; - memset(&stats, 0, sizeof(struct lock_class_stats)); + memset(stats, 0, sizeof(struct lock_class_stats)); for_each_possible_cpu(cpu) { struct lock_class_stats *pcs = &per_cpu(cpu_lock_stats, cpu)[class - lock_classes]; - for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) - stats.contention_point[i] += pcs->contention_point[i]; + for (i = 0; i < ARRAY_SIZE(stats->contention_point); i++) + stats->contention_point[i] += pcs->contention_point[i]; - for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++) - stats.contending_point[i] += pcs->contending_point[i]; + for (i = 0; i < ARRAY_SIZE(stats->contending_point); i++) + stats->contending_point[i] += pcs->contending_point[i]; - lock_time_add(&pcs->read_waittime, &stats.read_waittime); - lock_time_add(&pcs->write_waittime, &stats.write_waittime); + lock_time_add(&pcs->read_waittime, &stats->read_waittime); + lock_time_add(&pcs->write_waittime, &stats->write_waittime); - lock_time_add(&pcs->read_holdtime, &stats.read_holdtime); - lock_time_add(&pcs->write_holdtime, &stats.write_holdtime); + lock_time_add(&pcs->read_holdtime, &stats->read_holdtime); + lock_time_add(&pcs->write_holdtime, &stats->write_holdtime); - for (i = 0; i < ARRAY_SIZE(stats.bounces); i++) - stats.bounces[i] += pcs->bounces[i]; + for (i = 0; i < ARRAY_SIZE(stats->bounces); i++) + stats->bounces[i] += pcs->bounces[i]; } - - return stats; } void clear_lock_stats(struct lock_class *class) @@ -248,12 +339,7 @@ void clear_lock_stats(struct lock_class *class) static struct lock_class_stats *get_lock_stats(struct lock_class *class) { - return &get_cpu_var(cpu_lock_stats)[class - lock_classes]; -} - -static void put_lock_stats(struct lock_class_stats *stats) -{ - put_cpu_var(cpu_lock_stats); + return &this_cpu_ptr(cpu_lock_stats)[class - lock_classes]; } static void lock_release_holdtime(struct held_lock *hlock) @@ -271,7 +357,6 @@ static void lock_release_holdtime(struct held_lock *hlock) lock_time_inc(&stats->read_holdtime, holdtime); else lock_time_inc(&stats->write_holdtime, holdtime); - put_lock_stats(stats); } #else static inline void lock_release_holdtime(struct held_lock *hlock) @@ -280,11 +365,42 @@ static inline void lock_release_holdtime(struct held_lock *hlock) #endif /* - * We keep a global list of all lock classes. The list only grows, - * never shrinks. The list is only accessed with the lockdep - * spinlock lock held. + * We keep a global list of all lock classes. The list is only accessed with + * the lockdep spinlock lock held. free_lock_classes is a list with free + * elements. These elements are linked together by the lock_entry member in + * struct lock_class. */ -LIST_HEAD(all_lock_classes); +static LIST_HEAD(all_lock_classes); +static LIST_HEAD(free_lock_classes); + +/** + * struct pending_free - information about data structures about to be freed + * @zapped: Head of a list with struct lock_class elements. + * @lock_chains_being_freed: Bitmap that indicates which lock_chains[] elements + * are about to be freed. + */ +struct pending_free { + struct list_head zapped; + DECLARE_BITMAP(lock_chains_being_freed, MAX_LOCKDEP_CHAINS); +}; + +/** + * struct delayed_free - data structures used for delayed freeing + * + * A data structure for delayed freeing of data structures that may be + * accessed by RCU readers at the time these were freed. + * + * @rcu_head: Used to schedule an RCU callback for freeing data structures. + * @index: Index of @pf to which freed data structures are added. + * @scheduled: Whether or not an RCU callback has been scheduled. + * @pf: Array with information about data structures about to be freed. + */ +static struct delayed_free { + struct rcu_head rcu_head; + int index; + int scheduled; + struct pending_free pf[2]; +} delayed_free; /* * The lockdep classes are in a hash-table as well, for fast lookup: @@ -308,6 +424,21 @@ static struct hlist_head classhash_table[CLASSHASH_SIZE]; static struct hlist_head chainhash_table[CHAINHASH_SIZE]; /* + * the id of held_lock + */ +static inline u16 hlock_id(struct held_lock *hlock) +{ + BUILD_BUG_ON(MAX_LOCKDEP_KEYS_BITS + 2 > 16); + + return (hlock->class_idx | (hlock->read << MAX_LOCKDEP_KEYS_BITS)); +} + +static inline __maybe_unused unsigned int chain_hlock_class_idx(u16 hlock_id) +{ + return hlock_id & (MAX_LOCKDEP_KEYS - 1); +} + +/* * The hash key of the lock dependency chains is a hash itself too: * it's a hash of all locks taken up to that lock, including that lock. * It's a 64-bit hash, because it's important for the keys to be @@ -322,17 +453,28 @@ static inline u64 iterate_chain_key(u64 key, u32 idx) return k0 | (u64)k1 << 32; } -void lockdep_off(void) +void lockdep_init_task(struct task_struct *task) { - current->lockdep_recursion++; + task->lockdep_depth = 0; /* no locks held yet */ + task->curr_chain_key = INITIAL_CHAIN_KEY; + task->lockdep_recursion = 0; } -EXPORT_SYMBOL(lockdep_off); -void lockdep_on(void) +static __always_inline void lockdep_recursion_inc(void) { - current->lockdep_recursion--; + __this_cpu_inc(lockdep_recursion); +} + +static __always_inline void lockdep_recursion_finish(void) +{ + if (WARN_ON_ONCE(__this_cpu_dec_return(lockdep_recursion))) + __this_cpu_write(lockdep_recursion, 0); +} + +void lockdep_set_selftest_task(struct task_struct *task) +{ + lockdep_selftest_task_struct = task; } -EXPORT_SYMBOL(lockdep_on); /* * Debugging switches: @@ -344,14 +486,12 @@ EXPORT_SYMBOL(lockdep_on); #if VERBOSE # define HARDIRQ_VERBOSE 1 # define SOFTIRQ_VERBOSE 1 -# define RECLAIM_VERBOSE 1 #else # define HARDIRQ_VERBOSE 0 # define SOFTIRQ_VERBOSE 0 -# define RECLAIM_VERBOSE 0 #endif -#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE +#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE /* * Quick filtering for interesting events: */ @@ -379,13 +519,6 @@ static int verbose(struct lock_class *class) return 0; } -/* - * Stack-trace: tightly packed array of stack backtrace - * addresses. Protected by the graph_lock. - */ -unsigned long nr_stack_trace_entries; -static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; - static void print_lockdep_off(const char *bug_msg) { printk(KERN_DEBUG "%s\n", bug_msg); @@ -395,44 +528,108 @@ static void print_lockdep_off(const char *bug_msg) #endif } -static int save_trace(struct stack_trace *trace) -{ - trace->nr_entries = 0; - trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; - trace->entries = stack_trace + nr_stack_trace_entries; +unsigned long nr_stack_trace_entries; - trace->skip = 3; +#ifdef CONFIG_PROVE_LOCKING +/** + * struct lock_trace - single stack backtrace + * @hash_entry: Entry in a stack_trace_hash[] list. + * @hash: jhash() of @entries. + * @nr_entries: Number of entries in @entries. + * @entries: Actual stack backtrace. + */ +struct lock_trace { + struct hlist_node hash_entry; + u32 hash; + u32 nr_entries; + unsigned long entries[] __aligned(sizeof(unsigned long)); +}; +#define LOCK_TRACE_SIZE_IN_LONGS \ + (sizeof(struct lock_trace) / sizeof(unsigned long)) +/* + * Stack-trace: sequence of lock_trace structures. Protected by the graph_lock. + */ +static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; +static struct hlist_head stack_trace_hash[STACK_TRACE_HASH_SIZE]; - save_stack_trace(trace); +static bool traces_identical(struct lock_trace *t1, struct lock_trace *t2) +{ + return t1->hash == t2->hash && t1->nr_entries == t2->nr_entries && + memcmp(t1->entries, t2->entries, + t1->nr_entries * sizeof(t1->entries[0])) == 0; +} - /* - * Some daft arches put -1 at the end to indicate its a full trace. - * - * <rant> this is buggy anyway, since it takes a whole extra entry so a - * complete trace that maxes out the entries provided will be reported - * as incomplete, friggin useless </rant> - */ - if (trace->nr_entries != 0 && - trace->entries[trace->nr_entries-1] == ULONG_MAX) - trace->nr_entries--; +static struct lock_trace *save_trace(void) +{ + struct lock_trace *trace, *t2; + struct hlist_head *hash_head; + u32 hash; + int max_entries; - trace->max_entries = trace->nr_entries; + BUILD_BUG_ON_NOT_POWER_OF_2(STACK_TRACE_HASH_SIZE); + BUILD_BUG_ON(LOCK_TRACE_SIZE_IN_LONGS >= MAX_STACK_TRACE_ENTRIES); - nr_stack_trace_entries += trace->nr_entries; + trace = (struct lock_trace *)(stack_trace + nr_stack_trace_entries); + max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries - + LOCK_TRACE_SIZE_IN_LONGS; - if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { + if (max_entries <= 0) { if (!debug_locks_off_graph_unlock()) - return 0; + return NULL; + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); - return 0; + return NULL; } + trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3); + + hash = jhash(trace->entries, trace->nr_entries * + sizeof(trace->entries[0]), 0); + trace->hash = hash; + hash_head = stack_trace_hash + (hash & (STACK_TRACE_HASH_SIZE - 1)); + hlist_for_each_entry(t2, hash_head, hash_entry) { + if (traces_identical(trace, t2)) + return t2; + } + nr_stack_trace_entries += LOCK_TRACE_SIZE_IN_LONGS + trace->nr_entries; + hlist_add_head(&trace->hash_entry, hash_head); - return 1; + return trace; +} + +/* Return the number of stack traces in the stack_trace[] array. */ +u64 lockdep_stack_trace_count(void) +{ + struct lock_trace *trace; + u64 c = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(stack_trace_hash); i++) { + hlist_for_each_entry(trace, &stack_trace_hash[i], hash_entry) { + c++; + } + } + + return c; } +/* Return the number of stack hash chains that have at least one stack trace. */ +u64 lockdep_stack_hash_count(void) +{ + u64 c = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(stack_trace_hash); i++) + if (!hlist_empty(&stack_trace_hash[i])) + c++; + + return c; +} +#endif + unsigned int nr_hardirq_chains; unsigned int nr_softirq_chains; unsigned int nr_process_chains; @@ -445,6 +642,7 @@ unsigned int max_lockdep_depth; DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); #endif +#ifdef CONFIG_PROVE_LOCKING /* * Locking printouts: */ @@ -461,9 +659,13 @@ static const char *usage_str[] = #include "lockdep_states.h" #undef LOCKDEP_STATE [LOCK_USED] = "INITIAL USE", + [LOCK_USED_READ] = "INITIAL READ USE", + /* abused as string storage for verify_lock_unused() */ + [LOCK_USAGE_STATES] = "IN-NMI", }; +#endif -const char * __get_key_name(struct lockdep_subclass_key *key, char *str) +const char *__get_key_name(const struct lockdep_subclass_key *key, char *str) { return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); } @@ -475,15 +677,26 @@ static inline unsigned long lock_flag(enum lock_usage_bit bit) static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit) { + /* + * The usage character defaults to '.' (i.e., irqs disabled and not in + * irq context), which is the safest usage category. + */ char c = '.'; - if (class->usage_mask & lock_flag(bit + 2)) + /* + * The order of the following usage checks matters, which will + * result in the outcome character as follows: + * + * - '+': irq is enabled and not in irq context + * - '-': in irq context and irq is disabled + * - '?': in irq context and irq is enabled + */ + if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) { c = '+'; - if (class->usage_mask & lock_flag(bit)) { - c = '-'; - if (class->usage_mask & lock_flag(bit + 2)) + if (class->usage_mask & lock_flag(bit)) c = '?'; - } + } else if (class->usage_mask & lock_flag(bit)) + c = '-'; return c; } @@ -501,7 +714,7 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) usage[i] = '\0'; } -static void __print_lock_name(struct lock_class *class) +static void __print_lock_name(struct held_lock *hlock, struct lock_class *class) { char str[KSYM_NAME_LEN]; const char *name; @@ -516,18 +729,22 @@ static void __print_lock_name(struct lock_class *class) printk(KERN_CONT "#%d", class->name_version); if (class->subclass) printk(KERN_CONT "/%d", class->subclass); + if (hlock && class->print_fn) + class->print_fn(hlock->instance); } } -static void print_lock_name(struct lock_class *class) +static void print_lock_name(struct held_lock *hlock, struct lock_class *class) { char usage[LOCK_USAGE_CHARS]; get_usage_chars(class, usage); printk(KERN_CONT " ("); - __print_lock_name(class); - printk(KERN_CONT "){%s}", usage); + __print_lock_name(hlock, class); + printk(KERN_CONT "){%s}-{%d:%d}", usage, + class->wait_type_outer ?: class->wait_type_inner, + class->wait_type_inner); } static void print_lockdep_cache(struct lockdep_map *lock) @@ -547,36 +764,43 @@ static void print_lock(struct held_lock *hlock) /* * We can be called locklessly through debug_show_all_locks() so be * extra careful, the hlock might have been released and cleared. + * + * If this indeed happens, lets pretend it does not hurt to continue + * to print the lock unless the hlock class_idx does not point to a + * registered class. The rationale here is: since we don't attempt + * to distinguish whether we are in this situation, if it just + * happened we can't count on class_idx to tell either. */ - unsigned int class_idx = hlock->class_idx; + struct lock_class *lock = hlock_class(hlock); - /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */ - barrier(); - - if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) { + if (!lock) { printk(KERN_CONT "<RELEASED>\n"); return; } - print_lock_name(lock_classes + class_idx - 1); - printk(KERN_CONT ", at: [<%p>] %pS\n", - (void *)hlock->acquire_ip, (void *)hlock->acquire_ip); + printk(KERN_CONT "%px", hlock->instance); + print_lock_name(hlock, lock); + printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); } -static void lockdep_print_held_locks(struct task_struct *curr) +static void lockdep_print_held_locks(struct task_struct *p) { - int i, depth = curr->lockdep_depth; + int i, depth = READ_ONCE(p->lockdep_depth); - if (!depth) { - printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr)); + if (!depth) + printk("no locks held by %s/%d.\n", p->comm, task_pid_nr(p)); + else + printk("%d lock%s held by %s/%d:\n", depth, + str_plural(depth), p->comm, task_pid_nr(p)); + /* + * It's not reliable to print a task's held locks if it's not sleeping + * and it's not the current task. + */ + if (p != current && task_is_running(p)) return; - } - printk("%d lock%s held by %s/%d:\n", - depth, depth > 1 ? "s" : "", curr->comm, task_pid_nr(curr)); - for (i = 0; i < depth; i++) { printk(" #%d: ", i); - print_lock(curr->held_locks + i); + print_lock(p->held_locks + i); } } @@ -600,19 +824,26 @@ static int very_verbose(struct lock_class *class) * Is this the address of a static object: */ #ifdef __KERNEL__ -static int static_obj(void *obj) +static int static_obj(const void *obj) { - unsigned long start = (unsigned long) &_stext, - end = (unsigned long) &_end, - addr = (unsigned long) obj; + unsigned long addr = (unsigned long) obj; + + if (is_kernel_core_data(addr)) + return 1; /* - * static variable? + * keys are allowed in the __ro_after_init section. */ - if ((addr >= start) && (addr < end)) + if (is_kernel_rodata(addr)) return 1; - if (arch_is_kernel_data(addr)) + /* + * in initdata section and used during bootup only? + * NOTE: On some platforms the initdata section is + * outside of the _stext ... _end range. + */ + if (system_state < SYSTEM_FREEING_INITMEM && + init_section_contains((void *)addr, 1)) return 1; /* @@ -630,7 +861,8 @@ static int static_obj(void *obj) /* * To make lock name printouts unique, we calculate a unique - * class->name_version generation counter: + * class->name_version generation counter. The caller must hold the graph + * lock. */ static int count_matching_names(struct lock_class *new_class) { @@ -640,7 +872,7 @@ static int count_matching_names(struct lock_class *new_class) if (!new_class->name) return 0; - list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) { + list_for_each_entry(class, &all_lock_classes, lock_entry) { if (new_class->key - new_class->subclass == class->key) return class->name_version; if (class->name && !strcmp(class->name, new_class->name)) @@ -650,48 +882,34 @@ static int count_matching_names(struct lock_class *new_class) return count + 1; } -/* - * Register a lock's class in the hash-table, if the class is not present - * yet. Otherwise we look it up. We cache the result in the lock object - * itself, so actual lookup of the hash should be once per lock object. - */ -static inline struct lock_class * -look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) +/* used from NMI context -- must be lockless */ +static noinstr struct lock_class * +look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) { struct lockdep_subclass_key *key; struct hlist_head *hash_head; struct lock_class *class; - bool is_static = false; if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { + instrumentation_begin(); debug_locks_off(); + nbcon_cpu_emergency_enter(); printk(KERN_ERR "BUG: looking up invalid subclass: %u\n", subclass); printk(KERN_ERR "turning off the locking correctness validator.\n"); dump_stack(); + nbcon_cpu_emergency_exit(); + instrumentation_end(); return NULL; } /* - * Static locks do not have their class-keys yet - for them the key - * is the lock object itself. If the lock is in the per cpu area, - * the canonical address of the lock (per cpu offset removed) is - * used. + * If it is not initialised then it has never been locked, + * so it won't be present in the hash table. */ - if (unlikely(!lock->key)) { - unsigned long can_addr, addr = (unsigned long)lock; - - if (__is_kernel_percpu_address(addr, &can_addr)) - lock->key = (void *)can_addr; - else if (__is_module_percpu_address(addr, &can_addr)) - lock->key = (void *)can_addr; - else if (static_obj(lock)) - lock->key = (void *)lock; - else - return ERR_PTR(-EINVAL); - is_static = true; - } + if (unlikely(!lock->key)) + return NULL; /* * NOTE: the class-key must be unique. For dynamic locks, a static @@ -712,18 +930,350 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return NULL; - hlist_for_each_entry_rcu(class, hash_head, hash_entry) { + hlist_for_each_entry_rcu_notrace(class, hash_head, hash_entry) { if (class->key == key) { /* * Huh! same key, different name? Did someone trample * on some memory? We're most confused. */ - WARN_ON_ONCE(class->name != lock->name); + WARN_ONCE(class->name != lock->name && + lock->key != &__lockdep_no_validate__, + "Looking for class \"%s\" with key %ps, but found a different class \"%s\" with the same key\n", + lock->name, lock->key, class->name); return class; } } - return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL); + return NULL; +} + +/* + * Static locks do not have their class-keys yet - for them the key is + * the lock object itself. If the lock is in the per cpu area, the + * canonical address of the lock (per cpu offset removed) is used. + */ +static bool assign_lock_key(struct lockdep_map *lock) +{ + unsigned long can_addr, addr = (unsigned long)lock; + +#ifdef __KERNEL__ + /* + * lockdep_free_key_range() assumes that struct lock_class_key + * objects do not overlap. Since we use the address of lock + * objects as class key for static objects, check whether the + * size of lock_class_key objects does not exceed the size of + * the smallest lock object. + */ + BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(raw_spinlock_t)); +#endif + + if (__is_kernel_percpu_address(addr, &can_addr)) + lock->key = (void *)can_addr; + else if (__is_module_percpu_address(addr, &can_addr)) + lock->key = (void *)can_addr; + else if (static_obj(lock)) + lock->key = (void *)lock; + else { + /* Debug-check: all keys must be persistent! */ + debug_locks_off(); + nbcon_cpu_emergency_enter(); + pr_err("INFO: trying to register non-static key.\n"); + pr_err("The code is fine but needs lockdep annotation, or maybe\n"); + pr_err("you didn't initialize this object before use?\n"); + pr_err("turning off the locking correctness validator.\n"); + dump_stack(); + nbcon_cpu_emergency_exit(); + return false; + } + + return true; +} + +#ifdef CONFIG_DEBUG_LOCKDEP + +/* Check whether element @e occurs in list @h */ +static bool in_list(struct list_head *e, struct list_head *h) +{ + struct list_head *f; + + list_for_each(f, h) { + if (e == f) + return true; + } + + return false; +} + +/* + * Check whether entry @e occurs in any of the locks_after or locks_before + * lists. + */ +static bool in_any_class_list(struct list_head *e) +{ + struct lock_class *class; + int i; + + for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { + class = &lock_classes[i]; + if (in_list(e, &class->locks_after) || + in_list(e, &class->locks_before)) + return true; + } + return false; +} + +static bool class_lock_list_valid(struct lock_class *c, struct list_head *h) +{ + struct lock_list *e; + + list_for_each_entry(e, h, entry) { + if (e->links_to != c) { + printk(KERN_INFO "class %s: mismatch for lock entry %ld; class %s <> %s", + c->name ? : "(?)", + (unsigned long)(e - list_entries), + e->links_to && e->links_to->name ? + e->links_to->name : "(?)", + e->class && e->class->name ? e->class->name : + "(?)"); + return false; + } + } + return true; +} + +#ifdef CONFIG_PROVE_LOCKING +static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; +#endif + +static bool check_lock_chain_key(struct lock_chain *chain) +{ +#ifdef CONFIG_PROVE_LOCKING + u64 chain_key = INITIAL_CHAIN_KEY; + int i; + + for (i = chain->base; i < chain->base + chain->depth; i++) + chain_key = iterate_chain_key(chain_key, chain_hlocks[i]); + /* + * The 'unsigned long long' casts avoid that a compiler warning + * is reported when building tools/lib/lockdep. + */ + if (chain->chain_key != chain_key) { + printk(KERN_INFO "chain %lld: key %#llx <> %#llx\n", + (unsigned long long)(chain - lock_chains), + (unsigned long long)chain->chain_key, + (unsigned long long)chain_key); + return false; + } +#endif + return true; +} + +static bool in_any_zapped_class_list(struct lock_class *class) +{ + struct pending_free *pf; + int i; + + for (i = 0, pf = delayed_free.pf; i < ARRAY_SIZE(delayed_free.pf); i++, pf++) { + if (in_list(&class->lock_entry, &pf->zapped)) + return true; + } + + return false; +} + +static bool __check_data_structures(void) +{ + struct lock_class *class; + struct lock_chain *chain; + struct hlist_head *head; + struct lock_list *e; + int i; + + /* Check whether all classes occur in a lock list. */ + for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { + class = &lock_classes[i]; + if (!in_list(&class->lock_entry, &all_lock_classes) && + !in_list(&class->lock_entry, &free_lock_classes) && + !in_any_zapped_class_list(class)) { + printk(KERN_INFO "class %px/%s is not in any class list\n", + class, class->name ? : "(?)"); + return false; + } + } + + /* Check whether all classes have valid lock lists. */ + for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { + class = &lock_classes[i]; + if (!class_lock_list_valid(class, &class->locks_before)) + return false; + if (!class_lock_list_valid(class, &class->locks_after)) + return false; + } + + /* Check the chain_key of all lock chains. */ + for (i = 0; i < ARRAY_SIZE(chainhash_table); i++) { + head = chainhash_table + i; + hlist_for_each_entry_rcu(chain, head, entry) { + if (!check_lock_chain_key(chain)) + return false; + } + } + + /* + * Check whether all list entries that are in use occur in a class + * lock list. + */ + for_each_set_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) { + e = list_entries + i; + if (!in_any_class_list(&e->entry)) { + printk(KERN_INFO "list entry %d is not in any class list; class %s <> %s\n", + (unsigned int)(e - list_entries), + e->class->name ? : "(?)", + e->links_to->name ? : "(?)"); + return false; + } + } + + /* + * Check whether all list entries that are not in use do not occur in + * a class lock list. + */ + for_each_clear_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) { + e = list_entries + i; + if (in_any_class_list(&e->entry)) { + printk(KERN_INFO "list entry %d occurs in a class list; class %s <> %s\n", + (unsigned int)(e - list_entries), + e->class && e->class->name ? e->class->name : + "(?)", + e->links_to && e->links_to->name ? + e->links_to->name : "(?)"); + return false; + } + } + + return true; +} + +int check_consistency = 0; +module_param(check_consistency, int, 0644); + +static void check_data_structures(void) +{ + static bool once = false; + + if (check_consistency && !once) { + if (!__check_data_structures()) { + once = true; + WARN_ON(once); + } + } +} + +#else /* CONFIG_DEBUG_LOCKDEP */ + +static inline void check_data_structures(void) { } + +#endif /* CONFIG_DEBUG_LOCKDEP */ + +static void init_chain_block_buckets(void); + +/* + * Initialize the lock_classes[] array elements, the free_lock_classes list + * and also the delayed_free structure. + */ +static void init_data_structures_once(void) +{ + static bool __read_mostly ds_initialized, rcu_head_initialized; + int i; + + if (likely(rcu_head_initialized)) + return; + + if (system_state >= SYSTEM_SCHEDULING) { + init_rcu_head(&delayed_free.rcu_head); + rcu_head_initialized = true; + } + + if (ds_initialized) + return; + + ds_initialized = true; + + INIT_LIST_HEAD(&delayed_free.pf[0].zapped); + INIT_LIST_HEAD(&delayed_free.pf[1].zapped); + + for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { + list_add_tail(&lock_classes[i].lock_entry, &free_lock_classes); + INIT_LIST_HEAD(&lock_classes[i].locks_after); + INIT_LIST_HEAD(&lock_classes[i].locks_before); + } + init_chain_block_buckets(); +} + +static inline struct hlist_head *keyhashentry(const struct lock_class_key *key) +{ + unsigned long hash = hash_long((uintptr_t)key, KEYHASH_BITS); + + return lock_keys_hash + hash; +} + +/* Register a dynamically allocated key. */ +void lockdep_register_key(struct lock_class_key *key) +{ + struct hlist_head *hash_head; + struct lock_class_key *k; + unsigned long flags; + + if (WARN_ON_ONCE(static_obj(key))) + return; + hash_head = keyhashentry(key); + + raw_local_irq_save(flags); + if (!graph_lock()) + goto restore_irqs; + hlist_for_each_entry_rcu(k, hash_head, hash_entry) { + if (WARN_ON_ONCE(k == key)) + goto out_unlock; + } + hlist_add_head_rcu(&key->hash_entry, hash_head); + nr_dynamic_keys++; +out_unlock: + graph_unlock(); +restore_irqs: + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lockdep_register_key); + +/* Check whether a key has been registered as a dynamic key. */ +static bool is_dynamic_key(const struct lock_class_key *key) +{ + struct hlist_head *hash_head; + struct lock_class_key *k; + bool found = false; + + if (WARN_ON_ONCE(static_obj(key))) + return false; + + /* + * If lock debugging is disabled lock_keys_hash[] may contain + * pointers to memory that has already been freed. Avoid triggering + * a use-after-free in that case by returning early. + */ + if (!debug_locks) + return true; + + hash_head = keyhashentry(key); + + rcu_read_lock(); + hlist_for_each_entry_rcu(k, hash_head, hash_entry) { + if (k == key) { + found = true; + break; + } + } + rcu_read_unlock(); + + return found; } /* @@ -737,22 +1287,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) struct lockdep_subclass_key *key; struct hlist_head *hash_head; struct lock_class *class; + int idx; DEBUG_LOCKS_WARN_ON(!irqs_disabled()); class = look_up_lock_class(lock, subclass); - if (likely(!IS_ERR_OR_NULL(class))) + if (likely(class)) goto out_set_class_cache; - /* - * Debug-check: all keys must be persistent! - */ - if (IS_ERR(class)) { - debug_locks_off(); - printk("INFO: trying to register non-static key.\n"); - printk("the code is fine but needs lockdep annotation.\n"); - printk("turning off the locking correctness validator.\n"); - dump_stack(); + if (!lock->key) { + if (!assign_lock_key(lock)) + return NULL; + } else if (!static_obj(lock->key) && !is_dynamic_key(lock->key)) { return NULL; } @@ -771,46 +1317,58 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) goto out_unlock_set; } - /* - * Allocate a new key from the static array, and add it to - * the hash: - */ - if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { + init_data_structures_once(); + + /* Allocate a new lock class and add it to the hash. */ + class = list_first_entry_or_null(&free_lock_classes, typeof(*class), + lock_entry); + if (!class) { if (!debug_locks_off_graph_unlock()) { return NULL; } + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return NULL; } - class = lock_classes + nr_lock_classes++; + nr_lock_classes++; + __set_bit(class - lock_classes, lock_classes_in_use); debug_atomic_inc(nr_unused_locks); class->key = key; class->name = lock->name; class->subclass = subclass; - INIT_LIST_HEAD(&class->lock_entry); - INIT_LIST_HEAD(&class->locks_before); - INIT_LIST_HEAD(&class->locks_after); + WARN_ON_ONCE(!list_empty(&class->locks_before)); + WARN_ON_ONCE(!list_empty(&class->locks_after)); class->name_version = count_matching_names(class); + class->wait_type_inner = lock->wait_type_inner; + class->wait_type_outer = lock->wait_type_outer; + class->lock_type = lock->lock_type; /* * We use RCU's safe list-add method to make * parallel walking of the hash-list safe: */ hlist_add_head_rcu(&class->hash_entry, hash_head); /* - * Add it to the global list of classes: + * Remove the class from the free list and add it to the global list + * of classes. */ - list_add_tail_rcu(&class->lock_entry, &all_lock_classes); + list_move_tail(&class->lock_entry, &all_lock_classes); + idx = class - lock_classes; + if (idx > max_lock_class_idx) + max_lock_class_idx = idx; if (verbose(class)) { graph_unlock(); - printk("\nnew class %p: %s", class->key, class->name); + nbcon_cpu_emergency_enter(); + printk("\nnew class %px: %s", class->key, class->name); if (class->name_version > 1) printk(KERN_CONT "#%d", class->name_version); printk(KERN_CONT "\n"); dump_stack(); + nbcon_cpu_emergency_exit(); if (!graph_lock()) { return NULL; @@ -842,23 +1400,31 @@ out_set_class_cache: */ static struct lock_list *alloc_list_entry(void) { - if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { + int idx = find_first_zero_bit(list_entries_in_use, + ARRAY_SIZE(list_entries)); + + if (idx >= ARRAY_SIZE(list_entries)) { if (!debug_locks_off_graph_unlock()) return NULL; + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return NULL; } - return list_entries + nr_list_entries++; + nr_list_entries++; + __set_bit(idx, list_entries_in_use); + return list_entries + idx; } /* * Add a new dependency to the head of the list: */ -static int add_lock_to_list(struct lock_class *this, struct list_head *head, - unsigned long ip, int distance, - struct stack_trace *trace) +static int add_lock_to_list(struct lock_class *this, + struct lock_class *links_to, struct list_head *head, + u16 distance, u8 dep, + const struct lock_trace *trace) { struct lock_list *entry; /* @@ -870,8 +1436,10 @@ static int add_lock_to_list(struct lock_class *this, struct list_head *head, return 0; entry->class = this; + entry->links_to = links_to; + entry->dep = dep; entry->distance = distance; - entry->trace = *trace; + entry->trace = trace; /* * Both allocation and removal are done under the graph lock; but * iteration is under RCU-sched; see look_up_lock_class() and @@ -885,17 +1453,21 @@ static int add_lock_to_list(struct lock_class *this, struct list_head *head, /* * For good efficiency of modular, we use power of 2 */ -#define MAX_CIRCULAR_QUEUE_SIZE 4096UL +#define MAX_CIRCULAR_QUEUE_SIZE (1UL << CONFIG_LOCKDEP_CIRCULAR_QUEUE_BITS) #define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1) /* - * The circular_queue and helpers is used to implement the - * breadth-first search(BFS)algorithem, by which we can build - * the shortest path from the next lock to be acquired to the - * previous held lock if there is a circular between them. + * The circular_queue and helpers are used to implement graph + * breadth-first search (BFS) algorithm, by which we can determine + * whether there is a path from a lock to another. In deadlock checks, + * a path from the next lock to be acquired to a previous held lock + * indicates that adding the <prev> -> <next> lock dependency will + * produce a circle in the graph. Breadth-first search instead of + * depth-first search is used in order to find the shortest (circular) + * path. */ struct circular_queue { - unsigned long element[MAX_CIRCULAR_QUEUE_SIZE]; + struct lock_list *element[MAX_CIRCULAR_QUEUE_SIZE]; unsigned int front, rear; }; @@ -921,7 +1493,7 @@ static inline int __cq_full(struct circular_queue *cq) return ((cq->rear + 1) & CQ_MASK) == cq->front; } -static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) +static inline int __cq_enqueue(struct circular_queue *cq, struct lock_list *elem) { if (__cq_full(cq)) return -1; @@ -931,14 +1503,21 @@ static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) return 0; } -static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) +/* + * Dequeue an element from the circular_queue, return a lock_list if + * the queue is not empty, or NULL if otherwise. + */ +static inline struct lock_list * __cq_dequeue(struct circular_queue *cq) { + struct lock_list * lock; + if (__cq_empty(cq)) - return -1; + return NULL; - *elem = cq->element[cq->front]; + lock = cq->element[cq->front]; cq->front = (cq->front + 1) & CQ_MASK; - return 0; + + return lock; } static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) @@ -946,23 +1525,19 @@ static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) return (cq->rear - cq->front) & CQ_MASK; } -static inline void mark_lock_accessed(struct lock_list *lock, - struct lock_list *parent) +static inline void mark_lock_accessed(struct lock_list *lock) { - unsigned long nr; + lock->class->dep_gen_id = lockdep_dependency_gen_id; +} - nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ +static inline void visit_lock_entry(struct lock_list *lock, + struct lock_list *parent) +{ lock->parent = parent; - lock->class->dep_gen_id = lockdep_dependency_gen_id; } static inline unsigned long lock_accessed(struct lock_list *lock) { - unsigned long nr; - - nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ return lock->class->dep_gen_id == lockdep_dependency_gen_id; } @@ -983,114 +1558,329 @@ static inline int get_lock_depth(struct lock_list *child) return depth; } -static int __bfs(struct lock_list *source_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry, - int forward) +/* + * Return the forward or backward dependency list. + * + * @lock: the lock_list to get its class's dependency list + * @offset: the offset to struct lock_class to determine whether it is + * locks_after or locks_before + */ +static inline struct list_head *get_dep_list(struct lock_list *lock, int offset) +{ + void *lock_class = lock->class; + + return lock_class + offset; +} +/* + * Return values of a bfs search: + * + * BFS_E* indicates an error + * BFS_R* indicates a result (match or not) + * + * BFS_EINVALIDNODE: Find a invalid node in the graph. + * + * BFS_EQUEUEFULL: The queue is full while doing the bfs. + * + * BFS_RMATCH: Find the matched node in the graph, and put that node into + * *@target_entry. + * + * BFS_RNOMATCH: Haven't found the matched node and keep *@target_entry + * _unchanged_. + */ +enum bfs_result { + BFS_EINVALIDNODE = -2, + BFS_EQUEUEFULL = -1, + BFS_RMATCH = 0, + BFS_RNOMATCH = 1, +}; + +/* + * bfs_result < 0 means error + */ +static inline bool bfs_error(enum bfs_result res) +{ + return res < 0; +} + +/* + * DEP_*_BIT in lock_list::dep + * + * For dependency @prev -> @next: + * + * SR: @prev is shared reader (->read != 0) and @next is recursive reader + * (->read == 2) + * ER: @prev is exclusive locker (->read == 0) and @next is recursive reader + * SN: @prev is shared reader and @next is non-recursive locker (->read != 2) + * EN: @prev is exclusive locker and @next is non-recursive locker + * + * Note that we define the value of DEP_*_BITs so that: + * bit0 is prev->read == 0 + * bit1 is next->read != 2 + */ +#define DEP_SR_BIT (0 + (0 << 1)) /* 0 */ +#define DEP_ER_BIT (1 + (0 << 1)) /* 1 */ +#define DEP_SN_BIT (0 + (1 << 1)) /* 2 */ +#define DEP_EN_BIT (1 + (1 << 1)) /* 3 */ + +#define DEP_SR_MASK (1U << (DEP_SR_BIT)) +#define DEP_ER_MASK (1U << (DEP_ER_BIT)) +#define DEP_SN_MASK (1U << (DEP_SN_BIT)) +#define DEP_EN_MASK (1U << (DEP_EN_BIT)) + +static inline unsigned int +__calc_dep_bit(struct held_lock *prev, struct held_lock *next) +{ + return (prev->read == 0) + ((next->read != 2) << 1); +} + +static inline u8 calc_dep(struct held_lock *prev, struct held_lock *next) +{ + return 1U << __calc_dep_bit(prev, next); +} + +/* + * calculate the dep_bit for backwards edges. We care about whether @prev is + * shared and whether @next is recursive. + */ +static inline unsigned int +__calc_dep_bitb(struct held_lock *prev, struct held_lock *next) +{ + return (next->read != 2) + ((prev->read == 0) << 1); +} + +static inline u8 calc_depb(struct held_lock *prev, struct held_lock *next) +{ + return 1U << __calc_dep_bitb(prev, next); +} + +/* + * Initialize a lock_list entry @lock belonging to @class as the root for a BFS + * search. + */ +static inline void __bfs_init_root(struct lock_list *lock, + struct lock_class *class) { + lock->class = class; + lock->parent = NULL; + lock->only_xr = 0; +} + +/* + * Initialize a lock_list entry @lock based on a lock acquisition @hlock as the + * root for a BFS search. + * + * ->only_xr of the initial lock node is set to @hlock->read == 2, to make sure + * that <prev> -> @hlock and @hlock -> <whatever __bfs() found> is not -(*R)-> + * and -(S*)->. + */ +static inline void bfs_init_root(struct lock_list *lock, + struct held_lock *hlock) +{ + __bfs_init_root(lock, hlock_class(hlock)); + lock->only_xr = (hlock->read == 2); +} + +/* + * Similar to bfs_init_root() but initialize the root for backwards BFS. + * + * ->only_xr of the initial lock node is set to @hlock->read != 0, to make sure + * that <next> -> @hlock and @hlock -> <whatever backwards BFS found> is not + * -(*S)-> and -(R*)-> (reverse order of -(*R)-> and -(S*)->). + */ +static inline void bfs_init_rootb(struct lock_list *lock, + struct held_lock *hlock) +{ + __bfs_init_root(lock, hlock_class(hlock)); + lock->only_xr = (hlock->read != 0); +} + +static inline struct lock_list *__bfs_next(struct lock_list *lock, int offset) +{ + if (!lock || !lock->parent) + return NULL; + + return list_next_or_null_rcu(get_dep_list(lock->parent, offset), + &lock->entry, struct lock_list, entry); +} + +/* + * Breadth-First Search to find a strong path in the dependency graph. + * + * @source_entry: the source of the path we are searching for. + * @data: data used for the second parameter of @match function + * @match: match function for the search + * @target_entry: pointer to the target of a matched path + * @offset: the offset to struct lock_class to determine whether it is + * locks_after or locks_before + * + * We may have multiple edges (considering different kinds of dependencies, + * e.g. ER and SN) between two nodes in the dependency graph. But + * only the strong dependency path in the graph is relevant to deadlocks. A + * strong dependency path is a dependency path that doesn't have two adjacent + * dependencies as -(*R)-> -(S*)->, please see: + * + * Documentation/locking/lockdep-design.rst + * + * for more explanation of the definition of strong dependency paths + * + * In __bfs(), we only traverse in the strong dependency path: + * + * In lock_list::only_xr, we record whether the previous dependency only + * has -(*R)-> in the search, and if it does (prev only has -(*R)->), we + * filter out any -(S*)-> in the current dependency and after that, the + * ->only_xr is set according to whether we only have -(*R)-> left. + */ +static enum bfs_result __bfs(struct lock_list *source_entry, + void *data, + bool (*match)(struct lock_list *entry, void *data), + bool (*skip)(struct lock_list *entry, void *data), + struct lock_list **target_entry, + int offset) +{ + struct circular_queue *cq = &lock_cq; + struct lock_list *lock = NULL; struct lock_list *entry; struct list_head *head; - struct circular_queue *cq = &lock_cq; - int ret = 1; + unsigned int cq_depth; + bool first; - if (match(source_entry, data)) { - *target_entry = source_entry; - ret = 0; - goto exit; - } + lockdep_assert_locked(); - if (forward) - head = &source_entry->class->locks_after; - else - head = &source_entry->class->locks_before; + __cq_init(cq); + __cq_enqueue(cq, source_entry); - if (list_empty(head)) - goto exit; + while ((lock = __bfs_next(lock, offset)) || (lock = __cq_dequeue(cq))) { + if (!lock->class) + return BFS_EINVALIDNODE; - __cq_init(cq); - __cq_enqueue(cq, (unsigned long)source_entry); + /* + * Step 1: check whether we already finish on this one. + * + * If we have visited all the dependencies from this @lock to + * others (iow, if we have visited all lock_list entries in + * @lock->class->locks_{after,before}) we skip, otherwise go + * and visit all the dependencies in the list and mark this + * list accessed. + */ + if (lock_accessed(lock)) + continue; + else + mark_lock_accessed(lock); + + /* + * Step 2: check whether prev dependency and this form a strong + * dependency path. + */ + if (lock->parent) { /* Parent exists, check prev dependency */ + u8 dep = lock->dep; + bool prev_only_xr = lock->parent->only_xr; - while (!__cq_empty(cq)) { - struct lock_list *lock; + /* + * Mask out all -(S*)-> if we only have *R in previous + * step, because -(*R)-> -(S*)-> don't make up a strong + * dependency. + */ + if (prev_only_xr) + dep &= ~(DEP_SR_MASK | DEP_SN_MASK); - __cq_dequeue(cq, (unsigned long *)&lock); + /* If nothing left, we skip */ + if (!dep) + continue; - if (!lock->class) { - ret = -2; - goto exit; + /* If there are only -(*R)-> left, set that for the next step */ + lock->only_xr = !(dep & (DEP_SN_MASK | DEP_EN_MASK)); } - if (forward) - head = &lock->class->locks_after; - else - head = &lock->class->locks_before; + /* + * Step 3: we haven't visited this and there is a strong + * dependency path to this, so check with @match. + * If @skip is provide and returns true, we skip this + * lock (and any path this lock is in). + */ + if (skip && skip(lock, data)) + continue; - DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + if (match(lock, data)) { + *target_entry = lock; + return BFS_RMATCH; + } + /* + * Step 4: if not match, expand the path by adding the + * forward or backwards dependencies in the search + * + */ + first = true; + head = get_dep_list(lock, offset); list_for_each_entry_rcu(entry, head, entry) { - if (!lock_accessed(entry)) { - unsigned int cq_depth; - mark_lock_accessed(entry, lock); - if (match(entry, data)) { - *target_entry = entry; - ret = 0; - goto exit; - } + visit_lock_entry(entry, lock); - if (__cq_enqueue(cq, (unsigned long)entry)) { - ret = -1; - goto exit; - } - cq_depth = __cq_get_elem_count(cq); - if (max_bfs_queue_depth < cq_depth) - max_bfs_queue_depth = cq_depth; - } + /* + * Note we only enqueue the first of the list into the + * queue, because we can always find a sibling + * dependency from one (see __bfs_next()), as a result + * the space of queue is saved. + */ + if (!first) + continue; + + first = false; + + if (__cq_enqueue(cq, entry)) + return BFS_EQUEUEFULL; + + cq_depth = __cq_get_elem_count(cq); + if (max_bfs_queue_depth < cq_depth) + max_bfs_queue_depth = cq_depth; } } -exit: - return ret; + + return BFS_RNOMATCH; } -static inline int __bfs_forwards(struct lock_list *src_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry) +static inline enum bfs_result +__bfs_forwards(struct lock_list *src_entry, + void *data, + bool (*match)(struct lock_list *entry, void *data), + bool (*skip)(struct lock_list *entry, void *data), + struct lock_list **target_entry) { - return __bfs(src_entry, data, match, target_entry, 1); + return __bfs(src_entry, data, match, skip, target_entry, + offsetof(struct lock_class, locks_after)); } -static inline int __bfs_backwards(struct lock_list *src_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry) +static inline enum bfs_result +__bfs_backwards(struct lock_list *src_entry, + void *data, + bool (*match)(struct lock_list *entry, void *data), + bool (*skip)(struct lock_list *entry, void *data), + struct lock_list **target_entry) { - return __bfs(src_entry, data, match, target_entry, 0); + return __bfs(src_entry, data, match, skip, target_entry, + offsetof(struct lock_class, locks_before)); } -/* - * Recursive, forwards-direction lock-dependency checking, used for - * both noncyclic checking and for hardirq-unsafe/softirq-unsafe - * checking. - */ +static void print_lock_trace(const struct lock_trace *trace, + unsigned int spaces) +{ + stack_trace_print(trace->entries, trace->nr_entries, spaces); +} /* * Print a dependency chain entry (this is only done when a deadlock * has been detected): */ -static noinline int +static noinline void print_circular_bug_entry(struct lock_list *target, int depth) { if (debug_locks_silent) - return 0; + return; printk("\n-> #%u", depth); - print_lock_name(target->class); + print_lock_name(NULL, target->class); printk(KERN_CONT ":\n"); - print_stack_trace(&target->trace, 6); - - return 0; + print_lock_trace(target->trace, 6); } static void @@ -1101,6 +1891,8 @@ print_circular_lock_scenario(struct held_lock *src, struct lock_class *source = hlock_class(src); struct lock_class *target = hlock_class(tgt); struct lock_class *parent = prt->class; + int src_read = src->read; + int tgt_read = tgt->read; /* * A direct locking problem where unsafe_class lock is taken @@ -1117,28 +1909,36 @@ print_circular_lock_scenario(struct held_lock *src, */ if (parent != source) { printk("Chain exists of:\n "); - __print_lock_name(source); + __print_lock_name(src, source); printk(KERN_CONT " --> "); - __print_lock_name(parent); + __print_lock_name(NULL, parent); printk(KERN_CONT " --> "); - __print_lock_name(target); + __print_lock_name(tgt, target); printk(KERN_CONT "\n\n"); } printk(" Possible unsafe locking scenario:\n\n"); printk(" CPU0 CPU1\n"); printk(" ---- ----\n"); - printk(" lock("); - __print_lock_name(target); + if (tgt_read != 0) + printk(" rlock("); + else + printk(" lock("); + __print_lock_name(tgt, target); printk(KERN_CONT ");\n"); printk(" lock("); - __print_lock_name(parent); + __print_lock_name(NULL, parent); printk(KERN_CONT ");\n"); printk(" lock("); - __print_lock_name(target); + __print_lock_name(tgt, target); printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(source); + if (src_read != 0) + printk(" rlock("); + else if (src->sync) + printk(" sync("); + else + printk(" lock("); + __print_lock_name(src, source); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } @@ -1147,7 +1947,7 @@ print_circular_lock_scenario(struct held_lock *src, * When a circular dependency is detected, print the * header first: */ -static noinline int +static noinline void print_circular_bug_header(struct lock_list *entry, unsigned int depth, struct held_lock *check_src, struct held_lock *check_tgt) @@ -1155,7 +1955,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, struct task_struct *curr = current; if (debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("======================================================\n"); @@ -1165,22 +1965,44 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, pr_warn("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(check_src); + pr_warn("\nbut task is already holding lock:\n"); + print_lock(check_tgt); pr_warn("\nwhich lock already depends on the new lock.\n\n"); pr_warn("\nthe existing dependency chain (in reverse order) is:\n"); print_circular_bug_entry(entry, depth); - - return 0; } -static inline int class_equal(struct lock_list *entry, void *data) +/* + * We are about to add B -> A into the dependency graph, and in __bfs() a + * strong dependency path A -> .. -> B is found: hlock_class equals + * entry->class. + * + * We will have a deadlock case (conflict) if A -> .. -> B -> A is a strong + * dependency cycle, that means: + * + * Either + * + * a) B -> A is -(E*)-> + * + * or + * + * b) A -> .. -> B is -(*N)-> (i.e. A -> .. -(*N)-> B) + * + * as then we don't have -(*R)-> -(S*)-> in the cycle. + */ +static inline bool hlock_conflict(struct lock_list *entry, void *data) { - return entry->class == data; + struct held_lock *hlock = (struct held_lock *)data; + + return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */ + (hlock->read == 0 || /* B -> A is -(E*)-> */ + !entry->only_xr); /* A -> .. -> B is -(*N)-> */ } -static noinline int print_circular_bug(struct lock_list *this, +static noinline void print_circular_bug(struct lock_list *this, struct lock_list *target, struct held_lock *check_src, struct held_lock *check_tgt) @@ -1191,13 +2013,16 @@ static noinline int print_circular_bug(struct lock_list *this, int depth; if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; - if (!save_trace(&this->trace)) - return 0; + this->trace = save_trace(); + if (!this->trace) + return; depth = get_lock_depth(target); + nbcon_cpu_emergency_enter(); + print_circular_bug_header(target, depth, check_src, check_tgt); parent = get_lock_parent(target); @@ -1217,34 +2042,35 @@ static noinline int print_circular_bug(struct lock_list *this, printk("\nstack backtrace:\n"); dump_stack(); - return 0; + nbcon_cpu_emergency_exit(); } -static noinline int print_bfs_bug(int ret) +static noinline void print_bfs_bug(int ret) { if (!debug_locks_off_graph_unlock()) - return 0; + return; /* * Breadth-first-search failed, graph got corrupted? */ - WARN(1, "lockdep bfs error:%d\n", ret); + if (ret == BFS_EQUEUEFULL) + pr_warn("Increase LOCKDEP_CIRCULAR_QUEUE_BITS to avoid this warning:\n"); - return 0; + WARN(1, "lockdep bfs error:%d\n", ret); } -static int noop_count(struct lock_list *entry, void *data) +static bool noop_count(struct lock_list *entry, void *data) { (*(unsigned long *)data)++; - return 0; + return false; } static unsigned long __lockdep_count_forward_deps(struct lock_list *this) { unsigned long count = 0; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; - __bfs_forwards(this, (void *)&count, noop_count, &target_entry); + __bfs_forwards(this, (void *)&count, noop_count, NULL, &target_entry); return count; } @@ -1253,14 +2079,13 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) unsigned long ret, flags; struct lock_list this; - this.parent = NULL; - this.class = class; + __bfs_init_root(&this, class); - local_irq_save(flags); - arch_spin_lock(&lockdep_lock); + raw_local_irq_save(flags); + lockdep_lock(); ret = __lockdep_count_forward_deps(&this); - arch_spin_unlock(&lockdep_lock); - local_irq_restore(flags); + lockdep_unlock(); + raw_local_irq_restore(flags); return ret; } @@ -1268,9 +2093,9 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) static unsigned long __lockdep_count_backward_deps(struct lock_list *this) { unsigned long count = 0; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; - __bfs_backwards(this, (void *)&count, noop_count, &target_entry); + __bfs_backwards(this, (void *)&count, noop_count, NULL, &target_entry); return count; } @@ -1280,68 +2105,208 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) unsigned long ret, flags; struct lock_list this; - this.parent = NULL; - this.class = class; + __bfs_init_root(&this, class); - local_irq_save(flags); - arch_spin_lock(&lockdep_lock); + raw_local_irq_save(flags); + lockdep_lock(); ret = __lockdep_count_backward_deps(&this); - arch_spin_unlock(&lockdep_lock); - local_irq_restore(flags); + lockdep_unlock(); + raw_local_irq_restore(flags); return ret; } /* - * Prove that the dependency graph starting at <entry> can not - * lead to <target>. Print an error and return 0 if it does. + * Check that the dependency graph starting at <src> can lead to + * <target> or not. */ -static noinline int -check_noncircular(struct lock_list *root, struct lock_class *target, - struct lock_list **target_entry) +static noinline enum bfs_result +check_path(struct held_lock *target, struct lock_list *src_entry, + bool (*match)(struct lock_list *entry, void *data), + bool (*skip)(struct lock_list *entry, void *data), + struct lock_list **target_entry) { - int result; + enum bfs_result ret; + + ret = __bfs_forwards(src_entry, target, match, skip, target_entry); + + if (unlikely(bfs_error(ret))) + print_bfs_bug(ret); + + return ret; +} + +static void print_deadlock_bug(struct task_struct *, struct held_lock *, struct held_lock *); + +/* + * Prove that the dependency graph starting at <src> can not + * lead to <target>. If it can, there is a circle when adding + * <target> -> <src> dependency. + * + * Print an error and return BFS_RMATCH if it does. + */ +static noinline enum bfs_result +check_noncircular(struct held_lock *src, struct held_lock *target, + struct lock_trace **const trace) +{ + enum bfs_result ret; + struct lock_list *target_entry; + struct lock_list src_entry; + + bfs_init_root(&src_entry, src); debug_atomic_inc(nr_cyclic_checks); - result = __bfs_forwards(root, target, class_equal, target_entry); + ret = check_path(target, &src_entry, hlock_conflict, NULL, &target_entry); - return result; + if (unlikely(ret == BFS_RMATCH)) { + if (!*trace) { + /* + * If save_trace fails here, the printing might + * trigger a WARN but because of the !nr_entries it + * should not do bad things. + */ + *trace = save_trace(); + } + + if (src->class_idx == target->class_idx) + print_deadlock_bug(current, src, target); + else + print_circular_bug(&src_entry, target_entry, src, target); + } + + return ret; } -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +#ifdef CONFIG_TRACE_IRQFLAGS + /* * Forwards and backwards subgraph searching, for the purposes of * proving that two subgraphs can be connected by a new dependency * without creating any illegal irq-safe -> irq-unsafe lock dependency. + * + * A irq safe->unsafe deadlock happens with the following conditions: + * + * 1) We have a strong dependency path A -> ... -> B + * + * 2) and we have ENABLED_IRQ usage of B and USED_IN_IRQ usage of A, therefore + * irq can create a new dependency B -> A (consider the case that a holder + * of B gets interrupted by an irq whose handler will try to acquire A). + * + * 3) the dependency circle A -> ... -> B -> A we get from 1) and 2) is a + * strong circle: + * + * For the usage bits of B: + * a) if A -> B is -(*N)->, then B -> A could be any type, so any + * ENABLED_IRQ usage suffices. + * b) if A -> B is -(*R)->, then B -> A must be -(E*)->, so only + * ENABLED_IRQ_*_READ usage suffices. + * + * For the usage bits of A: + * c) if A -> B is -(E*)->, then B -> A could be any type, so any + * USED_IN_IRQ usage suffices. + * d) if A -> B is -(S*)->, then B -> A must be -(*N)->, so only + * USED_IN_IRQ_*_READ usage suffices. */ -static inline int usage_match(struct lock_list *entry, void *bit) +/* + * There is a strong dependency path in the dependency graph: A -> B, and now + * we need to decide which usage bit of A should be accumulated to detect + * safe->unsafe bugs. + * + * Note that usage_accumulate() is used in backwards search, so ->only_xr + * stands for whether A -> B only has -(S*)-> (in this case ->only_xr is true). + * + * As above, if only_xr is false, which means A -> B has -(E*)-> dependency + * path, any usage of A should be considered. Otherwise, we should only + * consider _READ usage. + */ +static inline bool usage_accumulate(struct lock_list *entry, void *mask) { - return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit); + if (!entry->only_xr) + *(unsigned long *)mask |= entry->class->usage_mask; + else /* Mask out _READ usage bits */ + *(unsigned long *)mask |= (entry->class->usage_mask & LOCKF_IRQ); + + return false; } +/* + * There is a strong dependency path in the dependency graph: A -> B, and now + * we need to decide which usage bit of B conflicts with the usage bits of A, + * i.e. which usage bit of B may introduce safe->unsafe deadlocks. + * + * As above, if only_xr is false, which means A -> B has -(*N)-> dependency + * path, any usage of B should be considered. Otherwise, we should only + * consider _READ usage. + */ +static inline bool usage_match(struct lock_list *entry, void *mask) +{ + if (!entry->only_xr) + return !!(entry->class->usage_mask & *(unsigned long *)mask); + else /* Mask out _READ usage bits */ + return !!((entry->class->usage_mask & LOCKF_IRQ) & *(unsigned long *)mask); +} +static inline bool usage_skip(struct lock_list *entry, void *mask) +{ + if (entry->class->lock_type == LD_LOCK_NORMAL) + return false; + + /* + * Skip local_lock() for irq inversion detection. + * + * For !RT, local_lock() is not a real lock, so it won't carry any + * dependency. + * + * For RT, an irq inversion happens when we have lock A and B, and on + * some CPU we can have: + * + * lock(A); + * <interrupted> + * lock(B); + * + * where lock(B) cannot sleep, and we have a dependency B -> ... -> A. + * + * Now we prove local_lock() cannot exist in that dependency. First we + * have the observation for any lock chain L1 -> ... -> Ln, for any + * 1 <= i <= n, Li.inner_wait_type <= L1.inner_wait_type, otherwise + * wait context check will complain. And since B is not a sleep lock, + * therefore B.inner_wait_type >= 2, and since the inner_wait_type of + * local_lock() is 3, which is greater than 2, therefore there is no + * way the local_lock() exists in the dependency B -> ... -> A. + * + * As a result, we will skip local_lock(), when we search for irq + * inversion bugs. + */ + if (entry->class->lock_type == LD_LOCK_PERCPU && + DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG)) + return false; + + /* + * Skip WAIT_OVERRIDE for irq inversion detection -- it's not actually + * a lock and only used to override the wait_type. + */ + + return true; +} /* * Find a node in the forwards-direction dependency sub-graph starting * at @root->class that matches @bit. * - * Return 0 if such a node exists in the subgraph, and put that node + * Return BFS_MATCH if such a node exists in the subgraph, and put that node * into *@target_entry. - * - * Return 1 otherwise and keep *@target_entry unchanged. - * Return <0 on error. */ -static int -find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, +static enum bfs_result +find_usage_forwards(struct lock_list *root, unsigned long usage_mask, struct lock_list **target_entry) { - int result; + enum bfs_result result; debug_atomic_inc(nr_find_usage_forwards_checks); - result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); + result = __bfs_forwards(root, &usage_mask, usage_match, usage_skip, target_entry); return result; } @@ -1349,22 +2314,16 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, /* * Find a node in the backwards-direction dependency sub-graph starting * at @root->class that matches @bit. - * - * Return 0 if such a node exists in the subgraph, and put that node - * into *@target_entry. - * - * Return 1 otherwise and keep *@target_entry unchanged. - * Return <0 on error. */ -static int -find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, +static enum bfs_result +find_usage_backwards(struct lock_list *root, unsigned long usage_mask, struct lock_list **target_entry) { - int result; + enum bfs_result result; debug_atomic_inc(nr_find_usage_backwards_checks); - result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); + result = __bfs_backwards(root, &usage_mask, usage_match, usage_skip, target_entry); return result; } @@ -1374,31 +2333,82 @@ static void print_lock_class_header(struct lock_class *class, int depth) int bit; printk("%*s->", depth, ""); - print_lock_name(class); - printk(KERN_CONT " ops: %lu", class->ops); + print_lock_name(NULL, class); +#ifdef CONFIG_DEBUG_LOCKDEP + printk(KERN_CONT " ops: %lu", debug_class_ops_read(class)); +#endif printk(KERN_CONT " {\n"); - for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { + for (bit = 0; bit < LOCK_TRACE_STATES; bit++) { if (class->usage_mask & (1 << bit)) { int len = depth; len += printk("%*s %s", depth, "", usage_str[bit]); len += printk(KERN_CONT " at:\n"); - print_stack_trace(class->usage_traces + bit, len); + print_lock_trace(class->usage_traces[bit], len); } } printk("%*s }\n", depth, ""); - printk("%*s ... key at: [<%p>] %pS\n", + printk("%*s ... key at: [<%px>] %pS\n", depth, "", class->key, class->key); } /* - * printk the shortest lock dependencies from @start to @end in reverse order: + * Dependency path printing: + * + * After BFS we get a lock dependency path (linked via ->parent of lock_list), + * printing out each lock in the dependency path will help on understanding how + * the deadlock could happen. Here are some details about dependency path + * printing: + * + * 1) A lock_list can be either forwards or backwards for a lock dependency, + * for a lock dependency A -> B, there are two lock_lists: + * + * a) lock_list in the ->locks_after list of A, whose ->class is B and + * ->links_to is A. In this case, we can say the lock_list is + * "A -> B" (forwards case). + * + * b) lock_list in the ->locks_before list of B, whose ->class is A + * and ->links_to is B. In this case, we can say the lock_list is + * "B <- A" (bacwards case). + * + * The ->trace of both a) and b) point to the call trace where B was + * acquired with A held. + * + * 2) A "helper" lock_list is introduced during BFS, this lock_list doesn't + * represent a certain lock dependency, it only provides an initial entry + * for BFS. For example, BFS may introduce a "helper" lock_list whose + * ->class is A, as a result BFS will search all dependencies starting with + * A, e.g. A -> B or A -> C. + * + * The notation of a forwards helper lock_list is like "-> A", which means + * we should search the forwards dependencies starting with "A", e.g A -> B + * or A -> C. + * + * The notation of a bacwards helper lock_list is like "<- B", which means + * we should search the backwards dependencies ending with "B", e.g. + * B <- A or B <- C. + */ + +/* + * printk the shortest lock dependencies from @root to @leaf in reverse order. + * + * We have a lock dependency path as follow: + * + * @root @leaf + * | | + * V V + * ->parent ->parent + * | lock_list | <--------- | lock_list | ... | lock_list | <--------- | lock_list | + * | -> L1 | | L1 -> L2 | ... |Ln-2 -> Ln-1| | Ln-1 -> Ln| + * + * , so it's natural that we start from @leaf and print every ->class and + * ->trace until we reach the @root. */ static void __used print_shortest_lock_dependencies(struct lock_list *leaf, - struct lock_list *root) + struct lock_list *root) { struct lock_list *entry = leaf; int depth; @@ -1409,7 +2419,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf, do { print_lock_class_header(entry->class, depth); printk("%*s ... acquired at:\n", depth, ""); - print_stack_trace(&entry->trace, 2); + print_lock_trace(entry->trace, 2); printk("\n"); if (depth == 0 && (entry != root)) { @@ -1420,8 +2430,61 @@ print_shortest_lock_dependencies(struct lock_list *leaf, entry = get_lock_parent(entry); depth--; } while (entry && (depth >= 0)); +} - return; +/* + * printk the shortest lock dependencies from @leaf to @root. + * + * We have a lock dependency path (from a backwards search) as follow: + * + * @leaf @root + * | | + * V V + * ->parent ->parent + * | lock_list | ---------> | lock_list | ... | lock_list | ---------> | lock_list | + * | L2 <- L1 | | L3 <- L2 | ... | Ln <- Ln-1 | | <- Ln | + * + * , so when we iterate from @leaf to @root, we actually print the lock + * dependency path L1 -> L2 -> .. -> Ln in the non-reverse order. + * + * Another thing to notice here is that ->class of L2 <- L1 is L1, while the + * ->trace of L2 <- L1 is the call trace of L2, in fact we don't have the call + * trace of L1 in the dependency path, which is alright, because most of the + * time we can figure out where L1 is held from the call trace of L2. + */ +static void __used +print_shortest_lock_dependencies_backwards(struct lock_list *leaf, + struct lock_list *root) +{ + struct lock_list *entry = leaf; + const struct lock_trace *trace = NULL; + int depth; + + /*compute depth from generated tree by BFS*/ + depth = get_lock_depth(leaf); + + do { + print_lock_class_header(entry->class, depth); + if (trace) { + printk("%*s ... acquired at:\n", depth, ""); + print_lock_trace(trace, 2); + printk("\n"); + } + + /* + * Record the pointer to the trace for the next lock_list + * entry, see the comments for the function. + */ + trace = entry->trace; + + if (depth == 0 && (entry != root)) { + printk("lockdep:%s bad path found in chain graph\n", __func__); + break; + } + + entry = get_lock_parent(entry); + depth--; + } while (entry && (depth >= 0)); } static void @@ -1452,11 +2515,11 @@ print_irq_lock_scenario(struct lock_list *safe_entry, */ if (middle_class != unsafe_class) { printk("Chain exists of:\n "); - __print_lock_name(safe_class); + __print_lock_name(NULL, safe_class); printk(KERN_CONT " --> "); - __print_lock_name(middle_class); + __print_lock_name(NULL, middle_class); printk(KERN_CONT " --> "); - __print_lock_name(unsafe_class); + __print_lock_name(NULL, unsafe_class); printk(KERN_CONT "\n\n"); } @@ -1464,23 +2527,23 @@ print_irq_lock_scenario(struct lock_list *safe_entry, printk(" CPU0 CPU1\n"); printk(" ---- ----\n"); printk(" lock("); - __print_lock_name(unsafe_class); + __print_lock_name(NULL, unsafe_class); printk(KERN_CONT ");\n"); printk(" local_irq_disable();\n"); printk(" lock("); - __print_lock_name(safe_class); + __print_lock_name(NULL, safe_class); printk(KERN_CONT ");\n"); printk(" lock("); - __print_lock_name(middle_class); + __print_lock_name(NULL, middle_class); printk(KERN_CONT ");\n"); printk(" <Interrupt>\n"); printk(" lock("); - __print_lock_name(safe_class); + __print_lock_name(NULL, safe_class); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } -static int +static void print_bad_irq_dependency(struct task_struct *curr, struct lock_list *prev_root, struct lock_list *next_root, @@ -1493,7 +2556,9 @@ print_bad_irq_dependency(struct task_struct *curr, const char *irqclass) { if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; + + nbcon_cpu_emergency_enter(); pr_warn("\n"); pr_warn("=====================================================\n"); @@ -1503,33 +2568,33 @@ print_bad_irq_dependency(struct task_struct *curr, pr_warn("-----------------------------------------------------\n"); pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), - curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, + lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT, curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, - curr->hardirqs_enabled, + lockdep_hardirqs_enabled(), curr->softirqs_enabled); print_lock(next); pr_warn("\nand this task is already holding:\n"); print_lock(prev); pr_warn("which would create a new lock dependency:\n"); - print_lock_name(hlock_class(prev)); + print_lock_name(prev, hlock_class(prev)); pr_cont(" ->"); - print_lock_name(hlock_class(next)); + print_lock_name(next, hlock_class(next)); pr_cont("\n"); pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n", irqclass); - print_lock_name(backwards_entry->class); + print_lock_name(NULL, backwards_entry->class); pr_warn("\n... which became %s-irq-safe at:\n", irqclass); - print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); + print_lock_trace(backwards_entry->class->usage_traces[bit1], 1); pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); - print_lock_name(forwards_entry->class); + print_lock_name(NULL, forwards_entry->class); pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); pr_warn("..."); - print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); + print_lock_trace(forwards_entry->class->usage_traces[bit2], 1); pr_warn("\nother info that might help us debug this:\n\n"); print_irq_lock_scenario(backwards_entry, forwards_entry, @@ -1538,53 +2603,19 @@ print_bad_irq_dependency(struct task_struct *curr, lockdep_print_held_locks(curr); pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); - if (!save_trace(&prev_root->trace)) - return 0; - print_shortest_lock_dependencies(backwards_entry, prev_root); + print_shortest_lock_dependencies_backwards(backwards_entry, prev_root); pr_warn("\nthe dependencies between the lock to be acquired"); pr_warn(" and %s-irq-unsafe lock:\n", irqclass); - if (!save_trace(&next_root->trace)) - return 0; + next_root->trace = save_trace(); + if (!next_root->trace) + goto out; print_shortest_lock_dependencies(forwards_entry, next_root); pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; -} - -static int -check_usage(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, enum lock_usage_bit bit_backwards, - enum lock_usage_bit bit_forwards, const char *irqclass) -{ - int ret; - struct lock_list this, that; - struct lock_list *uninitialized_var(target_entry); - struct lock_list *uninitialized_var(target_entry1); - - this.parent = NULL; - - this.class = hlock_class(prev); - ret = find_usage_backwards(&this, bit_backwards, &target_entry); - if (ret < 0) - return print_bfs_bug(ret); - if (ret == 1) - return ret; - - that.parent = NULL; - that.class = hlock_class(next); - ret = find_usage_forwards(&that, bit_forwards, &target_entry1); - if (ret < 0) - return print_bfs_bug(ret); - if (ret == 1) - return ret; - - return print_bad_irq_dependency(curr, &this, &that, - target_entry, target_entry1, - prev, next, - bit_backwards, bit_forwards, irqclass); +out: + nbcon_cpu_emergency_exit(); } static const char *state_names[] = { @@ -1603,103 +2634,361 @@ static const char *state_rnames[] = { static inline const char *state_name(enum lock_usage_bit bit) { - return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2]; + if (bit & LOCK_USAGE_READ_MASK) + return state_rnames[bit >> LOCK_USAGE_DIR_MASK]; + else + return state_names[bit >> LOCK_USAGE_DIR_MASK]; } +/* + * The bit number is encoded like: + * + * bit0: 0 exclusive, 1 read lock + * bit1: 0 used in irq, 1 irq enabled + * bit2-n: state + */ static int exclusive_bit(int new_bit) { - /* - * USED_IN - * USED_IN_READ - * ENABLED - * ENABLED_READ - * - * bit 0 - write/read - * bit 1 - used_in/enabled - * bit 2+ state - */ - - int state = new_bit & ~3; - int dir = new_bit & 2; + int state = new_bit & LOCK_USAGE_STATE_MASK; + int dir = new_bit & LOCK_USAGE_DIR_MASK; /* * keep state, bit flip the direction and strip read. */ - return state | (dir ^ 2); + return state | (dir ^ LOCK_USAGE_DIR_MASK); +} + +/* + * Observe that when given a bitmask where each bitnr is encoded as above, a + * right shift of the mask transforms the individual bitnrs as -1 and + * conversely, a left shift transforms into +1 for the individual bitnrs. + * + * So for all bits whose number have LOCK_ENABLED_* set (bitnr1 == 1), we can + * create the mask with those bit numbers using LOCK_USED_IN_* (bitnr1 == 0) + * instead by subtracting the bit number by 2, or shifting the mask right by 2. + * + * Similarly, bitnr1 == 0 becomes bitnr1 == 1 by adding 2, or shifting left 2. + * + * So split the mask (note that LOCKF_ENABLED_IRQ_ALL|LOCKF_USED_IN_IRQ_ALL is + * all bits set) and recompose with bitnr1 flipped. + */ +static unsigned long invert_dir_mask(unsigned long mask) +{ + unsigned long excl = 0; + + /* Invert dir */ + excl |= (mask & LOCKF_ENABLED_IRQ_ALL) >> LOCK_USAGE_DIR_MASK; + excl |= (mask & LOCKF_USED_IN_IRQ_ALL) << LOCK_USAGE_DIR_MASK; + + return excl; +} + +/* + * Note that a LOCK_ENABLED_IRQ_*_READ usage and a LOCK_USED_IN_IRQ_*_READ + * usage may cause deadlock too, for example: + * + * P1 P2 + * <irq disabled> + * write_lock(l1); <irq enabled> + * read_lock(l2); + * write_lock(l2); + * <in irq> + * read_lock(l1); + * + * , in above case, l1 will be marked as LOCK_USED_IN_IRQ_HARDIRQ_READ and l2 + * will marked as LOCK_ENABLE_IRQ_HARDIRQ_READ, and this is a possible + * deadlock. + * + * In fact, all of the following cases may cause deadlocks: + * + * LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_* + * LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_* + * LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_*_READ + * LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_*_READ + * + * As a result, to calculate the "exclusive mask", first we invert the + * direction (USED_IN/ENABLED) of the original mask, and 1) for all bits with + * bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). 2) for all + * bits with bitnr0 cleared (LOCK_*_READ), add those with bitnr0 set (LOCK_*). + */ +static unsigned long exclusive_mask(unsigned long mask) +{ + unsigned long excl = invert_dir_mask(mask); + + excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK; + excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK; + + return excl; +} + +/* + * Retrieve the _possible_ original mask to which @mask is + * exclusive. Ie: this is the opposite of exclusive_mask(). + * Note that 2 possible original bits can match an exclusive + * bit: one has LOCK_USAGE_READ_MASK set, the other has it + * cleared. So both are returned for each exclusive bit. + */ +static unsigned long original_mask(unsigned long mask) +{ + unsigned long excl = invert_dir_mask(mask); + + /* Include read in existing usages */ + excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK; + excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK; + + return excl; +} + +/* + * Find the first pair of bit match between an original + * usage mask and an exclusive usage mask. + */ +static int find_exclusive_match(unsigned long mask, + unsigned long excl_mask, + enum lock_usage_bit *bitp, + enum lock_usage_bit *excl_bitp) +{ + int bit, excl, excl_read; + + for_each_set_bit(bit, &mask, LOCK_USED) { + /* + * exclusive_bit() strips the read bit, however, + * LOCK_ENABLED_IRQ_*_READ may cause deadlocks too, so we need + * to search excl | LOCK_USAGE_READ_MASK as well. + */ + excl = exclusive_bit(bit); + excl_read = excl | LOCK_USAGE_READ_MASK; + if (excl_mask & lock_flag(excl)) { + *bitp = bit; + *excl_bitp = excl; + return 0; + } else if (excl_mask & lock_flag(excl_read)) { + *bitp = bit; + *excl_bitp = excl_read; + return 0; + } + } + return -1; } +/* + * Prove that the new dependency does not connect a hardirq-safe(-read) + * lock with a hardirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at <prev>, and the + * forwards-subgraph starting at <next>: + */ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, enum lock_usage_bit bit) + struct held_lock *next) { + unsigned long usage_mask = 0, forward_mask, backward_mask; + enum lock_usage_bit forward_bit = 0, backward_bit = 0; + struct lock_list *target_entry1; + struct lock_list *target_entry; + struct lock_list this, that; + enum bfs_result ret; + /* - * Prove that the new dependency does not connect a hardirq-safe - * lock with a hardirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at <prev>, and the - * forwards-subgraph starting at <next>: + * Step 1: gather all hard/soft IRQs usages backward in an + * accumulated usage mask. */ - if (!check_usage(curr, prev, next, bit, - exclusive_bit(bit), state_name(bit))) + bfs_init_rootb(&this, prev); + + ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, usage_skip, NULL); + if (bfs_error(ret)) { + print_bfs_bug(ret); return 0; + } - bit++; /* _READ */ + usage_mask &= LOCKF_USED_IN_IRQ_ALL; + if (!usage_mask) + return 1; /* - * Prove that the new dependency does not connect a hardirq-safe-read - * lock with a hardirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at <prev>, and the - * forwards-subgraph starting at <next>: + * Step 2: find exclusive uses forward that match the previous + * backward accumulated mask. */ - if (!check_usage(curr, prev, next, bit, - exclusive_bit(bit), state_name(bit))) + forward_mask = exclusive_mask(usage_mask); + + bfs_init_root(&that, next); + + ret = find_usage_forwards(&that, forward_mask, &target_entry1); + if (bfs_error(ret)) { + print_bfs_bug(ret); return 0; + } + if (ret == BFS_RNOMATCH) + return 1; + + /* + * Step 3: we found a bad match! Now retrieve a lock from the backward + * list whose usage mask matches the exclusive usage mask from the + * lock found on the forward list. + * + * Note, we should only keep the LOCKF_ENABLED_IRQ_ALL bits, considering + * the follow case: + * + * When trying to add A -> B to the graph, we find that there is a + * hardirq-safe L, that L -> ... -> A, and another hardirq-unsafe M, + * that B -> ... -> M. However M is **softirq-safe**, if we use exact + * invert bits of M's usage_mask, we will find another lock N that is + * **softirq-unsafe** and N -> ... -> A, however N -> .. -> M will not + * cause a inversion deadlock. + */ + backward_mask = original_mask(target_entry1->class->usage_mask & LOCKF_ENABLED_IRQ_ALL); + + ret = find_usage_backwards(&this, backward_mask, &target_entry); + if (bfs_error(ret)) { + print_bfs_bug(ret); + return 0; + } + if (DEBUG_LOCKS_WARN_ON(ret == BFS_RNOMATCH)) + return 1; + + /* + * Step 4: narrow down to a pair of incompatible usage bits + * and report it. + */ + ret = find_exclusive_match(target_entry->class->usage_mask, + target_entry1->class->usage_mask, + &backward_bit, &forward_bit); + if (DEBUG_LOCKS_WARN_ON(ret == -1)) + return 1; + + print_bad_irq_dependency(curr, &this, &that, + target_entry, target_entry1, + prev, next, + backward_bit, forward_bit, + state_name(backward_bit)); + + return 0; +} + +#else +static inline int check_irq_usage(struct task_struct *curr, + struct held_lock *prev, struct held_lock *next) +{ return 1; } -static int -check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next) +static inline bool usage_skip(struct lock_list *entry, void *mask) { -#define LOCKDEP_STATE(__STATE) \ - if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \ - return 0; -#include "lockdep_states.h" -#undef LOCKDEP_STATE + return false; +} - return 1; +#endif /* CONFIG_TRACE_IRQFLAGS */ + +#ifdef CONFIG_LOCKDEP_SMALL +/* + * We are about to add A -> B into the dependency graph, and in __bfs() a + * strong dependency path A -> .. -> B is found: hlock_class equals + * entry->class. + * + * If A -> .. -> B can replace A -> B in any __bfs() search (means the former + * is _stronger_ than or equal to the latter), we consider A -> B as redundant. + * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A + * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the + * dependency graph, as any strong path ..-> A -> B ->.. we can get with + * having dependency A -> B, we could already get a equivalent path ..-> A -> + * .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant. + * + * We need to make sure both the start and the end of A -> .. -> B is not + * weaker than A -> B. For the start part, please see the comment in + * check_redundant(). For the end part, we need: + * + * Either + * + * a) A -> B is -(*R)-> (everything is not weaker than that) + * + * or + * + * b) A -> .. -> B is -(*N)-> (nothing is stronger than this) + * + */ +static inline bool hlock_equal(struct lock_list *entry, void *data) +{ + struct held_lock *hlock = (struct held_lock *)data; + + return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */ + (hlock->read == 2 || /* A -> B is -(*R)-> */ + !entry->only_xr); /* A -> .. -> B is -(*N)-> */ } -static void inc_chains(void) +/* + * Check that the dependency graph starting at <src> can lead to + * <target> or not. If it can, <src> -> <target> dependency is already + * in the graph. + * + * Return BFS_RMATCH if it does, or BFS_RNOMATCH if it does not, return BFS_E* if + * any error appears in the bfs search. + */ +static noinline enum bfs_result +check_redundant(struct held_lock *src, struct held_lock *target) { - if (current->hardirq_context) - nr_hardirq_chains++; - else { - if (current->softirq_context) - nr_softirq_chains++; - else - nr_process_chains++; - } + enum bfs_result ret; + struct lock_list *target_entry; + struct lock_list src_entry; + + bfs_init_root(&src_entry, src); + /* + * Special setup for check_redundant(). + * + * To report redundant, we need to find a strong dependency path that + * is equal to or stronger than <src> -> <target>. So if <src> is E, + * we need to let __bfs() only search for a path starting at a -(E*)->, + * we achieve this by setting the initial node's ->only_xr to true in + * that case. And if <prev> is S, we set initial ->only_xr to false + * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant. + */ + src_entry.only_xr = src->read == 0; + + debug_atomic_inc(nr_redundant_checks); + + /* + * Note: we skip local_lock() for redundant check, because as the + * comment in usage_skip(), A -> local_lock() -> B and A -> B are not + * the same. + */ + ret = check_path(target, &src_entry, hlock_equal, usage_skip, &target_entry); + + if (ret == BFS_RMATCH) + debug_atomic_inc(nr_redundant); + + return ret; } #else -static inline int -check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next) +static inline enum bfs_result +check_redundant(struct held_lock *src, struct held_lock *target) { - return 1; + return BFS_RNOMATCH; } -static inline void inc_chains(void) +#endif + +static void inc_chains(int irq_context) { - nr_process_chains++; + if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT) + nr_hardirq_chains++; + else if (irq_context & LOCK_CHAIN_SOFTIRQ_CONTEXT) + nr_softirq_chains++; + else + nr_process_chains++; } -#endif +static void dec_chains(int irq_context) +{ + if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT) + nr_hardirq_chains--; + else if (irq_context & LOCK_CHAIN_SOFTIRQ_CONTEXT) + nr_softirq_chains--; + else + nr_process_chains--; +} static void -print_deadlock_scenario(struct held_lock *nxt, - struct held_lock *prv) +print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv) { struct lock_class *next = hlock_class(nxt); struct lock_class *prev = hlock_class(prv); @@ -1708,21 +2997,25 @@ print_deadlock_scenario(struct held_lock *nxt, printk(" CPU0\n"); printk(" ----\n"); printk(" lock("); - __print_lock_name(prev); + __print_lock_name(prv, prev); printk(KERN_CONT ");\n"); printk(" lock("); - __print_lock_name(next); + __print_lock_name(nxt, next); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); printk(" May be due to missing lock nesting notation\n\n"); } -static int +static void print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, struct held_lock *next) { + struct lock_class *class = hlock_class(prev); + if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; + + nbcon_cpu_emergency_enter(); pr_warn("\n"); pr_warn("============================================\n"); @@ -1735,6 +3028,11 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, pr_warn("\nbut task is already holding lock:\n"); print_lock(prev); + if (class->cmp_fn) { + pr_warn("and the lock comparison function returns %i:\n", + class->cmp_fn(prev->instance, next->instance)); + } + pr_warn("\nother info that might help us debug this:\n"); print_deadlock_scenario(next, prev); lockdep_print_held_locks(curr); @@ -1742,7 +3040,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, pr_warn("\nstack backtrace:\n"); dump_stack(); - return 0; + nbcon_cpu_emergency_exit(); } /* @@ -1751,12 +3049,14 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, * (Note that this has to be done separately, because the graph cannot * detect such classes of deadlocks.) * - * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read + * Returns: 0 on deadlock detected, 1 on OK, 2 if another lock with the same + * lock class is held but nest_lock is also held, i.e. we rely on the + * nest_lock to avoid the deadlock. */ static int -check_deadlock(struct task_struct *curr, struct held_lock *next, - struct lockdep_map *next_instance, int read) +check_deadlock(struct task_struct *curr, struct held_lock *next) { + struct lock_class *class; struct held_lock *prev; struct held_lock *nest = NULL; int i; @@ -1774,8 +3074,14 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, * Allow read-after-read recursion of the same * lock class (i.e. read_lock(lock)+read_lock(lock)): */ - if ((read == 2) && prev->read) - return 2; + if ((next->read == 2) && prev->read) + continue; + + class = hlock_class(prev); + + if (class->cmp_fn && + class->cmp_fn(prev->instance, next->instance) < 0) + continue; /* * We're holding the nest_lock, which serializes this lock's @@ -1784,14 +3090,15 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, if (nest) return 2; - return print_deadlock_bug(curr, prev, next); + print_deadlock_bug(curr, prev, next); + return 0; } return 1; } /* * There was a chain-cache miss, and we are about to add a new dependency - * to a previous lock. We recursively validate the following rules: + * to a previous lock. We validate the following rules: * * - would the adding of the <prev> -> <next> dependency create a * circular dependency in the graph? [== circular deadlock] @@ -1813,51 +3120,55 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, int *stack_saved) + struct held_lock *next, u16 distance, + struct lock_trace **const trace) { struct lock_list *entry; - int ret; - struct lock_list this; - struct lock_list *uninitialized_var(target_entry); - /* - * Static variable, serialized by the graph_lock(). - * - * We use this static variable to save the stack trace in case - * we call into this function multiple times due to encountering - * trylocks in the held lock stack. - */ - static struct stack_trace trace; + enum bfs_result ret; + + if (!hlock_class(prev)->key || !hlock_class(next)->key) { + /* + * The warning statements below may trigger a use-after-free + * of the class name. It is better to trigger a use-after free + * and to have the class name most of the time instead of not + * having the class name available. + */ + WARN_ONCE(!debug_locks_silent && !hlock_class(prev)->key, + "Detected use-after-free of lock class %px/%s\n", + hlock_class(prev), + hlock_class(prev)->name); + WARN_ONCE(!debug_locks_silent && !hlock_class(next)->key, + "Detected use-after-free of lock class %px/%s\n", + hlock_class(next), + hlock_class(next)->name); + return 2; + } + + if (prev->class_idx == next->class_idx) { + struct lock_class *class = hlock_class(prev); + + if (class->cmp_fn && + class->cmp_fn(prev->instance, next->instance) < 0) + return 2; + } /* * Prove that the new <prev> -> <next> dependency would not * create a circular dependency in the graph. (We do this by - * forward-recursing into the graph starting at <next>, and - * checking whether we can reach <prev>.) + * a breadth-first search into the graph starting at <next>, + * and check whether we can reach <prev>.) * - * We are using global variables to control the recursion, to - * keep the stackframe size of the recursive functions low: + * The search is limited by the size of the circular queue (i.e., + * MAX_CIRCULAR_QUEUE_SIZE) which keeps track of a breadth of nodes + * in the graph whose neighbours are to be checked. */ - this.class = hlock_class(next); - this.parent = NULL; - ret = check_noncircular(&this, hlock_class(prev), &target_entry); - if (unlikely(!ret)) - return print_circular_bug(&this, target_entry, next, prev); - else if (unlikely(ret < 0)) - return print_bfs_bug(ret); - - if (!check_prev_add_irq(curr, prev, next)) + ret = check_noncircular(next, prev, trace); + if (unlikely(bfs_error(ret) || ret == BFS_RMATCH)) + return 0; + + if (!check_irq_usage(curr, prev, next)) return 0; - /* - * For recursive read-locks we do all the dependency checks, - * but we dont store read-triggered dependencies (only - * write-triggered dependencies). This ensures that only the - * write-side dependencies matter, and that if for example a - * write-lock never takes any other locks, then the reads are - * equivalent to a NOP. - */ - if (next->read == 2 || prev->read == 2) - return 1; /* * Is the <prev> -> <next> dependency already present? * @@ -1870,49 +3181,71 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, if (entry->class == hlock_class(next)) { if (distance == 1) entry->distance = 1; - return 2; + entry->dep |= calc_dep(prev, next); + + /* + * Also, update the reverse dependency in @next's + * ->locks_before list. + * + * Here we reuse @entry as the cursor, which is fine + * because we won't go to the next iteration of the + * outer loop: + * + * For normal cases, we return in the inner loop. + * + * If we fail to return, we have inconsistency, i.e. + * <prev>::locks_after contains <next> while + * <next>::locks_before doesn't contain <prev>. In + * that case, we return after the inner and indicate + * something is wrong. + */ + list_for_each_entry(entry, &hlock_class(next)->locks_before, entry) { + if (entry->class == hlock_class(prev)) { + if (distance == 1) + entry->distance = 1; + entry->dep |= calc_depb(prev, next); + return 1; + } + } + + /* <prev> is not found in <next>::locks_before */ + return 0; } } - if (!*stack_saved) { - if (!save_trace(&trace)) + /* + * Is the <prev> -> <next> link redundant? + */ + ret = check_redundant(prev, next); + if (bfs_error(ret)) + return 0; + else if (ret == BFS_RMATCH) + return 2; + + if (!*trace) { + *trace = save_trace(); + if (!*trace) return 0; - *stack_saved = 1; } /* * Ok, all validations passed, add the new lock * to the previous lock's dependency list: */ - ret = add_lock_to_list(hlock_class(next), - &hlock_class(prev)->locks_after, - next->acquire_ip, distance, &trace); + ret = add_lock_to_list(hlock_class(next), hlock_class(prev), + &hlock_class(prev)->locks_after, distance, + calc_dep(prev, next), *trace); if (!ret) return 0; - ret = add_lock_to_list(hlock_class(prev), - &hlock_class(next)->locks_before, - next->acquire_ip, distance, &trace); + ret = add_lock_to_list(hlock_class(prev), hlock_class(next), + &hlock_class(next)->locks_before, distance, + calc_depb(prev, next), *trace); if (!ret) return 0; - /* - * Debugging printouts: - */ - if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { - /* We drop graph lock, so another thread can overwrite trace. */ - *stack_saved = 0; - graph_unlock(); - printk("\n new dependency: "); - print_lock_name(hlock_class(prev)); - printk(KERN_CONT " => "); - print_lock_name(hlock_class(next)); - printk(KERN_CONT "\n"); - dump_stack(); - return graph_lock(); - } - return 1; + return 2; } /* @@ -1924,8 +3257,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { + struct lock_trace *trace = NULL; int depth = curr->lockdep_depth; - int stack_saved = 0; struct held_lock *hlock; /* @@ -1944,16 +3277,14 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) goto out_bug; for (;;) { - int distance = curr->lockdep_depth - depth + 1; + u16 distance = curr->lockdep_depth - depth + 1; hlock = curr->held_locks + depth - 1; - /* - * Only non-recursive-read entries get new dependencies - * added: - */ - if (hlock->read != 2 && hlock->check) { - if (!check_prev_add(curr, hlock, next, - distance, &stack_saved)) + + if (hlock->check) { + int ret = check_prev_add(curr, hlock, next, distance, &trace); + if (!ret) return 0; + /* * Stop after the first non-trylock entry, * as non-trylock entries have added their @@ -1963,6 +3294,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) if (!hlock->trylock) break; } + depth--; /* * End of lock-stack? @@ -1991,14 +3323,245 @@ out_bug: return 0; } -unsigned long nr_lock_chains; struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; -int nr_chain_hlocks; +static DECLARE_BITMAP(lock_chains_in_use, MAX_LOCKDEP_CHAINS); static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; +unsigned long nr_zapped_lock_chains; +unsigned int nr_free_chain_hlocks; /* Free chain_hlocks in buckets */ +unsigned int nr_lost_chain_hlocks; /* Lost chain_hlocks */ +unsigned int nr_large_chain_blocks; /* size > MAX_CHAIN_BUCKETS */ + +/* + * The first 2 chain_hlocks entries in the chain block in the bucket + * list contains the following meta data: + * + * entry[0]: + * Bit 15 - always set to 1 (it is not a class index) + * Bits 0-14 - upper 15 bits of the next block index + * entry[1] - lower 16 bits of next block index + * + * A next block index of all 1 bits means it is the end of the list. + * + * On the unsized bucket (bucket-0), the 3rd and 4th entries contain + * the chain block size: + * + * entry[2] - upper 16 bits of the chain block size + * entry[3] - lower 16 bits of the chain block size + */ +#define MAX_CHAIN_BUCKETS 16 +#define CHAIN_BLK_FLAG (1U << 15) +#define CHAIN_BLK_LIST_END 0xFFFFU + +static int chain_block_buckets[MAX_CHAIN_BUCKETS]; + +static inline int size_to_bucket(int size) +{ + if (size > MAX_CHAIN_BUCKETS) + return 0; + + return size - 1; +} + +/* + * Iterate all the chain blocks in a bucket. + */ +#define for_each_chain_block(bucket, prev, curr) \ + for ((prev) = -1, (curr) = chain_block_buckets[bucket]; \ + (curr) >= 0; \ + (prev) = (curr), (curr) = chain_block_next(curr)) + +/* + * next block or -1 + */ +static inline int chain_block_next(int offset) +{ + int next = chain_hlocks[offset]; + + WARN_ON_ONCE(!(next & CHAIN_BLK_FLAG)); + + if (next == CHAIN_BLK_LIST_END) + return -1; + + next &= ~CHAIN_BLK_FLAG; + next <<= 16; + next |= chain_hlocks[offset + 1]; + + return next; +} + +/* + * bucket-0 only + */ +static inline int chain_block_size(int offset) +{ + return (chain_hlocks[offset + 2] << 16) | chain_hlocks[offset + 3]; +} + +static inline void init_chain_block(int offset, int next, int bucket, int size) +{ + chain_hlocks[offset] = (next >> 16) | CHAIN_BLK_FLAG; + chain_hlocks[offset + 1] = (u16)next; + + if (size && !bucket) { + chain_hlocks[offset + 2] = size >> 16; + chain_hlocks[offset + 3] = (u16)size; + } +} + +static inline void add_chain_block(int offset, int size) +{ + int bucket = size_to_bucket(size); + int next = chain_block_buckets[bucket]; + int prev, curr; + + if (unlikely(size < 2)) { + /* + * We can't store single entries on the freelist. Leak them. + * + * One possible way out would be to uniquely mark them, other + * than with CHAIN_BLK_FLAG, such that we can recover them when + * the block before it is re-added. + */ + if (size) + nr_lost_chain_hlocks++; + return; + } + + nr_free_chain_hlocks += size; + if (!bucket) { + nr_large_chain_blocks++; + + /* + * Variable sized, sort large to small. + */ + for_each_chain_block(0, prev, curr) { + if (size >= chain_block_size(curr)) + break; + } + init_chain_block(offset, curr, 0, size); + if (prev < 0) + chain_block_buckets[0] = offset; + else + init_chain_block(prev, offset, 0, 0); + return; + } + /* + * Fixed size, add to head. + */ + init_chain_block(offset, next, bucket, size); + chain_block_buckets[bucket] = offset; +} + +/* + * Only the first block in the list can be deleted. + * + * For the variable size bucket[0], the first block (the largest one) is + * returned, broken up and put back into the pool. So if a chain block of + * length > MAX_CHAIN_BUCKETS is ever used and zapped, it will just be + * queued up after the primordial chain block and never be used until the + * hlock entries in the primordial chain block is almost used up. That + * causes fragmentation and reduce allocation efficiency. That can be + * monitored by looking at the "large chain blocks" number in lockdep_stats. + */ +static inline void del_chain_block(int bucket, int size, int next) +{ + nr_free_chain_hlocks -= size; + chain_block_buckets[bucket] = next; + + if (!bucket) + nr_large_chain_blocks--; +} + +static void init_chain_block_buckets(void) +{ + int i; + + for (i = 0; i < MAX_CHAIN_BUCKETS; i++) + chain_block_buckets[i] = -1; + + add_chain_block(0, ARRAY_SIZE(chain_hlocks)); +} + +/* + * Return offset of a chain block of the right size or -1 if not found. + * + * Fairly simple worst-fit allocator with the addition of a number of size + * specific free lists. + */ +static int alloc_chain_hlocks(int req) +{ + int bucket, curr, size; + + /* + * We rely on the MSB to act as an escape bit to denote freelist + * pointers. Make sure this bit isn't set in 'normal' class_idx usage. + */ + BUILD_BUG_ON((MAX_LOCKDEP_KEYS-1) & CHAIN_BLK_FLAG); + + init_data_structures_once(); + + if (nr_free_chain_hlocks < req) + return -1; + + /* + * We require a minimum of 2 (u16) entries to encode a freelist + * 'pointer'. + */ + req = max(req, 2); + bucket = size_to_bucket(req); + curr = chain_block_buckets[bucket]; + + if (bucket) { + if (curr >= 0) { + del_chain_block(bucket, req, chain_block_next(curr)); + return curr; + } + /* Try bucket 0 */ + curr = chain_block_buckets[0]; + } + + /* + * The variable sized freelist is sorted by size; the first entry is + * the largest. Use it if it fits. + */ + if (curr >= 0) { + size = chain_block_size(curr); + if (likely(size >= req)) { + del_chain_block(0, size, chain_block_next(curr)); + if (size > req) + add_chain_block(curr + req, size - req); + return curr; + } + } + + /* + * Last resort, split a block in a larger sized bucket. + */ + for (size = MAX_CHAIN_BUCKETS; size > req; size--) { + bucket = size_to_bucket(size); + curr = chain_block_buckets[bucket]; + if (curr < 0) + continue; + + del_chain_block(bucket, size, chain_block_next(curr)); + add_chain_block(curr + req, size - req); + return curr; + } + + return -1; +} + +static inline void free_chain_hlocks(int base, int size) +{ + add_chain_block(base, max(size, 2)); +} struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) { - return lock_classes + chain_hlocks[chain->base + i]; + u16 chain_hlock = chain_hlocks[chain->base + i]; + unsigned int class_idx = chain_hlock_class_idx(chain_hlock); + + return lock_classes + class_idx; } /* @@ -2024,12 +3587,12 @@ static inline int get_first_held_lock(struct task_struct *curr, /* * Returns the next chain_key iteration */ -static u64 print_chain_key_iteration(int class_idx, u64 chain_key) +static u64 print_chain_key_iteration(u16 hlock_id, u64 chain_key) { - u64 new_chain_key = iterate_chain_key(chain_key, class_idx); + u64 new_chain_key = iterate_chain_key(chain_key, hlock_id); - printk(" class_idx:%d -> chain_key:%016Lx", - class_idx, + printk(" hlock_id:%d -> chain_key:%016Lx", + (unsigned int)hlock_id, (unsigned long long)new_chain_key); return new_chain_key; } @@ -2038,34 +3601,35 @@ static void print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next) { struct held_lock *hlock; - u64 chain_key = 0; + u64 chain_key = INITIAL_CHAIN_KEY; int depth = curr->lockdep_depth; - int i; + int i = get_first_held_lock(curr, hlock_next); - printk("depth: %u\n", depth + 1); - for (i = get_first_held_lock(curr, hlock_next); i < depth; i++) { + printk("depth: %u (irq_context %u)\n", depth - i + 1, + hlock_next->irq_context); + for (; i < depth; i++) { hlock = curr->held_locks + i; - chain_key = print_chain_key_iteration(hlock->class_idx, chain_key); + chain_key = print_chain_key_iteration(hlock_id(hlock), chain_key); print_lock(hlock); } - print_chain_key_iteration(hlock_next->class_idx, chain_key); + print_chain_key_iteration(hlock_id(hlock_next), chain_key); print_lock(hlock_next); } static void print_chain_keys_chain(struct lock_chain *chain) { int i; - u64 chain_key = 0; - int class_id; + u64 chain_key = INITIAL_CHAIN_KEY; + u16 hlock_id; printk("depth: %u\n", chain->depth); for (i = 0; i < chain->depth; i++) { - class_id = chain_hlocks[chain->base + i]; - chain_key = print_chain_key_iteration(class_id + 1, chain_key); + hlock_id = chain_hlocks[chain->base + i]; + chain_key = print_chain_key_iteration(hlock_id, chain_key); - print_lock_name(lock_classes + class_id); + print_lock_name(NULL, lock_classes + chain_hlock_class_idx(hlock_id)); printk("\n"); } } @@ -2074,6 +3638,8 @@ static void print_collision(struct task_struct *curr, struct held_lock *hlock_next, struct lock_chain *chain) { + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("============================\n"); pr_warn("WARNING: chain_key collision\n"); @@ -2090,6 +3656,8 @@ static void print_collision(struct task_struct *curr, pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } #endif @@ -2114,7 +3682,7 @@ static int check_no_collision(struct task_struct *curr, } for (j = 0; j < chain->depth - 1; j++, i++) { - id = curr->held_locks[i].class_idx - 1; + id = hlock_id(&curr->held_locks[i]); if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) { print_collision(curr, hlock, chain); @@ -2126,73 +3694,66 @@ static int check_no_collision(struct task_struct *curr, } /* - * Look up a dependency chain. If the key is not present yet then - * add it and return 1 - in this case the new dependency chain is - * validated. If the key is already hashed, return 0. - * (On return with 1 graph_lock is held.) + * Given an index that is >= -1, return the index of the next lock chain. + * Return -2 if there is no next lock chain. */ -static inline int lookup_chain_cache(struct task_struct *curr, - struct held_lock *hlock, - u64 chain_key) +long lockdep_next_lockchain(long i) +{ + i = find_next_bit(lock_chains_in_use, ARRAY_SIZE(lock_chains), i + 1); + return i < ARRAY_SIZE(lock_chains) ? i : -2; +} + +unsigned long lock_chain_count(void) +{ + return bitmap_weight(lock_chains_in_use, ARRAY_SIZE(lock_chains)); +} + +/* Must be called with the graph lock held. */ +static struct lock_chain *alloc_lock_chain(void) +{ + int idx = find_first_zero_bit(lock_chains_in_use, + ARRAY_SIZE(lock_chains)); + + if (unlikely(idx >= ARRAY_SIZE(lock_chains))) + return NULL; + __set_bit(idx, lock_chains_in_use); + return lock_chains + idx; +} + +/* + * Adds a dependency chain into chain hashtable. And must be called with + * graph_lock held. + * + * Return 0 if fail, and graph_lock is released. + * Return 1 if succeed, with graph_lock held. + */ +static inline int add_chain_cache(struct task_struct *curr, + struct held_lock *hlock, + u64 chain_key) { - struct lock_class *class = hlock_class(hlock); struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; int i, j; /* - * We might need to take the graph lock, ensure we've got IRQs + * The caller must hold the graph lock, ensure we've got IRQs * disabled to make this an IRQ-safe lock.. for recursion reasons * lockdep won't complain about its own locking errors. */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + if (lockdep_assert_locked()) return 0; - /* - * We can walk it lock-free, because entries only get added - * to the hash: - */ - hlist_for_each_entry_rcu(chain, hash_head, entry) { - if (chain->chain_key == chain_key) { -cache_hit: - debug_atomic_inc(chain_lookup_hits); - if (!check_no_collision(curr, hlock, chain)) - return 0; - if (very_verbose(class)) - printk("\nhash chain already cached, key: " - "%016Lx tail class: [%p] %s\n", - (unsigned long long)chain_key, - class->key, class->name); - return 0; - } - } - if (very_verbose(class)) - printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", - (unsigned long long)chain_key, class->key, class->name); - /* - * Allocate a new chain entry from the static array, and add - * it to the hash: - */ - if (!graph_lock()) - return 0; - /* - * We have to walk the chain again locked - to avoid duplicates: - */ - hlist_for_each_entry(chain, hash_head, entry) { - if (chain->chain_key == chain_key) { - graph_unlock(); - goto cache_hit; - } - } - if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { + chain = alloc_lock_chain(); + if (!chain) { if (!debug_locks_off_graph_unlock()) return 0; + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return 0; } - chain = lock_chains + nr_lock_chains++; chain->chain_key = chain_key; chain->irq_context = hlock->irq_context; i = get_first_held_lock(curr, hlock); @@ -2202,41 +3763,104 @@ cache_hit: BUILD_BUG_ON((1UL << 6) <= ARRAY_SIZE(curr->held_locks)); BUILD_BUG_ON((1UL << 8*sizeof(chain_hlocks[0])) <= ARRAY_SIZE(lock_classes)); - if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { - chain->base = nr_chain_hlocks; - for (j = 0; j < chain->depth - 1; j++, i++) { - int lock_id = curr->held_locks[i].class_idx - 1; - chain_hlocks[chain->base + j] = lock_id; - } - chain_hlocks[chain->base + j] = class - lock_classes; - } - - if (nr_chain_hlocks < MAX_LOCKDEP_CHAIN_HLOCKS) - nr_chain_hlocks += chain->depth; - -#ifdef CONFIG_DEBUG_LOCKDEP - /* - * Important for check_no_collision(). - */ - if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) { + j = alloc_chain_hlocks(chain->depth); + if (j < 0) { if (!debug_locks_off_graph_unlock()) return 0; + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return 0; } -#endif + chain->base = j; + for (j = 0; j < chain->depth - 1; j++, i++) { + int lock_id = hlock_id(curr->held_locks + i); + + chain_hlocks[chain->base + j] = lock_id; + } + chain_hlocks[chain->base + j] = hlock_id(hlock); hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); - inc_chains(); + inc_chains(chain->irq_context); + + return 1; +} + +/* + * Look up a dependency chain. Must be called with either the graph lock or + * the RCU read lock held. + */ +static inline struct lock_chain *lookup_chain_cache(u64 chain_key) +{ + struct hlist_head *hash_head = chainhashentry(chain_key); + struct lock_chain *chain; + + hlist_for_each_entry_rcu(chain, hash_head, entry) { + if (READ_ONCE(chain->chain_key) == chain_key) { + debug_atomic_inc(chain_lookup_hits); + return chain; + } + } + return NULL; +} + +/* + * If the key is not present yet in dependency chain cache then + * add it and return 1 - in this case the new dependency chain is + * validated. If the key is already hashed, return 0. + * (On return with 1 graph_lock is held.) + */ +static inline int lookup_chain_cache_add(struct task_struct *curr, + struct held_lock *hlock, + u64 chain_key) +{ + struct lock_class *class = hlock_class(hlock); + struct lock_chain *chain = lookup_chain_cache(chain_key); + + if (chain) { +cache_hit: + if (!check_no_collision(curr, hlock, chain)) + return 0; + + if (very_verbose(class)) { + printk("\nhash chain already cached, key: " + "%016Lx tail class: [%px] %s\n", + (unsigned long long)chain_key, + class->key, class->name); + } + + return 0; + } + + if (very_verbose(class)) { + printk("\nnew hash chain, key: %016Lx tail class: [%px] %s\n", + (unsigned long long)chain_key, class->key, class->name); + } + + if (!graph_lock()) + return 0; + + /* + * We have to walk the chain again locked - to avoid duplicates: + */ + chain = lookup_chain_cache(chain_key); + if (chain) { + graph_unlock(); + goto cache_hit; + } + + if (!add_chain_cache(curr, hlock, chain_key)) + return 0; return 1; } -static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, - struct held_lock *hlock, int chain_head, u64 chain_key) +static int validate_chain(struct task_struct *curr, + struct held_lock *hlock, + int chain_head, u64 chain_key) { /* * Trylock needs to maintain the stack of held locks, but it @@ -2245,11 +3869,11 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, * * We look up the chain_key and do the O(N^2) check and update of * the dependencies only if this is a new dependency chain. - * (If lookup_chain_cache() returns with 1 it acquires + * (If lookup_chain_cache_add() return with 1 it acquires * graph_lock for us) */ if (!hlock->trylock && hlock->check && - lookup_chain_cache(curr, hlock, chain_key)) { + lookup_chain_cache_add(curr, hlock, chain_key)) { /* * Check whether last held lock: * @@ -2257,45 +3881,53 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, * - is softirq-safe, if this lock is hardirq-unsafe * * And check whether the new lock's dependency graph - * could lead back to the previous lock. + * could lead back to the previous lock: + * + * - within the current held-lock stack + * - across our accumulated lock dependency records * - * any of these scenarios could lead to a deadlock. If - * All validations + * any of these scenarios could lead to a deadlock. */ - int ret = check_deadlock(curr, hlock, lock, hlock->read); + /* + * The simple case: does the current hold the same lock + * already? + */ + int ret = check_deadlock(curr, hlock); if (!ret) return 0; /* - * Mark recursive read, as we jump over it when - * building dependencies (just like we jump over - * trylock entries): - */ - if (ret == 2) - hlock->read = 2; - /* * Add dependency only if this lock is not the head - * of the chain, and if it's not a secondary read-lock: + * of the chain, and if the new lock introduces no more + * lock dependency (because we already hold a lock with the + * same lock class) nor deadlock (because the nest_lock + * serializes nesting locks), see the comments for + * check_deadlock(). */ - if (!chain_head && ret != 2) + if (!chain_head && ret != 2) { if (!check_prevs_add(curr, hlock)) return 0; + } + graph_unlock(); - } else - /* after lookup_chain_cache(): */ + } else { + /* after lookup_chain_cache_add(): */ if (unlikely(!debug_locks)) return 0; + } return 1; } #else static inline int validate_chain(struct task_struct *curr, - struct lockdep_map *lock, struct held_lock *hlock, - int chain_head, u64 chain_key) + struct held_lock *hlock, + int chain_head, u64 chain_key) { return 1; } -#endif + +static void init_chain_block_buckets(void) { } +#endif /* CONFIG_PROVE_LOCKING */ /* * We are building curr_chain_key incrementally, so double-check @@ -2306,7 +3938,7 @@ static void check_chain_key(struct task_struct *curr) #ifdef CONFIG_DEBUG_LOCKDEP struct held_lock *hlock, *prev_hlock = NULL; unsigned int i; - u64 chain_key = 0; + u64 chain_key = INITIAL_CHAIN_KEY; for (i = 0; i < curr->lockdep_depth; i++) { hlock = curr->held_locks + i; @@ -2322,16 +3954,18 @@ static void check_chain_key(struct task_struct *curr) (unsigned long long)hlock->prev_chain_key); return; } + /* - * Whoops ran out of static storage again? + * hlock->class_idx can't go beyond MAX_LOCKDEP_KEYS, but is + * it registered lock class index? */ - if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS)) + if (DEBUG_LOCKS_WARN_ON(!test_bit(hlock->class_idx, lock_classes_in_use))) return; if (prev_hlock && (prev_hlock->irq_context != hlock->irq_context)) - chain_key = 0; - chain_key = iterate_chain_key(chain_key, hlock->class_idx); + chain_key = INITIAL_CHAIN_KEY; + chain_key = iterate_chain_key(chain_key, hlock_id(hlock)); prev_hlock = hlock; } if (chain_key != curr->curr_chain_key) { @@ -2348,8 +3982,11 @@ static void check_chain_key(struct task_struct *curr) #endif } -static void -print_usage_bug_scenario(struct held_lock *lock) +#ifdef CONFIG_PROVE_LOCKING +static int mark_lock(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit); + +static void print_usage_bug_scenario(struct held_lock *lock) { struct lock_class *class = hlock_class(lock); @@ -2357,21 +3994,23 @@ print_usage_bug_scenario(struct held_lock *lock) printk(" CPU0\n"); printk(" ----\n"); printk(" lock("); - __print_lock_name(class); + __print_lock_name(lock, class); printk(KERN_CONT ");\n"); printk(" <Interrupt>\n"); printk(" lock("); - __print_lock_name(class); + __print_lock_name(lock, class); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } -static int +static void print_usage_bug(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) { - if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + if (!debug_locks_off() || debug_locks_silent) + return; + + nbcon_cpu_emergency_enter(); pr_warn("\n"); pr_warn("================================\n"); @@ -2384,14 +4023,14 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", curr->comm, task_pid_nr(curr), - trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, - trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, - trace_hardirqs_enabled(curr), - trace_softirqs_enabled(curr)); + lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT, + lockdep_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, + lockdep_hardirqs_enabled(), + lockdep_softirqs_enabled(curr)); print_lock(this); pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); - print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); + print_lock_trace(hlock_class(this)->usage_traces[prev_bit], 1); print_irqtrace_events(curr); pr_warn("\nother info that might help us debug this:\n"); @@ -2402,7 +4041,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, pr_warn("\nstack backtrace:\n"); dump_stack(); - return 0; + nbcon_cpu_emergency_exit(); } /* @@ -2412,20 +4051,19 @@ static inline int valid_state(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) { - if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) - return print_usage_bug(curr, this, bad_bit, new_bit); + if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) { + graph_unlock(); + print_usage_bug(curr, this, bad_bit, new_bit); + return 0; + } return 1; } -static int mark_lock(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit); - -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * print irq inversion bug: */ -static int +static void print_irq_inversion_bug(struct task_struct *curr, struct lock_list *root, struct lock_list *other, struct held_lock *this, int forwards, @@ -2436,7 +4074,9 @@ print_irq_inversion_bug(struct task_struct *curr, int depth; if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; + + nbcon_cpu_emergency_enter(); pr_warn("\n"); pr_warn("========================================================\n"); @@ -2450,7 +4090,7 @@ print_irq_inversion_bug(struct task_struct *curr, pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass); else pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); - print_lock_name(other->class); + print_lock_name(NULL, other->class); pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n"); pr_warn("\nother info that might help us debug this:\n"); @@ -2476,14 +4116,15 @@ print_irq_inversion_bug(struct task_struct *curr, lockdep_print_held_locks(curr); pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); - if (!save_trace(&root->trace)) - return 0; + root->trace = save_trace(); + if (!root->trace) + goto out; print_shortest_lock_dependencies(other, root); pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; +out: + nbcon_cpu_emergency_exit(); } /* @@ -2492,22 +4133,33 @@ print_irq_inversion_bug(struct task_struct *curr, */ static int check_usage_forwards(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit bit, const char *irqclass) + enum lock_usage_bit bit) { - int ret; + enum bfs_result ret; struct lock_list root; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; + enum lock_usage_bit read_bit = bit + LOCK_USAGE_READ_MASK; + unsigned usage_mask = lock_flag(bit) | lock_flag(read_bit); + + bfs_init_root(&root, this); + ret = find_usage_forwards(&root, usage_mask, &target_entry); + if (bfs_error(ret)) { + print_bfs_bug(ret); + return 0; + } + if (ret == BFS_RNOMATCH) + return 1; - root.parent = NULL; - root.class = hlock_class(this); - ret = find_usage_forwards(&root, bit, &target_entry); - if (ret < 0) - return print_bfs_bug(ret); - if (ret == 1) - return ret; + /* Check whether write or read usage is the match */ + if (target_entry->class->usage_mask & lock_flag(bit)) { + print_irq_inversion_bug(curr, &root, target_entry, + this, 1, state_name(bit)); + } else { + print_irq_inversion_bug(curr, &root, target_entry, + this, 1, state_name(read_bit)); + } - return print_irq_inversion_bug(curr, &root, target_entry, - this, 1, irqclass); + return 0; } /* @@ -2516,39 +4168,56 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, */ static int check_usage_backwards(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit bit, const char *irqclass) + enum lock_usage_bit bit) { - int ret; + enum bfs_result ret; struct lock_list root; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; + enum lock_usage_bit read_bit = bit + LOCK_USAGE_READ_MASK; + unsigned usage_mask = lock_flag(bit) | lock_flag(read_bit); + + bfs_init_rootb(&root, this); + ret = find_usage_backwards(&root, usage_mask, &target_entry); + if (bfs_error(ret)) { + print_bfs_bug(ret); + return 0; + } + if (ret == BFS_RNOMATCH) + return 1; - root.parent = NULL; - root.class = hlock_class(this); - ret = find_usage_backwards(&root, bit, &target_entry); - if (ret < 0) - return print_bfs_bug(ret); - if (ret == 1) - return ret; + /* Check whether write or read usage is the match */ + if (target_entry->class->usage_mask & lock_flag(bit)) { + print_irq_inversion_bug(curr, &root, target_entry, + this, 0, state_name(bit)); + } else { + print_irq_inversion_bug(curr, &root, target_entry, + this, 0, state_name(read_bit)); + } - return print_irq_inversion_bug(curr, &root, target_entry, - this, 0, irqclass); + return 0; } void print_irqtrace_events(struct task_struct *curr) { - printk("irq event stamp: %u\n", curr->irq_events); - printk("hardirqs last enabled at (%u): [<%p>] %pS\n", - curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip, - (void *)curr->hardirq_enable_ip); - printk("hardirqs last disabled at (%u): [<%p>] %pS\n", - curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip, - (void *)curr->hardirq_disable_ip); - printk("softirqs last enabled at (%u): [<%p>] %pS\n", - curr->softirq_enable_event, (void *)curr->softirq_enable_ip, - (void *)curr->softirq_enable_ip); - printk("softirqs last disabled at (%u): [<%p>] %pS\n", - curr->softirq_disable_event, (void *)curr->softirq_disable_ip, - (void *)curr->softirq_disable_ip); + const struct irqtrace_events *trace = &curr->irqtrace; + + nbcon_cpu_emergency_enter(); + + printk("irq event stamp: %u\n", trace->irq_events); + printk("hardirqs last enabled at (%u): [<%px>] %pS\n", + trace->hardirq_enable_event, (void *)trace->hardirq_enable_ip, + (void *)trace->hardirq_enable_ip); + printk("hardirqs last disabled at (%u): [<%px>] %pS\n", + trace->hardirq_disable_event, (void *)trace->hardirq_disable_ip, + (void *)trace->hardirq_disable_ip); + printk("softirqs last enabled at (%u): [<%px>] %pS\n", + trace->softirq_enable_event, (void *)trace->softirq_enable_ip, + (void *)trace->softirq_enable_ip); + printk("softirqs last disabled at (%u): [<%px>] %pS\n", + trace->softirq_disable_event, (void *)trace->softirq_disable_ip, + (void *)trace->softirq_disable_ip); + + nbcon_cpu_emergency_exit(); } static int HARDIRQ_verbose(struct lock_class *class) @@ -2567,16 +4236,6 @@ static int SOFTIRQ_verbose(struct lock_class *class) return 0; } -static int RECLAIM_FS_verbose(struct lock_class *class) -{ -#if RECLAIM_VERBOSE - return class_filter(class); -#endif - return 0; -} - -#define STRICT_READ_CHECKS 1 - static int (*state_verbose_f[])(struct lock_class *class) = { #define LOCKDEP_STATE(__STATE) \ __STATE##_verbose, @@ -2587,7 +4246,7 @@ static int (*state_verbose_f[])(struct lock_class *class) = { static inline int state_verbose(enum lock_usage_bit bit, struct lock_class *class) { - return state_verbose_f[bit >> 2](class); + return state_verbose_f[bit >> LOCK_USAGE_DIR_MASK](class); } typedef int (*check_usage_f)(struct task_struct *, struct held_lock *, @@ -2598,18 +4257,8 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit) { int excl_bit = exclusive_bit(new_bit); - int read = new_bit & 1; - int dir = new_bit & 2; - - /* - * mark USED_IN has to look forwards -- to ensure no dependency - * has ENABLED state, which would allow recursion deadlocks. - * - * mark ENABLED has to look backwards -- to ensure no dependee - * has USED_IN state, which, again, would allow recursion deadlocks. - */ - check_usage_f usage = dir ? - check_usage_backwards : check_usage_forwards; + int read = new_bit & LOCK_USAGE_READ_MASK; + int dir = new_bit & LOCK_USAGE_DIR_MASK; /* * Validate that this particular lock does not have conflicting @@ -2619,23 +4268,30 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, return 0; /* - * Validate that the lock dependencies don't have conflicting usage - * states. + * Check for read in write conflicts */ - if ((!read || !dir || STRICT_READ_CHECKS) && - !usage(curr, this, excl_bit, state_name(new_bit & ~1))) + if (!read && !valid_state(curr, this, new_bit, + excl_bit + LOCK_USAGE_READ_MASK)) return 0; + /* - * Check for read in write conflicts + * Validate that the lock dependencies don't have conflicting usage + * states. */ - if (!read) { - if (!valid_state(curr, this, new_bit, excl_bit + 1)) + if (dir) { + /* + * mark ENABLED has to look backwards -- to ensure no dependee + * has USED_IN state, which, again, would allow recursion deadlocks. + */ + if (!check_usage_backwards(curr, this, excl_bit)) return 0; - - if (STRICT_READ_CHECKS && - !usage(curr, this, excl_bit + 1, - state_name(new_bit + 1))) + } else { + /* + * mark USED_IN has to look forwards -- to ensure no dependency + * has ENABLED state, which would allow recursion deadlocks. + */ + if (!check_usage_forwards(curr, this, excl_bit)) return 0; } @@ -2645,35 +4301,28 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, return 1; } -enum mark_type { -#define LOCKDEP_STATE(__STATE) __STATE, -#include "lockdep_states.h" -#undef LOCKDEP_STATE -}; - /* * Mark all held locks with a usage bit: */ static int -mark_held_locks(struct task_struct *curr, enum mark_type mark) +mark_held_locks(struct task_struct *curr, enum lock_usage_bit base_bit) { - enum lock_usage_bit usage_bit; struct held_lock *hlock; int i; for (i = 0; i < curr->lockdep_depth; i++) { + enum lock_usage_bit hlock_bit = base_bit; hlock = curr->held_locks + i; - usage_bit = 2 + (mark << 2); /* ENABLED */ if (hlock->read) - usage_bit += 1; /* READ */ + hlock_bit += LOCK_USAGE_READ_MASK; - BUG_ON(usage_bit >= LOCK_USAGE_STATES); + BUG_ON(hlock_bit >= LOCK_USAGE_STATES); if (!hlock->check) continue; - if (!mark_lock(curr, hlock, usage_bit)) + if (!mark_lock(curr, hlock, hlock_bit)) return 0; } @@ -2683,18 +4332,15 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) /* * Hardirqs will be enabled: */ -static void __trace_hardirqs_on_caller(unsigned long ip) +static void __trace_hardirqs_on_caller(void) { struct task_struct *curr = current; - /* we'll do an OFF -> ON transition: */ - curr->hardirqs_enabled = 1; - /* * We are going to turn hardirqs on, so set the * usage bit for all held locks: */ - if (!mark_held_locks(curr, HARDIRQ)) + if (!mark_held_locks(curr, LOCK_ENABLED_HARDIRQ)) return; /* * If we have softirqs enabled, then set the usage @@ -2702,22 +4348,32 @@ static void __trace_hardirqs_on_caller(unsigned long ip) * this bit from being set before) */ if (curr->softirqs_enabled) - if (!mark_held_locks(curr, SOFTIRQ)) - return; - - curr->hardirq_enable_ip = ip; - curr->hardirq_enable_event = ++curr->irq_events; - debug_atomic_inc(hardirqs_on_events); + mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); } -__visible void trace_hardirqs_on_caller(unsigned long ip) +/** + * lockdep_hardirqs_on_prepare - Prepare for enabling interrupts + * + * Invoked before a possible transition to RCU idle from exit to user or + * guest mode. This ensures that all RCU operations are done before RCU + * stops watching. After the RCU transition lockdep_hardirqs_on() has to be + * invoked to set the final state. + */ +void lockdep_hardirqs_on_prepare(void) { - time_hardirqs_on(CALLER_ADDR0, ip); + if (unlikely(!debug_locks)) + return; - if (unlikely(!debug_locks || current->lockdep_recursion)) + /* + * NMIs do not (and cannot) track lock dependencies, nothing to do. + */ + if (unlikely(in_nmi())) + return; + + if (unlikely(this_cpu_read(lockdep_recursion))) return; - if (unlikely(current->hardirqs_enabled)) { + if (unlikely(lockdep_hardirqs_enabled())) { /* * Neither irq nor preemption are disabled here * so this is racy by nature but losing one hit @@ -2738,38 +4394,105 @@ __visible void trace_hardirqs_on_caller(unsigned long ip) /* * See the fine text that goes along with this variable definition. */ - if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) + if (DEBUG_LOCKS_WARN_ON(early_boot_irqs_disabled)) return; /* * Can't allow enabling interrupts while in an interrupt handler, * that's general bad form and such. Recursion, limited stack etc.. */ - if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context())) return; - current->lockdep_recursion = 1; - __trace_hardirqs_on_caller(ip); - current->lockdep_recursion = 0; + current->hardirq_chain_key = current->curr_chain_key; + + lockdep_recursion_inc(); + __trace_hardirqs_on_caller(); + lockdep_recursion_finish(); } -EXPORT_SYMBOL(trace_hardirqs_on_caller); +EXPORT_SYMBOL_GPL(lockdep_hardirqs_on_prepare); -void trace_hardirqs_on(void) +void noinstr lockdep_hardirqs_on(unsigned long ip) { - trace_hardirqs_on_caller(CALLER_ADDR0); + struct irqtrace_events *trace = ¤t->irqtrace; + + if (unlikely(!debug_locks)) + return; + + /* + * NMIs can happen in the middle of local_irq_{en,dis}able() where the + * tracking state and hardware state are out of sync. + * + * NMIs must save lockdep_hardirqs_enabled() to restore IRQ state from, + * and not rely on hardware state like normal interrupts. + */ + if (unlikely(in_nmi())) { + if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI)) + return; + + /* + * Skip: + * - recursion check, because NMI can hit lockdep; + * - hardware state check, because above; + * - chain_key check, see lockdep_hardirqs_on_prepare(). + */ + goto skip_checks; + } + + if (unlikely(this_cpu_read(lockdep_recursion))) + return; + + if (lockdep_hardirqs_enabled()) { + /* + * Neither irq nor preemption are disabled here + * so this is racy by nature but losing one hit + * in a stat is not a big deal. + */ + __debug_atomic_inc(redundant_hardirqs_on); + return; + } + + /* + * We're enabling irqs and according to our state above irqs weren't + * already enabled, yet we find the hardware thinks they are in fact + * enabled.. someone messed up their IRQ state tracing. + */ + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + + /* + * Ensure the lock stack remained unchanged between + * lockdep_hardirqs_on_prepare() and lockdep_hardirqs_on(). + */ + DEBUG_LOCKS_WARN_ON(current->hardirq_chain_key != + current->curr_chain_key); + +skip_checks: + /* we'll do an OFF -> ON transition: */ + __this_cpu_write(hardirqs_enabled, 1); + trace->hardirq_enable_ip = ip; + trace->hardirq_enable_event = ++trace->irq_events; + debug_atomic_inc(hardirqs_on_events); } -EXPORT_SYMBOL(trace_hardirqs_on); +EXPORT_SYMBOL_GPL(lockdep_hardirqs_on); /* * Hardirqs were disabled: */ -__visible void trace_hardirqs_off_caller(unsigned long ip) +void noinstr lockdep_hardirqs_off(unsigned long ip) { - struct task_struct *curr = current; - - time_hardirqs_off(CALLER_ADDR0, ip); + if (unlikely(!debug_locks)) + return; - if (unlikely(!debug_locks || current->lockdep_recursion)) + /* + * Matching lockdep_hardirqs_on(), allow NMIs in the middle of lockdep; + * they will restore the software state. This ensures the software + * state is consistent inside NMIs as well. + */ + if (in_nmi()) { + if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI)) + return; + } else if (__this_cpu_read(lockdep_recursion)) return; /* @@ -2779,33 +4502,30 @@ __visible void trace_hardirqs_off_caller(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->hardirqs_enabled) { + if (lockdep_hardirqs_enabled()) { + struct irqtrace_events *trace = ¤t->irqtrace; + /* * We have done an ON -> OFF transition: */ - curr->hardirqs_enabled = 0; - curr->hardirq_disable_ip = ip; - curr->hardirq_disable_event = ++curr->irq_events; + __this_cpu_write(hardirqs_enabled, 0); + trace->hardirq_disable_ip = ip; + trace->hardirq_disable_event = ++trace->irq_events; debug_atomic_inc(hardirqs_off_events); - } else + } else { debug_atomic_inc(redundant_hardirqs_off); + } } -EXPORT_SYMBOL(trace_hardirqs_off_caller); - -void trace_hardirqs_off(void) -{ - trace_hardirqs_off_caller(CALLER_ADDR0); -} -EXPORT_SYMBOL(trace_hardirqs_off); +EXPORT_SYMBOL_GPL(lockdep_hardirqs_off); /* * Softirqs will be enabled: */ -void trace_softirqs_on(unsigned long ip) +void lockdep_softirqs_on(unsigned long ip) { - struct task_struct *curr = current; + struct irqtrace_events *trace = ¤t->irqtrace; - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; /* @@ -2815,37 +4535,35 @@ void trace_softirqs_on(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->softirqs_enabled) { + if (current->softirqs_enabled) { debug_atomic_inc(redundant_softirqs_on); return; } - current->lockdep_recursion = 1; + lockdep_recursion_inc(); /* * We'll do an OFF -> ON transition: */ - curr->softirqs_enabled = 1; - curr->softirq_enable_ip = ip; - curr->softirq_enable_event = ++curr->irq_events; + current->softirqs_enabled = 1; + trace->softirq_enable_ip = ip; + trace->softirq_enable_event = ++trace->irq_events; debug_atomic_inc(softirqs_on_events); /* * We are going to turn softirqs on, so set the * usage bit for all held locks, if hardirqs are * enabled too: */ - if (curr->hardirqs_enabled) - mark_held_locks(curr, SOFTIRQ); - current->lockdep_recursion = 0; + if (lockdep_hardirqs_enabled()) + mark_held_locks(current, LOCK_ENABLED_SOFTIRQ); + lockdep_recursion_finish(); } /* * Softirqs were disabled: */ -void trace_softirqs_off(unsigned long ip) +void lockdep_softirqs_off(unsigned long ip) { - struct task_struct *curr = current; - - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; /* @@ -2854,13 +4572,15 @@ void trace_softirqs_off(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->softirqs_enabled) { + if (current->softirqs_enabled) { + struct irqtrace_events *trace = ¤t->irqtrace; + /* * We have done an ON -> OFF transition: */ - curr->softirqs_enabled = 0; - curr->softirq_disable_ip = ip; - curr->softirq_disable_event = ++curr->irq_events; + current->softirqs_enabled = 0; + trace->softirq_disable_ip = ip; + trace->softirq_disable_event = ++trace->irq_events; debug_atomic_inc(softirqs_off_events); /* * Whoops, we wanted softirqs off, so why aren't they? @@ -2870,66 +4590,43 @@ void trace_softirqs_off(unsigned long ip) debug_atomic_inc(redundant_softirqs_off); } -static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) +/** + * lockdep_cleanup_dead_cpu - Ensure CPU lockdep state is cleanly stopped + * + * @cpu: index of offlined CPU + * @idle: task pointer for offlined CPU's idle thread + * + * Invoked after the CPU is dead. Ensures that the tracing infrastructure + * is left in a suitable state for the CPU to be subsequently brought + * online again. + */ +void lockdep_cleanup_dead_cpu(unsigned int cpu, struct task_struct *idle) { - struct task_struct *curr = current; - if (unlikely(!debug_locks)) return; - gfp_mask = current_gfp_context(gfp_mask); - - /* no reclaim without waiting on it */ - if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) - return; - - /* this guy won't enter reclaim */ - if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) - return; - - /* We're only interested __GFP_FS allocations for now */ - if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS)) - return; - - /* - * Oi! Can't be having __GFP_FS allocations with IRQs disabled. - */ - if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) - return; - - /* Disable lockdep if explicitly requested */ - if (gfp_mask & __GFP_NOLOCKDEP) - return; - - mark_held_locks(curr, RECLAIM_FS); + if (unlikely(per_cpu(hardirqs_enabled, cpu))) { + pr_warn("CPU %u left hardirqs enabled!", cpu); + if (idle) + print_irqtrace_events(idle); + /* Clean it up for when the CPU comes online again. */ + per_cpu(hardirqs_enabled, cpu) = 0; + } } -static void check_flags(unsigned long flags); - -void lockdep_trace_alloc(gfp_t gfp_mask) +static int +mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) { - unsigned long flags; - - if (unlikely(current->lockdep_recursion)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - current->lockdep_recursion = 1; - __lockdep_trace_alloc(gfp_mask, flags); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} + if (!check) + goto lock_used; -static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) -{ /* * If non-trylock use in a hardirq or softirq context, then * mark the lock as used in these contexts: */ if (!hlock->trylock) { if (hlock->read) { - if (curr->hardirq_context) + if (lockdep_hardirq_context()) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ_READ)) return 0; @@ -2938,7 +4635,7 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) LOCK_USED_IN_SOFTIRQ_READ)) return 0; } else { - if (curr->hardirq_context) + if (lockdep_hardirq_context()) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) return 0; if (curr->softirq_context) @@ -2946,7 +4643,13 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) return 0; } } - if (!hlock->hardirqs_off) { + + /* + * For lock_sync(), don't mark the ENABLED usage, since lock_sync() + * creates no critical section and no extra dependency can be introduced + * by interrupts + */ + if (!hlock->hardirqs_off && !hlock->sync) { if (hlock->read) { if (!mark_lock(curr, hlock, LOCK_ENABLED_HARDIRQ_READ)) @@ -2966,28 +4669,18 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) } } - /* - * We reuse the irq context infrastructure more broadly as a general - * context checking code. This tests GFP_FS recursion (a lock taken - * during reclaim for a GFP_FS allocation is held over a GFP_FS - * allocation). - */ - if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) { - if (hlock->read) { - if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ)) - return 0; - } else { - if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS)) - return 0; - } - } +lock_used: + /* mark it as used: */ + if (!mark_lock(curr, hlock, LOCK_USED)) + return 0; return 1; } static inline unsigned int task_irq_context(struct task_struct *task) { - return 2 * !!task->hardirq_context + !!task->softirq_context; + return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context() + + LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context; } static int separate_irq_context(struct task_struct *curr, @@ -3013,46 +4706,23 @@ static int separate_irq_context(struct task_struct *curr, return 0; } -#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ - -static inline -int mark_lock_irq(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit) -{ - WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */ - return 1; -} - -static inline int mark_irqflags(struct task_struct *curr, - struct held_lock *hlock) -{ - return 1; -} - -static inline unsigned int task_irq_context(struct task_struct *task) -{ - return 0; -} - -static inline int separate_irq_context(struct task_struct *curr, - struct held_lock *hlock) -{ - return 0; -} - -void lockdep_trace_alloc(gfp_t gfp_mask) -{ -} - -#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ - /* * Mark a lock with a usage bit, and validate the state transition: */ static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit) { - unsigned int new_mask = 1 << new_bit, ret = 1; + unsigned int new_mask, ret = 1; + + if (new_bit >= LOCK_USAGE_STATES) { + DEBUG_LOCKS_WARN_ON(1); + return 0; + } + + if (new_bit == LOCK_USED && this->read) + new_bit = LOCK_USED_READ; + + new_mask = 1 << new_bit; /* * If already set then do not dirty the cacheline, @@ -3066,63 +4736,210 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, /* * Make sure we didn't race: */ - if (unlikely(hlock_class(this)->usage_mask & new_mask)) { - graph_unlock(); - return 1; - } + if (unlikely(hlock_class(this)->usage_mask & new_mask)) + goto unlock; + + if (!hlock_class(this)->usage_mask) + debug_atomic_dec(nr_unused_locks); hlock_class(this)->usage_mask |= new_mask; - if (!save_trace(hlock_class(this)->usage_traces + new_bit)) - return 0; + if (new_bit < LOCK_TRACE_STATES) { + if (!(hlock_class(this)->usage_traces[new_bit] = save_trace())) + return 0; + } - switch (new_bit) { -#define LOCKDEP_STATE(__STATE) \ - case LOCK_USED_IN_##__STATE: \ - case LOCK_USED_IN_##__STATE##_READ: \ - case LOCK_ENABLED_##__STATE: \ - case LOCK_ENABLED_##__STATE##_READ: -#include "lockdep_states.h" -#undef LOCKDEP_STATE + if (new_bit < LOCK_USED) { ret = mark_lock_irq(curr, this, new_bit); if (!ret) return 0; - break; - case LOCK_USED: - debug_atomic_dec(nr_unused_locks); - break; - default: - if (!debug_locks_off_graph_unlock()) - return 0; - WARN_ON(1); - return 0; } +unlock: graph_unlock(); /* * We must printk outside of the graph_lock: */ if (ret == 2) { + nbcon_cpu_emergency_enter(); printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); print_lock(this); print_irqtrace_events(curr); dump_stack(); + nbcon_cpu_emergency_exit(); } return ret; } +static inline short task_wait_context(struct task_struct *curr) +{ + /* + * Set appropriate wait type for the context; for IRQs we have to take + * into account force_irqthread as that is implied by PREEMPT_RT. + */ + if (lockdep_hardirq_context()) { + /* + * Check if force_irqthreads will run us threaded. + */ + if (curr->hardirq_threaded || curr->irq_config) + return LD_WAIT_CONFIG; + + return LD_WAIT_SPIN; + } else if (curr->softirq_context) { + /* + * Softirqs are always threaded. + */ + return LD_WAIT_CONFIG; + } + + return LD_WAIT_MAX; +} + +static int +print_lock_invalid_wait_context(struct task_struct *curr, + struct held_lock *hlock) +{ + short curr_inner; + + if (!debug_locks_off()) + return 0; + if (debug_locks_silent) + return 0; + + nbcon_cpu_emergency_enter(); + + pr_warn("\n"); + pr_warn("=============================\n"); + pr_warn("[ BUG: Invalid wait context ]\n"); + print_kernel_ident(); + pr_warn("-----------------------------\n"); + + pr_warn("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); + print_lock(hlock); + + pr_warn("other info that might help us debug this:\n"); + + curr_inner = task_wait_context(curr); + pr_warn("context-{%d:%d}\n", curr_inner, curr_inner); + + lockdep_print_held_locks(curr); + + pr_warn("stack backtrace:\n"); + dump_stack(); + + nbcon_cpu_emergency_exit(); + + return 0; +} + +/* + * Verify the wait_type context. + * + * This check validates we take locks in the right wait-type order; that is it + * ensures that we do not take mutexes inside spinlocks and do not attempt to + * acquire spinlocks inside raw_spinlocks and the sort. + * + * The entire thing is slightly more complex because of RCU, RCU is a lock that + * can be taken from (pretty much) any context but also has constraints. + * However when taken in a stricter environment the RCU lock does not loosen + * the constraints. + * + * Therefore we must look for the strictest environment in the lock stack and + * compare that to the lock we're trying to acquire. + */ +static int check_wait_context(struct task_struct *curr, struct held_lock *next) +{ + u8 next_inner = hlock_class(next)->wait_type_inner; + u8 next_outer = hlock_class(next)->wait_type_outer; + u8 curr_inner; + int depth; + + if (!next_inner || next->trylock) + return 0; + + if (!next_outer) + next_outer = next_inner; + + /* + * Find start of current irq_context.. + */ + for (depth = curr->lockdep_depth - 1; depth >= 0; depth--) { + struct held_lock *prev = curr->held_locks + depth; + if (prev->irq_context != next->irq_context) + break; + } + depth++; + + curr_inner = task_wait_context(curr); + + for (; depth < curr->lockdep_depth; depth++) { + struct held_lock *prev = curr->held_locks + depth; + struct lock_class *class = hlock_class(prev); + u8 prev_inner = class->wait_type_inner; + + if (prev_inner) { + /* + * We can have a bigger inner than a previous one + * when outer is smaller than inner, as with RCU. + * + * Also due to trylocks. + */ + curr_inner = min(curr_inner, prev_inner); + + /* + * Allow override for annotations -- this is typically + * only valid/needed for code that only exists when + * CONFIG_PREEMPT_RT=n. + */ + if (unlikely(class->lock_type == LD_LOCK_WAIT_OVERRIDE)) + curr_inner = prev_inner; + } + } + + if (next_outer > curr_inner) + return print_lock_invalid_wait_context(curr, next); + + return 0; +} + +#else /* CONFIG_PROVE_LOCKING */ + +static inline int +mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) +{ + return 1; +} + +static inline unsigned int task_irq_context(struct task_struct *task) +{ + return 0; +} + +static inline int separate_irq_context(struct task_struct *curr, + struct held_lock *hlock) +{ + return 0; +} + +static inline int check_wait_context(struct task_struct *curr, + struct held_lock *next) +{ + return 0; +} + +#endif /* CONFIG_PROVE_LOCKING */ + /* * Initialize a lock instance's lock-class mapping info: */ -void lockdep_init_map(struct lockdep_map *lock, const char *name, - struct lock_class_key *key, int subclass) +void lockdep_init_map_type(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, int subclass, + u8 inner, u8 outer, u8 lock_type) { int i; - kmemcheck_mark_initialized(lock, sizeof(*lock)); - for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) lock->class_cache[i] = NULL; @@ -3140,19 +4957,22 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, lock->name = name; + lock->wait_type_outer = outer; + lock->wait_type_inner = inner; + lock->lock_type = lock_type; + /* * No key, no joy, we need to hash something. */ if (DEBUG_LOCKS_WARN_ON(!key)) return; /* - * Sanity check, the lock-class key must be persistent: + * Sanity check, the lock-class key must either have been allocated + * statically or must have been registered as a dynamic key. */ - if (!static_obj(key)) { - printk("BUG: key %p not in .data!\n", key); - /* - * What it says above ^^^^^, I suggest you read it. - */ + if (!static_obj(key) && !is_dynamic_key(key)) { + if (debug_locks) + printk(KERN_ERR "BUG: key %px has not been registered!\n", key); DEBUG_LOCKS_WARN_ON(1); return; } @@ -3164,30 +4984,61 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, if (subclass) { unsigned long flags; - if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion)) + if (DEBUG_LOCKS_WARN_ON(!lockdep_enabled())) return; raw_local_irq_save(flags); - current->lockdep_recursion = 1; + lockdep_recursion_inc(); register_lock_class(lock, subclass, 1); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } } -EXPORT_SYMBOL_GPL(lockdep_init_map); +EXPORT_SYMBOL_GPL(lockdep_init_map_type); struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); -static int +struct lock_class_key __lockdep_no_track__; +EXPORT_SYMBOL_GPL(__lockdep_no_track__); + +#ifdef CONFIG_PROVE_LOCKING +void lockdep_set_lock_cmp_fn(struct lockdep_map *lock, lock_cmp_fn cmp_fn, + lock_print_fn print_fn) +{ + struct lock_class *class = lock->class_cache[0]; + unsigned long flags; + + raw_local_irq_save(flags); + lockdep_recursion_inc(); + + if (!class) + class = register_lock_class(lock, 0, 0); + + if (class) { + WARN_ON(class->cmp_fn && class->cmp_fn != cmp_fn); + WARN_ON(class->print_fn && class->print_fn != print_fn); + + class->cmp_fn = cmp_fn; + class->print_fn = print_fn; + } + + lockdep_recursion_finish(); + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lockdep_set_lock_cmp_fn); +#endif + +static void print_lock_nested_lock_not_held(struct task_struct *curr, - struct held_lock *hlock, - unsigned long ip) + struct held_lock *hlock) { if (!debug_locks_off()) - return 0; + return; if (debug_locks_silent) - return 0; + return; + + nbcon_cpu_emergency_enter(); pr_warn("\n"); pr_warn("==================================\n"); @@ -3210,19 +5061,23 @@ print_lock_nested_lock_not_held(struct task_struct *curr, pr_warn("\nstack backtrace:\n"); dump_stack(); - return 0; + nbcon_cpu_emergency_exit(); } -static int __lock_is_held(struct lockdep_map *lock, int read); +static int __lock_is_held(const struct lockdep_map *lock, int read); /* * This gets called for every mutex_lock*()/spin_lock*() operation. * We maintain the dependency maps and validate the locking attempt: + * + * The callers must make sure that IRQs are disabled before calling it, + * otherwise we could get an interrupt which would want to take locks, + * which would end up in lockdep again. */ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, int hardirqs_off, struct lockdep_map *nest_lock, unsigned long ip, - int references, int pin_count) + int references, int pin_count, int sync) { struct task_struct *curr = current; struct lock_class *class = NULL; @@ -3235,16 +5090,18 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (unlikely(!debug_locks)) return 0; - /* - * Lockdep should run with IRQs disabled, otherwise we could - * get an interrupt which would want to take locks, which would - * end up in lockdep and have you got a head-ache already? - */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + if (unlikely(lock->key == &__lockdep_no_track__)) return 0; - if (!prove_locking || lock->key == &__lockdep_no_validate__) + lockevent_inc(lockdep_acquire); + + if (!prove_locking || lock->key == &__lockdep_no_validate__) { check = 0; + lockevent_inc(lockdep_nocheck); + } + + if (DEBUG_LOCKS_WARN_ON(subclass >= MAX_LOCKDEP_SUBCLASSES)) + return 0; if (subclass < NR_LOCKDEP_CACHING_CLASSES) class = lock->class_cache[subclass]; @@ -3256,13 +5113,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!class) return 0; } - atomic_inc((atomic_t *)&class->ops); + + debug_class_ops_inc(class); + if (very_verbose(class)) { - printk("\nacquire class [%p] %s", class->key, class->name); + nbcon_cpu_emergency_enter(); + printk("\nacquire class [%px] %s", class->key, class->name); if (class->name_version > 1) printk(KERN_CONT "#%d", class->name_version); printk(KERN_CONT "\n"); dump_stack(); + nbcon_cpu_emergency_exit(); } /* @@ -3277,24 +5138,25 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) return 0; - class_idx = class - lock_classes + 1; + class_idx = class - lock_classes; - if (depth) { + if (depth && !sync) { + /* we're holding locks and the new held lock is not a sync */ hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { - if (hlock->references) { - /* - * Check: unsigned int references:12, overflow. - */ - if (DEBUG_LOCKS_WARN_ON(hlock->references == (1 << 12)-1)) - return 0; + if (!references) + references++; + if (!hlock->references) hlock->references++; - } else { - hlock->references = 2; - } - return 1; + hlock->references += references; + + /* Overflow */ + if (DEBUG_LOCKS_WARN_ON(hlock->references < references)) + return 0; + + return 2; } } @@ -3313,6 +5175,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->trylock = trylock; hlock->read = read; hlock->check = check; + hlock->sync = !!sync; hlock->hardirqs_off = !!hardirqs_off; hlock->references = references; #ifdef CONFIG_LOCK_STAT @@ -3321,11 +5184,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, #endif hlock->pin_count = pin_count; - if (check && !mark_irqflags(curr, hlock)) + if (check_wait_context(curr, hlock)) return 0; - /* mark it as used: */ - if (!mark_lock(curr, hlock, LOCK_USED)) + /* Initialize the lock usage bit */ + if (!mark_usage(curr, hlock, check)) return 0; /* @@ -3339,9 +5202,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, * the hash, not class->key. */ /* - * Whoops, we did it again.. ran straight out of our static allocation. + * Whoops, we did it again.. class_idx is invalid. */ - if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS)) + if (DEBUG_LOCKS_WARN_ON(!test_bit(class_idx, lock_classes_in_use))) return 0; chain_key = curr->curr_chain_key; @@ -3349,24 +5212,35 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, /* * How can we have a chain hash when we ain't got no keys?! */ - if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) + if (DEBUG_LOCKS_WARN_ON(chain_key != INITIAL_CHAIN_KEY)) return 0; chain_head = 1; } hlock->prev_chain_key = chain_key; if (separate_irq_context(curr, hlock)) { - chain_key = 0; + chain_key = INITIAL_CHAIN_KEY; chain_head = 1; } - chain_key = iterate_chain_key(chain_key, class_idx); + chain_key = iterate_chain_key(chain_key, hlock_id(hlock)); - if (nest_lock && !__lock_is_held(nest_lock, -1)) - return print_lock_nested_lock_not_held(curr, hlock, ip); + if (nest_lock && !__lock_is_held(nest_lock, -1)) { + print_lock_nested_lock_not_held(curr, hlock); + return 0; + } - if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) + if (!debug_locks_silent) { + WARN_ON_ONCE(depth && !hlock_class(hlock - 1)->key); + WARN_ON_ONCE(!hlock_class(hlock)->key); + } + + if (!validate_chain(curr, hlock, chain_head, chain_key)) return 0; + /* For lock_sync(), we are done here since no actual critical section */ + if (hlock->sync) + return 1; + curr->curr_chain_key = chain_key; curr->lockdep_depth++; check_chain_key(curr); @@ -3376,6 +5250,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, #endif if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { debug_locks_off(); + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!"); printk(KERN_DEBUG "depth: %i max: %lu!\n", curr->lockdep_depth, MAX_LOCK_DEPTH); @@ -3383,6 +5258,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, lockdep_print_held_locks(current); debug_show_all_locks(); dump_stack(); + nbcon_cpu_emergency_exit(); return 0; } @@ -3393,14 +5269,16 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, return 1; } -static int -print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, - unsigned long ip) +static void print_unlock_imbalance_bug(struct task_struct *curr, + struct lockdep_map *lock, + unsigned long ip) { if (!debug_locks_off()) - return 0; + return; if (debug_locks_silent) - return 0; + return; + + nbcon_cpu_emergency_enter(); pr_warn("\n"); pr_warn("=====================================\n"); @@ -3411,7 +5289,7 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); pr_cont(") at:\n"); - print_ip_sym(ip); + print_ip_sym(KERN_WARNING, ip); pr_warn("but there are no more locks to release!\n"); pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); @@ -3419,16 +5297,17 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, pr_warn("\nstack backtrace:\n"); dump_stack(); - return 0; + nbcon_cpu_emergency_exit(); } -static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) +static noinstr int match_held_lock(const struct held_lock *hlock, + const struct lockdep_map *lock) { if (hlock->instance == lock) return 1; if (hlock->references) { - struct lock_class *class = lock->class_cache[0]; + const struct lock_class *class = lock->class_cache[0]; if (!class) class = look_up_lock_class(lock, 0); @@ -3439,7 +5318,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) * Clearly if the lock hasn't been acquired _ever_, we're not * holding it either, so report failure. */ - if (IS_ERR_OR_NULL(class)) + if (!class) return 0; /* @@ -3450,7 +5329,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) return 0; - if (hlock->class_idx == class - lock_classes + 1) + if (hlock->class_idx == class - lock_classes) return 1; } @@ -3494,19 +5373,33 @@ out: } static int reacquire_held_locks(struct task_struct *curr, unsigned int depth, - int idx) + int idx, unsigned int *merged) { struct held_lock *hlock; + int first_idx = idx; + + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return 0; for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) { - if (!__lock_acquire(hlock->instance, + switch (__lock_acquire(hlock->instance, hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, hlock->nest_lock, hlock->acquire_ip, - hlock->references, hlock->pin_count)) + hlock->references, hlock->pin_count, 0)) { + case 0: return 1; + case 1: + break; + case 2: + *merged += (idx == first_idx); + break; + default: + WARN_ON(1); + return 0; + } } return 0; } @@ -3517,11 +5410,14 @@ __lock_set_class(struct lockdep_map *lock, const char *name, unsigned long ip) { struct task_struct *curr = current; + unsigned int depth, merged = 0; struct held_lock *hlock; struct lock_class *class; - unsigned int depth; int i; + if (unlikely(!debug_locks)) + return 0; + depth = curr->lockdep_depth; /* * This function is about (re)setting the class of a held lock, @@ -3531,24 +5427,29 @@ __lock_set_class(struct lockdep_map *lock, const char *name, return 0; hlock = find_held_lock(curr, lock, depth, &i); - if (!hlock) - return print_unlock_imbalance_bug(curr, lock, ip); + if (!hlock) { + print_unlock_imbalance_bug(curr, lock, ip); + return 0; + } - lockdep_init_map(lock, name, key, 0); + lockdep_init_map_type(lock, name, key, 0, + lock->wait_type_inner, + lock->wait_type_outer, + lock->lock_type); class = register_lock_class(lock, subclass, 0); - hlock->class_idx = class - lock_classes + 1; + hlock->class_idx = class - lock_classes; curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; - if (reacquire_held_locks(curr, depth, i)) + if (reacquire_held_locks(curr, depth, i, &merged)) return 0; /* * I took it apart and put it back together again, except now I have * these 'spare' parts.. where shall I put them. */ - if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) + if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged)) return 0; return 1; } @@ -3556,10 +5457,13 @@ __lock_set_class(struct lockdep_map *lock, const char *name, static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; + unsigned int depth, merged = 0; struct held_lock *hlock; - unsigned int depth; int i; + if (unlikely(!debug_locks)) + return 0; + depth = curr->lockdep_depth; /* * This function is about (re)setting the class of a held lock, @@ -3569,8 +5473,10 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) return 0; hlock = find_held_lock(curr, lock, depth, &i); - if (!hlock) - return print_unlock_imbalance_bug(curr, lock, ip); + if (!hlock) { + print_unlock_imbalance_bug(curr, lock, ip); + return 0; + } curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; @@ -3579,7 +5485,11 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) hlock->read = 1; hlock->acquire_ip = ip; - if (reacquire_held_locks(curr, depth, i)) + if (reacquire_held_locks(curr, depth, i, &merged)) + return 0; + + /* Merging can't happen with unchanged classes.. */ + if (DEBUG_LOCKS_WARN_ON(merged)) return 0; /* @@ -3588,22 +5498,21 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) */ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) return 0; + return 1; } /* - * Remove the lock to the list of currently held locks - this gets + * Remove the lock from the list of currently held locks - this gets * called on mutex_unlock()/spin_unlock*() (or on a failed * mutex_lock_interruptible()). - * - * @nested is an hysterical artifact, needs a tree wide cleanup. */ static int -__lock_release(struct lockdep_map *lock, int nested, unsigned long ip) +__lock_release(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; + unsigned int depth, merged = 1; struct held_lock *hlock; - unsigned int depth; int i; if (unlikely(!debug_locks)) @@ -3614,16 +5523,20 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) * So we're all set to release this lock.. wait what lock? We don't * own any locks, you've been drinking again? */ - if (DEBUG_LOCKS_WARN_ON(depth <= 0)) - return print_unlock_imbalance_bug(curr, lock, ip); + if (depth <= 0) { + print_unlock_imbalance_bug(curr, lock, ip); + return 0; + } /* * Check whether the lock exists in the current stack * of held locks: */ hlock = find_held_lock(curr, lock, depth, &i); - if (!hlock) - return print_unlock_imbalance_bug(curr, lock, ip); + if (!hlock) { + print_unlock_imbalance_bug(curr, lock, ip); + return 0; + } if (hlock->instance == lock) lock_release_holdtime(hlock); @@ -3651,20 +5564,33 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; - if (reacquire_held_locks(curr, depth, i + 1)) + /* + * The most likely case is when the unlock is on the innermost + * lock. In this case, we are done! + */ + if (i == depth-1) + return 1; + + if (reacquire_held_locks(curr, depth, i + 1, &merged)) return 0; /* * We had N bottles of beer on the wall, we drank one, but now * there's not N-1 bottles of beer left on the wall... + * Pouring two of the bottles together is acceptable. */ - if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) - return 0; + DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged); - return 1; + /* + * Since reacquire_held_locks() would have called check_chain_key() + * indirectly via __lock_acquire(), we don't need to do it again + * on return. + */ + return 0; } -static int __lock_is_held(struct lockdep_map *lock, int read) +static __always_inline +int __lock_is_held(const struct lockdep_map *lock, int read) { struct task_struct *curr = current; int i; @@ -3673,14 +5599,14 @@ static int __lock_is_held(struct lockdep_map *lock, int read) struct held_lock *hlock = curr->held_locks + i; if (match_held_lock(hlock, lock)) { - if (read == -1 || hlock->read == read) - return 1; + if (read == -1 || !!hlock->read == read) + return LOCK_STATE_HELD; - return 0; + return LOCK_STATE_NOT_HELD; } } - return 0; + return LOCK_STATE_NOT_HELD; } static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock) @@ -3701,7 +5627,7 @@ static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock) * be guessable and still allows some pin nesting in * our u32 pin_count. */ - cookie.val = 1 + (prandom_u32() >> 16); + cookie.val = 1 + (sched_clock() & 0xffff); hlock->pin_count += cookie.val; return cookie; } @@ -3761,23 +5687,26 @@ static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie /* * Check whether we follow the irq-flags state precisely: */ -static void check_flags(unsigned long flags) +static noinstr void check_flags(unsigned long flags) { -#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \ - defined(CONFIG_TRACE_IRQFLAGS) +#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) if (!debug_locks) return; + /* Get the warning out.. */ + instrumentation_begin(); + if (irqs_disabled_flags(flags)) { - if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) { + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled())) { printk("possible reason: unannotated irqs-off.\n"); } } else { - if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) { + if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled())) { printk("possible reason: unannotated irqs-on.\n"); } } +#ifndef CONFIG_PREEMPT_RT /* * We dont accurately track softirq state in e.g. * hardirq contexts (such as on 4KSTACKS), so only @@ -3792,9 +5721,12 @@ static void check_flags(unsigned long flags) DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); } } +#endif if (!debug_locks) print_irqtrace_events(current); + + instrumentation_end(); #endif } @@ -3804,15 +5736,15 @@ void lock_set_class(struct lockdep_map *lock, const char *name, { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); - current->lockdep_recursion = 1; + lockdep_recursion_inc(); check_flags(flags); if (__lock_set_class(lock, name, key, subclass, ip)) check_chain_key(current); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_set_class); @@ -3821,19 +5753,71 @@ void lock_downgrade(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); - current->lockdep_recursion = 1; + lockdep_recursion_inc(); check_flags(flags); if (__lock_downgrade(lock, ip)) check_chain_key(current); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_downgrade); +/* NMI context !!! */ +static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock, int subclass) +{ +#ifdef CONFIG_PROVE_LOCKING + struct lock_class *class = look_up_lock_class(lock, subclass); + unsigned long mask = LOCKF_USED; + + /* if it doesn't have a class (yet), it certainly hasn't been used yet */ + if (!class) + return; + + /* + * READ locks only conflict with USED, such that if we only ever use + * READ locks, there is no deadlock possible -- RCU. + */ + if (!hlock->read) + mask |= LOCKF_USED_READ; + + if (!(class->usage_mask & mask)) + return; + + hlock->class_idx = class - lock_classes; + + print_usage_bug(current, hlock, LOCK_USED, LOCK_USAGE_STATES); +#endif +} + +static bool lockdep_nmi(void) +{ + if (raw_cpu_read(lockdep_recursion)) + return false; + + if (!in_nmi()) + return false; + + return true; +} + +/* + * read_lock() is recursive if: + * 1. We force lockdep think this way in selftests or + * 2. The implementation is not queued read/write lock or + * 3. The locker is at an in_interrupt() context. + */ +bool read_lock_is_recursive(void) +{ + return force_read_lock_recursive || + !IS_ENABLED(CONFIG_QUEUED_RWLOCKS) || + in_interrupt(); +} +EXPORT_SYMBOL_GPL(read_lock_is_recursive); + /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: @@ -3844,74 +5828,138 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); + + if (!debug_locks) return; + /* + * As KASAN instrumentation is disabled and lock_acquire() is usually + * the first lockdep call when a task tries to acquire a lock, add + * kasan_check_byte() here to check for use-after-free and other + * memory errors. + */ + kasan_check_byte(lock); + + if (unlikely(!lockdep_enabled())) { + /* XXX allow trylock from NMI ?!? */ + if (lockdep_nmi() && !trylock) { + struct held_lock hlock; + + hlock.acquire_ip = ip; + hlock.instance = lock; + hlock.nest_lock = nest_lock; + hlock.irq_context = 2; // XXX + hlock.trylock = trylock; + hlock.read = read; + hlock.check = check; + hlock.hardirqs_off = true; + hlock.references = 0; + + verify_lock_unused(lock, &hlock, subclass); + } + return; + } + raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; - trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); + lockdep_recursion_inc(); __lock_acquire(lock, subclass, trylock, read, check, - irqs_disabled_flags(flags), nest_lock, ip, 0, 0); - current->lockdep_recursion = 0; + irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 0); + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_acquire); -void lock_release(struct lockdep_map *lock, int nested, - unsigned long ip) +void lock_release(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + trace_lock_release(lock, ip); + + if (unlikely(!lockdep_enabled() || + lock->key == &__lockdep_no_track__)) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; - trace_lock_release(lock, ip); - if (__lock_release(lock, nested, ip)) + + lockdep_recursion_inc(); + if (__lock_release(lock, ip)) check_chain_key(current); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_release); -int lock_is_held_type(struct lockdep_map *lock, int read) +/* + * lock_sync() - A special annotation for synchronize_{s,}rcu()-like API. + * + * No actual critical section is created by the APIs annotated with this: these + * APIs are used to wait for one or multiple critical sections (on other CPUs + * or threads), and it means that calling these APIs inside these critical + * sections is potential deadlock. + */ +void lock_sync(struct lockdep_map *lock, unsigned subclass, int read, + int check, struct lockdep_map *nest_lock, unsigned long ip) { unsigned long flags; - int ret = 0; - if (unlikely(current->lockdep_recursion)) - return 1; /* avoid false negative lockdep_assert_held() */ + if (unlikely(!lockdep_enabled())) + return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + lockdep_recursion_inc(); + __lock_acquire(lock, subclass, 0, read, check, + irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 1); + check_chain_key(current); + lockdep_recursion_finish(); + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_sync); + +noinstr int lock_is_held_type(const struct lockdep_map *lock, int read) +{ + unsigned long flags; + int ret = LOCK_STATE_NOT_HELD; + + /* + * Avoid false negative lockdep_assert_held() and + * lockdep_assert_not_held(). + */ + if (unlikely(!lockdep_enabled())) + return LOCK_STATE_UNKNOWN; + + raw_local_irq_save(flags); + check_flags(flags); + + lockdep_recursion_inc(); ret = __lock_is_held(lock, read); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); return ret; } EXPORT_SYMBOL_GPL(lock_is_held_type); +NOKPROBE_SYMBOL(lock_is_held_type); struct pin_cookie lock_pin_lock(struct lockdep_map *lock) { struct pin_cookie cookie = NIL_COOKIE; unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return cookie; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + lockdep_recursion_inc(); cookie = __lock_pin_lock(lock); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); return cookie; @@ -3922,15 +5970,15 @@ void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + lockdep_recursion_inc(); __lock_repin_lock(lock, cookie); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_repin_lock); @@ -3939,40 +5987,30 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + lockdep_recursion_inc(); __lock_unpin_lock(lock, cookie); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_unpin_lock); -void lockdep_set_current_reclaim_state(gfp_t gfp_mask) -{ - current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask); -} -EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state); - -void lockdep_clear_current_reclaim_state(void) -{ - current->lockdep_reclaim_gfp = 0; -} -EXPORT_SYMBOL_GPL(lockdep_clear_current_reclaim_state); - #ifdef CONFIG_LOCK_STAT -static int -print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, - unsigned long ip) +static void print_lock_contention_bug(struct task_struct *curr, + struct lockdep_map *lock, + unsigned long ip) { if (!debug_locks_off()) - return 0; + return; if (debug_locks_silent) - return 0; + return; + + nbcon_cpu_emergency_enter(); pr_warn("\n"); pr_warn("=================================\n"); @@ -3983,7 +6021,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); pr_cont(") at:\n"); - print_ip_sym(ip); + print_ip_sym(KERN_WARNING, ip); pr_warn("but there are no locks held!\n"); pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); @@ -3991,7 +6029,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, pr_warn("\nstack backtrace:\n"); dump_stack(); - return 0; + nbcon_cpu_emergency_exit(); } static void @@ -4011,6 +6049,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!depth)) return; + if (unlikely(lock->key == &__lockdep_no_track__)) + return; + hlock = find_held_lock(curr, lock, depth, &i); if (!hlock) { print_lock_contention_bug(curr, lock, ip); @@ -4033,7 +6074,6 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) stats->contending_point[contending_point]++; if (lock->cpu != smp_processor_id()) stats->bounces[bounce_contended + !!hlock->read]++; - put_lock_stats(stats); } static void @@ -4054,6 +6094,9 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!depth)) return; + if (unlikely(lock->key == &__lockdep_no_track__)) + return; + hlock = find_held_lock(curr, lock, depth, &i); if (!hlock) { print_lock_contention_bug(curr, lock, _RET_IP_); @@ -4070,8 +6113,6 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) hlock->holdtime_stamp = now; } - trace_lock_acquired(lock, ip); - stats = get_lock_stats(hlock_class(hlock)); if (waittime) { if (hlock->read) @@ -4081,7 +6122,6 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) } if (lock->cpu != cpu) stats->bounces[bounce_acquired + !!hlock->read]++; - put_lock_stats(stats); lock->cpu = cpu; lock->ip = ip; @@ -4091,18 +6131,16 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - if (unlikely(!lock_stat)) - return; + trace_lock_contended(lock, ip); - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lock_stat || !lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; - trace_lock_contended(lock, ip); + lockdep_recursion_inc(); __lock_contended(lock, ip); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_contended); @@ -4111,17 +6149,16 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - if (unlikely(!lock_stat)) - return; + trace_lock_acquired(lock, ip); - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lock_stat || !lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + lockdep_recursion_inc(); __lock_acquired(lock, ip); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_acquired); @@ -4138,9 +6175,7 @@ void lockdep_reset(void) int i; raw_local_irq_save(flags); - current->curr_chain_key = 0; - current->lockdep_depth = 0; - current->lockdep_recursion = 0; + lockdep_init_task(current); memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock)); nr_hardirq_chains = 0; nr_softirq_chains = 0; @@ -4151,26 +6186,111 @@ void lockdep_reset(void) raw_local_irq_restore(flags); } -static void zap_class(struct lock_class *class) +/* Remove a class from a lock chain. Must be called with the graph lock held. */ +static void remove_class_from_lock_chain(struct pending_free *pf, + struct lock_chain *chain, + struct lock_class *class) { +#ifdef CONFIG_PROVE_LOCKING int i; + for (i = chain->base; i < chain->base + chain->depth; i++) { + if (chain_hlock_class_idx(chain_hlocks[i]) != class - lock_classes) + continue; + /* + * Each lock class occurs at most once in a lock chain so once + * we found a match we can break out of this loop. + */ + goto free_lock_chain; + } + /* Since the chain has not been modified, return. */ + return; + +free_lock_chain: + free_chain_hlocks(chain->base, chain->depth); + /* Overwrite the chain key for concurrent RCU readers. */ + WRITE_ONCE(chain->chain_key, INITIAL_CHAIN_KEY); + dec_chains(chain->irq_context); + /* - * Remove all dependencies this lock is - * involved in: + * Note: calling hlist_del_rcu() from inside a + * hlist_for_each_entry_rcu() loop is safe. */ - for (i = 0; i < nr_list_entries; i++) { - if (list_entries[i].class == class) - list_del_rcu(&list_entries[i].entry); + hlist_del_rcu(&chain->entry); + __set_bit(chain - lock_chains, pf->lock_chains_being_freed); + nr_zapped_lock_chains++; +#endif +} + +/* Must be called with the graph lock held. */ +static void remove_class_from_lock_chains(struct pending_free *pf, + struct lock_class *class) +{ + struct lock_chain *chain; + struct hlist_head *head; + int i; + + for (i = 0; i < ARRAY_SIZE(chainhash_table); i++) { + head = chainhash_table + i; + hlist_for_each_entry_rcu(chain, head, entry) { + remove_class_from_lock_chain(pf, chain, class); + } } +} + +/* + * Remove all references to a lock class. The caller must hold the graph lock. + */ +static void zap_class(struct pending_free *pf, struct lock_class *class) +{ + struct lock_list *entry; + int i; + + WARN_ON_ONCE(!class->key); + /* - * Unhash the class and remove it from the all_lock_classes list: + * Remove all dependencies this lock is + * involved in: */ - hlist_del_rcu(&class->hash_entry); - list_del_rcu(&class->lock_entry); + for_each_set_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) { + entry = list_entries + i; + if (entry->class != class && entry->links_to != class) + continue; + __clear_bit(i, list_entries_in_use); + nr_list_entries--; + list_del_rcu(&entry->entry); + } + if (list_empty(&class->locks_after) && + list_empty(&class->locks_before)) { + list_move_tail(&class->lock_entry, &pf->zapped); + hlist_del_rcu(&class->hash_entry); + WRITE_ONCE(class->key, NULL); + WRITE_ONCE(class->name, NULL); + /* Class allocated but not used, -1 in nr_unused_locks */ + if (class->usage_mask == 0) + debug_atomic_dec(nr_unused_locks); + nr_lock_classes--; + __clear_bit(class - lock_classes, lock_classes_in_use); + if (class - lock_classes == max_lock_class_idx) + max_lock_class_idx--; + } else { + WARN_ONCE(true, "%s() failed for class %s\n", __func__, + class->name); + } - RCU_INIT_POINTER(class->key, NULL); - RCU_INIT_POINTER(class->name, NULL); + remove_class_from_lock_chains(pf, class); + nr_zapped_classes++; +} + +static void reinit_class(struct lock_class *class) +{ + WARN_ON_ONCE(!class->lock_entry.next); + WARN_ON_ONCE(!list_empty(&class->locks_after)); + WARN_ON_ONCE(!list_empty(&class->locks_before)); + memset_startat(class, 0, key); + WARN_ON_ONCE(!class->lock_entry.next); + WARN_ON_ONCE(!list_empty(&class->locks_after)); + WARN_ON_ONCE(!list_empty(&class->locks_before)); } static inline int within(const void *addr, void *start, unsigned long size) @@ -4178,66 +6298,206 @@ static inline int within(const void *addr, void *start, unsigned long size) return addr >= start && addr < start + size; } +static bool inside_selftest(void) +{ + return current == lockdep_selftest_task_struct; +} + +/* The caller must hold the graph lock. */ +static struct pending_free *get_pending_free(void) +{ + return delayed_free.pf + delayed_free.index; +} + +static void free_zapped_rcu(struct rcu_head *cb); + /* - * Used in module.c to remove lock classes from memory that is going to be - * freed; and possibly re-used by other modules. - * - * We will have had one sync_sched() before getting here, so we're guaranteed - * nobody will look up these exact classes -- they're properly dead but still - * allocated. - */ -void lockdep_free_key_range(void *start, unsigned long size) +* See if we need to queue an RCU callback, must called with +* the lockdep lock held, returns false if either we don't have +* any pending free or the callback is already scheduled. +* Otherwise, a call_rcu() must follow this function call. +*/ +static bool prepare_call_rcu_zapped(struct pending_free *pf) +{ + WARN_ON_ONCE(inside_selftest()); + + if (list_empty(&pf->zapped)) + return false; + + if (delayed_free.scheduled) + return false; + + delayed_free.scheduled = true; + + WARN_ON_ONCE(delayed_free.pf + delayed_free.index != pf); + delayed_free.index ^= 1; + + return true; +} + +/* The caller must hold the graph lock. May be called from RCU context. */ +static void __free_zapped_classes(struct pending_free *pf) { struct lock_class *class; - struct hlist_head *head; + + check_data_structures(); + + list_for_each_entry(class, &pf->zapped, lock_entry) + reinit_class(class); + + list_splice_init(&pf->zapped, &free_lock_classes); + +#ifdef CONFIG_PROVE_LOCKING + bitmap_andnot(lock_chains_in_use, lock_chains_in_use, + pf->lock_chains_being_freed, ARRAY_SIZE(lock_chains)); + bitmap_clear(pf->lock_chains_being_freed, 0, ARRAY_SIZE(lock_chains)); +#endif +} + +static void free_zapped_rcu(struct rcu_head *ch) +{ + struct pending_free *pf; unsigned long flags; - int i; - int locked; + bool need_callback; + + if (WARN_ON_ONCE(ch != &delayed_free.rcu_head)) + return; raw_local_irq_save(flags); - locked = graph_lock(); + lockdep_lock(); + + /* closed head */ + pf = delayed_free.pf + (delayed_free.index ^ 1); + __free_zapped_classes(pf); + delayed_free.scheduled = false; + need_callback = + prepare_call_rcu_zapped(delayed_free.pf + delayed_free.index); + lockdep_unlock(); + raw_local_irq_restore(flags); /* - * Unhash all classes that were created by this module: - */ + * If there's pending free and its callback has not been scheduled, + * queue an RCU callback. + */ + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); + +} + +/* + * Remove all lock classes from the class hash table and from the + * all_lock_classes list whose key or name is in the address range [start, + * start + size). Move these lock classes to the zapped_classes list. Must + * be called with the graph lock held. + */ +static void __lockdep_free_key_range(struct pending_free *pf, void *start, + unsigned long size) +{ + struct lock_class *class; + struct hlist_head *head; + int i; + + /* Unhash all classes that were created by a module. */ for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; hlist_for_each_entry_rcu(class, head, hash_entry) { - if (within(class->key, start, size)) - zap_class(class); - else if (within(class->name, start, size)) - zap_class(class); + if (!within(class->key, start, size) && + !within(class->name, start, size)) + continue; + zap_class(pf, class); } } +} - if (locked) - graph_unlock(); - raw_local_irq_restore(flags); +/* + * Used in module.c to remove lock classes from memory that is going to be + * freed; and possibly re-used by other modules. + * + * We will have had one synchronize_rcu() before getting here, so we're + * guaranteed nobody will look up these exact classes -- they're properly dead + * but still allocated. + */ +static void lockdep_free_key_range_reg(void *start, unsigned long size) +{ + struct pending_free *pf; + unsigned long flags; + bool need_callback; + init_data_structures_once(); + + raw_local_irq_save(flags); + lockdep_lock(); + pf = get_pending_free(); + __lockdep_free_key_range(pf, start, size); + need_callback = prepare_call_rcu_zapped(pf); + lockdep_unlock(); + raw_local_irq_restore(flags); + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); /* * Wait for any possible iterators from look_up_lock_class() to pass * before continuing to free the memory they refer to. - * - * sync_sched() is sufficient because the read-side is IRQ disable. */ - synchronize_sched(); + synchronize_rcu(); +} - /* - * XXX at this point we could return the resources to the pool; - * instead we leak them. We would need to change to bitmap allocators - * instead of the linear allocators we have now. - */ +/* + * Free all lockdep keys in the range [start, start+size). Does not sleep. + * Ignores debug_locks. Must only be used by the lockdep selftests. + */ +static void lockdep_free_key_range_imm(void *start, unsigned long size) +{ + struct pending_free *pf = delayed_free.pf; + unsigned long flags; + + init_data_structures_once(); + + raw_local_irq_save(flags); + lockdep_lock(); + __lockdep_free_key_range(pf, start, size); + __free_zapped_classes(pf); + lockdep_unlock(); + raw_local_irq_restore(flags); } -void lockdep_reset_lock(struct lockdep_map *lock) +void lockdep_free_key_range(void *start, unsigned long size) +{ + init_data_structures_once(); + + if (inside_selftest()) + lockdep_free_key_range_imm(start, size); + else + lockdep_free_key_range_reg(start, size); +} + +/* + * Check whether any element of the @lock->class_cache[] array refers to a + * registered lock class. The caller must hold either the graph lock or the + * RCU read lock. + */ +static bool lock_class_cache_is_registered(struct lockdep_map *lock) { struct lock_class *class; struct hlist_head *head; - unsigned long flags; int i, j; - int locked; - raw_local_irq_save(flags); + for (i = 0; i < CLASSHASH_SIZE; i++) { + head = classhash_table + i; + hlist_for_each_entry_rcu(class, head, hash_entry) { + for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) + if (lock->class_cache[j] == class) + return true; + } + } + return false; +} + +/* The caller must hold the graph lock. Does not sleep. */ +static void __lockdep_reset_lock(struct pending_free *pf, + struct lockdep_map *lock) +{ + struct lock_class *class; + int j; /* * Remove all classes this lock might have: @@ -4247,66 +6507,165 @@ void lockdep_reset_lock(struct lockdep_map *lock) * If the class exists we look it up and zap it: */ class = look_up_lock_class(lock, j); - if (!IS_ERR_OR_NULL(class)) - zap_class(class); + if (class) + zap_class(pf, class); } /* * Debug check: in the end all mapped classes should * be gone. */ + if (WARN_ON_ONCE(lock_class_cache_is_registered(lock))) + debug_locks_off(); +} + +/* + * Remove all information lockdep has about a lock if debug_locks == 1. Free + * released data structures from RCU context. + */ +static void lockdep_reset_lock_reg(struct lockdep_map *lock) +{ + struct pending_free *pf; + unsigned long flags; + int locked; + bool need_callback = false; + + raw_local_irq_save(flags); locked = graph_lock(); - for (i = 0; i < CLASSHASH_SIZE; i++) { - head = classhash_table + i; - hlist_for_each_entry_rcu(class, head, hash_entry) { - int match = 0; + if (!locked) + goto out_irq; - for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) - match |= class == lock->class_cache[j]; - - if (unlikely(match)) { - if (debug_locks_off_graph_unlock()) { - /* - * We all just reset everything, how did it match? - */ - WARN_ON(1); - } - goto out_restore; - } - } - } - if (locked) - graph_unlock(); + pf = get_pending_free(); + __lockdep_reset_lock(pf, lock); + need_callback = prepare_call_rcu_zapped(pf); + + graph_unlock(); +out_irq: + raw_local_irq_restore(flags); + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); +} -out_restore: +/* + * Reset a lock. Does not sleep. Ignores debug_locks. Must only be used by the + * lockdep selftests. + */ +static void lockdep_reset_lock_imm(struct lockdep_map *lock) +{ + struct pending_free *pf = delayed_free.pf; + unsigned long flags; + + raw_local_irq_save(flags); + lockdep_lock(); + __lockdep_reset_lock(pf, lock); + __free_zapped_classes(pf); + lockdep_unlock(); raw_local_irq_restore(flags); } -void __init lockdep_info(void) +void lockdep_reset_lock(struct lockdep_map *lock) { - printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); + init_data_structures_once(); + + if (inside_selftest()) + lockdep_reset_lock_imm(lock); + else + lockdep_reset_lock_reg(lock); +} - printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); - printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); - printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); - printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); - printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); - printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); - printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); +/* + * Unregister a dynamically allocated key. + * + * Unlike lockdep_register_key(), a search is always done to find a matching + * key irrespective of debug_locks to avoid potential invalid access to freed + * memory in lock_class entry. + */ +void lockdep_unregister_key(struct lock_class_key *key) +{ + struct hlist_head *hash_head = keyhashentry(key); + struct lock_class_key *k; + struct pending_free *pf; + unsigned long flags; + bool found = false; + bool need_callback = false; + + might_sleep(); + + if (WARN_ON_ONCE(static_obj(key))) + return; + + raw_local_irq_save(flags); + lockdep_lock(); + + hlist_for_each_entry_rcu(k, hash_head, hash_entry) { + if (k == key) { + hlist_del_rcu(&k->hash_entry); + found = true; + break; + } + } + WARN_ON_ONCE(!found && debug_locks); + if (found) { + pf = get_pending_free(); + __lockdep_free_key_range(pf, key, 1); + need_callback = prepare_call_rcu_zapped(pf); + nr_dynamic_keys--; + } + lockdep_unlock(); + raw_local_irq_restore(flags); - printk(" memory used by lock dependency info: %lu kB\n", - (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS + - sizeof(struct list_head) * CLASSHASH_SIZE + - sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + - sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + - sizeof(struct list_head) * CHAINHASH_SIZE + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); + + /* + * Wait until is_dynamic_key() has finished accessing k->hash_entry. + * + * Some operations like __qdisc_destroy() will call this in a debug + * kernel, and the network traffic is disabled while waiting, hence + * the delay of the wait matters in debugging cases. Currently use a + * synchronize_rcu_expedited() to speed up the wait at the cost of + * system IPIs. TODO: Replace RCU with hazptr for this. + */ + synchronize_rcu_expedited(); +} +EXPORT_SYMBOL_GPL(lockdep_unregister_key); + +void __init lockdep_init(void) +{ + pr_info("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); + + pr_info("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); + pr_info("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); + pr_info("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); + pr_info("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); + pr_info("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); + pr_info("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); + pr_info("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); + + pr_info(" memory used by lock dependency info: %zu kB\n", + (sizeof(lock_classes) + + sizeof(lock_classes_in_use) + + sizeof(classhash_table) + + sizeof(list_entries) + + sizeof(list_entries_in_use) + + sizeof(chainhash_table) + + sizeof(delayed_free) #ifdef CONFIG_PROVE_LOCKING - + sizeof(struct circular_queue) + + sizeof(lock_cq) + + sizeof(lock_chains) + + sizeof(lock_chains_in_use) + + sizeof(chain_hlocks) #endif ) / 1024 ); - printk(" per task-struct memory footprint: %lu bytes\n", - sizeof(struct held_lock) * MAX_LOCK_DEPTH); +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + pr_info(" memory used for stack traces: %zu kB\n", + (sizeof(stack_trace) + sizeof(stack_trace_hash)) / 1024 + ); +#endif + + pr_info(" per task-struct memory footprint: %zu bytes\n", + sizeof(((struct task_struct *)NULL)->held_locks)); } static void @@ -4318,18 +6677,22 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, if (debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("=========================\n"); pr_warn("WARNING: held lock freed!\n"); print_kernel_ident(); pr_warn("-------------------------\n"); - pr_warn("%s/%d is freeing memory %p-%p, with a lock still held there!\n", + pr_warn("%s/%d is freeing memory %px-%px, with a lock still held there!\n", curr->comm, task_pid_nr(curr), mem_from, mem_to-1); print_lock(hlock); lockdep_print_held_locks(curr); pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } static inline int not_in_range(const void* mem_from, unsigned long mem_len, @@ -4354,7 +6717,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) if (unlikely(!debug_locks)) return; - local_irq_save(flags); + raw_local_irq_save(flags); for (i = 0; i < curr->lockdep_depth; i++) { hlock = curr->held_locks + i; @@ -4365,7 +6728,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) print_freed_lock_bug(curr, mem_from, mem_from + mem_len, hlock); break; } - local_irq_restore(flags); + raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); @@ -4376,6 +6739,8 @@ static void print_held_locks_bug(void) if (debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("====================================\n"); pr_warn("WARNING: %s/%d still has locks held!\n", @@ -4385,6 +6750,8 @@ static void print_held_locks_bug(void) lockdep_print_held_locks(current); pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } void debug_check_no_locks_held(void) @@ -4398,8 +6765,6 @@ EXPORT_SYMBOL_GPL(debug_check_no_locks_held); void debug_show_all_locks(void) { struct task_struct *g, *p; - int count = 10; - int unlock = 1; if (unlikely(!debug_locks)) { pr_warn("INFO: lockdep is turned off.\n"); @@ -4407,49 +6772,18 @@ void debug_show_all_locks(void) } pr_warn("\nShowing all locks held in the system:\n"); - /* - * Here we try to get the tasklist_lock as hard as possible, - * if not successful after 2 seconds we ignore it (but keep - * trying). This is to enable a debug printout even if a - * tasklist_lock-holding task deadlocks or crashes. - */ -retry: - if (!read_trylock(&tasklist_lock)) { - if (count == 10) - pr_warn("hm, tasklist_lock locked, retrying... "); - if (count) { - count--; - pr_cont(" #%d", 10-count); - mdelay(200); - goto retry; - } - pr_cont(" ignoring it.\n"); - unlock = 0; - } else { - if (count != 10) - pr_cont(" locked it.\n"); - } - - do_each_thread(g, p) { - /* - * It's not reliable to print a task's held locks - * if it's not sleeping (or if it's not the current - * task): - */ - if (p->state == TASK_RUNNING && p != current) + rcu_read_lock(); + for_each_process_thread(g, p) { + if (!p->lockdep_depth) continue; - if (p->lockdep_depth) - lockdep_print_held_locks(p); - if (!unlock) - if (read_trylock(&tasklist_lock)) - unlock = 1; - } while_each_thread(g, p); + lockdep_print_held_locks(p); + touch_nmi_watchdog(); + touch_all_softlockup_watchdogs(); + } + rcu_read_unlock(); pr_warn("\n"); pr_warn("=============================================\n\n"); - - if (unlock) - read_unlock(&tasklist_lock); } EXPORT_SYMBOL_GPL(debug_show_all_locks); #endif @@ -4475,6 +6809,7 @@ asmlinkage __visible void lockdep_sys_exit(void) if (unlikely(curr->lockdep_depth)) { if (!debug_locks_off()) return; + nbcon_cpu_emergency_enter(); pr_warn("\n"); pr_warn("================================================\n"); pr_warn("WARNING: lock held when returning to user space!\n"); @@ -4483,14 +6818,24 @@ asmlinkage __visible void lockdep_sys_exit(void) pr_warn("%s/%d is leaving the kernel with locks still held!\n", curr->comm, curr->pid); lockdep_print_held_locks(curr); + nbcon_cpu_emergency_exit(); } + + /* + * The lock history for each syscall should be independent. So wipe the + * slate clean on return to userspace. + */ + lockdep_invariant_state(false); } void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { struct task_struct *curr = current; + int dl = READ_ONCE(debug_locks); + bool rcu = warn_rcu_enter(); /* Note: the following can be executed concurrently, so be careful. */ + nbcon_cpu_emergency_enter(); pr_warn("\n"); pr_warn("=============================\n"); pr_warn("WARNING: suspicious RCU usage\n"); @@ -4498,17 +6843,16 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) pr_warn("-----------------------------\n"); pr_warn("%s:%d %s!\n", file, line, s); pr_warn("\nother info that might help us debug this:\n\n"); - pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n", + pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n%s", !rcu_lockdep_current_cpu_online() ? "RCU used illegally from offline CPU!\n" - : !rcu_is_watching() - ? "RCU used illegally from idle CPU!\n" - : "", - rcu_scheduler_active, debug_locks); + : "", + rcu_scheduler_active, dl, + dl ? "" : "Possible false positive due to lockdep disabling via debug_locks = 0\n"); /* * If a CPU is in the RCU-free window in idle (ie: in the section - * between rcu_idle_enter() and rcu_idle_exit(), then RCU + * between ct_idle_enter() and ct_idle_exit(), then RCU * considers that CPU to be in an "extended quiescent state", * which means that RCU will be completely ignoring that CPU. * Therefore, rcu_read_lock() and friends have absolutely no @@ -4530,5 +6874,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) lockdep_print_held_locks(curr); pr_warn("\nstack backtrace:\n"); dump_stack(); + nbcon_cpu_emergency_exit(); + warn_rcu_exit(rcu); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index c08fbd2f5ba9..0e5e6ffe91a3 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * kernel/lockdep_internals.h * @@ -18,9 +19,17 @@ enum lock_usage_bit { #include "lockdep_states.h" #undef LOCKDEP_STATE LOCK_USED, - LOCK_USAGE_STATES + LOCK_USED_READ, + LOCK_USAGE_STATES, }; +/* states after LOCK_USED_READ are not traced and printed */ +static_assert(LOCK_TRACE_STATES == LOCK_USAGE_STATES); + +#define LOCK_USAGE_READ_MASK 1 +#define LOCK_USAGE_DIR_MASK 2 +#define LOCK_USAGE_STATE_MASK (~(LOCK_USAGE_READ_MASK | LOCK_USAGE_DIR_MASK)) + /* * Usage-state bitmasks: */ @@ -35,15 +44,40 @@ enum { #include "lockdep_states.h" #undef LOCKDEP_STATE __LOCKF(USED) + __LOCKF(USED_READ) }; -#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ) -#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ) +enum { +#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE | + LOCKF_ENABLED_IRQ = +#include "lockdep_states.h" + 0, +#undef LOCKDEP_STATE -#define LOCKF_ENABLED_IRQ_READ \ - (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ) -#define LOCKF_USED_IN_IRQ_READ \ - (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) +#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE | + LOCKF_USED_IN_IRQ = +#include "lockdep_states.h" + 0, +#undef LOCKDEP_STATE + +#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE##_READ | + LOCKF_ENABLED_IRQ_READ = +#include "lockdep_states.h" + 0, +#undef LOCKDEP_STATE + +#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE##_READ | + LOCKF_USED_IN_IRQ_READ = +#include "lockdep_states.h" + 0, +#undef LOCKDEP_STATE +}; + +#define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ) +#define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ) + +#define LOCKF_IRQ (LOCKF_ENABLED_IRQ | LOCKF_USED_IN_IRQ) +#define LOCKF_IRQ_READ (LOCKF_ENABLED_IRQ_READ | LOCKF_USED_IN_IRQ_READ) /* * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text, @@ -65,51 +99,73 @@ enum { #define MAX_LOCKDEP_ENTRIES 16384UL #define MAX_LOCKDEP_CHAINS_BITS 15 #define MAX_STACK_TRACE_ENTRIES 262144UL +#define STACK_TRACE_HASH_SIZE 8192 #else -#define MAX_LOCKDEP_ENTRIES 32768UL +#define MAX_LOCKDEP_ENTRIES (1UL << CONFIG_LOCKDEP_BITS) -#define MAX_LOCKDEP_CHAINS_BITS 16 +#define MAX_LOCKDEP_CHAINS_BITS CONFIG_LOCKDEP_CHAINS_BITS /* * Stack-trace: tightly packed array of stack backtrace * addresses. Protected by the hash_lock. */ -#define MAX_STACK_TRACE_ENTRIES 524288UL +#define MAX_STACK_TRACE_ENTRIES (1UL << CONFIG_LOCKDEP_STACK_TRACE_BITS) +#define STACK_TRACE_HASH_SIZE (1 << CONFIG_LOCKDEP_STACK_TRACE_HASH_BITS) #endif +/* + * Bit definitions for lock_chain.irq_context + */ +#define LOCK_CHAIN_SOFTIRQ_CONTEXT (1 << 0) +#define LOCK_CHAIN_HARDIRQ_CONTEXT (1 << 1) + #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) -#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) +#define AVG_LOCKDEP_CHAIN_DEPTH 5 +#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS * AVG_LOCKDEP_CHAIN_DEPTH) -extern struct list_head all_lock_classes; extern struct lock_chain lock_chains[]; -#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2) +#define LOCK_USAGE_CHARS (2*XXX_LOCK_USAGE_STATES + 1) extern void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]); -extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); +extern const char *__get_key_name(const struct lockdep_subclass_key *key, + char *str); struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); extern unsigned long nr_lock_classes; +extern unsigned long nr_zapped_classes; +extern unsigned long nr_zapped_lock_chains; extern unsigned long nr_list_entries; -extern unsigned long nr_lock_chains; -extern int nr_chain_hlocks; +extern unsigned long nr_dynamic_keys; +long lockdep_next_lockchain(long i); +unsigned long lock_chain_count(void); extern unsigned long nr_stack_trace_entries; extern unsigned int nr_hardirq_chains; extern unsigned int nr_softirq_chains; extern unsigned int nr_process_chains; -extern unsigned int max_lockdep_depth; -extern unsigned int max_recursion_depth; +extern unsigned int nr_free_chain_hlocks; +extern unsigned int nr_lost_chain_hlocks; +extern unsigned int nr_large_chain_blocks; +extern unsigned int max_lockdep_depth; extern unsigned int max_bfs_queue_depth; +extern unsigned long max_lock_class_idx; + +extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +extern unsigned long lock_classes_in_use[]; #ifdef CONFIG_PROVE_LOCKING extern unsigned long lockdep_count_forward_deps(struct lock_class *); extern unsigned long lockdep_count_backward_deps(struct lock_class *); +#ifdef CONFIG_TRACE_IRQFLAGS +u64 lockdep_stack_trace_count(void); +u64 lockdep_stack_hash_count(void); +#endif #else static inline unsigned long lockdep_count_forward_deps(struct lock_class *class) @@ -132,23 +188,27 @@ lockdep_count_backward_deps(struct lock_class *class) * and we want to avoid too much cache bouncing. */ struct lockdep_stats { - int chain_lookup_hits; - int chain_lookup_misses; - int hardirqs_on_events; - int hardirqs_off_events; - int redundant_hardirqs_on; - int redundant_hardirqs_off; - int softirqs_on_events; - int softirqs_off_events; - int redundant_softirqs_on; - int redundant_softirqs_off; - int nr_unused_locks; - int nr_cyclic_checks; - int nr_cyclic_check_recursions; - int nr_find_usage_forwards_checks; - int nr_find_usage_forwards_recursions; - int nr_find_usage_backwards_checks; - int nr_find_usage_backwards_recursions; + unsigned long chain_lookup_hits; + unsigned int chain_lookup_misses; + unsigned long hardirqs_on_events; + unsigned long hardirqs_off_events; + unsigned long redundant_hardirqs_on; + unsigned long redundant_hardirqs_off; + unsigned long softirqs_on_events; + unsigned long softirqs_off_events; + unsigned long redundant_softirqs_on; + unsigned long redundant_softirqs_off; + int nr_unused_locks; + unsigned int nr_redundant_checks; + unsigned int nr_redundant; + unsigned int nr_cyclic_checks; + unsigned int nr_find_usage_forwards_checks; + unsigned int nr_find_usage_backwards_checks; + + /* + * Per lock class locking operation stat counts + */ + unsigned long lock_class_ops[MAX_LOCKDEP_KEYS]; }; DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); @@ -176,9 +236,30 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); } \ __total; \ }) + +static inline void debug_class_ops_inc(struct lock_class *class) +{ + int idx; + + idx = class - lock_classes; + __debug_atomic_inc(lock_class_ops[idx]); +} + +static inline unsigned long debug_class_ops_read(struct lock_class *class) +{ + int idx, cpu; + unsigned long ops = 0; + + idx = class - lock_classes; + for_each_possible_cpu(cpu) + ops += per_cpu(lockdep_stats.lock_class_ops[idx], cpu); + return ops; +} + #else # define __debug_atomic_inc(ptr) do { } while (0) # define debug_atomic_inc(ptr) do { } while (0) # define debug_atomic_dec(ptr) do { } while (0) # define debug_atomic_read(ptr) 0 +# define debug_class_ops_inc(ptr) do { } while (0) #endif diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 6d1fcc786081..1916db9aa46b 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * kernel/lockdep_proc.c * @@ -23,14 +24,33 @@ #include "lockdep_internals.h" +/* + * Since iteration of lock_classes is done without holding the lockdep lock, + * it is not safe to iterate all_lock_classes list directly as the iteration + * may branch off to free_lock_classes or the zapped list. Iteration is done + * directly on the lock_classes array by checking the lock_classes_in_use + * bitmap and max_lock_class_idx. + */ +#define iterate_lock_classes(idx, class) \ + for (idx = 0, class = lock_classes; idx <= max_lock_class_idx; \ + idx++, class++) + static void *l_next(struct seq_file *m, void *v, loff_t *pos) { - return seq_list_next(v, &all_lock_classes, pos); + struct lock_class *class = v; + + ++class; + *pos = class - lock_classes; + return (*pos > max_lock_class_idx) ? NULL : class; } static void *l_start(struct seq_file *m, loff_t *pos) { - return seq_list_start_head(&all_lock_classes, *pos); + unsigned long idx = *pos; + + if (idx > max_lock_class_idx) + return NULL; + return lock_classes + idx; } static void l_stop(struct seq_file *m, void *v) @@ -56,39 +76,43 @@ static void print_name(struct seq_file *m, struct lock_class *class) static int l_show(struct seq_file *m, void *v) { - struct lock_class *class = list_entry(v, struct lock_class, lock_entry); + struct lock_class *class = v; struct lock_list *entry; char usage[LOCK_USAGE_CHARS]; + int idx = class - lock_classes; - if (v == &all_lock_classes) { + if (v == lock_classes) seq_printf(m, "all lock classes:\n"); + + if (!test_bit(idx, lock_classes_in_use)) return 0; - } seq_printf(m, "%p", class->key); #ifdef CONFIG_DEBUG_LOCKDEP - seq_printf(m, " OPS:%8ld", class->ops); -#endif -#ifdef CONFIG_PROVE_LOCKING - seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class)); - seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class)); + seq_printf(m, " OPS:%8ld", debug_class_ops_read(class)); #endif + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { + seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class)); + seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class)); - get_usage_chars(class, usage); - seq_printf(m, " %s", usage); + get_usage_chars(class, usage); + seq_printf(m, " %s", usage); + } seq_printf(m, ": "); print_name(m, class); seq_puts(m, "\n"); - list_for_each_entry(entry, &class->locks_after, entry) { - if (entry->distance == 1) { - seq_printf(m, " -> [%p] ", entry->class->key); - print_name(m, entry->class); - seq_puts(m, "\n"); + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { + list_for_each_entry(entry, &class->locks_after, entry) { + if (entry->distance == 1) { + seq_printf(m, " -> [%p] ", entry->class->key); + print_name(m, entry->class); + seq_puts(m, "\n"); + } } + seq_puts(m, "\n"); } - seq_puts(m, "\n"); return 0; } @@ -100,33 +124,21 @@ static const struct seq_operations lockdep_ops = { .show = l_show, }; -static int lockdep_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &lockdep_ops); -} - -static const struct file_operations proc_lockdep_operations = { - .open = lockdep_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - #ifdef CONFIG_PROVE_LOCKING static void *lc_start(struct seq_file *m, loff_t *pos) { + if (*pos < 0) + return NULL; + if (*pos == 0) return SEQ_START_TOKEN; - if (*pos - 1 < nr_lock_chains) - return lock_chains + (*pos - 1); - - return NULL; + return lock_chains + (*pos - 1); } static void *lc_next(struct seq_file *m, void *v, loff_t *pos) { - (*pos)++; + *pos = lockdep_next_lockchain(*pos - 1) + 1; return lc_start(m, pos); } @@ -139,15 +151,22 @@ static int lc_show(struct seq_file *m, void *v) struct lock_chain *chain = v; struct lock_class *class; int i; + static const char * const irq_strs[] = { + [0] = "0", + [LOCK_CHAIN_HARDIRQ_CONTEXT] = "hardirq", + [LOCK_CHAIN_SOFTIRQ_CONTEXT] = "softirq", + [LOCK_CHAIN_SOFTIRQ_CONTEXT| + LOCK_CHAIN_HARDIRQ_CONTEXT] = "hardirq|softirq", + }; if (v == SEQ_START_TOKEN) { - if (nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS) + if (!nr_free_chain_hlocks) seq_printf(m, "(buggered) "); seq_printf(m, "all lock chains:\n"); return 0; } - seq_printf(m, "irq_context: %d\n", chain->irq_context); + seq_printf(m, "irq_context: %s\n", irq_strs[chain->irq_context]); for (i = 0; i < chain->depth; i++) { class = lock_chain_get_class(chain, i); @@ -169,18 +188,6 @@ static const struct seq_operations lockdep_chains_ops = { .stop = lc_stop, .show = lc_show, }; - -static int lockdep_chains_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &lockdep_chains_ops); -} - -static const struct file_operations proc_lockdep_chains_operations = { - .open = lockdep_chains_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; #endif /* CONFIG_PROVE_LOCKING */ static void lockdep_stats_debug_show(struct seq_file *m) @@ -201,6 +208,10 @@ static void lockdep_stats_debug_show(struct seq_file *m) debug_atomic_read(chain_lookup_hits)); seq_printf(m, " cyclic checks: %11llu\n", debug_atomic_read(nr_cyclic_checks)); + seq_printf(m, " redundant checks: %11llu\n", + debug_atomic_read(nr_redundant_checks)); + seq_printf(m, " redundant links: %11llu\n", + debug_atomic_read(nr_redundant)); seq_printf(m, " find-mask forwards checks: %11llu\n", debug_atomic_read(nr_find_usage_forwards_checks)); seq_printf(m, " find-mask backwards checks: %11llu\n", @@ -219,7 +230,6 @@ static void lockdep_stats_debug_show(struct seq_file *m) static int lockdep_stats_show(struct seq_file *m, void *v) { - struct lock_class *class; unsigned long nr_unused = 0, nr_uncategorized = 0, nr_irq_safe = 0, nr_irq_unsafe = 0, nr_softirq_safe = 0, nr_softirq_unsafe = 0, @@ -229,7 +239,13 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, sum_forward_deps = 0; - list_for_each_entry(class, &all_lock_classes, lock_entry) { +#ifdef CONFIG_PROVE_LOCKING + struct lock_class *class; + unsigned long idx; + + iterate_lock_classes(idx, class) { + if (!test_bit(idx, lock_classes_in_use)) + continue; if (class->usage_mask == 0) nr_unused++; @@ -260,15 +276,18 @@ static int lockdep_stats_show(struct seq_file *m, void *v) if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ) nr_hardirq_read_unsafe++; -#ifdef CONFIG_PROVE_LOCKING sum_forward_deps += lockdep_count_forward_deps(class); -#endif } + #ifdef CONFIG_DEBUG_LOCKDEP DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused); #endif + +#endif seq_printf(m, " lock-classes: %11lu [max: %lu]\n", nr_lock_classes, MAX_LOCKDEP_KEYS); + seq_printf(m, " dynamic-keys: %11lu\n", + nr_dynamic_keys); seq_printf(m, " direct dependencies: %11lu [max: %lu]\n", nr_list_entries, MAX_LOCKDEP_ENTRIES); seq_printf(m, " indirect dependencies: %11lu\n", @@ -287,9 +306,13 @@ static int lockdep_stats_show(struct seq_file *m, void *v) #ifdef CONFIG_PROVE_LOCKING seq_printf(m, " dependency chains: %11lu [max: %lu]\n", - nr_lock_chains, MAX_LOCKDEP_CHAINS); - seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n", - nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS); + lock_chain_count(), MAX_LOCKDEP_CHAINS); + seq_printf(m, " dependency chain hlocks used: %11lu [max: %lu]\n", + MAX_LOCKDEP_CHAIN_HLOCKS - + (nr_free_chain_hlocks + nr_lost_chain_hlocks), + MAX_LOCKDEP_CHAIN_HLOCKS); + seq_printf(m, " dependency chain hlocks lost: %11u\n", + nr_lost_chain_hlocks); #endif #ifdef CONFIG_TRACE_IRQFLAGS @@ -302,6 +325,12 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_process_chains); seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n", nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES); +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + seq_printf(m, " number of stack traces: %11llu\n", + lockdep_stack_trace_count()); + seq_printf(m, " number of stack hash chains: %11llu\n", + lockdep_stack_hash_count()); +#endif seq_printf(m, " combined max dependencies: %11u\n", (nr_hardirq_chains + 1) * (nr_softirq_chains + 1) * @@ -343,25 +372,27 @@ static int lockdep_stats_show(struct seq_file *m, void *v) seq_printf(m, " max bfs queue depth: %11u\n", max_bfs_queue_depth); #endif + seq_printf(m, " max lock class index: %11lu\n", + max_lock_class_idx); lockdep_stats_debug_show(m); seq_printf(m, " debug_locks: %11u\n", debug_locks); + /* + * Zapped classes and lockdep data buffers reuse statistics. + */ + seq_puts(m, "\n"); + seq_printf(m, " zapped classes: %11lu\n", + nr_zapped_classes); +#ifdef CONFIG_PROVE_LOCKING + seq_printf(m, " zapped lock chains: %11lu\n", + nr_zapped_lock_chains); + seq_printf(m, " large chain blocks: %11u\n", + nr_large_chain_blocks); +#endif return 0; } -static int lockdep_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, lockdep_stats_show, NULL); -} - -static const struct file_operations proc_lockdep_stats_operations = { - .open = lockdep_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - #ifdef CONFIG_LOCK_STAT struct lock_stat_data { @@ -395,7 +426,7 @@ static void seq_line(struct seq_file *m, char c, int offset, int length) for (i = 0; i < offset; i++) seq_puts(m, " "); for (i = 0; i < length; i++) - seq_printf(m, "%c", c); + seq_putc(m, c); seq_puts(m, "\n"); } @@ -411,7 +442,7 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr) static void seq_time(struct seq_file *m, s64 time) { - char num[15]; + char num[22]; snprint_time(num, sizeof(num), time); seq_printf(m, " %14s", num); @@ -423,12 +454,12 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt) seq_time(m, lt->min); seq_time(m, lt->max); seq_time(m, lt->total); - seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0); + seq_time(m, lt->nr ? div64_u64(lt->total, lt->nr) : 0); } static void seq_stats(struct seq_file *m, struct lock_stat_data *data) { - struct lockdep_subclass_key *ckey; + const struct lockdep_subclass_key *ckey; struct lock_class_stats *stats; struct lock_class *class; const char *cname; @@ -620,12 +651,16 @@ static int lock_stat_open(struct inode *inode, struct file *file) if (!res) { struct lock_stat_data *iter = data->stats; struct seq_file *m = file->private_data; + unsigned long idx; - list_for_each_entry(class, &all_lock_classes, lock_entry) { + iterate_lock_classes(idx, class) { + if (!test_bit(idx, lock_classes_in_use)) + continue; iter->class = class; - iter->stats = lock_stats(class); + lock_stats(class, &iter->stats); iter++; } + data->iter_end = iter; sort(data->stats, data->iter_end - data->stats, @@ -643,6 +678,7 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct lock_class *class; + unsigned long idx; char c; if (count) { @@ -652,8 +688,11 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf, if (c != '0') return count; - list_for_each_entry(class, &all_lock_classes, lock_entry) + iterate_lock_classes(idx, class) { + if (!test_bit(idx, lock_classes_in_use)) + continue; clear_lock_stats(class); + } } return count; } @@ -666,28 +705,24 @@ static int lock_stat_release(struct inode *inode, struct file *file) return seq_release(inode, file); } -static const struct file_operations proc_lock_stat_operations = { - .open = lock_stat_open, - .write = lock_stat_write, - .read = seq_read, - .llseek = seq_lseek, - .release = lock_stat_release, +static const struct proc_ops lock_stat_proc_ops = { + .proc_open = lock_stat_open, + .proc_write = lock_stat_write, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = lock_stat_release, }; #endif /* CONFIG_LOCK_STAT */ static int __init lockdep_proc_init(void) { - proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations); + proc_create_seq("lockdep", S_IRUSR, NULL, &lockdep_ops); #ifdef CONFIG_PROVE_LOCKING - proc_create("lockdep_chains", S_IRUSR, NULL, - &proc_lockdep_chains_operations); + proc_create_seq("lockdep_chains", S_IRUSR, NULL, &lockdep_chains_ops); #endif - proc_create("lockdep_stats", S_IRUSR, NULL, - &proc_lockdep_stats_operations); - + proc_create_single("lockdep_stats", S_IRUSR, NULL, lockdep_stats_show); #ifdef CONFIG_LOCK_STAT - proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, - &proc_lock_stat_operations); + proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, &lock_stat_proc_ops); #endif return 0; diff --git a/kernel/locking/lockdep_states.h b/kernel/locking/lockdep_states.h index 995b0cc2b84c..35ca09f2ed0b 100644 --- a/kernel/locking/lockdep_states.h +++ b/kernel/locking/lockdep_states.h @@ -6,4 +6,3 @@ */ LOCKDEP_STATE(HARDIRQ) LOCKDEP_STATE(SOFTIRQ) -LOCKDEP_STATE(RECLAIM_FS) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index f24582d4dad3..6567e5eeacc0 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -1,32 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Module-based torture test facility for locking * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2014 * - * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> * Davidlohr Bueso <dave@stgolabs.net> * Based on kernel/rcu/torture.c. */ + +#define pr_fmt(fmt) fmt + #include <linux/kernel.h> #include <linux/module.h> #include <linux/kthread.h> #include <linux/sched/rt.h> #include <linux/spinlock.h> -#include <linux/rwlock.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/smp.h> @@ -38,48 +27,105 @@ #include <linux/moduleparam.h> #include <linux/delay.h> #include <linux/slab.h> -#include <linux/percpu-rwsem.h> #include <linux/torture.h> +#include <linux/reboot.h> +MODULE_DESCRIPTION("torture test facility for locking"); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); - -torture_param(int, nwriters_stress, -1, - "Number of write-locking stress-test threads"); -torture_param(int, nreaders_stress, -1, - "Number of read-locking stress-test threads"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); + +torture_param(int, acq_writer_lim, 0, "Write_acquisition time limit (jiffies)."); +torture_param(int, call_rcu_chains, 0, "Self-propagate call_rcu() chains during test (0=disable)."); +torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable"); +torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)"); +torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads"); +torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); -torture_param(int, onoff_interval, 0, - "Time between CPU hotplugs (s), 0=disable"); -torture_param(int, shuffle_interval, 3, - "Number of jiffies between shuffles, 0=disable"); +torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, rt_boost, 2, + "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types."); +torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens."); +torture_param(int, shuffle_interval, 3, "Number of jiffies between shuffles, 0=disable"); torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); -torture_param(int, stat_interval, 60, - "Number of seconds between stats printk()s"); +torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); -torture_param(bool, verbose, true, - "Enable verbose debugging printk()s"); +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); +torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority"); +/* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */ +#define MAX_NESTED_LOCKS 8 -static char *torture_type = "spin_lock"; +static char *torture_type = IS_ENABLED(CONFIG_PREEMPT_RT) ? "raw_spin_lock" : "spin_lock"; module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)"); +static cpumask_var_t bind_readers; // Bind the readers to the specified set of CPUs. +static cpumask_var_t bind_writers; // Bind the writers to the specified set of CPUs. + +// Parse a cpumask kernel parameter. If there are more users later on, +// this might need to got to a more central location. +static int param_set_cpumask(const char *val, const struct kernel_param *kp) +{ + cpumask_var_t *cm_bind = kp->arg; + int ret; + char *s; + + if (!alloc_cpumask_var(cm_bind, GFP_KERNEL)) { + s = "Out of memory"; + ret = -ENOMEM; + goto out_err; + } + ret = cpulist_parse(val, *cm_bind); + if (!ret) + return ret; + s = "Bad CPU range"; +out_err: + pr_warn("%s: %s, all CPUs set\n", kp->name, s); + cpumask_setall(*cm_bind); + return ret; +} + +// Output a cpumask kernel parameter. +static int param_get_cpumask(char *buffer, const struct kernel_param *kp) +{ + cpumask_var_t *cm_bind = kp->arg; + + return sprintf(buffer, "%*pbl", cpumask_pr_args(*cm_bind)); +} + +static bool cpumask_nonempty(cpumask_var_t mask) +{ + return cpumask_available(mask) && !cpumask_empty(mask); +} + +static const struct kernel_param_ops lt_bind_ops = { + .set = param_set_cpumask, + .get = param_get_cpumask, +}; + +module_param_cb(bind_readers, <_bind_ops, &bind_readers, 0444); +module_param_cb(bind_writers, <_bind_ops, &bind_writers, 0444); + +long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn); + static struct task_struct *stats_task; static struct task_struct **writer_tasks; static struct task_struct **reader_tasks; static bool lock_is_write_held; -static bool lock_is_read_held; +static atomic_t lock_is_read_held; +static unsigned long last_lock_release; struct lock_stress_stats { long n_lock_fail; long n_lock_acquired; }; -int torture_runnable = IS_ENABLED(MODULE); -module_param(torture_runnable, int, 0444); -MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init"); +struct call_rcu_chain { + struct rcu_head crc_rh; + bool crc_stop; +}; +struct call_rcu_chain *call_rcu_chain_list; /* Forward reference. */ static void lock_torture_cleanup(void); @@ -89,13 +135,16 @@ static void lock_torture_cleanup(void); */ struct lock_torture_ops { void (*init)(void); - int (*writelock)(void); + void (*exit)(void); + int (*nested_lock)(int tid, u32 lockset); + int (*writelock)(int tid); void (*write_delay)(struct torture_random_state *trsp); void (*task_boost)(struct torture_random_state *trsp); - void (*writeunlock)(void); - int (*readlock)(void); + void (*writeunlock)(int tid); + void (*nested_unlock)(int tid, u32 lockset); + int (*readlock)(int tid); void (*read_delay)(struct torture_random_state *trsp); - void (*readunlock)(void); + void (*readunlock)(int tid); unsigned long flags; /* for irq spinlocks */ const char *name; @@ -105,51 +154,82 @@ struct lock_torture_cxt { int nrealwriters_stress; int nrealreaders_stress; bool debug_lock; + bool init_called; atomic_t n_lock_torture_errors; struct lock_torture_ops *cur_ops; struct lock_stress_stats *lwsa; /* writer statistics */ struct lock_stress_stats *lrsa; /* reader statistics */ }; -static struct lock_torture_cxt cxt = { 0, 0, false, +static struct lock_torture_cxt cxt = { 0, 0, false, false, ATOMIC_INIT(0), NULL, NULL}; /* * Definitions for lock torture testing. */ -static int torture_lock_busted_write_lock(void) +static int torture_lock_busted_write_lock(int tid __maybe_unused) { return 0; /* BUGGY, do not use in real life!!! */ } static void torture_lock_busted_write_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; - /* We want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms); -#ifdef CONFIG_PREEMPT + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } -static void torture_lock_busted_write_unlock(void) +static void torture_lock_busted_write_unlock(int tid __maybe_unused) { /* BUGGY, do not use in real life!!! */ } -static void torture_boost_dummy(struct torture_random_state *trsp) +static void __torture_rt_boost(struct torture_random_state *trsp) { - /* Only rtmutexes care about priority */ + const unsigned int factor = rt_boost_factor; + + if (!rt_task(current)) { + /* + * Boost priority once every rt_boost_factor operations. When + * the task tries to take the lock, the rtmutex it will account + * for the new priority, and do any corresponding pi-dance. + */ + if (trsp && !(torture_random(trsp) % + (cxt.nrealwriters_stress * factor))) { + sched_set_fifo(current); + } else /* common case, do nothing */ + return; + } else { + /* + * The task will remain boosted for another 10 * rt_boost_factor + * operations, then restored back to its original prio, and so + * forth. + * + * When @trsp is nil, we want to force-reset the task for + * stopping the kthread. + */ + if (!trsp || !(torture_random(trsp) % + (cxt.nrealwriters_stress * factor * 2))) { + sched_set_normal(current, 0); + } else /* common case, do nothing */ + return; + } +} + +static void torture_rt_boost(struct torture_random_state *trsp) +{ + if (rt_boost != 2) + return; + + __torture_rt_boost(trsp); } static struct lock_torture_ops lock_busted_ops = { .writelock = torture_lock_busted_write_lock, .write_delay = torture_lock_busted_write_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_lock_busted_write_unlock, .readlock = NULL, .read_delay = NULL, @@ -159,7 +239,8 @@ static struct lock_torture_ops lock_busted_ops = { static DEFINE_SPINLOCK(torture_spinlock); -static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock) +static int torture_spin_lock_write_lock(int tid __maybe_unused) +__acquires(torture_spinlock) { spin_lock(&torture_spinlock); return 0; @@ -168,24 +249,24 @@ static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock) static void torture_spin_lock_write_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = 100; + unsigned long j; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms); - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2 * shortdelay_us))) + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) { + j = jiffies; + mdelay(long_hold); + pr_alert("%s: delay = %lu jiffies.\n", __func__, jiffies - j); + } + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us))) udelay(shortdelay_us); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } -static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock) +static void torture_spin_lock_write_unlock(int tid __maybe_unused) +__releases(torture_spinlock) { spin_unlock(&torture_spinlock); } @@ -193,7 +274,7 @@ static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock) static struct lock_torture_ops spin_lock_ops = { .writelock = torture_spin_lock_write_lock, .write_delay = torture_spin_lock_write_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_spin_lock_write_unlock, .readlock = NULL, .read_delay = NULL, @@ -201,7 +282,7 @@ static struct lock_torture_ops spin_lock_ops = { .name = "spin_lock" }; -static int torture_spin_lock_write_lock_irq(void) +static int torture_spin_lock_write_lock_irq(int tid __maybe_unused) __acquires(torture_spinlock) { unsigned long flags; @@ -211,7 +292,7 @@ __acquires(torture_spinlock) return 0; } -static void torture_lock_spin_write_unlock_irq(void) +static void torture_lock_spin_write_unlock_irq(int tid __maybe_unused) __releases(torture_spinlock) { spin_unlock_irqrestore(&torture_spinlock, cxt.cur_ops->flags); @@ -220,7 +301,7 @@ __releases(torture_spinlock) static struct lock_torture_ops spin_lock_irq_ops = { .writelock = torture_spin_lock_write_lock_irq, .write_delay = torture_spin_lock_write_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_lock_spin_write_unlock_irq, .readlock = NULL, .read_delay = NULL, @@ -228,9 +309,117 @@ static struct lock_torture_ops spin_lock_irq_ops = { .name = "spin_lock_irq" }; +static DEFINE_RAW_SPINLOCK(torture_raw_spinlock); + +static int torture_raw_spin_lock_write_lock(int tid __maybe_unused) +__acquires(torture_raw_spinlock) +{ + raw_spin_lock(&torture_raw_spinlock); + return 0; +} + +static void torture_raw_spin_lock_write_unlock(int tid __maybe_unused) +__releases(torture_raw_spinlock) +{ + raw_spin_unlock(&torture_raw_spinlock); +} + +static struct lock_torture_ops raw_spin_lock_ops = { + .writelock = torture_raw_spin_lock_write_lock, + .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_rt_boost, + .writeunlock = torture_raw_spin_lock_write_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "raw_spin_lock" +}; + +static int torture_raw_spin_lock_write_lock_irq(int tid __maybe_unused) +__acquires(torture_raw_spinlock) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&torture_raw_spinlock, flags); + cxt.cur_ops->flags = flags; + return 0; +} + +static void torture_raw_spin_lock_write_unlock_irq(int tid __maybe_unused) +__releases(torture_raw_spinlock) +{ + raw_spin_unlock_irqrestore(&torture_raw_spinlock, cxt.cur_ops->flags); +} + +static struct lock_torture_ops raw_spin_lock_irq_ops = { + .writelock = torture_raw_spin_lock_write_lock_irq, + .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_rt_boost, + .writeunlock = torture_raw_spin_lock_write_unlock_irq, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "raw_spin_lock_irq" +}; + +#ifdef CONFIG_BPF_SYSCALL + +#include <asm/rqspinlock.h> +static rqspinlock_t rqspinlock; + +static int torture_raw_res_spin_write_lock(int tid __maybe_unused) +{ + raw_res_spin_lock(&rqspinlock); + return 0; +} + +static void torture_raw_res_spin_write_unlock(int tid __maybe_unused) +{ + raw_res_spin_unlock(&rqspinlock); +} + +static struct lock_torture_ops raw_res_spin_lock_ops = { + .writelock = torture_raw_res_spin_write_lock, + .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_rt_boost, + .writeunlock = torture_raw_res_spin_write_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "raw_res_spin_lock" +}; + +static int torture_raw_res_spin_write_lock_irq(int tid __maybe_unused) +{ + unsigned long flags; + + raw_res_spin_lock_irqsave(&rqspinlock, flags); + cxt.cur_ops->flags = flags; + return 0; +} + +static void torture_raw_res_spin_write_unlock_irq(int tid __maybe_unused) +{ + raw_res_spin_unlock_irqrestore(&rqspinlock, cxt.cur_ops->flags); +} + +static struct lock_torture_ops raw_res_spin_lock_irq_ops = { + .writelock = torture_raw_res_spin_write_lock_irq, + .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_rt_boost, + .writeunlock = torture_raw_res_spin_write_unlock_irq, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "raw_res_spin_lock_irq" +}; + +#endif + static DEFINE_RWLOCK(torture_rwlock); -static int torture_rwlock_write_lock(void) __acquires(torture_rwlock) +static int torture_rwlock_write_lock(int tid __maybe_unused) +__acquires(torture_rwlock) { write_lock(&torture_rwlock); return 0; @@ -239,24 +428,24 @@ static int torture_rwlock_write_lock(void) __acquires(torture_rwlock) static void torture_rwlock_write_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = 100; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms); + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold); else udelay(shortdelay_us); } -static void torture_rwlock_write_unlock(void) __releases(torture_rwlock) +static void torture_rwlock_write_unlock(int tid __maybe_unused) +__releases(torture_rwlock) { write_unlock(&torture_rwlock); } -static int torture_rwlock_read_lock(void) __acquires(torture_rwlock) +static int torture_rwlock_read_lock(int tid __maybe_unused) +__acquires(torture_rwlock) { read_lock(&torture_rwlock); return 0; @@ -265,19 +454,18 @@ static int torture_rwlock_read_lock(void) __acquires(torture_rwlock) static void torture_rwlock_read_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 10; - const unsigned long longdelay_ms = 100; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealreaders_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms); + if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold))) + mdelay(long_hold); else udelay(shortdelay_us); } -static void torture_rwlock_read_unlock(void) __releases(torture_rwlock) +static void torture_rwlock_read_unlock(int tid __maybe_unused) +__releases(torture_rwlock) { read_unlock(&torture_rwlock); } @@ -285,7 +473,7 @@ static void torture_rwlock_read_unlock(void) __releases(torture_rwlock) static struct lock_torture_ops rw_lock_ops = { .writelock = torture_rwlock_write_lock, .write_delay = torture_rwlock_write_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_rwlock_write_unlock, .readlock = torture_rwlock_read_lock, .read_delay = torture_rwlock_read_delay, @@ -293,7 +481,8 @@ static struct lock_torture_ops rw_lock_ops = { .name = "rw_lock" }; -static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock) +static int torture_rwlock_write_lock_irq(int tid __maybe_unused) +__acquires(torture_rwlock) { unsigned long flags; @@ -302,13 +491,14 @@ static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock) return 0; } -static void torture_rwlock_write_unlock_irq(void) +static void torture_rwlock_write_unlock_irq(int tid __maybe_unused) __releases(torture_rwlock) { write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags); } -static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock) +static int torture_rwlock_read_lock_irq(int tid __maybe_unused) +__acquires(torture_rwlock) { unsigned long flags; @@ -317,7 +507,7 @@ static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock) return 0; } -static void torture_rwlock_read_unlock_irq(void) +static void torture_rwlock_read_unlock_irq(int tid __maybe_unused) __releases(torture_rwlock) { read_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags); @@ -326,7 +516,7 @@ __releases(torture_rwlock) static struct lock_torture_ops rw_lock_irq_ops = { .writelock = torture_rwlock_write_lock_irq, .write_delay = torture_rwlock_write_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_rwlock_write_unlock_irq, .readlock = torture_rwlock_read_lock_irq, .read_delay = torture_rwlock_read_delay, @@ -335,8 +525,31 @@ static struct lock_torture_ops rw_lock_irq_ops = { }; static DEFINE_MUTEX(torture_mutex); +static struct mutex torture_nested_mutexes[MAX_NESTED_LOCKS]; +static struct lock_class_key nested_mutex_keys[MAX_NESTED_LOCKS]; + +static void torture_mutex_init(void) +{ + int i; + + for (i = 0; i < MAX_NESTED_LOCKS; i++) + __mutex_init(&torture_nested_mutexes[i], __func__, + &nested_mutex_keys[i]); +} + +static int torture_mutex_nested_lock(int tid __maybe_unused, + u32 lockset) +{ + int i; + + for (i = 0; i < nested_locks; i++) + if (lockset & (1 << i)) + mutex_lock(&torture_nested_mutexes[i]); + return 0; +} -static int torture_mutex_lock(void) __acquires(torture_mutex) +static int torture_mutex_lock(int tid __maybe_unused) +__acquires(torture_mutex) { mutex_lock(&torture_mutex); return 0; @@ -344,30 +557,37 @@ static int torture_mutex_lock(void) __acquires(torture_mutex) static void torture_mutex_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; - /* We want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms * 5); - else - mdelay(longdelay_ms / 5); -#ifdef CONFIG_PREEMPT + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold * 5); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } -static void torture_mutex_unlock(void) __releases(torture_mutex) +static void torture_mutex_unlock(int tid __maybe_unused) +__releases(torture_mutex) { mutex_unlock(&torture_mutex); } +static void torture_mutex_nested_unlock(int tid __maybe_unused, + u32 lockset) +{ + int i; + + for (i = nested_locks - 1; i >= 0; i--) + if (lockset & (1 << i)) + mutex_unlock(&torture_nested_mutexes[i]); +} + static struct lock_torture_ops mutex_lock_ops = { + .init = torture_mutex_init, + .nested_lock = torture_mutex_nested_lock, .writelock = torture_mutex_lock, .write_delay = torture_mutex_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_mutex_unlock, + .nested_unlock = torture_mutex_nested_unlock, .readlock = NULL, .read_delay = NULL, .readunlock = NULL, @@ -375,12 +595,34 @@ static struct lock_torture_ops mutex_lock_ops = { }; #include <linux/ww_mutex.h> -static DEFINE_WW_CLASS(torture_ww_class); -static DEFINE_WW_MUTEX(torture_ww_mutex_0, &torture_ww_class); -static DEFINE_WW_MUTEX(torture_ww_mutex_1, &torture_ww_class); -static DEFINE_WW_MUTEX(torture_ww_mutex_2, &torture_ww_class); +/* + * The torture ww_mutexes should belong to the same lock class as + * torture_ww_class to avoid lockdep problem. The ww_mutex_init() + * function is called for initialization to ensure that. + */ +static DEFINE_WD_CLASS(torture_ww_class); +static struct ww_mutex torture_ww_mutex_0, torture_ww_mutex_1, torture_ww_mutex_2; +static struct ww_acquire_ctx *ww_acquire_ctxs; + +static void torture_ww_mutex_init(void) +{ + ww_mutex_init(&torture_ww_mutex_0, &torture_ww_class); + ww_mutex_init(&torture_ww_mutex_1, &torture_ww_class); + ww_mutex_init(&torture_ww_mutex_2, &torture_ww_class); + + ww_acquire_ctxs = kmalloc_array(cxt.nrealwriters_stress, + sizeof(*ww_acquire_ctxs), + GFP_KERNEL); + if (!ww_acquire_ctxs) + VERBOSE_TOROUT_STRING("ww_acquire_ctx: Out of memory"); +} + +static void torture_ww_mutex_exit(void) +{ + kfree(ww_acquire_ctxs); +} -static int torture_ww_mutex_lock(void) +static int torture_ww_mutex_lock(int tid) __acquires(torture_ww_mutex_0) __acquires(torture_ww_mutex_1) __acquires(torture_ww_mutex_2) @@ -390,7 +632,7 @@ __acquires(torture_ww_mutex_2) struct list_head link; struct ww_mutex *lock; } locks[3], *ll, *ln; - struct ww_acquire_ctx ctx; + struct ww_acquire_ctx *ctx = &ww_acquire_ctxs[tid]; locks[0].lock = &torture_ww_mutex_0; list_add(&locks[0].link, &list); @@ -401,12 +643,12 @@ __acquires(torture_ww_mutex_2) locks[2].lock = &torture_ww_mutex_2; list_add(&locks[2].link, &list); - ww_acquire_init(&ctx, &torture_ww_class); + ww_acquire_init(ctx, &torture_ww_class); list_for_each_entry(ll, &list, link) { int err; - err = ww_mutex_lock(ll->lock, &ctx); + err = ww_mutex_lock(ll->lock, ctx); if (!err) continue; @@ -417,28 +659,32 @@ __acquires(torture_ww_mutex_2) if (err != -EDEADLK) return err; - ww_mutex_lock_slow(ll->lock, &ctx); + ww_mutex_lock_slow(ll->lock, ctx); list_move(&ll->link, &list); } - ww_acquire_fini(&ctx); return 0; } -static void torture_ww_mutex_unlock(void) +static void torture_ww_mutex_unlock(int tid) __releases(torture_ww_mutex_0) __releases(torture_ww_mutex_1) __releases(torture_ww_mutex_2) { + struct ww_acquire_ctx *ctx = &ww_acquire_ctxs[tid]; + ww_mutex_unlock(&torture_ww_mutex_0); ww_mutex_unlock(&torture_ww_mutex_1); ww_mutex_unlock(&torture_ww_mutex_2); + ww_acquire_fini(ctx); } static struct lock_torture_ops ww_mutex_lock_ops = { + .init = torture_ww_mutex_init, + .exit = torture_ww_mutex_exit, .writelock = torture_ww_mutex_lock, .write_delay = torture_mutex_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_ww_mutex_unlock, .readlock = NULL, .read_delay = NULL, @@ -448,81 +694,85 @@ static struct lock_torture_ops ww_mutex_lock_ops = { #ifdef CONFIG_RT_MUTEXES static DEFINE_RT_MUTEX(torture_rtmutex); +static struct rt_mutex torture_nested_rtmutexes[MAX_NESTED_LOCKS]; +static struct lock_class_key nested_rtmutex_keys[MAX_NESTED_LOCKS]; -static int torture_rtmutex_lock(void) __acquires(torture_rtmutex) +static void torture_rtmutex_init(void) { - rt_mutex_lock(&torture_rtmutex); - return 0; + int i; + + for (i = 0; i < MAX_NESTED_LOCKS; i++) + __rt_mutex_init(&torture_nested_rtmutexes[i], __func__, + &nested_rtmutex_keys[i]); } -static void torture_rtmutex_boost(struct torture_random_state *trsp) +static int torture_rtmutex_nested_lock(int tid __maybe_unused, + u32 lockset) { - int policy; - struct sched_param param; - const unsigned int factor = 50000; /* yes, quite arbitrary */ + int i; - if (!rt_task(current)) { - /* - * Boost priority once every ~50k operations. When the - * task tries to take the lock, the rtmutex it will account - * for the new priority, and do any corresponding pi-dance. - */ - if (trsp && !(torture_random(trsp) % - (cxt.nrealwriters_stress * factor))) { - policy = SCHED_FIFO; - param.sched_priority = MAX_RT_PRIO - 1; - } else /* common case, do nothing */ - return; - } else { - /* - * The task will remain boosted for another ~500k operations, - * then restored back to its original prio, and so forth. - * - * When @trsp is nil, we want to force-reset the task for - * stopping the kthread. - */ - if (!trsp || !(torture_random(trsp) % - (cxt.nrealwriters_stress * factor * 2))) { - policy = SCHED_NORMAL; - param.sched_priority = 0; - } else /* common case, do nothing */ - return; - } + for (i = 0; i < nested_locks; i++) + if (lockset & (1 << i)) + rt_mutex_lock(&torture_nested_rtmutexes[i]); + return 0; +} - sched_setscheduler_nocheck(current, policy, ¶m); +static int torture_rtmutex_lock(int tid __maybe_unused) +__acquires(torture_rtmutex) +{ + rt_mutex_lock(&torture_rtmutex); + return 0; } static void torture_rtmutex_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = 100; /* * We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold); if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms); - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2 * shortdelay_us))) + (cxt.nrealwriters_stress * 200 * shortdelay_us))) udelay(shortdelay_us); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } -static void torture_rtmutex_unlock(void) __releases(torture_rtmutex) +static void torture_rtmutex_unlock(int tid __maybe_unused) +__releases(torture_rtmutex) { rt_mutex_unlock(&torture_rtmutex); } +static void torture_rt_boost_rtmutex(struct torture_random_state *trsp) +{ + if (!rt_boost) + return; + + __torture_rt_boost(trsp); +} + +static void torture_rtmutex_nested_unlock(int tid __maybe_unused, + u32 lockset) +{ + int i; + + for (i = nested_locks - 1; i >= 0; i--) + if (lockset & (1 << i)) + rt_mutex_unlock(&torture_nested_rtmutexes[i]); +} + static struct lock_torture_ops rtmutex_lock_ops = { + .init = torture_rtmutex_init, + .nested_lock = torture_rtmutex_nested_lock, .writelock = torture_rtmutex_lock, .write_delay = torture_rtmutex_delay, - .task_boost = torture_rtmutex_boost, + .task_boost = torture_rt_boost_rtmutex, .writeunlock = torture_rtmutex_unlock, + .nested_unlock = torture_rtmutex_nested_unlock, .readlock = NULL, .read_delay = NULL, .readunlock = NULL, @@ -531,7 +781,8 @@ static struct lock_torture_ops rtmutex_lock_ops = { #endif static DECLARE_RWSEM(torture_rwsem); -static int torture_rwsem_down_write(void) __acquires(torture_rwsem) +static int torture_rwsem_down_write(int tid __maybe_unused) +__acquires(torture_rwsem) { down_write(&torture_rwsem); return 0; @@ -539,26 +790,21 @@ static int torture_rwsem_down_write(void) __acquires(torture_rwsem) static void torture_rwsem_write_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; - /* We want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms * 10); - else - mdelay(longdelay_ms / 10); -#ifdef CONFIG_PREEMPT + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold * 10); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } -static void torture_rwsem_up_write(void) __releases(torture_rwsem) +static void torture_rwsem_up_write(int tid __maybe_unused) +__releases(torture_rwsem) { up_write(&torture_rwsem); } -static int torture_rwsem_down_read(void) __acquires(torture_rwsem) +static int torture_rwsem_down_read(int tid __maybe_unused) +__acquires(torture_rwsem) { down_read(&torture_rwsem); return 0; @@ -566,21 +812,17 @@ static int torture_rwsem_down_read(void) __acquires(torture_rwsem) static void torture_rwsem_read_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; - /* We want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms * 2); + if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold))) + mdelay(long_hold * 2); else - mdelay(longdelay_ms / 2); -#ifdef CONFIG_PREEMPT + mdelay(long_hold / 2); if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } -static void torture_rwsem_up_read(void) __releases(torture_rwsem) +static void torture_rwsem_up_read(int tid __maybe_unused) +__releases(torture_rwsem) { up_read(&torture_rwsem); } @@ -588,7 +830,7 @@ static void torture_rwsem_up_read(void) __releases(torture_rwsem) static struct lock_torture_ops rwsem_lock_ops = { .writelock = torture_rwsem_down_write, .write_delay = torture_rwsem_write_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_rwsem_up_write, .readlock = torture_rwsem_down_read, .read_delay = torture_rwsem_read_delay, @@ -599,38 +841,48 @@ static struct lock_torture_ops rwsem_lock_ops = { #include <linux/percpu-rwsem.h> static struct percpu_rw_semaphore pcpu_rwsem; -void torture_percpu_rwsem_init(void) +static void torture_percpu_rwsem_init(void) { BUG_ON(percpu_init_rwsem(&pcpu_rwsem)); } -static int torture_percpu_rwsem_down_write(void) __acquires(pcpu_rwsem) +static void torture_percpu_rwsem_exit(void) +{ + percpu_free_rwsem(&pcpu_rwsem); +} + +static int torture_percpu_rwsem_down_write(int tid __maybe_unused) +__acquires(pcpu_rwsem) { percpu_down_write(&pcpu_rwsem); return 0; } -static void torture_percpu_rwsem_up_write(void) __releases(pcpu_rwsem) +static void torture_percpu_rwsem_up_write(int tid __maybe_unused) +__releases(pcpu_rwsem) { percpu_up_write(&pcpu_rwsem); } -static int torture_percpu_rwsem_down_read(void) __acquires(pcpu_rwsem) +static int torture_percpu_rwsem_down_read(int tid __maybe_unused) +__acquires(pcpu_rwsem) { percpu_down_read(&pcpu_rwsem); return 0; } -static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem) +static void torture_percpu_rwsem_up_read(int tid __maybe_unused) +__releases(pcpu_rwsem) { percpu_up_read(&pcpu_rwsem); } static struct lock_torture_ops percpu_rwsem_lock_ops = { .init = torture_percpu_rwsem_init, + .exit = torture_percpu_rwsem_exit, .writelock = torture_percpu_rwsem_down_write, .write_delay = torture_rwsem_write_delay, - .task_boost = torture_boost_dummy, + .task_boost = torture_rt_boost, .writeunlock = torture_percpu_rwsem_up_write, .readlock = torture_percpu_rwsem_down_read, .read_delay = torture_rwsem_read_delay, @@ -644,28 +896,63 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = { */ static int lock_torture_writer(void *arg) { + unsigned long j; + unsigned long j1; + u32 lockset_mask; struct lock_stress_stats *lwsp = arg; - static DEFINE_TORTURE_RANDOM(rand); + DEFINE_TORTURE_RANDOM(rand); + bool skip_main_lock; + int tid = lwsp - cxt.lwsa; VERBOSE_TOROUT_STRING("lock_torture_writer task started"); - set_user_nice(current, MAX_NICE); + if (!rt_task(current)) + set_user_nice(current, MAX_NICE); do { if ((torture_random(&rand) & 0xfffff) == 0) schedule_timeout_uninterruptible(1); - cxt.cur_ops->task_boost(&rand); - cxt.cur_ops->writelock(); - if (WARN_ON_ONCE(lock_is_write_held)) - lwsp->n_lock_fail++; - lock_is_write_held = 1; - if (WARN_ON_ONCE(lock_is_read_held)) - lwsp->n_lock_fail++; /* rare, but... */ + lockset_mask = torture_random(&rand); + /* + * When using nested_locks, we want to occasionally + * skip the main lock so we can avoid always serializing + * the lock chains on that central lock. By skipping the + * main lock occasionally, we can create different + * contention patterns (allowing for multiple disjoint + * blocked trees) + */ + skip_main_lock = (nested_locks && + !(torture_random(&rand) % 100)); - lwsp->n_lock_acquired++; - cxt.cur_ops->write_delay(&rand); - lock_is_write_held = 0; - cxt.cur_ops->writeunlock(); + cxt.cur_ops->task_boost(&rand); + if (cxt.cur_ops->nested_lock) + cxt.cur_ops->nested_lock(tid, lockset_mask); + + if (!skip_main_lock) { + if (acq_writer_lim > 0) + j = jiffies; + cxt.cur_ops->writelock(tid); + if (WARN_ON_ONCE(lock_is_write_held)) + lwsp->n_lock_fail++; + lock_is_write_held = true; + if (WARN_ON_ONCE(atomic_read(&lock_is_read_held))) + lwsp->n_lock_fail++; /* rare, but... */ + if (acq_writer_lim > 0) { + j1 = jiffies; + WARN_ONCE(time_after(j1, j + acq_writer_lim), + "%s: Lock acquisition took %lu jiffies.\n", + __func__, j1 - j); + } + lwsp->n_lock_acquired++; + + cxt.cur_ops->write_delay(&rand); + + lock_is_write_held = false; + WRITE_ONCE(last_lock_release, jiffies); + cxt.cur_ops->writeunlock(tid); + } + if (cxt.cur_ops->nested_unlock) + cxt.cur_ops->nested_unlock(tid, lockset_mask); stutter_wait("lock_torture_writer"); } while (!torture_must_stop()); @@ -682,7 +969,8 @@ static int lock_torture_writer(void *arg) static int lock_torture_reader(void *arg) { struct lock_stress_stats *lrsp = arg; - static DEFINE_TORTURE_RANDOM(rand); + int tid = lrsp - cxt.lrsa; + DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("lock_torture_reader task started"); set_user_nice(current, MAX_NICE); @@ -691,15 +979,15 @@ static int lock_torture_reader(void *arg) if ((torture_random(&rand) & 0xfffff) == 0) schedule_timeout_uninterruptible(1); - cxt.cur_ops->readlock(); - lock_is_read_held = 1; + cxt.cur_ops->readlock(tid); + atomic_inc(&lock_is_read_held); if (WARN_ON_ONCE(lock_is_write_held)) lrsp->n_lock_fail++; /* rare, but... */ lrsp->n_lock_acquired++; cxt.cur_ops->read_delay(&rand); - lock_is_read_held = 0; - cxt.cur_ops->readunlock(); + atomic_dec(&lock_is_read_held); + cxt.cur_ops->readunlock(tid); stutter_wait("lock_torture_reader"); } while (!torture_must_stop()); @@ -713,26 +1001,28 @@ static int lock_torture_reader(void *arg) static void __torture_print_stats(char *page, struct lock_stress_stats *statp, bool write) { - bool fail = 0; + long cur; + bool fail = false; int i, n_stress; - long max = 0; - long min = statp[0].n_lock_acquired; + long max = 0, min = statp ? data_race(statp[0].n_lock_acquired) : 0; long long sum = 0; n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress; for (i = 0; i < n_stress; i++) { - if (statp[i].n_lock_fail) + if (data_race(statp[i].n_lock_fail)) fail = true; - sum += statp[i].n_lock_acquired; - if (max < statp[i].n_lock_fail) - max = statp[i].n_lock_fail; - if (min > statp[i].n_lock_fail) - min = statp[i].n_lock_fail; + cur = data_race(statp[i].n_lock_acquired); + sum += cur; + if (max < cur) + max = cur; + if (min > cur) + min = cur; } page += sprintf(page, "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", write ? "Writes" : "Reads ", - sum, max, min, max / 2 > min ? "???" : "", + sum, max, min, + !onoff_interval && max / 2 > min ? "???" : "", fail, fail ? "!!!" : ""); if (fail) atomic_inc(&cxt.n_lock_torture_errors); @@ -798,16 +1088,69 @@ static int lock_torture_stats(void *arg) return 0; } + static inline void lock_torture_print_module_parms(struct lock_torture_ops *cur_ops, const char *tag) { + static cpumask_t cpumask_all; + cpumask_t *rcmp = cpumask_nonempty(bind_readers) ? bind_readers : &cpumask_all; + cpumask_t *wcmp = cpumask_nonempty(bind_writers) ? bind_writers : &cpumask_all; + + cpumask_setall(&cpumask_all); pr_alert("%s" TORTURE_FLAG - "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", + "--- %s%s: acq_writer_lim=%d bind_readers=%*pbl bind_writers=%*pbl call_rcu_chains=%d long_hold=%d nested_locks=%d nreaders_stress=%d nwriters_stress=%d onoff_holdoff=%d onoff_interval=%d rt_boost=%d rt_boost_factor=%d shuffle_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d verbose=%d writer_fifo=%d\n", torture_type, tag, cxt.debug_lock ? " [debug]": "", - cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval, - verbose, shuffle_interval, stutter, shutdown_secs, - onoff_interval, onoff_holdoff); + acq_writer_lim, cpumask_pr_args(rcmp), cpumask_pr_args(wcmp), + call_rcu_chains, long_hold, nested_locks, cxt.nrealreaders_stress, + cxt.nrealwriters_stress, onoff_holdoff, onoff_interval, rt_boost, + rt_boost_factor, shuffle_interval, shutdown_secs, stat_interval, stutter, + verbose, writer_fifo); +} + +// If requested, maintain call_rcu() chains to keep a grace period always +// in flight. These increase the probability of getting an RCU CPU stall +// warning and associated diagnostics when a locking primitive stalls. + +static void call_rcu_chain_cb(struct rcu_head *rhp) +{ + struct call_rcu_chain *crcp = container_of(rhp, struct call_rcu_chain, crc_rh); + + if (!smp_load_acquire(&crcp->crc_stop)) { + (void)start_poll_synchronize_rcu(); // Start one grace period... + call_rcu(&crcp->crc_rh, call_rcu_chain_cb); // ... and later start another. + } +} + +// Start the requested number of call_rcu() chains. +static int call_rcu_chain_init(void) +{ + int i; + + if (call_rcu_chains <= 0) + return 0; + call_rcu_chain_list = kcalloc(call_rcu_chains, sizeof(*call_rcu_chain_list), GFP_KERNEL); + if (!call_rcu_chain_list) + return -ENOMEM; + for (i = 0; i < call_rcu_chains; i++) { + call_rcu_chain_list[i].crc_stop = false; + call_rcu(&call_rcu_chain_list[i].crc_rh, call_rcu_chain_cb); + } + return 0; +} + +// Stop all of the call_rcu() chains. +static void call_rcu_chain_cleanup(void) +{ + int i; + + if (!call_rcu_chain_list) + return; + for (i = 0; i < call_rcu_chains; i++) + smp_store_release(&call_rcu_chain_list[i].crc_stop, true); + rcu_barrier(); + kfree(call_rcu_chain_list); + call_rcu_chain_list = NULL; } static void lock_torture_cleanup(void) @@ -819,17 +1162,17 @@ static void lock_torture_cleanup(void) /* * Indicates early cleanup, meaning that the test has not run, - * such as when passing bogus args when loading the module. As - * such, only perform the underlying torture-specific cleanups, - * and avoid anything related to locktorture. + * such as when passing bogus args when loading the module. + * However cxt->cur_ops.init() may have been invoked, so beside + * perform the underlying torture-specific cleanups, cur_ops.exit() + * will be invoked if needed. */ - if (!cxt.lwsa) + if (!cxt.lwsa && !cxt.lrsa) goto end; if (writer_tasks) { for (i = 0; i < cxt.nrealwriters_stress; i++) - torture_stop_kthread(lock_torture_writer, - writer_tasks[i]); + torture_stop_kthread(lock_torture_writer, writer_tasks[i]); kfree(writer_tasks); writer_tasks = NULL; } @@ -856,9 +1199,22 @@ static void lock_torture_cleanup(void) "End of test: SUCCESS"); kfree(cxt.lwsa); + cxt.lwsa = NULL; kfree(cxt.lrsa); + cxt.lrsa = NULL; + + call_rcu_chain_cleanup(); end: + if (cxt.init_called) { + if (cxt.cur_ops->exit) + cxt.cur_ops->exit(); + cxt.init_called = false; + } + + free_cpumask_var(bind_readers); + free_cpumask_var(bind_writers); + torture_cleanup_end(); } @@ -869,6 +1225,10 @@ static int __init lock_torture_init(void) static struct lock_torture_ops *torture_ops[] = { &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, + &raw_spin_lock_ops, &raw_spin_lock_irq_ops, +#ifdef CONFIG_BPF_SYSCALL + &raw_res_spin_lock_ops, &raw_res_spin_lock_irq_ops, +#endif &rw_lock_ops, &rw_lock_irq_ops, &mutex_lock_ops, &ww_mutex_lock_ops, @@ -879,7 +1239,7 @@ static int __init lock_torture_init(void) &percpu_rwsem_lock_ops, }; - if (!torture_init_begin(torture_type, verbose, &torture_runnable)) + if (!torture_init_begin(torture_type, verbose)) return -EBUSY; /* Process args and tell the world that the torturer is on the job. */ @@ -898,40 +1258,54 @@ static int __init lock_torture_init(void) firsterr = -EINVAL; goto unwind; } - if (cxt.cur_ops->init) - cxt.cur_ops->init(); + + if (nwriters_stress == 0 && + (!cxt.cur_ops->readlock || nreaders_stress == 0)) { + pr_alert("lock-torture: must run at least one locking thread\n"); + firsterr = -EINVAL; + goto unwind; + } if (nwriters_stress >= 0) cxt.nrealwriters_stress = nwriters_stress; else cxt.nrealwriters_stress = 2 * num_online_cpus(); + if (cxt.cur_ops->init) { + cxt.cur_ops->init(); + cxt.init_called = true; + } + #ifdef CONFIG_DEBUG_MUTEXES - if (strncmp(torture_type, "mutex", 5) == 0) + if (str_has_prefix(torture_type, "mutex")) cxt.debug_lock = true; #endif #ifdef CONFIG_DEBUG_RT_MUTEXES - if (strncmp(torture_type, "rtmutex", 7) == 0) + if (str_has_prefix(torture_type, "rtmutex")) cxt.debug_lock = true; #endif #ifdef CONFIG_DEBUG_SPINLOCK - if ((strncmp(torture_type, "spin", 4) == 0) || - (strncmp(torture_type, "rw_lock", 7) == 0)) + if ((str_has_prefix(torture_type, "spin")) || + (str_has_prefix(torture_type, "rw_lock"))) cxt.debug_lock = true; #endif /* Initialize the statistics so that each run gets its own numbers. */ + if (nwriters_stress) { + lock_is_write_held = false; + cxt.lwsa = kmalloc_array(cxt.nrealwriters_stress, + sizeof(*cxt.lwsa), + GFP_KERNEL); + if (cxt.lwsa == NULL) { + VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory"); + firsterr = -ENOMEM; + goto unwind; + } - lock_is_write_held = 0; - cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL); - if (cxt.lwsa == NULL) { - VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < cxt.nrealwriters_stress; i++) { - cxt.lwsa[i].n_lock_fail = 0; - cxt.lwsa[i].n_lock_acquired = 0; + for (i = 0; i < cxt.nrealwriters_stress; i++) { + cxt.lwsa[i].n_lock_fail = 0; + cxt.lwsa[i].n_lock_acquired = 0; + } } if (cxt.cur_ops->readlock) { @@ -948,61 +1322,76 @@ static int __init lock_torture_init(void) cxt.nrealreaders_stress = cxt.nrealwriters_stress; } - lock_is_read_held = 0; - cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL); - if (cxt.lrsa == NULL) { - VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); - firsterr = -ENOMEM; - kfree(cxt.lwsa); - cxt.lwsa = NULL; - goto unwind; - } - - for (i = 0; i < cxt.nrealreaders_stress; i++) { - cxt.lrsa[i].n_lock_fail = 0; - cxt.lrsa[i].n_lock_acquired = 0; + if (nreaders_stress) { + cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress, + sizeof(*cxt.lrsa), + GFP_KERNEL); + if (cxt.lrsa == NULL) { + VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); + firsterr = -ENOMEM; + kfree(cxt.lwsa); + cxt.lwsa = NULL; + goto unwind; + } + + for (i = 0; i < cxt.nrealreaders_stress; i++) { + cxt.lrsa[i].n_lock_fail = 0; + cxt.lrsa[i].n_lock_acquired = 0; + } } } + firsterr = call_rcu_chain_init(); + if (torture_init_error(firsterr)) + goto unwind; + lock_torture_print_module_parms(cxt.cur_ops, "Start of test"); /* Prepare torture context. */ if (onoff_interval > 0) { firsterr = torture_onoff_init(onoff_holdoff * HZ, - onoff_interval * HZ); - if (firsterr) + onoff_interval * HZ, NULL); + if (torture_init_error(firsterr)) goto unwind; } if (shuffle_interval > 0) { firsterr = torture_shuffle_init(shuffle_interval); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (shutdown_secs > 0) { firsterr = torture_shutdown_init(shutdown_secs, lock_torture_cleanup); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (stutter > 0) { - firsterr = torture_stutter_init(stutter); - if (firsterr) + firsterr = torture_stutter_init(stutter, stutter); + if (torture_init_error(firsterr)) goto unwind; } - writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]), - GFP_KERNEL); - if (writer_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); - firsterr = -ENOMEM; - goto unwind; + if (nwriters_stress) { + writer_tasks = kcalloc(cxt.nrealwriters_stress, + sizeof(writer_tasks[0]), + GFP_KERNEL); + if (writer_tasks == NULL) { + TOROUT_ERRSTRING("writer_tasks: Out of memory"); + firsterr = -ENOMEM; + goto unwind; + } } + /* cap nested_locks to MAX_NESTED_LOCKS */ + if (nested_locks > MAX_NESTED_LOCKS) + nested_locks = MAX_NESTED_LOCKS; + if (cxt.cur_ops->readlock) { - reader_tasks = kzalloc(cxt.nrealreaders_stress * sizeof(reader_tasks[0]), + reader_tasks = kcalloc(cxt.nrealreaders_stress, + sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory"); + TOROUT_ERRSTRING("reader_tasks: Out of memory"); kfree(writer_tasks); writer_tasks = NULL; firsterr = -ENOMEM; @@ -1024,10 +1413,13 @@ static int __init lock_torture_init(void) goto create_reader; /* Create writer. */ - firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i], - writer_tasks[i]); - if (firsterr) + firsterr = torture_create_kthread_cb(lock_torture_writer, &cxt.lwsa[i], + writer_tasks[i], + writer_fifo ? sched_set_fifo : NULL); + if (torture_init_error(firsterr)) goto unwind; + if (cpumask_nonempty(bind_writers)) + torture_sched_setaffinity(writer_tasks[i]->pid, bind_writers, true); create_reader: if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress)) @@ -1035,13 +1427,15 @@ static int __init lock_torture_init(void) /* Create reader. */ firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j], reader_tasks[j]); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; + if (cpumask_nonempty(bind_readers)) + torture_sched_setaffinity(reader_tasks[j]->pid, bind_readers, true); } if (stat_interval > 0) { firsterr = torture_create_kthread(lock_torture_stats, NULL, stats_task); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } torture_init_end(); @@ -1050,6 +1444,10 @@ static int __init lock_torture_init(void) unwind: torture_init_end(); lock_torture_cleanup(); + if (shutdown_secs) { + WARN_ON(!IS_MODULE(CONFIG_LOCK_TORTURE_TEST)); + kernel_power_off(); + } return firsterr; } diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 6a385aabcce7..5c92ba199b90 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * MCS lock defines * @@ -6,7 +7,7 @@ * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock * with the desirable properties of being fair, and with each cpu trying * to acquire the lock spinning on a local variable. - * It avoids expensive cache bouncings that common test-and-set spin-lock + * It avoids expensive cache bounces that common test-and-set spin-lock * implementations incur. */ #ifndef __LINUX_MCS_SPINLOCK_H @@ -14,22 +15,16 @@ #include <asm/mcs_spinlock.h> -struct mcs_spinlock { - struct mcs_spinlock *next; - int locked; /* 1 if lock acquired */ - int count; /* nesting count, see qspinlock.c */ -}; - #ifndef arch_mcs_spin_lock_contended /* - * Using smp_load_acquire() provides a memory barrier that ensures - * subsequent operations happen after the lock is acquired. + * Using smp_cond_load_acquire() provides the acquire semantics + * required so that subsequent operations happen after the + * lock is acquired. Additionally, some architectures such as + * ARM64 would like to do spin-waiting instead of purely + * spinning, and smp_cond_load_acquire() provides that behavior. */ #define arch_mcs_spin_lock_contended(l) \ -do { \ - while (!(smp_load_acquire(l))) \ - cpu_relax(); \ -} while (0) + smp_cond_load_acquire(l, VAL) #endif #ifndef arch_mcs_spin_unlock_contended diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 9aa713629387..2c6b02d4699b 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -1,6 +1,4 @@ /* - * kernel/mutex-debug.c - * * Debugging code for mutexes * * Started by Ingo Molnar: @@ -14,6 +12,7 @@ */ #include <linux/mutex.h> #include <linux/delay.h> +#include <linux/device.h> #include <linux/export.h> #include <linux/poison.h> #include <linux/sched.h> @@ -22,7 +21,7 @@ #include <linux/interrupt.h> #include <linux/debug_locks.h> -#include "mutex-debug.h" +#include "mutex.h" /* * Must be called with lock->wait_lock held. @@ -32,11 +31,12 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); waiter->magic = waiter; INIT_LIST_HEAD(&waiter->list); + waiter->ww_ctx = MUTEX_POISON_WW_CTX; } void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) { - SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); + lockdep_assert_held(&lock->wait_lock); DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list)); DEBUG_LOCKS_WARN_ON(waiter->magic != waiter); DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); @@ -51,21 +51,22 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter) void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task) { - SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); + lockdep_assert_held(&lock->wait_lock); - /* Mark the current thread as blocked on the lock: */ - task->blocked_on = waiter; + /* Current thread can't be already blocked (since it's executing!) */ + DEBUG_LOCKS_WARN_ON(__get_task_blocked_on(task)); } -void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, +void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task) { + struct mutex *blocked_on = __get_task_blocked_on(task); + DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); DEBUG_LOCKS_WARN_ON(waiter->task != task); - DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); - task->blocked_on = NULL; + DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on != lock); - list_del_init(&waiter->list); + INIT_LIST_HEAD(&waiter->list); waiter->task = NULL; } @@ -77,19 +78,22 @@ void debug_mutex_unlock(struct mutex *lock) } } -void debug_mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key) +void debug_mutex_init(struct mutex *lock) { -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif lock->magic = lock; } +static void devm_mutex_release(void *res) +{ + mutex_destroy(res); +} + +int __devm_mutex_init(struct device *dev, struct mutex *lock) +{ + return devm_add_action_or_reset(dev, devm_mutex_release, lock); +} +EXPORT_SYMBOL_GPL(__devm_mutex_init); + /*** * mutex_destroy - mark a mutex unusable * @lock: the mutex to be destroyed diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h deleted file mode 100644 index 4174417d5309..000000000000 --- a/kernel/locking/mutex-debug.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Mutexes: blocking mutual exclusion locks - * - * started by Ingo Molnar: - * - * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * - * This file contains mutex debugging related internal declarations, - * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case. - * More details are in kernel/mutex-debug.c. - */ - -/* - * This must be called with lock->wait_lock held. - */ -extern void debug_mutex_lock_common(struct mutex *lock, - struct mutex_waiter *waiter); -extern void debug_mutex_wake_waiter(struct mutex *lock, - struct mutex_waiter *waiter); -extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); -extern void debug_mutex_add_waiter(struct mutex *lock, - struct mutex_waiter *waiter, - struct task_struct *task); -extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct task_struct *task); -extern void debug_mutex_unlock(struct mutex *lock); -extern void debug_mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 858a07590e39..2a1d165b3167 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/locking/mutex.c * @@ -15,7 +16,7 @@ * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale * and Sven Dietrich. * - * Also see Documentation/locking/mutex-design.txt. + * Also see Documentation/locking/mutex-design.rst. */ #include <linux/mutex.h> #include <linux/ww_mutex.h> @@ -28,101 +29,108 @@ #include <linux/interrupt.h> #include <linux/debug_locks.h> #include <linux/osq_lock.h> +#include <linux/hung_task.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/lock.h> + +#ifndef CONFIG_PREEMPT_RT +#include "mutex.h" #ifdef CONFIG_DEBUG_MUTEXES -# include "mutex-debug.h" +# define MUTEX_WARN_ON(cond) DEBUG_LOCKS_WARN_ON(cond) #else -# include "mutex.h" +# define MUTEX_WARN_ON(cond) #endif -void -__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) +static void __mutex_init_generic(struct mutex *lock) { atomic_long_set(&lock->owner, 0); - spin_lock_init(&lock->wait_lock); + raw_spin_lock_init(&lock->wait_lock); INIT_LIST_HEAD(&lock->wait_list); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER osq_lock_init(&lock->osq); #endif - - debug_mutex_init(lock, name, key); + debug_mutex_init(lock); } -EXPORT_SYMBOL(__mutex_init); - -/* - * @owner: contains: 'struct task_struct *' to the current lock owner, - * NULL means not owned. Since task_struct pointers are aligned at - * at least L1_CACHE_BYTES, we have low bits to store extra state. - * - * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup. - * Bit1 indicates unlock needs to hand the lock to the top-waiter - * Bit2 indicates handoff has been done and we're waiting for pickup. - */ -#define MUTEX_FLAG_WAITERS 0x01 -#define MUTEX_FLAG_HANDOFF 0x02 -#define MUTEX_FLAG_PICKUP 0x04 - -#define MUTEX_FLAGS 0x07 static inline struct task_struct *__owner_task(unsigned long owner) { return (struct task_struct *)(owner & ~MUTEX_FLAGS); } +bool mutex_is_locked(struct mutex *lock) +{ + return __mutex_owner(lock) != NULL; +} +EXPORT_SYMBOL(mutex_is_locked); + static inline unsigned long __owner_flags(unsigned long owner) { return owner & MUTEX_FLAGS; } +/* Do not use the return value as a pointer directly. */ +unsigned long mutex_get_owner(struct mutex *lock) +{ + unsigned long owner = atomic_long_read(&lock->owner); + + return (unsigned long)__owner_task(owner); +} + /* - * Trylock variant that retuns the owning task on failure. + * Returns: __mutex_owner(lock) on failure or NULL on success. */ -static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock) +static inline struct task_struct *__mutex_trylock_common(struct mutex *lock, bool handoff) { unsigned long owner, curr = (unsigned long)current; owner = atomic_long_read(&lock->owner); for (;;) { /* must loop, can race against a flag */ - unsigned long old, flags = __owner_flags(owner); + unsigned long flags = __owner_flags(owner); unsigned long task = owner & ~MUTEX_FLAGS; if (task) { - if (likely(task != curr)) - break; - - if (likely(!(flags & MUTEX_FLAG_PICKUP))) + if (flags & MUTEX_FLAG_PICKUP) { + if (task != curr) + break; + flags &= ~MUTEX_FLAG_PICKUP; + } else if (handoff) { + if (flags & MUTEX_FLAG_HANDOFF) + break; + flags |= MUTEX_FLAG_HANDOFF; + } else { break; - - flags &= ~MUTEX_FLAG_PICKUP; + } } else { -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(flags & MUTEX_FLAG_PICKUP); -#endif + MUTEX_WARN_ON(flags & (MUTEX_FLAG_HANDOFF | MUTEX_FLAG_PICKUP)); + task = curr; } - /* - * We set the HANDOFF bit, we must make sure it doesn't live - * past the point where we acquire it. This would be possible - * if we (accidentally) set the bit on an unlocked mutex. - */ - flags &= ~MUTEX_FLAG_HANDOFF; - - old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags); - if (old == owner) - return NULL; - - owner = old; + if (atomic_long_try_cmpxchg_acquire(&lock->owner, &owner, task | flags)) { + if (task == curr) + return NULL; + break; + } } return __owner_task(owner); } /* + * Trylock or set HANDOFF + */ +static inline bool __mutex_trylock_or_handoff(struct mutex *lock, bool handoff) +{ + return !__mutex_trylock_common(lock, handoff); +} + +/* * Actual trylock that will work on any unlocked state. */ static inline bool __mutex_trylock(struct mutex *lock) { - return !__mutex_trylock_or_owner(lock); + return !__mutex_trylock_common(lock, false); } #ifndef CONFIG_DEBUG_LOCK_ALLOC @@ -131,6 +139,11 @@ static inline bool __mutex_trylock(struct mutex *lock) * There is nothing that would stop spreading the lockdep annotations outwards * except more code. */ +void mutex_init_generic(struct mutex *lock) +{ + __mutex_init_generic(lock); +} +EXPORT_SYMBOL(mutex_init_generic); /* * Optimistic trylock that only works in the uncontended case. Make sure to @@ -139,8 +152,11 @@ static inline bool __mutex_trylock(struct mutex *lock) static __always_inline bool __mutex_trylock_fast(struct mutex *lock) { unsigned long curr = (unsigned long)current; + unsigned long zero = 0UL; + + MUTEX_WARN_ON(lock->magic != lock); - if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr)) + if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr)) return true; return false; @@ -150,12 +166,23 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock) { unsigned long curr = (unsigned long)current; - if (atomic_long_cmpxchg_release(&lock->owner, curr, 0UL) == curr) - return true; + return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL); +} - return false; +#else /* !CONFIG_DEBUG_LOCK_ALLOC */ + +void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key) +{ + __mutex_init_generic(lock); + + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); } -#endif +EXPORT_SYMBOL(mutex_init_lockep); +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag) { @@ -173,8 +200,35 @@ static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_wait } /* + * Add @waiter to a given location in the lock wait_list and set the + * FLAG_WAITERS flag if it's the first waiter. + */ +static void +__mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, + struct list_head *list) +{ + hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX); + debug_mutex_add_waiter(lock, waiter, current); + + list_add_tail(&waiter->list, list); + if (__mutex_waiter_is_first(lock, waiter)) + __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); +} + +static void +__mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter) +{ + list_del(&waiter->list); + if (likely(list_empty(&lock->wait_list))) + __mutex_clear_flag(lock, MUTEX_FLAGS); + + debug_mutex_remove_waiter(lock, waiter, current); + hung_task_clear_blocker(); +} + +/* * Give up ownership to a specific task, when @task = NULL, this is equivalent - * to a regular unlock. Sets PICKUP on a handoff, clears HANDOF, preserves + * to a regular unlock. Sets PICKUP on a handoff, clears HANDOFF, preserves * WAITERS. Provides RELEASE semantics like a regular unlock, the * __mutex_trylock() provides a matching ACQUIRE semantics for the handoff. */ @@ -183,23 +237,18 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task) unsigned long owner = atomic_long_read(&lock->owner); for (;;) { - unsigned long old, new; + unsigned long new; -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); - DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP); -#endif + MUTEX_WARN_ON(__owner_task(owner) != current); + MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP); new = (owner & MUTEX_FLAG_WAITERS); new |= (unsigned long)task; if (task) new |= MUTEX_FLAG_PICKUP; - old = atomic_long_cmpxchg_release(&lock->owner, owner, new); - if (old == owner) + if (atomic_long_try_cmpxchg_release(&lock->owner, &owner, new)) break; - - owner = old; } } @@ -243,135 +292,18 @@ void __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); #endif -static __always_inline void -ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) -{ -#ifdef CONFIG_DEBUG_MUTEXES - /* - * If this WARN_ON triggers, you used ww_mutex_lock to acquire, - * but released with a normal mutex_unlock in this call. - * - * This should never happen, always use ww_mutex_unlock. - */ - DEBUG_LOCKS_WARN_ON(ww->ctx); - - /* - * Not quite done after calling ww_acquire_done() ? - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); - - if (ww_ctx->contending_lock) { - /* - * After -EDEADLK you tried to - * acquire a different ww_mutex? Bad! - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); - - /* - * You called ww_mutex_lock after receiving -EDEADLK, - * but 'forgot' to unlock everything else first? - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); - ww_ctx->contending_lock = NULL; - } - - /* - * Naughty, using a different class will lead to undefined behavior! - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); -#endif - ww_ctx->acquired++; -} - -static inline bool __sched -__ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) -{ - return a->stamp - b->stamp <= LONG_MAX && - (a->stamp != b->stamp || a > b); -} - -/* - * Wake up any waiters that may have to back off when the lock is held by the - * given context. - * - * Due to the invariants on the wait list, this can only affect the first - * waiter with a context. - * - * The current task must not be on the wait list. - */ -static void __sched -__ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) -{ - struct mutex_waiter *cur; - - lockdep_assert_held(&lock->wait_lock); +#include "ww_mutex.h" - list_for_each_entry(cur, &lock->wait_list, list) { - if (!cur->ww_ctx) - continue; - - if (cur->ww_ctx->acquired > 0 && - __ww_ctx_stamp_after(cur->ww_ctx, ww_ctx)) { - debug_mutex_wake_waiter(lock, cur); - wake_up_process(cur->task); - } - - break; - } -} - -/* - * After acquiring lock with fastpath or when we lost out in contested - * slowpath, set ctx and wake up any waiters so they can recheck. - */ -static __always_inline void -ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) -{ - ww_mutex_lock_acquired(lock, ctx); - - lock->ctx = ctx; - - /* - * The lock->ctx update should be visible on all cores before - * the atomic read is done, otherwise contended waiters might be - * missed. The contended waiters will either see ww_ctx == NULL - * and keep spinning, or it will acquire wait_lock, add itself - * to waiter list and sleep. - */ - smp_mb(); /* ^^^ */ - - /* - * Check if lock is contended, if not there is nobody to wake up - */ - if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS))) - return; - - /* - * Uh oh, we raced in fastpath, wake up everyone in this case, - * so they can see the new lock->ctx. - */ - spin_lock(&lock->base.wait_lock); - __ww_mutex_wakeup_for_backoff(&lock->base, ctx); - spin_unlock(&lock->base.wait_lock); -} +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* - * After acquiring lock in the slowpath set ctx. - * - * Unlike for the fast path, the caller ensures that waiters are woken up where - * necessary. - * - * Callers must hold the mutex wait_lock. + * Trylock variant that returns the owning task on failure. */ -static __always_inline void -ww_mutex_set_context_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock) { - ww_mutex_lock_acquired(lock, ctx); - lock->ctx = ctx; + return __mutex_trylock_common(lock, false); } -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - static inline bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, struct mutex_waiter *waiter) @@ -426,21 +358,23 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner, { bool ret = true; - rcu_read_lock(); + lockdep_assert_preemption_disabled(); + while (__mutex_owner(lock) == owner) { /* * Ensure we emit the owner->on_cpu, dereference _after_ - * checking lock->owner still matches owner. If that fails, - * owner might point to freed memory. If it still matches, - * the rcu_read_lock() ensures the memory stays valid. + * checking lock->owner still matches owner. And we already + * disabled preemption which is equal to the RCU read-side + * crital section in optimistic spinning code. Thus the + * task_strcut structure won't go away during the spinning + * period */ barrier(); /* * Use vcpu_is_preempted to detect lock holder preemption issue. */ - if (!owner->on_cpu || need_resched() || - vcpu_is_preempted(task_cpu(owner))) { + if (!owner_on_cpu(owner) || need_resched()) { ret = false; break; } @@ -452,7 +386,6 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner, cpu_relax(); } - rcu_read_unlock(); return ret; } @@ -465,19 +398,19 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) struct task_struct *owner; int retval = 1; + lockdep_assert_preemption_disabled(); + if (need_resched()) return 0; - rcu_read_lock(); - owner = __mutex_owner(lock); - /* - * As lock holder preemption issue, we both skip spinning if task is not - * on cpu or its cpu is preempted + * We already disabled preemption which is equal to the RCU read-side + * crital section in optimistic spinning code. Thus the task_strcut + * structure won't go away during the spinning period. */ + owner = __mutex_owner(lock); if (owner) - retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); - rcu_read_unlock(); + retval = owner_on_cpu(owner); /* * If lock->owner is not set, the mutex has been released. Return true @@ -510,7 +443,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) */ static __always_inline bool mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, - const bool use_ww_ctx, struct mutex_waiter *waiter) + struct mutex_waiter *waiter) { if (!waiter) { /* @@ -586,7 +519,7 @@ fail: #else static __always_inline bool mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, - const bool use_ww_ctx, struct mutex_waiter *waiter) + struct mutex_waiter *waiter) { return false; } @@ -603,6 +536,11 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne * This function must not be used in interrupt context. Unlocking * of a not locked mutex is not allowed. * + * The caller must ensure that the mutex stays alive until this function has + * returned - mutex_unlock() can NOT directly be used to release an object such + * that another concurrent task can free it. + * Mutexes are different from spinlocks & refcounts in this aspect. + * * This function is similar to (but not equivalent to) up(). */ void __sched mutex_unlock(struct mutex *lock) @@ -628,182 +566,102 @@ EXPORT_SYMBOL(mutex_unlock); */ void __sched ww_mutex_unlock(struct ww_mutex *lock) { - /* - * The unlocking fastpath is the 0->1 transition from 'locked' - * into 'unlocked' state: - */ - if (lock->ctx) { -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired); -#endif - if (lock->ctx->acquired > 0) - lock->ctx->acquired--; - lock->ctx = NULL; - } - + __ww_mutex_unlock(lock); mutex_unlock(&lock->base); } EXPORT_SYMBOL(ww_mutex_unlock); -static inline int __sched -__ww_mutex_lock_check_stamp(struct mutex *lock, struct mutex_waiter *waiter, - struct ww_acquire_ctx *ctx) -{ - struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); - struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); - struct mutex_waiter *cur; - - if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx)) - goto deadlock; - - /* - * If there is a waiter in front of us that has a context, then its - * stamp is earlier than ours and we must back off. - */ - cur = waiter; - list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) { - if (cur->ww_ctx) - goto deadlock; - } - - return 0; - -deadlock: -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(ctx->contending_lock); - ctx->contending_lock = ww; -#endif - return -EDEADLK; -} - -static inline int __sched -__ww_mutex_add_waiter(struct mutex_waiter *waiter, - struct mutex *lock, - struct ww_acquire_ctx *ww_ctx) -{ - struct mutex_waiter *cur; - struct list_head *pos; - - if (!ww_ctx) { - list_add_tail(&waiter->list, &lock->wait_list); - return 0; - } - - /* - * Add the waiter before the first waiter with a higher stamp. - * Waiters without a context are skipped to avoid starving - * them. - */ - pos = &lock->wait_list; - list_for_each_entry_reverse(cur, &lock->wait_list, list) { - if (!cur->ww_ctx) - continue; - - if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) { - /* Back off immediately if necessary. */ - if (ww_ctx->acquired > 0) { -#ifdef CONFIG_DEBUG_MUTEXES - struct ww_mutex *ww; - - ww = container_of(lock, struct ww_mutex, base); - DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock); - ww_ctx->contending_lock = ww; -#endif - return -EDEADLK; - } - - break; - } - - pos = &cur->list; - - /* - * Wake up the waiter so that it gets a chance to back - * off. - */ - if (cur->ww_ctx->acquired > 0) { - debug_mutex_wake_waiter(lock, cur); - wake_up_process(cur->task); - } - } - - list_add_tail(&waiter->list, pos); - return 0; -} - /* * Lock a mutex (possibly interruptible), slowpath: */ static __always_inline int __sched -__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, +__mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip, struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) { + DEFINE_WAKE_Q(wake_q); struct mutex_waiter waiter; - bool first = false; struct ww_mutex *ww; + unsigned long flags; int ret; + if (!use_ww_ctx) + ww_ctx = NULL; + might_sleep(); + MUTEX_WARN_ON(lock->magic != lock); + ww = container_of(lock, struct ww_mutex, base); - if (use_ww_ctx && ww_ctx) { + if (ww_ctx) { if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) return -EALREADY; + + /* + * Reset the wounded flag after a kill. No other process can + * race and wound us here since they can't have a valid owner + * pointer if we don't have any locks held. + */ + if (ww_ctx->acquired == 0) + ww_ctx->wounded = 0; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + nest_lock = &ww_ctx->dep_map; +#endif } preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); + trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); if (__mutex_trylock(lock) || - mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, NULL)) { + mutex_optimistic_spin(lock, ww_ctx, NULL)) { /* got the lock, yay! */ lock_acquired(&lock->dep_map, ip); - if (use_ww_ctx && ww_ctx) + if (ww_ctx) ww_mutex_set_context_fastpath(ww, ww_ctx); + trace_contention_end(lock, 0); preempt_enable(); return 0; } - spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); /* * After waiting to acquire the wait_lock, try again. */ if (__mutex_trylock(lock)) { - if (use_ww_ctx && ww_ctx) - __ww_mutex_wakeup_for_backoff(lock, ww_ctx); + if (ww_ctx) + __ww_mutex_check_waiters(lock, ww_ctx, &wake_q); goto skip_wait; } debug_mutex_lock_common(lock, &waiter); - debug_mutex_add_waiter(lock, &waiter, current); + waiter.task = current; + if (use_ww_ctx) + waiter.ww_ctx = ww_ctx; lock_contended(&lock->dep_map, ip); if (!use_ww_ctx) { /* add waiting tasks to the end of the waitqueue (FIFO): */ - list_add_tail(&waiter.list, &lock->wait_list); - -#ifdef CONFIG_DEBUG_MUTEXES - waiter.ww_ctx = MUTEX_POISON_WW_CTX; -#endif + __mutex_add_waiter(lock, &waiter, &lock->wait_list); } else { - /* Add in stamp order, waking up waiters that must back off. */ - ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx); + /* + * Add in stamp order, waking up waiters that must kill + * themselves. + */ + ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx, &wake_q); if (ret) - goto err_early_backoff; - - waiter.ww_ctx = ww_ctx; + goto err_early_kill; } - waiter.task = current; - - if (__mutex_waiter_is_first(lock, &waiter)) - __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); - + __set_task_blocked_on(current, lock); set_current_state(state); + trace_contention_begin(lock, LCB_F_MUTEX); for (;;) { + bool first; + /* * Once we hold wait_lock, we're serialized against * mutex_unlock() handing the lock off to us, do a trylock @@ -814,92 +672,154 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto acquired; /* - * Check for signals and wound conditions while holding + * Check for signals and kill conditions while holding * wait_lock. This ensures the lock cancellation is ordered * against mutex_unlock() and wake-ups do not go missing. */ - if (unlikely(signal_pending_state(state, current))) { + if (signal_pending_state(state, current)) { ret = -EINTR; goto err; } - if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) { - ret = __ww_mutex_lock_check_stamp(lock, &waiter, ww_ctx); + if (ww_ctx) { + ret = __ww_mutex_check_kill(lock, &waiter, ww_ctx); if (ret) goto err; } - spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); + schedule_preempt_disabled(); + first = __mutex_waiter_is_first(lock, &waiter); + /* - * ww_mutex needs to always recheck its position since its waiter - * list is not FIFO ordered. + * As we likely have been woken up by task + * that has cleared our blocked_on state, re-set + * it to the lock we are trying to acquire. */ - if ((use_ww_ctx && ww_ctx) || !first) { - first = __mutex_waiter_is_first(lock, &waiter); - if (first) - __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); - } - + set_task_blocked_on(current, lock); set_current_state(state); /* * Here we order against unlock; we must either see it change * state back to RUNNING and fall through the next schedule(), * or we must see its unlock and acquire. */ - if (__mutex_trylock(lock) || - (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, &waiter))) + if (__mutex_trylock_or_handoff(lock, first)) break; - spin_lock(&lock->wait_lock); + if (first) { + trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); + /* + * mutex_optimistic_spin() can call schedule(), so + * clear blocked on so we don't become unselectable + * to run. + */ + clear_task_blocked_on(current, lock); + if (mutex_optimistic_spin(lock, ww_ctx, &waiter)) + break; + set_task_blocked_on(current, lock); + trace_contention_begin(lock, LCB_F_MUTEX); + } + + raw_spin_lock_irqsave(&lock->wait_lock, flags); } - spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); acquired: + __clear_task_blocked_on(current, lock); __set_current_state(TASK_RUNNING); - mutex_remove_waiter(lock, &waiter, current); - if (likely(list_empty(&lock->wait_list))) - __mutex_clear_flag(lock, MUTEX_FLAGS); + if (ww_ctx) { + /* + * Wound-Wait; we stole the lock (!first_waiter), check the + * waiters as anyone might want to wound us. + */ + if (!ww_ctx->is_wait_die && + !__mutex_waiter_is_first(lock, &waiter)) + __ww_mutex_check_waiters(lock, ww_ctx, &wake_q); + } + + __mutex_remove_waiter(lock, &waiter); debug_mutex_free_waiter(&waiter); skip_wait: /* got the lock - cleanup and rejoice! */ lock_acquired(&lock->dep_map, ip); + trace_contention_end(lock, 0); - if (use_ww_ctx && ww_ctx) - ww_mutex_set_context_slowpath(ww, ww_ctx); + if (ww_ctx) + ww_mutex_lock_acquired(ww, ww_ctx); - spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); preempt_enable(); return 0; err: + __clear_task_blocked_on(current, lock); __set_current_state(TASK_RUNNING); - mutex_remove_waiter(lock, &waiter, current); -err_early_backoff: - spin_unlock(&lock->wait_lock); + __mutex_remove_waiter(lock, &waiter); +err_early_kill: + WARN_ON(__get_task_blocked_on(current)); + trace_contention_end(lock, ret); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); debug_mutex_free_waiter(&waiter); - mutex_release(&lock->dep_map, 1, ip); + mutex_release(&lock->dep_map, ip); preempt_enable(); return ret; } static int __sched -__mutex_lock(struct mutex *lock, long state, unsigned int subclass, +__mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip) { return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false); } static int __sched -__ww_mutex_lock(struct mutex *lock, long state, unsigned int subclass, - struct lockdep_map *nest_lock, unsigned long ip, - struct ww_acquire_ctx *ww_ctx) +__ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, + unsigned long ip, struct ww_acquire_ctx *ww_ctx) { - return __mutex_lock_common(lock, state, subclass, nest_lock, ip, ww_ctx, true); + return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true); +} + +/** + * ww_mutex_trylock - tries to acquire the w/w mutex with optional acquire context + * @ww: mutex to lock + * @ww_ctx: optional w/w acquire context + * + * Trylocks a mutex with the optional acquire context; no deadlock detection is + * possible. Returns 1 if the mutex has been acquired successfully, 0 otherwise. + * + * Unlike ww_mutex_lock, no deadlock handling is performed. However, if a @ctx is + * specified, -EALREADY handling may happen in calls to ww_mutex_trylock. + * + * A mutex acquired with this function must be released with ww_mutex_unlock. + */ +int ww_mutex_trylock(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) +{ + if (!ww_ctx) + return mutex_trylock(&ww->base); + + MUTEX_WARN_ON(ww->base.magic != &ww->base); + + /* + * Reset the wounded flag after a kill. No other process can + * race and wound us here, since they can't have a valid owner + * pointer if we don't have any locks held. + */ + if (ww_ctx->acquired == 0) + ww_ctx->wounded = 0; + + if (__mutex_trylock(&ww->base)) { + ww_mutex_set_context_fastpath(ww, ww_ctx); + mutex_acquire_nest(&ww->base.dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_); + return 1; + } + + return 0; } +EXPORT_SYMBOL(ww_mutex_trylock); #ifdef CONFIG_DEBUG_LOCK_ALLOC void __sched @@ -918,11 +838,12 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); int __sched -mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) +_mutex_lock_killable(struct mutex *lock, unsigned int subclass, + struct lockdep_map *nest) { - return __mutex_lock(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); + return __mutex_lock(lock, TASK_KILLABLE, subclass, nest, _RET_IP_); } -EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); +EXPORT_SYMBOL_GPL(_mutex_lock_killable); int __sched mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) @@ -978,8 +899,7 @@ ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) might_sleep(); ret = __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, - 0, ctx ? &ctx->dep_map : NULL, _RET_IP_, - ctx); + 0, _RET_IP_, ctx); if (!ret && ctx && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); @@ -994,8 +914,7 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) might_sleep(); ret = __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, - 0, ctx ? &ctx->dep_map : NULL, _RET_IP_, - ctx); + 0, _RET_IP_, ctx); if (!ret && ctx && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); @@ -1014,8 +933,9 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne struct task_struct *next = NULL; DEFINE_WAKE_Q(wake_q); unsigned long owner; + unsigned long flags; - mutex_release(&lock->dep_map, 1, ip); + mutex_release(&lock->dep_map, ip); /* * Release the lock before (potentially) taking the spinlock such that @@ -1026,29 +946,21 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne */ owner = atomic_long_read(&lock->owner); for (;;) { - unsigned long old; - -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); - DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP); -#endif + MUTEX_WARN_ON(__owner_task(owner) != current); + MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP); if (owner & MUTEX_FLAG_HANDOFF) break; - old = atomic_long_cmpxchg_release(&lock->owner, owner, - __owner_flags(owner)); - if (old == owner) { + if (atomic_long_try_cmpxchg_release(&lock->owner, &owner, __owner_flags(owner))) { if (owner & MUTEX_FLAG_WAITERS) break; return; } - - owner = old; } - spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); debug_mutex_unlock(lock); if (!list_empty(&lock->wait_list)) { /* get the first entry from the wait-list: */ @@ -1059,15 +971,14 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne next = waiter->task; debug_mutex_wake_waiter(lock, waiter); + __clear_task_blocked_on(next, lock); wake_q_add(&wake_q, next); } if (owner & MUTEX_FLAG_HANDOFF) __mutex_handoff(lock, next); - spin_unlock(&lock->wait_lock); - - wake_up_q(&wake_q); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); } #ifndef CONFIG_DEBUG_LOCK_ALLOC @@ -1082,15 +993,16 @@ static noinline int __sched __mutex_lock_interruptible_slowpath(struct mutex *lock); /** - * mutex_lock_interruptible - acquire the mutex, interruptible - * @lock: the mutex to be acquired + * mutex_lock_interruptible() - Acquire the mutex, interruptible by signals. + * @lock: The mutex to be acquired. * - * Lock the mutex like mutex_lock(), and return 0 if the mutex has - * been acquired or sleep until the mutex becomes available. If a - * signal arrives while waiting for the lock then this function - * returns -EINTR. + * Lock the mutex like mutex_lock(). If a signal is delivered while the + * process is sleeping, this function will return without acquiring the + * mutex. * - * This function is similar to (but not equivalent to) down_interruptible(). + * Context: Process context. + * Return: 0 if the lock was successfully acquired or %-EINTR if a + * signal arrived. */ int __sched mutex_lock_interruptible(struct mutex *lock) { @@ -1104,6 +1016,18 @@ int __sched mutex_lock_interruptible(struct mutex *lock) EXPORT_SYMBOL(mutex_lock_interruptible); +/** + * mutex_lock_killable() - Acquire the mutex, interruptible by fatal signals. + * @lock: The mutex to be acquired. + * + * Lock the mutex like mutex_lock(). If a signal which will be fatal to + * the current process is delivered while the process is sleeping, this + * function will return without acquiring the mutex. + * + * Context: Process context. + * Return: 0 if the lock was successfully acquired or %-EINTR if a + * fatal signal arrived. + */ int __sched mutex_lock_killable(struct mutex *lock) { might_sleep(); @@ -1115,6 +1039,16 @@ int __sched mutex_lock_killable(struct mutex *lock) } EXPORT_SYMBOL(mutex_lock_killable); +/** + * mutex_lock_io() - Acquire the mutex and mark the process as waiting for I/O + * @lock: The mutex to be acquired. + * + * Lock the mutex like mutex_lock(). While the task is waiting for this + * mutex, it will be accounted as being in the IO wait state by the + * scheduler. + * + * Context: Process context. + */ void __sched mutex_lock_io(struct mutex *lock) { int token; @@ -1146,7 +1080,7 @@ __mutex_lock_interruptible_slowpath(struct mutex *lock) static noinline int __sched __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, NULL, + return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, _RET_IP_, ctx); } @@ -1154,12 +1088,13 @@ static noinline int __sched __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, NULL, + return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, _RET_IP_, ctx); } #endif +#ifndef CONFIG_DEBUG_LOCK_ALLOC /** * mutex_trylock - try to acquire the mutex, without waiting * @lock: the mutex to be acquired @@ -1176,14 +1111,24 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, */ int __sched mutex_trylock(struct mutex *lock) { - bool locked = __mutex_trylock(lock); + MUTEX_WARN_ON(lock->magic != lock); + return __mutex_trylock(lock); +} +EXPORT_SYMBOL(mutex_trylock); +#else +int __sched _mutex_trylock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock) +{ + bool locked; + MUTEX_WARN_ON(lock->magic != lock); + locked = __mutex_trylock(lock); if (locked) - mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + mutex_acquire_nest(&lock->dep_map, 0, 1, nest_lock, _RET_IP_); return locked; } -EXPORT_SYMBOL(mutex_trylock); +EXPORT_SYMBOL(_mutex_trylock_nest_lock); +#endif #ifndef CONFIG_DEBUG_LOCK_ALLOC int __sched @@ -1216,7 +1161,11 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) } EXPORT_SYMBOL(ww_mutex_lock_interruptible); -#endif +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ +#endif /* !CONFIG_PREEMPT_RT */ + +EXPORT_TRACEPOINT_SYMBOL_GPL(contention_begin); +EXPORT_TRACEPOINT_SYMBOL_GPL(contention_end); /** * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 6ebc1902f779..9ad4da8cea00 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -1,24 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Mutexes: blocking mutual exclusion locks * * started by Ingo Molnar: * * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * - * This file contains mutex debugging related internal prototypes, for the - * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: */ +#ifndef CONFIG_PREEMPT_RT +/* + * This is the control structure for tasks blocked on mutex, which resides + * on the blocked task's kernel stack: + */ +struct mutex_waiter { + struct list_head list; + struct task_struct *task; + struct ww_acquire_ctx *ww_ctx; +#ifdef CONFIG_DEBUG_MUTEXES + void *magic; +#endif +}; -#define mutex_remove_waiter(lock, waiter, task) \ - __list_del((waiter)->list.prev, (waiter)->list.next) +/* + * @owner: contains: 'struct task_struct *' to the current lock owner, + * NULL means not owned. Since task_struct pointers are aligned at + * at least L1_CACHE_BYTES, we have low bits to store extra state. + * + * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup. + * Bit1 indicates unlock needs to hand the lock to the top-waiter + * Bit2 indicates handoff has been done and we're waiting for pickup. + */ +#define MUTEX_FLAG_WAITERS 0x01 +#define MUTEX_FLAG_HANDOFF 0x02 +#define MUTEX_FLAG_PICKUP 0x04 -#define debug_mutex_wake_waiter(lock, waiter) do { } while (0) -#define debug_mutex_free_waiter(waiter) do { } while (0) -#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) -#define debug_mutex_unlock(lock) do { } while (0) -#define debug_mutex_init(lock, name, key) do { } while (0) +#define MUTEX_FLAGS 0x07 -static inline void -debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) +/* + * Internal helper function; C doesn't allow us to hide it :/ + * + * DO NOT USE (outside of mutex & scheduler code). + */ +static inline struct task_struct *__mutex_owner(struct mutex *lock) { + if (!lock) + return NULL; + return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS); } + +#ifdef CONFIG_DEBUG_MUTEXES +extern void debug_mutex_lock_common(struct mutex *lock, + struct mutex_waiter *waiter); +extern void debug_mutex_wake_waiter(struct mutex *lock, + struct mutex_waiter *waiter); +extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); +extern void debug_mutex_add_waiter(struct mutex *lock, + struct mutex_waiter *waiter, + struct task_struct *task); +extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, + struct task_struct *task); +extern void debug_mutex_unlock(struct mutex *lock); +extern void debug_mutex_init(struct mutex *lock); +#else /* CONFIG_DEBUG_MUTEXES */ +# define debug_mutex_lock_common(lock, waiter) do { } while (0) +# define debug_mutex_wake_waiter(lock, waiter) do { } while (0) +# define debug_mutex_free_waiter(waiter) do { } while (0) +# define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) +# define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0) +# define debug_mutex_unlock(lock) do { } while (0) +# define debug_mutex_init(lock) do { } while (0) +#endif /* !CONFIG_DEBUG_MUTEXES */ +#endif /* CONFIG_PREEMPT_RT */ diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index a3167941093b..b4233dc2c2b0 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/percpu.h> #include <linux/sched.h> #include <linux/osq_lock.h> @@ -10,6 +11,13 @@ * called from interrupt context and we have preemption disabled while * spinning. */ + +struct optimistic_spin_node { + struct optimistic_spin_node *next, *prev; + int locked; /* 1 if lock acquired */ + int cpu; /* encoded CPU # + 1 value */ +}; + static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node); /* @@ -36,32 +44,28 @@ static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val) /* * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. * Can return NULL in case we were the last queued and we updated @lock instead. + * + * If osq_lock() is being cancelled there must be a previous node + * and 'old_cpu' is its CPU #. + * For osq_unlock() there is never a previous node and old_cpu is + * set to OSQ_UNLOCKED_VAL. */ static inline struct optimistic_spin_node * osq_wait_next(struct optimistic_spin_queue *lock, struct optimistic_spin_node *node, - struct optimistic_spin_node *prev) + int old_cpu) { - struct optimistic_spin_node *next = NULL; int curr = encode_cpu(smp_processor_id()); - int old; - - /* - * If there is a prev node in queue, then the 'old' value will be - * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if - * we're currently last in queue, then the queue will then become empty. - */ - old = prev ? prev->cpu : OSQ_UNLOCKED_VAL; for (;;) { if (atomic_read(&lock->tail) == curr && - atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) { + atomic_cmpxchg_acquire(&lock->tail, curr, old_cpu) == curr) { /* * We were the last queued, we moved @lock back. @prev * will now observe @lock and will complete its * unlock()/unqueue(). */ - break; + return NULL; } /* @@ -75,15 +79,15 @@ osq_wait_next(struct optimistic_spin_queue *lock, * wait for a new @node->next from its Step-C. */ if (node->next) { + struct optimistic_spin_node *next; + next = xchg(&node->next, NULL); if (next) - break; + return next; } cpu_relax(); } - - return next; } bool osq_lock(struct optimistic_spin_queue *lock) @@ -109,6 +113,19 @@ bool osq_lock(struct optimistic_spin_queue *lock) prev = decode_cpu(old); node->prev = prev; + + /* + * osq_lock() unqueue + * + * node->prev = prev osq_wait_next() + * WMB MB + * prev->next = node next->prev = prev // unqueue-C + * + * Here 'node->prev' and 'next->prev' are the same variable and we need + * to ensure these stores happen in-order to avoid corrupting the list. + */ + smp_wmb(); + WRITE_ONCE(prev->next, node); /* @@ -120,20 +137,17 @@ bool osq_lock(struct optimistic_spin_queue *lock) * cmpxchg in an attempt to undo our queueing. */ - while (!READ_ONCE(node->locked)) { - /* - * If we need to reschedule bail... so we can block. - * Use vcpu_is_preempted() to avoid waiting for a preempted - * lock holder: - */ - if (need_resched() || vcpu_is_preempted(node_cpu(node->prev))) - goto unqueue; - - cpu_relax(); - } - return true; + /* + * Wait to acquire the lock or cancellation. Note that need_resched() + * will come with an IPI, which will wake smp_cond_load_relaxed() if it + * is implemented with a monitor-wait. vcpu_is_preempted() relies on + * polling, be careful. + */ + if (smp_cond_load_relaxed(&node->locked, VAL || need_resched() || + vcpu_is_preempted(node_cpu(node->prev)))) + return true; -unqueue: + /* unqueue */ /* * Step - A -- stabilize @prev * @@ -143,13 +157,17 @@ unqueue: */ for (;;) { - if (prev->next == node && + /* + * cpu_relax() below implies a compiler barrier which would + * prevent this comparison being optimized away. + */ + if (data_race(prev->next) == node && cmpxchg(&prev->next, node, NULL) == node) break; /* * We can only fail the cmpxchg() racing against an unlock(), - * in which case we should observe @node->locked becomming + * in which case we should observe @node->locked becoming * true. */ if (smp_load_acquire(&node->locked)) @@ -171,7 +189,7 @@ unqueue: * back to @prev. */ - next = osq_wait_next(lock, node, prev); + next = osq_wait_next(lock, node, prev->cpu); if (!next) return false; @@ -197,8 +215,7 @@ void osq_unlock(struct optimistic_spin_queue *lock) /* * Fast path for the uncontended case. */ - if (likely(atomic_cmpxchg_release(&lock->tail, curr, - OSQ_UNLOCKED_VAL) == curr)) + if (atomic_try_cmpxchg_release(&lock->tail, &curr, OSQ_UNLOCKED_VAL)) return; /* @@ -211,7 +228,7 @@ void osq_unlock(struct optimistic_spin_queue *lock) return; } - next = osq_wait_next(lock, node, NULL); + next = osq_wait_next(lock, node, OSQ_UNLOCKED_VAL); if (next) WRITE_ONCE(next->locked, 1); } diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 883cf1b92d90..ef234469baac 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -1,24 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0-only #include <linux/atomic.h> -#include <linux/rwsem.h> #include <linux/percpu.h> +#include <linux/wait.h> #include <linux/lockdep.h> #include <linux/percpu-rwsem.h> #include <linux/rcupdate.h> #include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/sched/debug.h> #include <linux/errno.h> +#include <trace/events/lock.h> int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, - const char *name, struct lock_class_key *rwsem_key) + const char *name, struct lock_class_key *key) { sem->read_count = alloc_percpu(int); if (unlikely(!sem->read_count)) return -ENOMEM; - /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ - rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); - __init_rwsem(&sem->rw_sem, name, rwsem_key); + rcu_sync_init(&sem->rss); rcuwait_init(&sem->writer); - sem->readers_block = 0; + init_waitqueue_head(&sem->waiters); + atomic_set(&sem->block, 0); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map(&sem->dep_map, name, key, 0); +#endif return 0; } EXPORT_SYMBOL_GPL(__percpu_init_rwsem); @@ -38,77 +45,149 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *sem) } EXPORT_SYMBOL_GPL(percpu_free_rwsem); -int __percpu_down_read(struct percpu_rw_semaphore *sem, int try) +static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) { + this_cpu_inc(*sem->read_count); + /* * Due to having preemption disabled the decrement happens on * the same CPU as the increment, avoiding the * increment-on-one-CPU-and-decrement-on-another problem. * - * If the reader misses the writer's assignment of readers_block, then - * the writer is guaranteed to see the reader's increment. + * If the reader misses the writer's assignment of sem->block, then the + * writer is guaranteed to see the reader's increment. * * Conversely, any readers that increment their sem->read_count after - * the writer looks are guaranteed to see the readers_block value, - * which in turn means that they are guaranteed to immediately - * decrement their sem->read_count, so that it doesn't matter that the - * writer missed them. + * the writer looks are guaranteed to see the sem->block value, which + * in turn means that they are guaranteed to immediately decrement + * their sem->read_count, so that it doesn't matter that the writer + * missed them. */ smp_mb(); /* A matches D */ /* - * If !readers_block the critical section starts here, matched by the + * If !sem->block the critical section starts here, matched by the * release in percpu_up_write(). */ - if (likely(!smp_load_acquire(&sem->readers_block))) + if (likely(!atomic_read_acquire(&sem->block))) + return true; + + this_cpu_dec(*sem->read_count); + + /* Prod writer to re-evaluate readers_active_check() */ + rcuwait_wake_up(&sem->writer); + + return false; +} + +static inline bool __percpu_down_write_trylock(struct percpu_rw_semaphore *sem) +{ + if (atomic_read(&sem->block)) + return false; + + return atomic_xchg(&sem->block, 1) == 0; +} + +static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader) +{ + if (reader) { + bool ret; + + preempt_disable(); + ret = __percpu_down_read_trylock(sem); + preempt_enable(); + + return ret; + } + return __percpu_down_write_trylock(sem); +} + +/* + * The return value of wait_queue_entry::func means: + * + * <0 - error, wakeup is terminated and the error is returned + * 0 - no wakeup, a next waiter is tried + * >0 - woken, if EXCLUSIVE, counted towards @nr_exclusive. + * + * We use EXCLUSIVE for both readers and writers to preserve FIFO order, + * and play games with the return value to allow waking multiple readers. + * + * Specifically, we wake readers until we've woken a single writer, or until a + * trylock fails. + */ +static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry, + unsigned int mode, int wake_flags, + void *key) +{ + bool reader = wq_entry->flags & WQ_FLAG_CUSTOM; + struct percpu_rw_semaphore *sem = key; + struct task_struct *p; + + /* concurrent against percpu_down_write(), can get stolen */ + if (!__percpu_rwsem_trylock(sem, reader)) return 1; - /* - * Per the above comment; we still have preemption disabled and - * will thus decrement on the same CPU as we incremented. - */ - __percpu_up_read(sem); + p = get_task_struct(wq_entry->private); + list_del_init(&wq_entry->entry); + smp_store_release(&wq_entry->private, NULL); - if (try) - return 0; + wake_up_process(p); + put_task_struct(p); - /* - * We either call schedule() in the wait, or we'll fall through - * and reschedule on the preempt_enable() in percpu_down_read(). - */ - preempt_enable_no_resched(); + return !reader; /* wake (readers until) 1 writer */ +} + +static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader, + bool freeze) +{ + DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function); + bool wait; + spin_lock_irq(&sem->waiters.lock); /* - * Avoid lockdep for the down/up_read() we already have them. + * Serialize against the wakeup in percpu_up_write(), if we fail + * the trylock, the wakeup must see us on the list. */ - __down_read(&sem->rw_sem); - this_cpu_inc(*sem->read_count); - __up_read(&sem->rw_sem); + wait = !__percpu_rwsem_trylock(sem, reader); + if (wait) { + wq_entry.flags |= WQ_FLAG_EXCLUSIVE | reader * WQ_FLAG_CUSTOM; + __add_wait_queue_entry_tail(&sem->waiters, &wq_entry); + } + spin_unlock_irq(&sem->waiters.lock); - preempt_disable(); - return 1; + while (wait) { + set_current_state(TASK_UNINTERRUPTIBLE | + (freeze ? TASK_FREEZABLE : 0)); + if (!smp_load_acquire(&wq_entry.private)) + break; + schedule(); + } + __set_current_state(TASK_RUNNING); } -EXPORT_SYMBOL_GPL(__percpu_down_read); -void __percpu_up_read(struct percpu_rw_semaphore *sem) +bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try, + bool freeze) { - smp_mb(); /* B matches C */ - /* - * In other words, if they see our decrement (presumably to aggregate - * zero, as that is the only time it matters) they will also see our - * critical section. - */ - __this_cpu_dec(*sem->read_count); + if (__percpu_down_read_trylock(sem)) + return true; - /* Prod writer to recheck readers_active */ - rcuwait_wake_up(&sem->writer); + if (try) + return false; + + trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ); + preempt_enable(); + percpu_rwsem_wait(sem, /* .reader = */ true, freeze); + preempt_disable(); + trace_contention_end(sem, 0); + + return true; } -EXPORT_SYMBOL_GPL(__percpu_up_read); +EXPORT_SYMBOL_GPL(__percpu_down_read); #define per_cpu_sum(var) \ ({ \ - typeof(var) __sum = 0; \ + TYPEOF_UNQUAL(var) __sum = 0; \ int cpu; \ compiletime_assert_atomic_type(__sum); \ for_each_possible_cpu(cpu) \ @@ -116,11 +195,19 @@ EXPORT_SYMBOL_GPL(__percpu_up_read); __sum; \ }) +bool percpu_is_read_locked(struct percpu_rw_semaphore *sem) +{ + return per_cpu_sum(*sem->read_count) != 0 && !atomic_read(&sem->block); +} +EXPORT_SYMBOL_GPL(percpu_is_read_locked); + /* * Return true if the modular sum of the sem->read_count per-CPU variable is * zero. If this sum is zero, then it is stable due to the fact that if any * newly arriving readers increment a given counter, they will immediately * decrement that same counter. + * + * Assumes sem->block is set. */ static bool readers_active_check(struct percpu_rw_semaphore *sem) { @@ -137,34 +224,45 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem) return true; } -void percpu_down_write(struct percpu_rw_semaphore *sem) +void __sched percpu_down_write(struct percpu_rw_semaphore *sem) { + bool contended = false; + + might_sleep(); + rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + /* Notify readers to take the slow path. */ rcu_sync_enter(&sem->rss); - down_write(&sem->rw_sem); - /* - * Notify new readers to block; up until now, and thus throughout the - * longish rcu_sync_enter() above, new readers could still come in. + * Try set sem->block; this provides writer-writer exclusion. + * Having sem->block set makes new readers block. */ - WRITE_ONCE(sem->readers_block, 1); + if (!__percpu_down_write_trylock(sem)) { + trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE); + percpu_rwsem_wait(sem, /* .reader = */ false, false); + contended = true; + } - smp_mb(); /* D matches A */ + /* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */ /* - * If they don't see our writer of readers_block, then we are - * guaranteed to see their sem->read_count increment, and therefore - * will wait for them. + * If they don't see our store of sem->block, then we are guaranteed to + * see their sem->read_count increment, and therefore will wait for + * them. */ - /* Wait for all now active readers to complete. */ - rcuwait_wait_event(&sem->writer, readers_active_check(sem)); + /* Wait for all active readers to complete. */ + rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE); + if (contended) + trace_contention_end(sem, 0); } EXPORT_SYMBOL_GPL(percpu_down_write); void percpu_up_write(struct percpu_rw_semaphore *sem) { + rwsem_release(&sem->dep_map, _RET_IP_); + /* * Signal the writer is done, no fast path yet. * @@ -175,12 +273,12 @@ void percpu_up_write(struct percpu_rw_semaphore *sem) * Therefore we force it through the slow path which guarantees an * acquire and thereby guarantees the critical section's consistency. */ - smp_store_release(&sem->readers_block, 0); + atomic_set_release(&sem->block, 0); /* - * Release the write lock, this will allow readers back in the game. + * Prod any pending reader/writer to make progress. */ - up_write(&sem->rw_sem); + __wake_up(&sem->waiters, TASK_NORMAL, 1, sem); /* * Once this completes (at least one RCU-sched grace period hence) the diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 2655f26ec882..d2ef312a8611 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -1,16 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Queued read/write locks * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P. * * Authors: Waiman Long <waiman.long@hp.com> @@ -21,51 +12,13 @@ #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/spinlock.h> -#include <asm/qrwlock.h> - -/* - * This internal data structure is used for optimizing access to some of - * the subfields within the atomic_t cnts. - */ -struct __qrwlock { - union { - atomic_t cnts; - struct { -#ifdef __LITTLE_ENDIAN - u8 wmode; /* Writer mode */ - u8 rcnts[3]; /* Reader counts */ -#else - u8 rcnts[3]; /* Reader counts */ - u8 wmode; /* Writer mode */ -#endif - }; - }; - arch_spinlock_t lock; -}; - -/** - * rspin_until_writer_unlock - inc reader count & spin until writer is gone - * @lock : Pointer to queue rwlock structure - * @writer: Current queue rwlock writer status byte - * - * In interrupt context or at the head of the queue, the reader will just - * increment the reader count & wait until the writer releases the lock. - */ -static __always_inline void -rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) -{ - while ((cnts & _QW_WMASK) == _QW_LOCKED) { - cpu_relax(); - cnts = atomic_read_acquire(&lock->cnts); - } -} +#include <trace/events/lock.h> /** - * queued_read_lock_slowpath - acquire read lock of a queue rwlock - * @lock: Pointer to queue rwlock structure - * @cnts: Current qrwlock lock value + * queued_read_lock_slowpath - acquire read lock of a queued rwlock + * @lock: Pointer to queued rwlock structure */ -void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) +void __lockfunc queued_read_lock_slowpath(struct qrwlock *lock) { /* * Readers come here when they cannot get the lock without waiting @@ -73,78 +26,67 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) if (unlikely(in_interrupt())) { /* * Readers in interrupt context will get the lock immediately - * if the writer is just waiting (not holding the lock yet). - * The rspin_until_writer_unlock() function returns immediately - * in this case. Otherwise, they will spin (with ACQUIRE - * semantics) until the lock is available without waiting in - * the queue. + * if the writer is just waiting (not holding the lock yet), + * so spin with ACQUIRE semantics until the lock is available + * without waiting in the queue. */ - rspin_until_writer_unlock(lock, cnts); + atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED)); return; } atomic_sub(_QR_BIAS, &lock->cnts); + trace_contention_begin(lock, LCB_F_SPIN | LCB_F_READ); + /* * Put the reader into the wait queue */ arch_spin_lock(&lock->wait_lock); + atomic_add(_QR_BIAS, &lock->cnts); /* * The ACQUIRE semantics of the following spinning code ensure * that accesses can't leak upwards out of our subsequent critical * section in the case that the lock is currently held for write. */ - cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts); - rspin_until_writer_unlock(lock, cnts); + atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED)); /* * Signal the next one in queue to become queue head */ arch_spin_unlock(&lock->wait_lock); + + trace_contention_end(lock, 0); } EXPORT_SYMBOL(queued_read_lock_slowpath); /** - * queued_write_lock_slowpath - acquire write lock of a queue rwlock - * @lock : Pointer to queue rwlock structure + * queued_write_lock_slowpath - acquire write lock of a queued rwlock + * @lock : Pointer to queued rwlock structure */ -void queued_write_lock_slowpath(struct qrwlock *lock) +void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock) { - u32 cnts; + int cnts; + + trace_contention_begin(lock, LCB_F_SPIN | LCB_F_WRITE); /* Put the writer into the wait queue */ arch_spin_lock(&lock->wait_lock); /* Try to acquire the lock directly if no reader is present */ - if (!atomic_read(&lock->cnts) && - (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0)) + if (!(cnts = atomic_read(&lock->cnts)) && + atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED)) goto unlock; - /* - * Set the waiting flag to notify readers that a writer is pending, - * or wait for a previous writer to go away. - */ - for (;;) { - struct __qrwlock *l = (struct __qrwlock *)lock; - - if (!READ_ONCE(l->wmode) && - (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0)) - break; + /* Set the waiting flag to notify readers that a writer is pending */ + atomic_or(_QW_WAITING, &lock->cnts); - cpu_relax(); - } - - /* When no more readers, set the locked flag */ - for (;;) { - cnts = atomic_read(&lock->cnts); - if ((cnts == _QW_WAITING) && - (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING, - _QW_LOCKED) == _QW_WAITING)) - break; - - cpu_relax(); - } + /* When no more readers or writers, set the locked flag */ + do { + cnts = atomic_cond_read_relaxed(&lock->cnts, VAL == _QW_WAITING); + } while (!atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED)); unlock: arch_spin_unlock(&lock->wait_lock); + + trace_contention_end(lock, 0); } EXPORT_SYMBOL(queued_write_lock_slowpath); diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index fd24153e8a48..af8d122bb649 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -1,22 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Queued spinlock * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. - * (C) Copyright 2013-2014 Red Hat, Inc. + * (C) Copyright 2013-2014,2018 Red Hat, Inc. * (C) Copyright 2015 Intel Corp. * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP * - * Authors: Waiman Long <waiman.long@hpe.com> + * Authors: Waiman Long <longman@redhat.com> * Peter Zijlstra <peterz@infradead.org> */ @@ -31,18 +22,26 @@ #include <linux/prefetch.h> #include <asm/byteorder.h> #include <asm/qspinlock.h> +#include <trace/events/lock.h> + +/* + * Include queued spinlock definitions and statistics code + */ +#include "qspinlock.h" +#include "qspinlock_stat.h" /* * The basic principle of a queue-based spinlock can best be understood * by studying a classic queue-based spinlock implementation called the - * MCS lock. The paper below provides a good description for this kind - * of lock. + * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable + * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and + * Scott") is available at * - * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf + * https://bugzilla.kernel.org/show_bug.cgi?id=206115 * - * This queued spinlock implementation is based on the MCS lock, however to make - * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing - * API, we must modify it somehow. + * This queued spinlock implementation is based on the MCS lock, however to + * make it fit the 4 bytes we assume spinlock_t to be, and preserve its + * existing API, we must modify it somehow. * * In particular; where the traditional MCS lock consists of a tail pointer * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to @@ -70,12 +69,6 @@ #include "mcs_spinlock.h" -#ifdef CONFIG_PARAVIRT_SPINLOCKS -#define MAX_NODES 8 -#else -#define MAX_NODES 4 -#endif - /* * Per-CPU queue node structures; we can never have more than 4 nested * contexts: task, softirq, hardirq, nmi. @@ -84,164 +77,7 @@ * * PV doubles the storage and uses the second cacheline for PV state. */ -static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]); - -/* - * We must be able to distinguish between no-tail and the tail at 0:0, - * therefore increment the cpu number by one. - */ - -static inline __pure u32 encode_tail(int cpu, int idx) -{ - u32 tail; - -#ifdef CONFIG_DEBUG_SPINLOCK - BUG_ON(idx > 3); -#endif - tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; - tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ - - return tail; -} - -static inline __pure struct mcs_spinlock *decode_tail(u32 tail) -{ - int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; - int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; - - return per_cpu_ptr(&mcs_nodes[idx], cpu); -} - -#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) - -/* - * By using the whole 2nd least significant byte for the pending bit, we - * can allow better optimization of the lock acquisition for the pending - * bit holder. - * - * This internal structure is also used by the set_locked function which - * is not restricted to _Q_PENDING_BITS == 8. - */ -struct __qspinlock { - union { - atomic_t val; -#ifdef __LITTLE_ENDIAN - struct { - u8 locked; - u8 pending; - }; - struct { - u16 locked_pending; - u16 tail; - }; -#else - struct { - u16 tail; - u16 locked_pending; - }; - struct { - u8 reserved[2]; - u8 pending; - u8 locked; - }; -#endif - }; -}; - -#if _Q_PENDING_BITS == 8 -/** - * clear_pending_set_locked - take ownership and clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,0 -> *,0,1 - * - * Lock stealing is not allowed if this function is used. - */ -static __always_inline void clear_pending_set_locked(struct qspinlock *lock) -{ - struct __qspinlock *l = (void *)lock; - - WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL); -} - -/* - * xchg_tail - Put in the new queue tail code word & retrieve previous one - * @lock : Pointer to queued spinlock structure - * @tail : The new queue tail code word - * Return: The previous queue tail code word - * - * xchg(lock, tail) - * - * p,*,* -> n,*,* ; prev = xchg(lock, node) - */ -static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) -{ - struct __qspinlock *l = (void *)lock; - - /* - * Use release semantics to make sure that the MCS node is properly - * initialized before changing the tail code. - */ - return (u32)xchg_release(&l->tail, - tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; -} - -#else /* _Q_PENDING_BITS == 8 */ - -/** - * clear_pending_set_locked - take ownership and clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,0 -> *,0,1 - */ -static __always_inline void clear_pending_set_locked(struct qspinlock *lock) -{ - atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); -} - -/** - * xchg_tail - Put in the new queue tail code word & retrieve previous one - * @lock : Pointer to queued spinlock structure - * @tail : The new queue tail code word - * Return: The previous queue tail code word - * - * xchg(lock, tail) - * - * p,*,* -> n,*,* ; prev = xchg(lock, node) - */ -static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) -{ - u32 old, new, val = atomic_read(&lock->val); - - for (;;) { - new = (val & _Q_LOCKED_PENDING_MASK) | tail; - /* - * Use release semantics to make sure that the MCS node is - * properly initialized before changing the tail code. - */ - old = atomic_cmpxchg_release(&lock->val, val, new); - if (old == val) - break; - - val = old; - } - return old; -} -#endif /* _Q_PENDING_BITS == 8 */ - -/** - * set_locked - Set the lock bit and own the lock - * @lock: Pointer to queued spinlock structure - * - * *,*,0 -> *,0,1 - */ -static __always_inline void set_locked(struct qspinlock *lock) -{ - struct __qspinlock *l = (void *)lock; - - WRITE_ONCE(l->locked, _Q_LOCKED_VAL); -} - +static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[_Q_MAX_NODES]); /* * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for @@ -268,123 +104,6 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath #endif -/* - * Various notes on spin_is_locked() and spin_unlock_wait(), which are - * 'interesting' functions: - * - * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE - * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64, - * PPC). Also qspinlock has a similar issue per construction, the setting of - * the locked byte can be unordered acquiring the lock proper. - * - * This gets to be 'interesting' in the following cases, where the /should/s - * end up false because of this issue. - * - * - * CASE 1: - * - * So the spin_is_locked() correctness issue comes from something like: - * - * CPU0 CPU1 - * - * global_lock(); local_lock(i) - * spin_lock(&G) spin_lock(&L[i]) - * for (i) if (!spin_is_locked(&G)) { - * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep(); - * return; - * } - * // deal with fail - * - * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such - * that there is exclusion between the two critical sections. - * - * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from - * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i]) - * /should/ be constrained by the ACQUIRE from spin_lock(&G). - * - * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB. - * - * - * CASE 2: - * - * For spin_unlock_wait() there is a second correctness issue, namely: - * - * CPU0 CPU1 - * - * flag = set; - * smp_mb(); spin_lock(&l) - * spin_unlock_wait(&l); if (!flag) - * // add to lockless list - * spin_unlock(&l); - * // iterate lockless list - * - * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0 - * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE - * semantics etc..) - * - * Where flag /should/ be ordered against the locked store of l. - */ - -/* - * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before - * issuing an _unordered_ store to set _Q_LOCKED_VAL. - * - * This means that the store can be delayed, but no later than the - * store-release from the unlock. This means that simply observing - * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired. - * - * There are two paths that can issue the unordered store: - * - * (1) clear_pending_set_locked(): *,1,0 -> *,0,1 - * - * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0 - * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1 - * - * However, in both cases we have other !0 state we've set before to queue - * ourseves: - * - * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our - * load is constrained by that ACQUIRE to not pass before that, and thus must - * observe the store. - * - * For (2) we have a more intersting scenario. We enqueue ourselves using - * xchg_tail(), which ends up being a RELEASE. This in itself is not - * sufficient, however that is followed by an smp_cond_acquire() on the same - * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and - * guarantees we must observe that store. - * - * Therefore both cases have other !0 state that is observable before the - * unordered locked byte store comes through. This means we can use that to - * wait for the lock store, and then wait for an unlock. - */ -#ifndef queued_spin_unlock_wait -void queued_spin_unlock_wait(struct qspinlock *lock) -{ - u32 val; - - for (;;) { - val = atomic_read(&lock->val); - - if (!val) /* not locked, we're done */ - goto done; - - if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */ - break; - - /* not locked, but pending, wait until we observe the lock */ - cpu_relax(); - } - - /* any unlock is good */ - while (atomic_read(&lock->val) & _Q_LOCKED_MASK) - cpu_relax(); - -done: - smp_acquire__after_ctrl_dep(); -} -EXPORT_SYMBOL(queued_spin_unlock_wait); -#endif - #endif /* _GEN_PV_LOCK_SLOWPATH */ /** @@ -408,82 +127,82 @@ EXPORT_SYMBOL(queued_spin_unlock_wait); * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : * queue : ^--' : */ -void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) +void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) { struct mcs_spinlock *prev, *next, *node; - u32 new, old, tail; + u32 old, tail; int idx; BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); if (pv_enabled()) - goto queue; + goto pv_queue; if (virt_spin_lock(lock)) return; /* - * wait for in-progress pending->locked hand-overs + * Wait for in-progress pending->locked hand-overs with a bounded + * number of spins so that we guarantee forward progress. * * 0,1,0 -> 0,0,1 */ if (val == _Q_PENDING_VAL) { - while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL) - cpu_relax(); + int cnt = _Q_PENDING_LOOPS; + val = atomic_cond_read_relaxed(&lock->val, + (VAL != _Q_PENDING_VAL) || !cnt--); } /* + * If we observe any contention; queue. + */ + if (val & ~_Q_LOCKED_MASK) + goto queue; + + /* * trylock || pending * - * 0,0,0 -> 0,0,1 ; trylock - * 0,0,1 -> 0,1,1 ; pending + * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock */ - for (;;) { - /* - * If we observe any contention; queue. - */ - if (val & ~_Q_LOCKED_MASK) - goto queue; + val = queued_fetch_set_pending_acquire(lock); - new = _Q_LOCKED_VAL; - if (val == new) - new |= _Q_PENDING_VAL; + /* + * If we observe contention, there is a concurrent locker. + * + * Undo and queue; our setting of PENDING might have made the + * n,0,0 -> 0,0,0 transition fail and it will now be waiting + * on @next to become !NULL. + */ + if (unlikely(val & ~_Q_LOCKED_MASK)) { - /* - * Acquire semantic is required here as the function may - * return immediately if the lock was free. - */ - old = atomic_cmpxchg_acquire(&lock->val, val, new); - if (old == val) - break; + /* Undo PENDING if we set it. */ + if (!(val & _Q_PENDING_MASK)) + clear_pending(lock); - val = old; + goto queue; } /* - * we won the trylock - */ - if (new == _Q_LOCKED_VAL) - return; - - /* - * we're pending, wait for the owner to go away. + * We're pending, wait for the owner to go away. * - * *,1,1 -> *,1,0 + * 0,1,1 -> *,1,0 * * this wait loop must be a load-acquire such that we match the * store-release that clears the locked bit and create lock - * sequentiality; this is because not all clear_pending_set_locked() - * implementations imply full barriers. + * sequentiality; this is because not all + * clear_pending_set_locked() implementations imply full + * barriers. */ - smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK)); + if (val & _Q_LOCKED_MASK) + smp_cond_load_acquire(&lock->locked, !VAL); /* * take ownership and clear the pending bit. * - * *,1,0 -> *,0,1 + * 0,1,0 -> 0,0,1 */ clear_pending_set_locked(lock); + lockevent_inc(lock_pending); return; /* @@ -491,11 +210,44 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * queuing. */ queue: - node = this_cpu_ptr(&mcs_nodes[0]); + lockevent_inc(lock_slowpath); +pv_queue: + node = this_cpu_ptr(&qnodes[0].mcs); idx = node->count++; tail = encode_tail(smp_processor_id(), idx); - node += idx; + trace_contention_begin(lock, LCB_F_SPIN); + + /* + * 4 nodes are allocated based on the assumption that there will + * not be nested NMIs taking spinlocks. That may not be true in + * some architectures even though the chance of needing more than + * 4 nodes will still be extremely unlikely. When that happens, + * we fall back to spinning on the lock directly without using + * any MCS node. This is not the most elegant solution, but is + * simple enough. + */ + if (unlikely(idx >= _Q_MAX_NODES)) { + lockevent_inc(lock_no_node); + while (!queued_spin_trylock(lock)) + cpu_relax(); + goto release; + } + + node = grab_mcs_node(node, idx); + + /* + * Keep counts of non-zero index values: + */ + lockevent_cond_inc(lock_use_node2 + idx - 1, idx); + + /* + * Ensure that we increment the head node->count before initialising + * the actual node. If the compiler is kind enough to reorder these + * stores, then an IRQ could overwrite our assignments. + */ + barrier(); + node->locked = 0; node->next = NULL; pv_init_node(node); @@ -509,12 +261,18 @@ queue: goto release; /* + * Ensure that the initialisation of @node is complete before we + * publish the updated tail via xchg_tail() and potentially link + * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. + */ + smp_wmb(); + + /* + * Publish the updated tail. * We have already touched the queueing cacheline; don't bother with * pending stuff. * * p,*,* -> n,*,* - * - * RELEASE, such that the stores to @node must be complete. */ old = xchg_tail(lock, tail); next = NULL; @@ -524,16 +282,9 @@ queue: * head of the waitqueue. */ if (old & _Q_TAIL_MASK) { - prev = decode_tail(old); - /* - * The above xchg_tail() is also a load of @lock which generates, - * through decode_tail(), a pointer. - * - * The address dependency matches the RELEASE of xchg_tail() - * such that the access to @prev must happen after. - */ - smp_read_barrier_depends(); + prev = decode_tail(old, qnodes); + /* Link @node into the waitqueue. */ WRITE_ONCE(prev->next, node); pv_wait_node(node, prev); @@ -563,8 +314,8 @@ queue: * * The PV pv_wait_head_or_lock function, if active, will acquire * the lock and return a non-zero value. So we have to skip the - * smp_cond_load_acquire() call. As the next PV queue head hasn't been - * designated yet, there is no way for the locked value to become + * atomic_cond_read_acquire() call. As the next PV queue head hasn't + * been designated yet, there is no way for the locked value to become * _Q_SLOW_VAL. So both the set_locked() and the * atomic_cmpxchg_relaxed() calls will be safe. * @@ -574,53 +325,58 @@ queue: if ((val = pv_wait_head_or_lock(lock, node))) goto locked; - val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK)); + val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK)); locked: /* * claim the lock: * * n,0,0 -> 0,0,1 : lock, uncontended - * *,0,0 -> *,0,1 : lock, contended + * *,*,0 -> *,*,1 : lock, contended * - * If the queue head is the only one in the queue (lock value == tail), - * clear the tail code and grab the lock. Otherwise, we only need - * to grab the lock. + * If the queue head is the only one in the queue (lock value == tail) + * and nobody is pending, clear the tail code and grab the lock. + * Otherwise, we only need to grab the lock. */ - for (;;) { - /* In the PV case we might already have _Q_LOCKED_VAL set */ - if ((val & _Q_TAIL_MASK) != tail) { - set_locked(lock); - break; - } - /* - * The smp_cond_load_acquire() call above has provided the - * necessary acquire semantics required for locking. At most - * two iterations of this loop may be ran. - */ - old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL); - if (old == val) - goto release; /* No contention */ - val = old; + /* + * In the PV case we might already have _Q_LOCKED_VAL set, because + * of lock stealing; therefore we must also allow: + * + * n,0,1 -> 0,0,1 + * + * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the + * above wait condition, therefore any concurrent setting of + * PENDING will make the uncontended transition fail. + */ + if ((val & _Q_TAIL_MASK) == tail) { + if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) + goto release; /* No contention */ } /* + * Either somebody is queued behind us or _Q_PENDING_VAL got set + * which will then detect the remaining tail and queue behind us + * ensuring we'll see a @next. + */ + set_locked(lock); + + /* * contended path; wait for next if not observed yet, release. */ - if (!next) { - while (!(next = READ_ONCE(node->next))) - cpu_relax(); - } + if (!next) + next = smp_cond_load_relaxed(&node->next, (VAL)); arch_mcs_spin_unlock_contended(&next->locked); pv_kick_node(lock, next); release: + trace_contention_end(lock, 0); + /* * release the node */ - __this_cpu_dec(mcs_nodes[0].count); + __this_cpu_dec(qnodes[0].mcs.count); } EXPORT_SYMBOL(queued_spin_lock_slowpath); @@ -644,4 +400,11 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath); #include "qspinlock_paravirt.h" #include "qspinlock.c" +bool nopvspin; +static __init int parse_nopvspin(char *arg) +{ + nopvspin = true; + return 0; +} +early_param("nopvspin", parse_nopvspin); #endif diff --git a/kernel/locking/qspinlock.h b/kernel/locking/qspinlock.h new file mode 100644 index 000000000000..d69958a844f7 --- /dev/null +++ b/kernel/locking/qspinlock.h @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Queued spinlock defines + * + * This file contains macro definitions and functions shared between different + * qspinlock slow path implementations. + */ +#ifndef __LINUX_QSPINLOCK_H +#define __LINUX_QSPINLOCK_H + +#include <asm-generic/percpu.h> +#include <linux/percpu-defs.h> +#include <asm-generic/qspinlock.h> +#include <asm-generic/mcs_spinlock.h> + +#define _Q_MAX_NODES 4 + +/* + * The pending bit spinning loop count. + * This heuristic is used to limit the number of lockword accesses + * made by atomic_cond_read_relaxed when waiting for the lock to + * transition out of the "== _Q_PENDING_VAL" state. We don't spin + * indefinitely because there's no guarantee that we'll make forward + * progress. + */ +#ifndef _Q_PENDING_LOOPS +#define _Q_PENDING_LOOPS 1 +#endif + +/* + * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in + * size and four of them will fit nicely in one 64-byte cacheline. For + * pvqspinlock, however, we need more space for extra data. To accommodate + * that, we insert two more long words to pad it up to 32 bytes. IOW, only + * two of them can fit in a cacheline in this case. That is OK as it is rare + * to have more than 2 levels of slowpath nesting in actual use. We don't + * want to penalize pvqspinlocks to optimize for a rare case in native + * qspinlocks. + */ +struct qnode { + struct mcs_spinlock mcs; +#ifdef CONFIG_PARAVIRT_SPINLOCKS + long reserved[2]; +#endif +}; + +/* + * We must be able to distinguish between no-tail and the tail at 0:0, + * therefore increment the cpu number by one. + */ + +static inline __pure u32 encode_tail(int cpu, int idx) +{ + u32 tail; + + tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; + tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ + + return tail; +} + +static inline __pure struct mcs_spinlock *decode_tail(u32 tail, + struct qnode __percpu *qnodes) +{ + int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; + int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; + + return per_cpu_ptr(&qnodes[idx].mcs, cpu); +} + +static inline __pure +struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) +{ + return &((struct qnode *)base + idx)->mcs; +} + +#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) + +#if _Q_PENDING_BITS == 8 +/** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,* -> *,0,* + */ +static __always_inline void clear_pending(struct qspinlock *lock) +{ + WRITE_ONCE(lock->pending, 0); +} + +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + * + * Lock stealing is not allowed if this function is used. + */ +static __always_inline void clear_pending_set_locked(struct qspinlock *lock) +{ + WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); +} + +/* + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail), which heads an address dependency + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + /* + * We can use relaxed semantics since the caller ensures that the + * MCS node is properly initialized before updating the tail. + */ + return (u32)xchg_relaxed(&lock->tail, + tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; +} + +#else /* _Q_PENDING_BITS == 8 */ + +/** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,* -> *,0,* + */ +static __always_inline void clear_pending(struct qspinlock *lock) +{ + atomic_andnot(_Q_PENDING_VAL, &lock->val); +} + +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + */ +static __always_inline void clear_pending_set_locked(struct qspinlock *lock) +{ + atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); +} + +/** + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail) + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + u32 old, new; + + old = atomic_read(&lock->val); + do { + new = (old & _Q_LOCKED_PENDING_MASK) | tail; + /* + * We can use relaxed semantics since the caller ensures that + * the MCS node is properly initialized before updating the + * tail. + */ + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); + + return old; +} +#endif /* _Q_PENDING_BITS == 8 */ + +/** + * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending + * @lock : Pointer to queued spinlock structure + * Return: The previous lock value + * + * *,*,* -> *,1,* + */ +#ifndef queued_fetch_set_pending_acquire +static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) +{ + return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); +} +#endif + +/** + * set_locked - Set the lock bit and own the lock + * @lock: Pointer to queued spinlock structure + * + * *,*,0 -> *,0,1 + */ +static __always_inline void set_locked(struct qspinlock *lock) +{ + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); +} + +#endif /* __LINUX_QSPINLOCK_H */ diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 4ccfcaae5b89..dc1cb90e3644 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -1,9 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _GEN_PV_LOCK_SLOWPATH #error "do not include this file" #endif #include <linux/hash.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/debug_locks.h> /* @@ -37,44 +38,65 @@ #define PV_PREV_CHECK_MASK 0xff /* - * Queue node uses: vcpu_running & vcpu_halted. - * Queue head uses: vcpu_running & vcpu_hashed. + * Queue node uses: VCPU_RUNNING & VCPU_HALTED. + * Queue head uses: VCPU_RUNNING & VCPU_HASHED. */ enum vcpu_state { - vcpu_running = 0, - vcpu_halted, /* Used only in pv_wait_node */ - vcpu_hashed, /* = pv_hash'ed + vcpu_halted */ + VCPU_RUNNING = 0, + VCPU_HALTED, /* Used only in pv_wait_node */ + VCPU_HASHED, /* = pv_hash'ed + VCPU_HALTED */ }; struct pv_node { struct mcs_spinlock mcs; - struct mcs_spinlock __res[3]; - int cpu; u8 state; }; /* - * Include queued spinlock statistics code - */ -#include "qspinlock_stat.h" - -/* + * Hybrid PV queued/unfair lock + * * By replacing the regular queued_spin_trylock() with the function below, * it will be called once when a lock waiter enter the PV slowpath before - * being queued. By allowing one lock stealing attempt here when the pending - * bit is off, it helps to reduce the performance impact of lock waiter - * preemption without the drawback of lock starvation. + * being queued. + * + * The pending bit is set by the queue head vCPU of the MCS wait queue in + * pv_wait_head_or_lock() to signal that it is ready to spin on the lock. + * When that bit becomes visible to the incoming waiters, no lock stealing + * is allowed. The function will return immediately to make the waiters + * enter the MCS wait queue. So lock starvation shouldn't happen as long + * as the queued mode vCPUs are actively running to set the pending bit + * and hence disabling lock stealing. + * + * When the pending bit isn't set, the lock waiters will stay in the unfair + * mode spinning on the lock unless the MCS wait queue is empty. In this + * case, the lock waiters will enter the queued mode slowpath trying to + * become the queue head and set the pending bit. + * + * This hybrid PV queued/unfair lock combines the best attributes of a + * queued lock (no lock starvation) and an unfair lock (good performance + * on not heavily contended locks). */ -#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l) -static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) +#define queued_spin_trylock(l) pv_hybrid_queued_unfair_trylock(l) +static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; + /* + * Stay in unfair lock mode as long as queued mode waiters are + * present in the MCS wait queue but the pending bit isn't set. + */ + for (;;) { + int val = atomic_read(&lock->val); + u8 old = 0; + + if (!(val & _Q_LOCKED_PENDING_MASK) && + try_cmpxchg_acquire(&lock->locked, &old, _Q_LOCKED_VAL)) { + lockevent_inc(pv_lock_stealing); + return true; + } + if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK)) + break; - if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && - (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { - qstat_inc(qstat_pv_lock_stealing, true); - return true; + cpu_relax(); } return false; @@ -87,30 +109,20 @@ static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) #if _Q_PENDING_BITS == 8 static __always_inline void set_pending(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; - - WRITE_ONCE(l->pending, 1); -} - -static __always_inline void clear_pending(struct qspinlock *lock) -{ - struct __qspinlock *l = (void *)lock; - - WRITE_ONCE(l->pending, 0); + WRITE_ONCE(lock->pending, 1); } /* * The pending bit check in pv_queued_spin_steal_lock() isn't a memory - * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock - * just to be sure that it will get it. + * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the + * lock just to be sure that it will get it. */ -static __always_inline int trylock_clear_pending(struct qspinlock *lock) +static __always_inline bool trylock_clear_pending(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; + u16 old = _Q_PENDING_VAL; - return !READ_ONCE(l->locked) && - (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL) - == _Q_PENDING_VAL); + return !READ_ONCE(lock->locked) && + try_cmpxchg_acquire(&lock->locked_pending, &old, _Q_LOCKED_VAL); } #else /* _Q_PENDING_BITS == 8 */ static __always_inline void set_pending(struct qspinlock *lock) @@ -118,32 +130,21 @@ static __always_inline void set_pending(struct qspinlock *lock) atomic_or(_Q_PENDING_VAL, &lock->val); } -static __always_inline void clear_pending(struct qspinlock *lock) +static __always_inline bool trylock_clear_pending(struct qspinlock *lock) { - atomic_andnot(_Q_PENDING_VAL, &lock->val); -} - -static __always_inline int trylock_clear_pending(struct qspinlock *lock) -{ - int val = atomic_read(&lock->val); - - for (;;) { - int old, new; - - if (val & _Q_LOCKED_MASK) - break; + int old, new; + old = atomic_read(&lock->val); + do { + if (old & _Q_LOCKED_MASK) + return false; /* * Try to clear pending bit & set locked bit */ - old = val; - new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL; - val = atomic_cmpxchg(&lock->val, old, new); + new = (old & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL; + } while (!atomic_try_cmpxchg_acquire (&lock->val, &old, new)); - if (val == old) - return 1; - } - return 0; + return true; } #endif /* _Q_PENDING_BITS == 8 */ @@ -211,10 +212,11 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node) int hopcnt = 0; for_each_hash_entry(he, offset, hash) { + struct qspinlock *old = NULL; hopcnt++; - if (!cmpxchg(&he->lock, NULL, lock)) { + if (try_cmpxchg(&he->lock, &old, lock)) { WRITE_ONCE(he->node, node); - qstat_hop(hopcnt); + lockevent_pv_hop(hopcnt); return &he->lock; } } @@ -264,7 +266,7 @@ pv_wait_early(struct pv_node *prev, int loop) if ((loop & PV_PREV_CHECK_MASK) != 0) return false; - return READ_ONCE(prev->state) != vcpu_running || vcpu_is_preempted(prev->cpu); + return READ_ONCE(prev->state) != VCPU_RUNNING; } /* @@ -274,10 +276,10 @@ static void pv_init_node(struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; - BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock)); + BUILD_BUG_ON(sizeof(struct pv_node) > sizeof(struct qnode)); pn->cpu = smp_processor_id(); - pn->state = vcpu_running; + pn->state = VCPU_RUNNING; } /* @@ -289,8 +291,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) { struct pv_node *pn = (struct pv_node *)node; struct pv_node *pp = (struct pv_node *)prev; - int loop; bool wait_early; + int loop; for (;;) { for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) { @@ -306,26 +308,26 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) /* * Order pn->state vs pn->locked thusly: * - * [S] pn->state = vcpu_halted [S] next->locked = 1 + * [S] pn->state = VCPU_HALTED [S] next->locked = 1 * MB MB - * [L] pn->locked [RmW] pn->state = vcpu_hashed + * [L] pn->locked [RmW] pn->state = VCPU_HASHED * * Matches the cmpxchg() from pv_kick_node(). */ - smp_store_mb(pn->state, vcpu_halted); + smp_store_mb(pn->state, VCPU_HALTED); if (!READ_ONCE(node->locked)) { - qstat_inc(qstat_pv_wait_node, true); - qstat_inc(qstat_pv_wait_early, wait_early); - pv_wait(&pn->state, vcpu_halted); + lockevent_inc(pv_wait_node); + lockevent_cond_inc(pv_wait_early, wait_early); + pv_wait(&pn->state, VCPU_HALTED); } /* - * If pv_kick_node() changed us to vcpu_hashed, retain that + * If pv_kick_node() changed us to VCPU_HASHED, retain that * value so that pv_wait_head_or_lock() knows to not also try * to hash this lock. */ - cmpxchg(&pn->state, vcpu_halted, vcpu_running); + cmpxchg(&pn->state, VCPU_HALTED, VCPU_RUNNING); /* * If the locked flag is still not set after wakeup, it is a @@ -334,7 +336,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) * So it is better to spin for a while in the hope that the * MCS lock will be released soon. */ - qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked)); + lockevent_cond_inc(pv_spurious_wakeup, + !READ_ONCE(node->locked)); } /* @@ -354,16 +357,24 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; - struct __qspinlock *l = (void *)lock; - + u8 old = VCPU_HALTED; /* * If the vCPU is indeed halted, advance its state to match that of * pv_wait_node(). If OTOH this fails, the vCPU was running and will * observe its next->locked value and advance itself. * * Matches with smp_store_mb() and cmpxchg() in pv_wait_node() + * + * The write to next->locked in arch_mcs_spin_unlock_contended() + * must be ordered before the read of pn->state in the cmpxchg() + * below for the code to work correctly. To guarantee full ordering + * irrespective of the success or failure of the cmpxchg(), + * a relaxed version with explicit barrier is used. The control + * dependency will order the reading of pn->state before any + * subsequent writes. */ - if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted) + smp_mb__before_atomic(); + if (!try_cmpxchg_relaxed(&pn->state, &old, VCPU_HASHED)) return; /* @@ -373,7 +384,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) * the hash table later on at unlock time, no atomic instruction is * needed. */ - WRITE_ONCE(l->locked, _Q_SLOW_VAL); + WRITE_ONCE(lock->locked, _Q_SLOW_VAL); (void)pv_hash(lock, pn); } @@ -388,7 +399,6 @@ static u32 pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; - struct __qspinlock *l = (void *)lock; struct qspinlock **lp = NULL; int waitcnt = 0; int loop; @@ -397,20 +407,20 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) * If pv_kick_node() already advanced our state, we don't need to * insert ourselves into the hash table anymore. */ - if (READ_ONCE(pn->state) == vcpu_hashed) + if (READ_ONCE(pn->state) == VCPU_HASHED) lp = (struct qspinlock **)1; /* * Tracking # of slowpath locking operations */ - qstat_inc(qstat_pv_lock_slowpath, true); + lockevent_inc(lock_slowpath); for (;; waitcnt++) { /* * Set correct vCPU state to be used by queue node wait-early * mechanism. */ - WRITE_ONCE(pn->state, vcpu_running); + WRITE_ONCE(pn->state, VCPU_RUNNING); /* * Set the pending bit in the active lock spinning loop to @@ -439,21 +449,21 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) * * Matches the smp_rmb() in __pv_queued_spin_unlock(). */ - if (xchg(&l->locked, _Q_SLOW_VAL) == 0) { + if (xchg(&lock->locked, _Q_SLOW_VAL) == 0) { /* * The lock was free and now we own the lock. * Change the lock value back to _Q_LOCKED_VAL * and unhash the table. */ - WRITE_ONCE(l->locked, _Q_LOCKED_VAL); + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); WRITE_ONCE(*lp, NULL); goto gotlock; } } - WRITE_ONCE(pn->state, vcpu_hashed); - qstat_inc(qstat_pv_wait_head, true); - qstat_inc(qstat_pv_wait_again, waitcnt); - pv_wait(&l->locked, _Q_SLOW_VAL); + WRITE_ONCE(pn->state, VCPU_HASHED); + lockevent_inc(pv_wait_head); + lockevent_cond_inc(pv_wait_again, waitcnt); + pv_wait(&lock->locked, _Q_SLOW_VAL); /* * Because of lock stealing, the queue head vCPU may not be @@ -472,13 +482,22 @@ gotlock: } /* + * Include the architecture specific callee-save thunk of the + * __pv_queued_spin_unlock(). This thunk is put together with + * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock + * function close to each other sharing consecutive instruction cachelines. + * Alternatively, architecture specific version of __pv_queued_spin_unlock() + * can be defined. + */ +#include <asm/qspinlock_paravirt.h> + +/* * PV versions of the unlock fastpath and slowpath functions to be used * instead of queued_spin_unlock(). */ -__visible void +__visible __lockfunc void __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) { - struct __qspinlock *l = (void *)lock; struct pv_node *node; if (unlikely(locked != _Q_SLOW_VAL)) { @@ -507,7 +526,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) * Now that we have a reference to the (likely) blocked pv_node, * release the lock. */ - smp_store_release(&l->locked, 0); + smp_store_release(&lock->locked, 0); /* * At this point the memory pointed at by lock can be freed/reused, @@ -516,33 +535,21 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) * vCPU is harmless other than the additional latency in completing * the unlock. */ - qstat_inc(qstat_pv_kick_unlock, true); + lockevent_inc(pv_kick_unlock); pv_kick(node->cpu); } -/* - * Include the architecture specific callee-save thunk of the - * __pv_queued_spin_unlock(). This thunk is put together with - * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock - * function close to each other sharing consecutive instruction cachelines. - * Alternatively, architecture specific version of __pv_queued_spin_unlock() - * can be defined. - */ -#include <asm/qspinlock_paravirt.h> - #ifndef __pv_queued_spin_unlock -__visible void __pv_queued_spin_unlock(struct qspinlock *lock) +__visible __lockfunc void __pv_queued_spin_unlock(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; - u8 locked; + u8 locked = _Q_LOCKED_VAL; /* * We must not unlock if SLOW, because in that case we must first * unhash. Otherwise it would be possible to have multiple @lock * entries, which would be BAD. */ - locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0); - if (likely(locked == _Q_LOCKED_VAL)) + if (try_cmpxchg_release(&lock->locked, &locked, 0)) return; __pv_queued_spin_unlock_slowpath(lock, locked); diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 4a30ef63c607..e625bb410aa2 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -1,252 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * Authors: Waiman Long <waiman.long@hpe.com> + * Authors: Waiman Long <longman@redhat.com> */ -/* - * When queued spinlock statistical counters are enabled, the following - * debugfs files will be created for reporting the counter values: - * - * <debugfs>/qlockstat/ - * pv_hash_hops - average # of hops per hashing operation - * pv_kick_unlock - # of vCPU kicks issued at unlock time - * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake - * pv_latency_kick - average latency (ns) of vCPU kick operation - * pv_latency_wake - average latency (ns) from vCPU kick to wakeup - * pv_lock_slowpath - # of locking operations via the slowpath - * pv_lock_stealing - # of lock stealing operations - * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs - * pv_wait_again - # of wait's after a queue head vCPU kick - * pv_wait_early - # of early vCPU wait's - * pv_wait_head - # of vCPU wait's at the queue head - * pv_wait_node - # of vCPU wait's at a non-head queue node - * - * Writing to the "reset_counters" file will reset all the above counter - * values. - * - * These statistical counters are implemented as per-cpu variables which are - * summed and computed whenever the corresponding debugfs files are read. This - * minimizes added overhead making the counters usable even in a production - * environment. - * - * There may be slight difference between pv_kick_wake and pv_kick_unlock. - */ -enum qlock_stats { - qstat_pv_hash_hops, - qstat_pv_kick_unlock, - qstat_pv_kick_wake, - qstat_pv_latency_kick, - qstat_pv_latency_wake, - qstat_pv_lock_slowpath, - qstat_pv_lock_stealing, - qstat_pv_spurious_wakeup, - qstat_pv_wait_again, - qstat_pv_wait_early, - qstat_pv_wait_head, - qstat_pv_wait_node, - qstat_num, /* Total number of statistical counters */ - qstat_reset_cnts = qstat_num, -}; +#include "lock_events.h" -#ifdef CONFIG_QUEUED_LOCK_STAT +#ifdef CONFIG_LOCK_EVENT_COUNTS +#ifdef CONFIG_PARAVIRT_SPINLOCKS /* - * Collect pvqspinlock statistics + * Collect pvqspinlock locking event counts */ -#include <linux/debugfs.h> #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/fs.h> -static const char * const qstat_names[qstat_num + 1] = { - [qstat_pv_hash_hops] = "pv_hash_hops", - [qstat_pv_kick_unlock] = "pv_kick_unlock", - [qstat_pv_kick_wake] = "pv_kick_wake", - [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", - [qstat_pv_latency_kick] = "pv_latency_kick", - [qstat_pv_latency_wake] = "pv_latency_wake", - [qstat_pv_lock_slowpath] = "pv_lock_slowpath", - [qstat_pv_lock_stealing] = "pv_lock_stealing", - [qstat_pv_wait_again] = "pv_wait_again", - [qstat_pv_wait_early] = "pv_wait_early", - [qstat_pv_wait_head] = "pv_wait_head", - [qstat_pv_wait_node] = "pv_wait_node", - [qstat_reset_cnts] = "reset_counters", -}; +#define EVENT_COUNT(ev) lockevents[LOCKEVENT_ ## ev] /* - * Per-cpu counters + * PV specific per-cpu counter */ -static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]); static DEFINE_PER_CPU(u64, pv_kick_time); /* - * Function to read and return the qlock statistical counter values + * Function to read and return the PV qspinlock counts. * * The following counters are handled specially: - * 1. qstat_pv_latency_kick + * 1. pv_latency_kick * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock - * 2. qstat_pv_latency_wake + * 2. pv_latency_wake * Average wake latency (ns) = pv_latency_wake/pv_kick_wake - * 3. qstat_pv_hash_hops + * 3. pv_hash_hops * Average hops/hash = pv_hash_hops/pv_kick_unlock */ -static ssize_t qstat_read(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) +ssize_t lockevent_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) { char buf[64]; - int cpu, counter, len; - u64 stat = 0, kicks = 0; + int cpu, id, len; + u64 sum = 0, kicks = 0; /* * Get the counter ID stored in file->f_inode->i_private */ - counter = (long)file_inode(file)->i_private; + id = (long)file_inode(file)->i_private; - if (counter >= qstat_num) + if (id >= lockevent_num) return -EBADF; for_each_possible_cpu(cpu) { - stat += per_cpu(qstats[counter], cpu); + sum += per_cpu(lockevents[id], cpu); /* - * Need to sum additional counter for some of them + * Need to sum additional counters for some of them */ - switch (counter) { + switch (id) { - case qstat_pv_latency_kick: - case qstat_pv_hash_hops: - kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu); + case LOCKEVENT_pv_latency_kick: + case LOCKEVENT_pv_hash_hops: + kicks += per_cpu(EVENT_COUNT(pv_kick_unlock), cpu); break; - case qstat_pv_latency_wake: - kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu); + case LOCKEVENT_pv_latency_wake: + kicks += per_cpu(EVENT_COUNT(pv_kick_wake), cpu); break; } } - if (counter == qstat_pv_hash_hops) { + if (id == LOCKEVENT_pv_hash_hops) { u64 frac = 0; if (kicks) { - frac = 100ULL * do_div(stat, kicks); + frac = 100ULL * do_div(sum, kicks); frac = DIV_ROUND_CLOSEST_ULL(frac, kicks); } /* * Return a X.XX decimal number */ - len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac); + len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", + sum, frac); } else { /* * Round to the nearest ns */ - if ((counter == qstat_pv_latency_kick) || - (counter == qstat_pv_latency_wake)) { + if ((id == LOCKEVENT_pv_latency_kick) || + (id == LOCKEVENT_pv_latency_wake)) { if (kicks) - stat = DIV_ROUND_CLOSEST_ULL(stat, kicks); + sum = DIV_ROUND_CLOSEST_ULL(sum, kicks); } - len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat); + len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum); } return simple_read_from_buffer(user_buf, count, ppos, buf, len); } /* - * Function to handle write request - * - * When counter = reset_cnts, reset all the counter values. - * Since the counter updates aren't atomic, the resetting is done twice - * to make sure that the counters are very likely to be all cleared. - */ -static ssize_t qstat_write(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) -{ - int cpu; - - /* - * Get the counter ID stored in file->f_inode->i_private - */ - if ((long)file_inode(file)->i_private != qstat_reset_cnts) - return count; - - for_each_possible_cpu(cpu) { - int i; - unsigned long *ptr = per_cpu_ptr(qstats, cpu); - - for (i = 0 ; i < qstat_num; i++) - WRITE_ONCE(ptr[i], 0); - } - return count; -} - -/* - * Debugfs data structures - */ -static const struct file_operations fops_qstat = { - .read = qstat_read, - .write = qstat_write, - .llseek = default_llseek, -}; - -/* - * Initialize debugfs for the qspinlock statistical counters - */ -static int __init init_qspinlock_stat(void) -{ - struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL); - int i; - - if (!d_qstat) - goto out; - - /* - * Create the debugfs files - * - * As reading from and writing to the stat files can be slow, only - * root is allowed to do the read/write to limit impact to system - * performance. - */ - for (i = 0; i < qstat_num; i++) - if (!debugfs_create_file(qstat_names[i], 0400, d_qstat, - (void *)(long)i, &fops_qstat)) - goto fail_undo; - - if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, - (void *)(long)qstat_reset_cnts, &fops_qstat)) - goto fail_undo; - - return 0; -fail_undo: - debugfs_remove_recursive(d_qstat); -out: - pr_warn("Could not create 'qlockstat' debugfs entries\n"); - return -ENOMEM; -} -fs_initcall(init_qspinlock_stat); - -/* - * Increment the PV qspinlock statistical counters - */ -static inline void qstat_inc(enum qlock_stats stat, bool cond) -{ - if (cond) - this_cpu_inc(qstats[stat]); -} - -/* * PV hash hop count */ -static inline void qstat_hop(int hopcnt) +static inline void lockevent_pv_hop(int hopcnt) { - this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt); + this_cpu_add(EVENT_COUNT(pv_hash_hops), hopcnt); } /* @@ -258,7 +111,7 @@ static inline void __pv_kick(int cpu) per_cpu(pv_kick_time, cpu) = start; pv_kick(cpu); - this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start); + this_cpu_add(EVENT_COUNT(pv_latency_kick), sched_clock() - start); } /* @@ -271,18 +124,19 @@ static inline void __pv_wait(u8 *ptr, u8 val) *pkick_time = 0; pv_wait(ptr, val); if (*pkick_time) { - this_cpu_add(qstats[qstat_pv_latency_wake], + this_cpu_add(EVENT_COUNT(pv_latency_wake), sched_clock() - *pkick_time); - qstat_inc(qstat_pv_kick_wake, true); + lockevent_inc(pv_kick_wake); } } #define pv_kick(c) __pv_kick(c) #define pv_wait(p, v) __pv_wait(p, v) -#else /* CONFIG_QUEUED_LOCK_STAT */ +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +#else /* CONFIG_LOCK_EVENT_COUNTS */ -static inline void qstat_inc(enum qlock_stats stat, bool cond) { } -static inline void qstat_hop(int hopcnt) { } +static inline void lockevent_pv_hop(int hopcnt) { } -#endif /* CONFIG_QUEUED_LOCK_STAT */ +#endif /* CONFIG_LOCK_EVENT_COUNTS */ diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c deleted file mode 100644 index ac35e648b0e5..000000000000 --- a/kernel/locking/rtmutex-debug.c +++ /dev/null @@ -1,181 +0,0 @@ -/* - * RT-Mutexes: blocking mutual exclusion locks with PI support - * - * started by Ingo Molnar and Thomas Gleixner: - * - * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> - * - * This code is based on the rt.c implementation in the preempt-rt tree. - * Portions of said code are - * - * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey - * Copyright (C) 2006 Esben Nielsen - * Copyright (C) 2006 Kihon Technologies Inc., - * Steven Rostedt <rostedt@goodmis.org> - * - * See rt.c in preempt-rt for proper credits and further information - */ -#include <linux/sched.h> -#include <linux/sched/rt.h> -#include <linux/sched/debug.h> -#include <linux/delay.h> -#include <linux/export.h> -#include <linux/spinlock.h> -#include <linux/kallsyms.h> -#include <linux/syscalls.h> -#include <linux/interrupt.h> -#include <linux/rbtree.h> -#include <linux/fs.h> -#include <linux/debug_locks.h> - -#include "rtmutex_common.h" - -static void printk_task(struct task_struct *p) -{ - if (p) - printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio); - else - printk("<none>"); -} - -static void printk_lock(struct rt_mutex *lock, int print_owner) -{ - if (lock->name) - printk(" [%p] {%s}\n", - lock, lock->name); - else - printk(" [%p] {%s:%d}\n", - lock, lock->file, lock->line); - - if (print_owner && rt_mutex_owner(lock)) { - printk(".. ->owner: %p\n", lock->owner); - printk(".. held by: "); - printk_task(rt_mutex_owner(lock)); - printk("\n"); - } -} - -void rt_mutex_debug_task_free(struct task_struct *task) -{ - DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters)); - DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); -} - -/* - * We fill out the fields in the waiter to store the information about - * the deadlock. We print when we return. act_waiter can be NULL in - * case of a remove waiter operation. - */ -void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, - struct rt_mutex_waiter *act_waiter, - struct rt_mutex *lock) -{ - struct task_struct *task; - - if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter) - return; - - task = rt_mutex_owner(act_waiter->lock); - if (task && task != current) { - act_waiter->deadlock_task_pid = get_pid(task_pid(task)); - act_waiter->deadlock_lock = lock; - } -} - -void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) -{ - struct task_struct *task; - - if (!waiter->deadlock_lock || !debug_locks) - return; - - rcu_read_lock(); - task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID); - if (!task) { - rcu_read_unlock(); - return; - } - - if (!debug_locks_off()) { - rcu_read_unlock(); - return; - } - - pr_warn("\n"); - pr_warn("============================================\n"); - pr_warn("WARNING: circular locking deadlock detected!\n"); - pr_warn("%s\n", print_tainted()); - pr_warn("--------------------------------------------\n"); - printk("%s/%d is deadlocking current task %s/%d\n\n", - task->comm, task_pid_nr(task), - current->comm, task_pid_nr(current)); - - printk("\n1) %s/%d is trying to acquire this lock:\n", - current->comm, task_pid_nr(current)); - printk_lock(waiter->lock, 1); - - printk("\n2) %s/%d is blocked on this lock:\n", - task->comm, task_pid_nr(task)); - printk_lock(waiter->deadlock_lock, 1); - - debug_show_held_locks(current); - debug_show_held_locks(task); - - printk("\n%s/%d's [blocked] stackdump:\n\n", - task->comm, task_pid_nr(task)); - show_stack(task, NULL); - printk("\n%s/%d's [current] stackdump:\n\n", - current->comm, task_pid_nr(current)); - dump_stack(); - debug_show_all_locks(); - rcu_read_unlock(); - - printk("[ turning off deadlock detection." - "Please report this trace. ]\n\n"); -} - -void debug_rt_mutex_lock(struct rt_mutex *lock) -{ -} - -void debug_rt_mutex_unlock(struct rt_mutex *lock) -{ - DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); -} - -void -debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner) -{ -} - -void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) -{ - DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); -} - -void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) -{ - memset(waiter, 0x11, sizeof(*waiter)); - waiter->deadlock_task_pid = NULL; -} - -void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) -{ - put_pid(waiter->deadlock_task_pid); - memset(waiter, 0x22, sizeof(*waiter)); -} - -void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key) -{ - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lock->name = name; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif -} - diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h deleted file mode 100644 index 5078c6ddf4a5..000000000000 --- a/kernel/locking/rtmutex-debug.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * RT-Mutexes: blocking mutual exclusion locks with PI support - * - * started by Ingo Molnar and Thomas Gleixner: - * - * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> - * - * This file contains macros used solely by rtmutex.c. Debug version. - */ - -extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); -extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); -extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key); -extern void debug_rt_mutex_lock(struct rt_mutex *lock); -extern void debug_rt_mutex_unlock(struct rt_mutex *lock); -extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, - struct task_struct *powner); -extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); -extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, - struct rt_mutex_waiter *waiter, - struct rt_mutex *lock); -extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); -# define debug_rt_mutex_reset_waiter(w) \ - do { (w)->deadlock_lock = NULL; } while (0) - -static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, - enum rtmutex_chainwalk walk) -{ - return (waiter != NULL); -} - -static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) -{ - debug_rt_mutex_print_deadlock(w); -} diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 649dc9d3951a..c80902eacd79 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * RT-Mutexes: simple blocking mutual exclusion locks with PI support * @@ -7,19 +8,62 @@ * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt * Copyright (C) 2006 Esben Nielsen + * Adaptive Spinlocks: + * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich, + * and Peter Morreale, + * Adaptive Spinlocks simplification: + * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com> * - * See Documentation/locking/rt-mutex-design.txt for details. + * See Documentation/locking/rt-mutex-design.rst for details. */ -#include <linux/spinlock.h> -#include <linux/export.h> +#include <linux/sched.h> +#include <linux/sched/debug.h> +#include <linux/sched/deadline.h> #include <linux/sched/signal.h> #include <linux/sched/rt.h> -#include <linux/sched/deadline.h> #include <linux/sched/wake_q.h> -#include <linux/sched/debug.h> -#include <linux/timer.h> +#include <linux/ww_mutex.h> + +#include <trace/events/lock.h> #include "rtmutex_common.h" +#include "lock_events.h" + +#ifndef WW_RT +# define build_ww_mutex() (false) +# define ww_container_of(rtm) NULL + +static inline int __ww_mutex_add_waiter(struct rt_mutex_waiter *waiter, + struct rt_mutex *lock, + struct ww_acquire_ctx *ww_ctx, + struct wake_q_head *wake_q) +{ + return 0; +} + +static inline void __ww_mutex_check_waiters(struct rt_mutex *lock, + struct ww_acquire_ctx *ww_ctx, + struct wake_q_head *wake_q) +{ +} + +static inline void ww_mutex_lock_acquired(struct ww_mutex *lock, + struct ww_acquire_ctx *ww_ctx) +{ +} + +static inline int __ww_mutex_check_kill(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct ww_acquire_ctx *ww_ctx) +{ + return 0; +} + +#else +# define build_ww_mutex() (true) +# define ww_container_of(rtm) container_of(rtm, struct ww_mutex, base) +# include "ww_mutex.h" +#endif /* * lock->owner state tracking: @@ -48,24 +92,41 @@ * set this bit before looking at the lock. */ -static void -rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) +static __always_inline struct task_struct * +rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner) { unsigned long val = (unsigned long)owner; if (rt_mutex_has_waiters(lock)) val |= RT_MUTEX_HAS_WAITERS; - lock->owner = (struct task_struct *)val; + return (struct task_struct *)val; +} + +static __always_inline void +rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner) +{ + /* + * lock->wait_lock is held but explicit acquire semantics are needed + * for a new lock owner so WRITE_ONCE is insufficient. + */ + xchg_acquire(&lock->owner, rt_mutex_owner_encode(lock, owner)); +} + +static __always_inline void rt_mutex_clear_owner(struct rt_mutex_base *lock) +{ + /* lock->wait_lock is held so the unlock provides release semantics. */ + WRITE_ONCE(lock->owner, rt_mutex_owner_encode(lock, NULL)); } -static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) +static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock) { lock->owner = (struct task_struct *) ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); } -static void fixup_rt_mutex_waiters(struct rt_mutex *lock) +static __always_inline void +fixup_rt_mutex_waiters(struct rt_mutex_base *lock, bool acquire_lock) { unsigned long owner, *p = (unsigned long *) &lock->owner; @@ -131,8 +192,21 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) * still set. */ owner = READ_ONCE(*p); - if (owner & RT_MUTEX_HAS_WAITERS) - WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); + if (owner & RT_MUTEX_HAS_WAITERS) { + /* + * See rt_mutex_set_owner() and rt_mutex_clear_owner() on + * why xchg_acquire() is used for updating owner for + * locking and WRITE_ONCE() for unlocking. + * + * WRITE_ONCE() would work for the acquire case too, but + * in case that the lock acquisition failed it might + * force other lockers into the slow path unnecessarily. + */ + if (acquire_lock) + xchg_acquire(p, owner & ~RT_MUTEX_HAS_WAITERS); + else + WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); + } } /* @@ -140,23 +214,46 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) * set up. */ #ifndef CONFIG_DEBUG_RT_MUTEXES -# define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c) -# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c) -# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c) +static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock, + struct task_struct *old, + struct task_struct *new) +{ + return try_cmpxchg_acquire(&lock->owner, &old, new); +} + +static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock) +{ + return rt_mutex_cmpxchg_acquire(lock, NULL, current); +} + +static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, + struct task_struct *old, + struct task_struct *new) +{ + return try_cmpxchg_release(&lock->owner, &old, new); +} /* * Callers must hold the ->wait_lock -- which is the whole purpose as we force * all future threads that attempt to [Rmw] the lock to the slowpath. As such * relaxed semantics suffice. */ -static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock) { - unsigned long owner, *p = (unsigned long *) &lock->owner; + unsigned long *p = (unsigned long *) &lock->owner; + unsigned long owner, new; + owner = READ_ONCE(*p); do { - owner = *p; - } while (cmpxchg_relaxed(p, owner, - owner | RT_MUTEX_HAS_WAITERS) != owner); + new = owner | RT_MUTEX_HAS_WAITERS; + } while (!try_cmpxchg_relaxed(p, &owner, new)); + + /* + * The cmpxchg loop above is relaxed to avoid back-to-back ACQUIRE + * operations in the event of contention. Ensure the successful + * cmpxchg is visible. + */ + smp_mb__after_atomic(); } /* @@ -165,8 +262,8 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) * 2) Drop lock->wait_lock * 3) Try to unlock the lock with cmpxchg */ -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, - unsigned long flags) +static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock, + unsigned long flags) __releases(lock->wait_lock) { struct task_struct *owner = rt_mutex_owner(lock); @@ -201,11 +298,36 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, } #else -# define rt_mutex_cmpxchg_relaxed(l,c,n) (0) -# define rt_mutex_cmpxchg_acquire(l,c,n) (0) -# define rt_mutex_cmpxchg_release(l,c,n) (0) +static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock, + struct task_struct *old, + struct task_struct *new) +{ + return false; -static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +} + +static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock); + +static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock) +{ + /* + * With debug enabled rt_mutex_cmpxchg trylock() will always fail. + * + * Avoid unconditionally taking the slow path by using + * rt_mutex_slow_trylock() which is covered by the debug code and can + * acquire a non-contended rtmutex. + */ + return rt_mutex_slowtrylock(lock); +} + +static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, + struct task_struct *old, + struct task_struct *new) +{ + return false; +} + +static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock) { lock->owner = (struct task_struct *) ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); @@ -214,8 +336,8 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) /* * Simple slow path only version: lock->owner is protected by lock->wait_lock. */ -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, - unsigned long flags) +static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock, + unsigned long flags) __releases(lock->wait_lock) { lock->owner = NULL; @@ -224,15 +346,53 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, } #endif +static __always_inline int __waiter_prio(struct task_struct *task) +{ + int prio = task->prio; + + if (!rt_or_dl_prio(prio)) + return DEFAULT_PRIO; + + return prio; +} + +/* + * Update the waiter->tree copy of the sort keys. + */ +static __always_inline void +waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) +{ + lockdep_assert_held(&waiter->lock->wait_lock); + lockdep_assert(RB_EMPTY_NODE(&waiter->tree.entry)); + + waiter->tree.prio = __waiter_prio(task); + waiter->tree.deadline = task->dl.deadline; +} + +/* + * Update the waiter->pi_tree copy of the sort keys (from the tree copy). + */ +static __always_inline void +waiter_clone_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) +{ + lockdep_assert_held(&waiter->lock->wait_lock); + lockdep_assert_held(&task->pi_lock); + lockdep_assert(RB_EMPTY_NODE(&waiter->pi_tree.entry)); + + waiter->pi_tree.prio = waiter->tree.prio; + waiter->pi_tree.deadline = waiter->tree.deadline; +} + /* - * Only use with rt_mutex_waiter_{less,equal}() + * Only use with rt_waiter_node_{less,equal}() */ +#define task_to_waiter_node(p) \ + &(struct rt_waiter_node){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline } #define task_to_waiter(p) \ - &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } + &(struct rt_mutex_waiter){ .tree = *task_to_waiter_node(p) } -static inline int -rt_mutex_waiter_less(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) +static __always_inline int rt_waiter_node_less(struct rt_waiter_node *left, + struct rt_waiter_node *right) { if (left->prio < right->prio) return 1; @@ -249,9 +409,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, return 0; } -static inline int -rt_mutex_waiter_equal(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) +static __always_inline int rt_waiter_node_equal(struct rt_waiter_node *left, + struct rt_waiter_node *right) { if (left->prio != right->prio) return 0; @@ -268,88 +427,110 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, return 1; } -static void -rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) +static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, + struct rt_mutex_waiter *top_waiter) { - struct rb_node **link = &lock->waiters.rb_node; - struct rb_node *parent = NULL; - struct rt_mutex_waiter *entry; - int leftmost = 1; - - while (*link) { - parent = *link; - entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry); - if (rt_mutex_waiter_less(waiter, entry)) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = 0; - } + if (rt_waiter_node_less(&waiter->tree, &top_waiter->tree)) + return true; + +#ifdef RT_MUTEX_BUILD_SPINLOCKS + /* + * Note that RT tasks are excluded from same priority (lateral) + * steals to prevent the introduction of an unbounded latency. + */ + if (rt_or_dl_prio(waiter->tree.prio)) + return false; + + return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree); +#else + return false; +#endif +} + +#define __node_2_waiter(node) \ + rb_entry((node), struct rt_mutex_waiter, tree.entry) + +static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b) +{ + struct rt_mutex_waiter *aw = __node_2_waiter(a); + struct rt_mutex_waiter *bw = __node_2_waiter(b); + + if (rt_waiter_node_less(&aw->tree, &bw->tree)) + return 1; + + if (!build_ww_mutex()) + return 0; + + if (rt_waiter_node_less(&bw->tree, &aw->tree)) + return 0; + + /* NOTE: relies on waiter->ww_ctx being set before insertion */ + if (aw->ww_ctx) { + if (!bw->ww_ctx) + return 1; + + return (signed long)(aw->ww_ctx->stamp - + bw->ww_ctx->stamp) < 0; } - if (leftmost) - lock->waiters_leftmost = &waiter->tree_entry; + return 0; +} - rb_link_node(&waiter->tree_entry, parent, link); - rb_insert_color(&waiter->tree_entry, &lock->waiters); +static __always_inline void +rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) +{ + lockdep_assert_held(&lock->wait_lock); + + rb_add_cached(&waiter->tree.entry, &lock->waiters, __waiter_less); } -static void -rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) +static __always_inline void +rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) { - if (RB_EMPTY_NODE(&waiter->tree_entry)) + lockdep_assert_held(&lock->wait_lock); + + if (RB_EMPTY_NODE(&waiter->tree.entry)) return; - if (lock->waiters_leftmost == &waiter->tree_entry) - lock->waiters_leftmost = rb_next(&waiter->tree_entry); + rb_erase_cached(&waiter->tree.entry, &lock->waiters); + RB_CLEAR_NODE(&waiter->tree.entry); +} + +#define __node_2_rt_node(node) \ + rb_entry((node), struct rt_waiter_node, entry) - rb_erase(&waiter->tree_entry, &lock->waiters); - RB_CLEAR_NODE(&waiter->tree_entry); +static __always_inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b) +{ + return rt_waiter_node_less(__node_2_rt_node(a), __node_2_rt_node(b)); } -static void +static __always_inline void rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { - struct rb_node **link = &task->pi_waiters.rb_node; - struct rb_node *parent = NULL; - struct rt_mutex_waiter *entry; - int leftmost = 1; - - while (*link) { - parent = *link; - entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry); - if (rt_mutex_waiter_less(waiter, entry)) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = 0; - } - } - - if (leftmost) - task->pi_waiters_leftmost = &waiter->pi_tree_entry; + lockdep_assert_held(&task->pi_lock); - rb_link_node(&waiter->pi_tree_entry, parent, link); - rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters); + rb_add_cached(&waiter->pi_tree.entry, &task->pi_waiters, __pi_waiter_less); } -static void +static __always_inline void rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { - if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) - return; + lockdep_assert_held(&task->pi_lock); - if (task->pi_waiters_leftmost == &waiter->pi_tree_entry) - task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry); + if (RB_EMPTY_NODE(&waiter->pi_tree.entry)) + return; - rb_erase(&waiter->pi_tree_entry, &task->pi_waiters); - RB_CLEAR_NODE(&waiter->pi_tree_entry); + rb_erase_cached(&waiter->pi_tree.entry, &task->pi_waiters); + RB_CLEAR_NODE(&waiter->pi_tree.entry); } -static void rt_mutex_adjust_prio(struct task_struct *p) +static __always_inline void rt_mutex_adjust_prio(struct rt_mutex_base *lock, + struct task_struct *p) { struct task_struct *pi_task = NULL; + lockdep_assert_held(&lock->wait_lock); + lockdep_assert(rt_mutex_owner(lock) == p); lockdep_assert_held(&p->pi_lock); if (task_has_pi_waiters(p)) @@ -358,6 +539,42 @@ static void rt_mutex_adjust_prio(struct task_struct *p) rt_mutex_setprio(p, pi_task); } +/* RT mutex specific wake_q wrappers */ +static __always_inline void rt_mutex_wake_q_add_task(struct rt_wake_q_head *wqh, + struct task_struct *task, + unsigned int wake_state) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && wake_state == TASK_RTLOCK_WAIT) { + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) + WARN_ON_ONCE(wqh->rtlock_task); + get_task_struct(task); + wqh->rtlock_task = task; + } else { + wake_q_add(&wqh->head, task); + } +} + +static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh, + struct rt_mutex_waiter *w) +{ + rt_mutex_wake_q_add_task(wqh, w->task, w->wake_state); +} + +static __always_inline void rt_mutex_wake_up_q(struct rt_wake_q_head *wqh) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && wqh->rtlock_task) { + wake_up_state(wqh->rtlock_task, TASK_RTLOCK_WAIT); + put_task_struct(wqh->rtlock_task); + wqh->rtlock_task = NULL; + } + + if (!wake_q_empty(&wqh->head)) + wake_up_q(&wqh->head); + + /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */ + preempt_enable(); +} + /* * Deadlock detection is conditional: * @@ -371,25 +588,16 @@ static void rt_mutex_adjust_prio(struct task_struct *p) * deadlock detection is disabled independent of the detect argument * and the config settings. */ -static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, - enum rtmutex_chainwalk chwalk) +static __always_inline bool +rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, + enum rtmutex_chainwalk chwalk) { - /* - * This is just a wrapper function for the following call, - * because debug_rt_mutex_detect_deadlock() smells like a magic - * debug feature and I wanted to keep the cond function in the - * main source file along with the comments instead of having - * two of the same in the headers. - */ - return debug_rt_mutex_detect_deadlock(waiter, chwalk); + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) + return waiter != NULL; + return chwalk == RT_MUTEX_FULL_CHAINWALK; } -/* - * Max number of times we'll walk the boosting chain: - */ -int max_lock_depth = 1024; - -static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) +static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_struct *p) { return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL; } @@ -418,9 +626,14 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) * Chain walk basics and protection scope * * [R] refcount on task - * [P] task->pi_lock held + * [Pn] task->pi_lock held * [L] rtmutex->wait_lock held * + * Normal locking order: + * + * rtmutex->wait_lock + * task->pi_lock + * * Step Description Protected by * function arguments: * @task [R] @@ -435,39 +648,44 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) * again: * loop_sanity_check(); * retry: - * [1] lock(task->pi_lock); [R] acquire [P] - * [2] waiter = task->pi_blocked_on; [P] - * [3] check_exit_conditions_1(); [P] - * [4] lock = waiter->lock; [P] - * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L] - * unlock(task->pi_lock); release [P] + * [1] lock(task->pi_lock); [R] acquire [P1] + * [2] waiter = task->pi_blocked_on; [P1] + * [3] check_exit_conditions_1(); [P1] + * [4] lock = waiter->lock; [P1] + * [5] if (!try_lock(lock->wait_lock)) { [P1] try to acquire [L] + * unlock(task->pi_lock); release [P1] * goto retry; * } - * [6] check_exit_conditions_2(); [P] + [L] - * [7] requeue_lock_waiter(lock, waiter); [P] + [L] - * [8] unlock(task->pi_lock); release [P] + * [6] check_exit_conditions_2(); [P1] + [L] + * [7] requeue_lock_waiter(lock, waiter); [P1] + [L] + * [8] unlock(task->pi_lock); release [P1] * put_task_struct(task); release [R] * [9] check_exit_conditions_3(); [L] * [10] task = owner(lock); [L] * get_task_struct(task); [L] acquire [R] - * lock(task->pi_lock); [L] acquire [P] - * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L] - * [12] check_exit_conditions_4(); [P] + [L] - * [13] unlock(task->pi_lock); release [P] + * lock(task->pi_lock); [L] acquire [P2] + * [11] requeue_pi_waiter(tsk, waiters(lock));[P2] + [L] + * [12] check_exit_conditions_4(); [P2] + [L] + * [13] unlock(task->pi_lock); release [P2] * unlock(lock->wait_lock); release [L] * goto again; + * + * Where P1 is the blocking task and P2 is the lock owner; going up one step + * the owner becomes the next blocked task etc.. + * +* */ -static int rt_mutex_adjust_prio_chain(struct task_struct *task, - enum rtmutex_chainwalk chwalk, - struct rt_mutex *orig_lock, - struct rt_mutex *next_lock, - struct rt_mutex_waiter *orig_waiter, - struct task_struct *top_task) +static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, + enum rtmutex_chainwalk chwalk, + struct rt_mutex_base *orig_lock, + struct rt_mutex_base *next_lock, + struct rt_mutex_waiter *orig_waiter, + struct task_struct *top_task) { struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; struct rt_mutex_waiter *prerequeue_top_waiter; int ret = 0, depth = 0; - struct rt_mutex *lock; + struct rt_mutex_base *lock; bool detect_deadlock; bool requeue = true; @@ -550,6 +768,31 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto out_unlock_pi; /* + * There could be 'spurious' loops in the lock graph due to ww_mutex, + * consider: + * + * P1: A, ww_A, ww_B + * P2: ww_B, ww_A + * P3: A + * + * P3 should not return -EDEADLK because it gets trapped in the cycle + * created by P1 and P2 (which will resolve -- and runs into + * max_lock_depth above). Therefore disable detect_deadlock such that + * the below termination condition can trigger once all relevant tasks + * are boosted. + * + * Even when we start with ww_mutex we can disable deadlock detection, + * since we would supress a ww_mutex induced deadlock at [6] anyway. + * Supressing it here however is not sufficient since we might still + * hit [6] due to adjustment driven iteration. + * + * NOTE: if someone were to create a deadlock between 2 ww_classes we'd + * utterly fail to report it; lockdep should. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && waiter->ww_ctx && detect_deadlock) + detect_deadlock = false; + + /* * Drop out, when the task has no waiters. Note, * top_waiter can be NULL, when we are in the deboosting * mode! @@ -578,7 +821,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * enabled we continue, but stop the requeueing in the chain * walk. */ - if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { + if (rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) { if (!detect_deadlock) goto out_unlock_pi; else @@ -586,13 +829,18 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } /* - * [4] Get the next lock + * [4] Get the next lock; per holding task->pi_lock we can't unblock + * and guarantee @lock's existence. */ lock = waiter->lock; /* * [5] We need to trylock here as we are holding task->pi_lock, * which is the reverse lock order versus the other rtmutex * operations. + * + * Per the above, holding task->pi_lock guarantees lock exists, so + * inverting this lock order is infeasible from a life-time + * perspective. */ if (!raw_spin_trylock(&lock->wait_lock)) { raw_spin_unlock_irq(&task->pi_lock); @@ -610,9 +858,21 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * walk, we detected a deadlock. */ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { - debug_rt_mutex_deadlock(chwalk, orig_waiter, lock); - raw_spin_unlock(&lock->wait_lock); ret = -EDEADLK; + + /* + * When the deadlock is due to ww_mutex; also see above. Don't + * report the deadlock and instead let the ww_mutex wound/die + * logic pick which of the contending threads gets -EDEADLK. + * + * NOTE: assumes the cycle only contains a single ww_class; any + * other configuration and we fail to report; also, see + * lockdep. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter && orig_waiter->ww_ctx) + ret = 0; + + raw_spin_unlock(&lock->wait_lock); goto out_unlock_pi; } @@ -639,8 +899,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } /* [10] Grab the next task, i.e. owner of @lock */ - task = rt_mutex_owner(lock); - get_task_struct(task); + task = get_task_struct(rt_mutex_owner(lock)); raw_spin_lock(&task->pi_lock); /* @@ -685,18 +944,18 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * or * * DL CBS enforcement advancing the effective deadline. - * - * Even though pi_waiters also uses these fields, and that tree is only - * updated in [11], we can do this here, since we hold [L], which - * serializes all pi_waiters access and rb_erase() does not care about - * the values of the node being removed. */ - waiter->prio = task->prio; - waiter->deadline = task->dl.deadline; + waiter_update_prio(waiter, task); rt_mutex_enqueue(lock, waiter); - /* [8] Release the task */ + /* + * [8] Release the (blocking) task in preparation for + * taking the owner task in [10]. + * + * Since we hold lock->waiter_lock, task cannot unblock, even if we + * release task->pi_lock. + */ raw_spin_unlock(&task->pi_lock); put_task_struct(task); @@ -713,15 +972,20 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * then we need to wake the new top waiter up to try * to get the lock. */ - if (prerequeue_top_waiter != rt_mutex_top_waiter(lock)) - wake_up_process(rt_mutex_top_waiter(lock)->task); + top_waiter = rt_mutex_top_waiter(lock); + if (prerequeue_top_waiter != top_waiter) + wake_up_state(top_waiter->task, top_waiter->wake_state); raw_spin_unlock_irq(&lock->wait_lock); return 0; } - /* [10] Grab the next task, i.e. the owner of @lock */ - task = rt_mutex_owner(lock); - get_task_struct(task); + /* + * [10] Grab the next task, i.e. the owner of @lock + * + * Per holding lock->wait_lock and checking for !owner above, there + * must be an owner and it cannot go away. + */ + task = get_task_struct(rt_mutex_owner(lock)); raw_spin_lock(&task->pi_lock); /* [11] requeue the pi waiters if necessary */ @@ -733,13 +997,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * and adjust the priority of the owner. */ rt_mutex_dequeue_pi(task, prerequeue_top_waiter); + waiter_clone_prio(waiter, task); rt_mutex_enqueue_pi(task, waiter); - rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(lock, task); } else if (prerequeue_top_waiter == waiter) { /* * The waiter was the top waiter on the lock, but is - * no longer the top prority waiter. Replace waiter in + * no longer the top priority waiter. Replace waiter in * the owner tasks pi waiters tree with the new top * (highest priority) waiter and adjust the priority * of the owner. @@ -749,8 +1014,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, */ rt_mutex_dequeue_pi(task, waiter); waiter = rt_mutex_top_waiter(lock); + waiter_clone_prio(waiter, task); rt_mutex_enqueue_pi(task, waiter); - rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(lock, task); } else { /* * Nothing changed. No need to do any priority @@ -817,8 +1083,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * @waiter: The waiter that is queued to the lock's wait tree if the * callsite called task_blocked_on_lock(), otherwise NULL */ -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, - struct rt_mutex_waiter *waiter) +static int __sched +try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task, + struct rt_mutex_waiter *waiter) { lockdep_assert_held(&lock->wait_lock); @@ -853,19 +1120,21 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * trylock attempt. */ if (waiter) { - /* - * If waiter is not the highest priority waiter of - * @lock, give up. - */ - if (waiter != rt_mutex_top_waiter(lock)) - return 0; + struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock); /* - * We can acquire the lock. Remove the waiter from the - * lock waiters tree. + * If waiter is the highest priority waiter of @lock, + * or allowed to steal it, take it over. */ - rt_mutex_dequeue(lock, waiter); - + if (waiter == top_waiter || rt_mutex_steal(waiter, top_waiter)) { + /* + * We can acquire the lock. Remove the waiter from the + * lock waiters tree. + */ + rt_mutex_dequeue(lock, waiter); + } else { + return 0; + } } else { /* * If the lock has waiters already we check whether @task is @@ -876,13 +1145,9 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * not need to be dequeued. */ if (rt_mutex_has_waiters(lock)) { - /* - * If @task->prio is greater than or equal to - * the top waiter priority (kernel view), - * @task lost. - */ - if (!rt_mutex_waiter_less(task_to_waiter(task), - rt_mutex_top_waiter(lock))) + /* Check whether the trylock can steal it. */ + if (!rt_mutex_steal(task_to_waiter(task), + rt_mutex_top_waiter(lock))) return 0; /* @@ -919,9 +1184,6 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, raw_spin_unlock(&task->pi_lock); takeit: - /* We got the lock. */ - debug_rt_mutex_lock(lock); - /* * This either preserves the RT_MUTEX_HAS_WAITERS bit if there * are still waiters or clears it. @@ -938,14 +1200,16 @@ takeit: * * This must be called with lock->wait_lock held and interrupts disabled */ -static int task_blocks_on_rt_mutex(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task, - enum rtmutex_chainwalk chwalk) +static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, + struct ww_acquire_ctx *ww_ctx, + enum rtmutex_chainwalk chwalk, + struct wake_q_head *wake_q) { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; - struct rt_mutex *next_lock; + struct rt_mutex_base *next_lock; int chain_walk = 0, res; lockdep_assert_held(&lock->wait_lock); @@ -958,15 +1222,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, * the other will detect the deadlock and return -EDEADLOCK, * which is wrong, as the other waiter is not in a deadlock * situation. + * + * Except for ww_mutex, in that case the chain walk must already deal + * with spurious cycles, see the comments at [3] and [6]. */ - if (owner == task) + if (owner == task && !(build_ww_mutex() && ww_ctx)) return -EDEADLK; raw_spin_lock(&task->pi_lock); waiter->task = task; waiter->lock = lock; - waiter->prio = task->prio; - waiter->deadline = task->dl.deadline; + waiter_update_prio(waiter, task); + waiter_clone_prio(waiter, task); /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) @@ -977,6 +1244,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, raw_spin_unlock(&task->pi_lock); + if (build_ww_mutex() && ww_ctx) { + struct rt_mutex *rtm; + + /* Check whether the waiter should back out immediately */ + rtm = container_of(lock, struct rt_mutex, rtmutex); + res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, wake_q); + if (res) { + raw_spin_lock(&task->pi_lock); + rt_mutex_dequeue(lock, waiter); + task->pi_blocked_on = NULL; + raw_spin_unlock(&task->pi_lock); + return res; + } + } + if (!owner) return 0; @@ -985,7 +1267,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); - rt_mutex_adjust_prio(owner); + rt_mutex_adjust_prio(lock, owner); if (owner->pi_blocked_on) chain_walk = 1; } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { @@ -1011,7 +1293,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, */ get_task_struct(owner); - raw_spin_unlock_irq(&lock->wait_lock); + raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); res = rt_mutex_adjust_prio_chain(owner, chwalk, lock, next_lock, waiter, task); @@ -1027,11 +1309,13 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, * * Called with lock->wait_lock held and interrupts disabled. */ -static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, - struct rt_mutex *lock) +static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh, + struct rt_mutex_base *lock) { struct rt_mutex_waiter *waiter; + lockdep_assert_held(&lock->wait_lock); + raw_spin_lock(¤t->pi_lock); waiter = rt_mutex_top_waiter(lock); @@ -1044,7 +1328,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, * task unblocks. */ rt_mutex_dequeue_pi(current, waiter); - rt_mutex_adjust_prio(current); + rt_mutex_adjust_prio(lock, current); /* * As we are waking up the top waiter, and the waiter stays @@ -1064,240 +1348,22 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, * deboost but before waking our donor task, hence the preempt_disable() * before unlock. * - * Pairs with preempt_enable() in rt_mutex_postunlock(); + * Pairs with preempt_enable() in rt_mutex_wake_up_q(); */ preempt_disable(); - wake_q_add(wake_q, waiter->task); - raw_spin_unlock(¤t->pi_lock); -} - -/* - * Remove a waiter from a lock and give up - * - * Must be called with lock->wait_lock held and interrupts disabled. I must - * have just failed to try_to_take_rt_mutex(). - */ -static void remove_waiter(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter) -{ - bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); - struct task_struct *owner = rt_mutex_owner(lock); - struct rt_mutex *next_lock; - - lockdep_assert_held(&lock->wait_lock); - - raw_spin_lock(¤t->pi_lock); - rt_mutex_dequeue(lock, waiter); - current->pi_blocked_on = NULL; + rt_mutex_wake_q_add(wqh, waiter); raw_spin_unlock(¤t->pi_lock); - - /* - * Only update priority if the waiter was the highest priority - * waiter of the lock and there is an owner to update. - */ - if (!owner || !is_top_waiter) - return; - - raw_spin_lock(&owner->pi_lock); - - rt_mutex_dequeue_pi(owner, waiter); - - if (rt_mutex_has_waiters(lock)) - rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); - - rt_mutex_adjust_prio(owner); - - /* Store the lock on which owner is blocked or NULL */ - next_lock = task_blocked_on_lock(owner); - - raw_spin_unlock(&owner->pi_lock); - - /* - * Don't walk the chain, if the owner task is not blocked - * itself. - */ - if (!next_lock) - return; - - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(owner); - - raw_spin_unlock_irq(&lock->wait_lock); - - rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock, - next_lock, NULL, current); - - raw_spin_lock_irq(&lock->wait_lock); -} - -/* - * Recheck the pi chain, in case we got a priority setting - * - * Called from sched_setscheduler - */ -void rt_mutex_adjust_pi(struct task_struct *task) -{ - struct rt_mutex_waiter *waiter; - struct rt_mutex *next_lock; - unsigned long flags; - - raw_spin_lock_irqsave(&task->pi_lock, flags); - - waiter = task->pi_blocked_on; - if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - return; - } - next_lock = waiter->lock; - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(task); - - rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL, - next_lock, NULL, task); } -void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) +static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock) { - debug_rt_mutex_init_waiter(waiter); - RB_CLEAR_NODE(&waiter->pi_tree_entry); - RB_CLEAR_NODE(&waiter->tree_entry); - waiter->task = NULL; -} - -/** - * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop - * @lock: the rt_mutex to take - * @state: the state the task should block in (TASK_INTERRUPTIBLE - * or TASK_UNINTERRUPTIBLE) - * @timeout: the pre-initialized and started timer, or NULL for none - * @waiter: the pre-initialized rt_mutex_waiter - * - * Must be called with lock->wait_lock held and interrupts disabled - */ -static int __sched -__rt_mutex_slowlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - struct rt_mutex_waiter *waiter) -{ - int ret = 0; - - for (;;) { - /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock, current, waiter)) - break; - - /* - * TASK_INTERRUPTIBLE checks for signals and - * timeout. Ignored otherwise. - */ - if (likely(state == TASK_INTERRUPTIBLE)) { - /* Signal pending? */ - if (signal_pending(current)) - ret = -EINTR; - if (timeout && !timeout->task) - ret = -ETIMEDOUT; - if (ret) - break; - } - - raw_spin_unlock_irq(&lock->wait_lock); - - debug_rt_mutex_print_deadlock(waiter); - - schedule(); - - raw_spin_lock_irq(&lock->wait_lock); - set_current_state(state); - } - - __set_current_state(TASK_RUNNING); - return ret; -} - -static void rt_mutex_handle_deadlock(int res, int detect_deadlock, - struct rt_mutex_waiter *w) -{ - /* - * If the result is not -EDEADLOCK or the caller requested - * deadlock detection, nothing to do here. - */ - if (res != -EDEADLOCK || detect_deadlock) - return; - - /* - * Yell lowdly and stop the task right here. - */ - rt_mutex_print_deadlock(w); - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - } -} - -/* - * Slow path lock function: - */ -static int __sched -rt_mutex_slowlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - enum rtmutex_chainwalk chwalk) -{ - struct rt_mutex_waiter waiter; - unsigned long flags; - int ret = 0; - - rt_mutex_init_waiter(&waiter); - - /* - * Technically we could use raw_spin_[un]lock_irq() here, but this can - * be called in early boot if the cmpxchg() fast path is disabled - * (debug, no architecture support). In this case we will acquire the - * rtmutex with lock->wait_lock held. But we cannot unconditionally - * enable interrupts in that early boot case. So we need to use the - * irqsave/restore variants. - */ - raw_spin_lock_irqsave(&lock->wait_lock, flags); - - /* Try to acquire the lock again: */ - if (try_to_take_rt_mutex(lock, current, NULL)) { - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - return 0; - } - - set_current_state(state); - - /* Setup the timer, when timeout != NULL */ - if (unlikely(timeout)) - hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); - - ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); - - if (likely(!ret)) - /* sleep on the mutex */ - ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); - - if (unlikely(ret)) { - __set_current_state(TASK_RUNNING); - if (rt_mutex_has_waiters(lock)) - remove_waiter(lock, &waiter); - rt_mutex_handle_deadlock(ret, chwalk, &waiter); - } + int ret = try_to_take_rt_mutex(lock, current, NULL); /* - * try_to_take_rt_mutex() sets the waiter bit - * unconditionally. We might have to fix that up. + * try_to_take_rt_mutex() sets the lock waiters bit + * unconditionally. Clean this up. */ - fixup_rt_mutex_waiters(lock); - - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - - /* Remove pending timer: */ - if (unlikely(timeout)) - hrtimer_cancel(&timeout->timer); - - debug_rt_mutex_free_waiter(&waiter); + fixup_rt_mutex_waiters(lock, true); return ret; } @@ -1305,7 +1371,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, /* * Slow path try-lock function: */ -static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) +static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock) { unsigned long flags; int ret; @@ -1324,27 +1390,27 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) */ raw_spin_lock_irqsave(&lock->wait_lock, flags); - ret = try_to_take_rt_mutex(lock, current, NULL); - - /* - * try_to_take_rt_mutex() sets the lock waiters bit - * unconditionally. Clean this up. - */ - fixup_rt_mutex_waiters(lock); + ret = __rt_mutex_slowtrylock(lock); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return ret; } +static __always_inline int __rt_mutex_trylock(struct rt_mutex_base *lock) +{ + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) + return 1; + + return rt_mutex_slowtrylock(lock); +} + /* * Slow path to release a rt-mutex. - * - * Return whether the current task needs to call rt_mutex_postunlock(). */ -static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, - struct wake_q_head *wake_q) +static void __sched rt_mutex_slowunlock(struct rt_mutex_base *lock) { + DEFINE_RT_WAKE_Q(wqh); unsigned long flags; /* irqsave required to support early boot calls */ @@ -1386,7 +1452,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, while (!rt_mutex_has_waiters(lock)) { /* Drops lock->wait_lock ! */ if (unlock_rt_mutex_safe(lock, flags) == true) - return false; + return; /* Relock the rtmutex and try again */ raw_spin_lock_irqsave(&lock->wait_lock, flags); } @@ -1397,481 +1463,437 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, * * Queue the next waiter for wakeup once we release the wait_lock. */ - mark_wakeup_next_waiter(wake_q, lock); + mark_wakeup_next_waiter(&wqh, lock); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - return true; /* call rt_mutex_postunlock() */ + rt_mutex_wake_up_q(&wqh); } -/* - * debug aware fast / slowpath lock,trylock,unlock - * - * The atomic acquire/release ops are compiled away, when either the - * architecture does not support cmpxchg or when debugging is enabled. - */ -static inline int -rt_mutex_fastlock(struct rt_mutex *lock, int state, - int (*slowfn)(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - enum rtmutex_chainwalk chwalk)) +static __always_inline void __rt_mutex_unlock(struct rt_mutex_base *lock) { - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) - return 0; + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) + return; - return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); + rt_mutex_slowunlock(lock); } -static inline int -rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - enum rtmutex_chainwalk chwalk, - int (*slowfn)(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - enum rtmutex_chainwalk chwalk)) +#ifdef CONFIG_SMP +static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *owner) { - if (chwalk == RT_MUTEX_MIN_CHAINWALK && - likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) - return 0; + bool res = true; - return slowfn(lock, state, timeout, chwalk); + rcu_read_lock(); + for (;;) { + /* If owner changed, trylock again. */ + if (owner != rt_mutex_owner(lock)) + break; + /* + * Ensure that @owner is dereferenced after checking that + * the lock owner still matches @owner. If that fails, + * @owner might point to freed memory. If it still matches, + * the rcu_read_lock() ensures the memory stays valid. + */ + barrier(); + /* + * Stop spinning when: + * - the lock owner has been scheduled out + * - current is not longer the top waiter + * - current is requested to reschedule (redundant + * for CONFIG_PREEMPT_RCU=y) + * - the VCPU on which owner runs is preempted + */ + if (!owner_on_cpu(owner) || need_resched() || + !rt_mutex_waiter_is_top_waiter(lock, waiter)) { + res = false; + break; + } + cpu_relax(); + } + rcu_read_unlock(); + return res; } - -static inline int -rt_mutex_fasttrylock(struct rt_mutex *lock, - int (*slowfn)(struct rt_mutex *lock)) +#else +static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *owner) { - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) - return 1; - - return slowfn(lock); + return false; } +#endif + +#ifdef RT_MUTEX_BUILD_MUTEX +/* + * Functions required for: + * - rtmutex, futex on all kernels + * - mutex and rwsem substitutions on RT kernels + */ /* - * Performs the wakeup of the the top-waiter and re-enables preemption. + * Remove a waiter from a lock and give up + * + * Must be called with lock->wait_lock held and interrupts disabled. It must + * have just failed to try_to_take_rt_mutex(). */ -void rt_mutex_postunlock(struct wake_q_head *wake_q) +static void __sched remove_waiter(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter) { - wake_up_q(wake_q); + bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); + struct task_struct *owner = rt_mutex_owner(lock); + struct rt_mutex_base *next_lock; - /* Pairs with preempt_disable() in rt_mutex_slowunlock() */ - preempt_enable(); -} + lockdep_assert_held(&lock->wait_lock); -static inline void -rt_mutex_fastunlock(struct rt_mutex *lock, - bool (*slowfn)(struct rt_mutex *lock, - struct wake_q_head *wqh)) -{ - DEFINE_WAKE_Q(wake_q); + raw_spin_lock(¤t->pi_lock); + rt_mutex_dequeue(lock, waiter); + current->pi_blocked_on = NULL; + raw_spin_unlock(¤t->pi_lock); - if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) + /* + * Only update priority if the waiter was the highest priority + * waiter of the lock and there is an owner to update. + */ + if (!owner || !is_top_waiter) return; - if (slowfn(lock, &wake_q)) - rt_mutex_postunlock(&wake_q); -} + raw_spin_lock(&owner->pi_lock); -/** - * rt_mutex_lock - lock a rt_mutex - * - * @lock: the rt_mutex to be locked - */ -void __sched rt_mutex_lock(struct rt_mutex *lock) -{ - might_sleep(); + rt_mutex_dequeue_pi(owner, waiter); - mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); - rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); -} -EXPORT_SYMBOL_GPL(rt_mutex_lock); + if (rt_mutex_has_waiters(lock)) + rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); -/** - * rt_mutex_lock_interruptible - lock a rt_mutex interruptible - * - * @lock: the rt_mutex to be locked - * - * Returns: - * 0 on success - * -EINTR when interrupted by a signal - */ -int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) -{ - int ret; + rt_mutex_adjust_prio(lock, owner); - might_sleep(); + /* Store the lock on which owner is blocked or NULL */ + next_lock = task_blocked_on_lock(owner); - mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); - ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); - if (ret) - mutex_release(&lock->dep_map, 1, _RET_IP_); + raw_spin_unlock(&owner->pi_lock); - return ret; -} -EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); + /* + * Don't walk the chain, if the owner task is not blocked + * itself. + */ + if (!next_lock) + return; -/* - * Futex variant, must not use fastpath. - */ -int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) -{ - return rt_mutex_slowtrylock(lock); + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(owner); + + raw_spin_unlock_irq(&lock->wait_lock); + + rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock, + next_lock, NULL, current); + + raw_spin_lock_irq(&lock->wait_lock); } /** - * rt_mutex_timed_lock - lock a rt_mutex interruptible - * the timeout structure is provided - * by the caller - * - * @lock: the rt_mutex to be locked - * @timeout: timeout structure or NULL (no timeout) + * rt_mutex_slowlock_block() - Perform the wait-wake-try-to-take loop + * @lock: the rt_mutex to take + * @ww_ctx: WW mutex context pointer + * @state: the state the task should block in (TASK_INTERRUPTIBLE + * or TASK_UNINTERRUPTIBLE) + * @timeout: the pre-initialized and started timer, or NULL for none + * @waiter: the pre-initialized rt_mutex_waiter + * @wake_q: wake_q of tasks to wake when we drop the lock->wait_lock * - * Returns: - * 0 on success - * -EINTR when interrupted by a signal - * -ETIMEDOUT when the timeout expired + * Must be called with lock->wait_lock held and interrupts disabled */ -int -rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) +static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, + struct ww_acquire_ctx *ww_ctx, + unsigned int state, + struct hrtimer_sleeper *timeout, + struct rt_mutex_waiter *waiter, + struct wake_q_head *wake_q) + __releases(&lock->wait_lock) __acquires(&lock->wait_lock) { - int ret; + struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); + struct task_struct *owner; + int ret = 0; - might_sleep(); + lockevent_inc(rtmutex_slow_block); + for (;;) { + /* Try to acquire the lock: */ + if (try_to_take_rt_mutex(lock, current, waiter)) { + lockevent_inc(rtmutex_slow_acq3); + break; + } - mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); - ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, - RT_MUTEX_MIN_CHAINWALK, - rt_mutex_slowlock); - if (ret) - mutex_release(&lock->dep_map, 1, _RET_IP_); + if (timeout && !timeout->task) { + ret = -ETIMEDOUT; + break; + } + if (signal_pending_state(state, current)) { + ret = -EINTR; + break; + } - return ret; -} -EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); + if (build_ww_mutex() && ww_ctx) { + ret = __ww_mutex_check_kill(rtm, waiter, ww_ctx); + if (ret) + break; + } -/** - * rt_mutex_trylock - try to lock a rt_mutex - * - * @lock: the rt_mutex to be locked - * - * This function can only be called in thread context. It's safe to - * call it from atomic regions, but not from hard interrupt or soft - * interrupt context. - * - * Returns 1 on success and 0 on contention - */ -int __sched rt_mutex_trylock(struct rt_mutex *lock) -{ - int ret; + if (waiter == rt_mutex_top_waiter(lock)) + owner = rt_mutex_owner(lock); + else + owner = NULL; + raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); - if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) - return 0; + if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) { + lockevent_inc(rtmutex_slow_sleep); + rt_mutex_schedule(); + } - ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); - if (ret) - mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + raw_spin_lock_irq(&lock->wait_lock); + set_current_state(state); + } + __set_current_state(TASK_RUNNING); return ret; } -EXPORT_SYMBOL_GPL(rt_mutex_trylock); -/** - * rt_mutex_unlock - unlock a rt_mutex - * - * @lock: the rt_mutex to be unlocked - */ -void __sched rt_mutex_unlock(struct rt_mutex *lock) +static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, + struct rt_mutex_base *lock, + struct rt_mutex_waiter *w) { - mutex_release(&lock->dep_map, 1, _RET_IP_); - rt_mutex_fastunlock(lock, rt_mutex_slowunlock); + /* + * If the result is not -EDEADLOCK or the caller requested + * deadlock detection, nothing to do here. + */ + if (res != -EDEADLOCK || detect_deadlock) + return; + + if (build_ww_mutex() && w->ww_ctx) + return; + + raw_spin_unlock_irq(&lock->wait_lock); + + WARN(1, "rtmutex deadlock detected\n"); + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + rt_mutex_schedule(); + } } -EXPORT_SYMBOL_GPL(rt_mutex_unlock); /** - * Futex variant, that since futex variants do not use the fast-path, can be - * simple and will not need to retry. + * __rt_mutex_slowlock - Locking slowpath invoked with lock::wait_lock held + * @lock: The rtmutex to block lock + * @ww_ctx: WW mutex context pointer + * @state: The task state for sleeping + * @chwalk: Indicator whether full or partial chainwalk is requested + * @waiter: Initializer waiter for blocking + * @wake_q: The wake_q to wake tasks after we release the wait_lock */ -bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wake_q) +static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, + struct ww_acquire_ctx *ww_ctx, + unsigned int state, + enum rtmutex_chainwalk chwalk, + struct rt_mutex_waiter *waiter, + struct wake_q_head *wake_q) { + struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); + struct ww_mutex *ww = ww_container_of(rtm); + int ret; + lockdep_assert_held(&lock->wait_lock); + lockevent_inc(rtmutex_slowlock); - debug_rt_mutex_unlock(lock); + /* Try to acquire the lock again: */ + if (try_to_take_rt_mutex(lock, current, NULL)) { + if (build_ww_mutex() && ww_ctx) { + __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); + ww_mutex_lock_acquired(ww, ww_ctx); + } + lockevent_inc(rtmutex_slow_acq1); + return 0; + } + + set_current_state(state); - if (!rt_mutex_has_waiters(lock)) { - lock->owner = NULL; - return false; /* done */ + trace_contention_begin(lock, LCB_F_RT); + + ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk, wake_q); + if (likely(!ret)) + ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter, wake_q); + + if (likely(!ret)) { + /* acquired the lock */ + if (build_ww_mutex() && ww_ctx) { + if (!ww_ctx->is_wait_die) + __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); + ww_mutex_lock_acquired(ww, ww_ctx); + } + lockevent_inc(rtmutex_slow_acq2); + } else { + __set_current_state(TASK_RUNNING); + remove_waiter(lock, waiter); + rt_mutex_handle_deadlock(ret, chwalk, lock, waiter); + lockevent_inc(rtmutex_deadlock); } /* - * We've already deboosted, mark_wakeup_next_waiter() will - * retain preempt_disabled when we drop the wait_lock, to - * avoid inversion prior to the wakeup. preempt_disable() - * therein pairs with rt_mutex_postunlock(). + * try_to_take_rt_mutex() sets the waiter bit + * unconditionally. We might have to fix that up. */ - mark_wakeup_next_waiter(wake_q, lock); + fixup_rt_mutex_waiters(lock, true); + + trace_contention_end(lock, ret); - return true; /* call postunlock() */ + return ret; } -void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) +static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, + struct ww_acquire_ctx *ww_ctx, + unsigned int state, + struct wake_q_head *wake_q) { - DEFINE_WAKE_Q(wake_q); - bool postunlock; + struct rt_mutex_waiter waiter; + int ret; - raw_spin_lock_irq(&lock->wait_lock); - postunlock = __rt_mutex_futex_unlock(lock, &wake_q); - raw_spin_unlock_irq(&lock->wait_lock); + rt_mutex_init_waiter(&waiter); + waiter.ww_ctx = ww_ctx; + + ret = __rt_mutex_slowlock(lock, ww_ctx, state, RT_MUTEX_MIN_CHAINWALK, + &waiter, wake_q); - if (postunlock) - rt_mutex_postunlock(&wake_q); + debug_rt_mutex_free_waiter(&waiter); + lockevent_cond_inc(rtmutex_slow_wake, !wake_q_empty(wake_q)); + return ret; } -/** - * rt_mutex_destroy - mark a mutex unusable - * @lock: the mutex to be destroyed - * - * This function marks the mutex uninitialized, and any subsequent - * use of the mutex is forbidden. The mutex must not be locked when - * this function is called. +/* + * rt_mutex_slowlock - Locking slowpath invoked when fast path fails + * @lock: The rtmutex to block lock + * @ww_ctx: WW mutex context pointer + * @state: The task state for sleeping */ -void rt_mutex_destroy(struct rt_mutex *lock) +static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, + struct ww_acquire_ctx *ww_ctx, + unsigned int state) { - WARN_ON(rt_mutex_is_locked(lock)); -#ifdef CONFIG_DEBUG_RT_MUTEXES - lock->magic = NULL; -#endif + DEFINE_WAKE_Q(wake_q); + unsigned long flags; + int ret; + + /* + * Do all pre-schedule work here, before we queue a waiter and invoke + * PI -- any such work that trips on rtlock (PREEMPT_RT spinlock) would + * otherwise recurse back into task_blocks_on_rt_mutex() through + * rtlock_slowlock() and will then enqueue a second waiter for this + * same task and things get really confusing real fast. + */ + rt_mutex_pre_schedule(); + + /* + * Technically we could use raw_spin_[un]lock_irq() here, but this can + * be called in early boot if the cmpxchg() fast path is disabled + * (debug, no architecture support). In this case we will acquire the + * rtmutex with lock->wait_lock held. But we cannot unconditionally + * enable interrupts in that early boot case. So we need to use the + * irqsave/restore variants. + */ + raw_spin_lock_irqsave(&lock->wait_lock, flags); + ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state, &wake_q); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); + rt_mutex_post_schedule(); + + return ret; } -EXPORT_SYMBOL_GPL(rt_mutex_destroy); -/** - * __rt_mutex_init - initialize the rt lock - * - * @lock: the rt lock to be initialized - * - * Initialize the rt lock to unlocked state. - * - * Initializing of a locked rt lock is not allowed - */ -void __rt_mutex_init(struct rt_mutex *lock, const char *name, - struct lock_class_key *key) +static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock, + unsigned int state) { - lock->owner = NULL; - raw_spin_lock_init(&lock->wait_lock); - lock->waiters = RB_ROOT; - lock->waiters_leftmost = NULL; + lockdep_assert(!current->pi_blocked_on); - if (name && key) - debug_rt_mutex_init(lock, name, key); + if (likely(rt_mutex_try_acquire(lock))) + return 0; + + return rt_mutex_slowlock(lock, NULL, state); } -EXPORT_SYMBOL_GPL(__rt_mutex_init); +#endif /* RT_MUTEX_BUILD_MUTEX */ -/** - * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a - * proxy owner - * - * @lock: the rt_mutex to be locked - * @proxy_owner:the task to set as owner - * - * No locking. Caller has to do serializing itself - * - * Special API call for PI-futex support. This initializes the rtmutex and - * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not - * possible at this point because the pi_state which contains the rtmutex - * is not yet visible to other tasks. +#ifdef RT_MUTEX_BUILD_SPINLOCKS +/* + * Functions required for spin/rw_lock substitution on RT kernels */ -void rt_mutex_init_proxy_locked(struct rt_mutex *lock, - struct task_struct *proxy_owner) -{ - __rt_mutex_init(lock, NULL, NULL); - debug_rt_mutex_proxy_lock(lock, proxy_owner); - rt_mutex_set_owner(lock, proxy_owner); -} /** - * rt_mutex_proxy_unlock - release a lock on behalf of owner - * - * @lock: the rt_mutex to be locked - * - * No locking. Caller has to do serializing itself - * - * Special API call for PI-futex support. This merrily cleans up the rtmutex - * (debugging) state. Concurrent operations on this rt_mutex are not - * possible because it belongs to the pi_state which is about to be freed - * and it is not longer visible to other tasks. + * rtlock_slowlock_locked - Slow path lock acquisition for RT locks + * @lock: The underlying RT mutex + * @wake_q: The wake_q to wake tasks after we release the wait_lock */ -void rt_mutex_proxy_unlock(struct rt_mutex *lock, - struct task_struct *proxy_owner) +static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, + struct wake_q_head *wake_q) + __releases(&lock->wait_lock) __acquires(&lock->wait_lock) { - debug_rt_mutex_proxy_unlock(lock); - rt_mutex_set_owner(lock, NULL); -} - -int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task) -{ - int ret; - - if (try_to_take_rt_mutex(lock, task, NULL)) - return 1; + struct rt_mutex_waiter waiter; + struct task_struct *owner; - /* We enforce deadlock detection for futexes */ - ret = task_blocks_on_rt_mutex(lock, waiter, task, - RT_MUTEX_FULL_CHAINWALK); + lockdep_assert_held(&lock->wait_lock); + lockevent_inc(rtlock_slowlock); - if (ret && !rt_mutex_owner(lock)) { - /* - * Reset the return value. We might have - * returned with -EDEADLK and the owner - * released the lock while we were walking the - * pi chain. Let the waiter sort it out. - */ - ret = 0; + if (try_to_take_rt_mutex(lock, current, NULL)) { + lockevent_inc(rtlock_slow_acq1); + return; } - if (unlikely(ret)) - remove_waiter(lock, waiter); + rt_mutex_init_rtlock_waiter(&waiter); - debug_rt_mutex_print_deadlock(waiter); + /* Save current state and set state to TASK_RTLOCK_WAIT */ + current_save_and_set_rtlock_wait_state(); - return ret; -} + trace_contention_begin(lock, LCB_F_RT); -/** - * rt_mutex_start_proxy_lock() - Start lock acquisition for another task - * @lock: the rt_mutex to take - * @waiter: the pre-initialized rt_mutex_waiter - * @task: the task to prepare - * - * Returns: - * 0 - task blocked on lock - * 1 - acquired the lock for task, caller should wake it up - * <0 - error - * - * Special API call for FUTEX_REQUEUE_PI support. - */ -int rt_mutex_start_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task) -{ - int ret; + task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK, wake_q); - raw_spin_lock_irq(&lock->wait_lock); - ret = __rt_mutex_start_proxy_lock(lock, waiter, task); - raw_spin_unlock_irq(&lock->wait_lock); + for (;;) { + /* Try to acquire the lock again */ + if (try_to_take_rt_mutex(lock, current, &waiter)) { + lockevent_inc(rtlock_slow_acq2); + break; + } - return ret; -} + if (&waiter == rt_mutex_top_waiter(lock)) + owner = rt_mutex_owner(lock); + else + owner = NULL; + raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); -/** - * rt_mutex_next_owner - return the next owner of the lock - * - * @lock: the rt lock query - * - * Returns the next owner of the lock or NULL - * - * Caller has to serialize against other accessors to the lock - * itself. - * - * Special API call for PI-futex support - */ -struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) -{ - if (!rt_mutex_has_waiters(lock)) - return NULL; + if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) { + lockevent_inc(rtlock_slow_sleep); + schedule_rtlock(); + } - return rt_mutex_top_waiter(lock)->task; -} + raw_spin_lock_irq(&lock->wait_lock); + set_current_state(TASK_RTLOCK_WAIT); + } -/** - * rt_mutex_wait_proxy_lock() - Wait for lock acquisition - * @lock: the rt_mutex we were woken on - * @to: the timeout, null if none. hrtimer should already have - * been started. - * @waiter: the pre-initialized rt_mutex_waiter - * - * Wait for the the lock acquisition started on our behalf by - * rt_mutex_start_proxy_lock(). Upon failure, the caller must call - * rt_mutex_cleanup_proxy_lock(). - * - * Returns: - * 0 - success - * <0 - error, one of -EINTR, -ETIMEDOUT - * - * Special API call for PI-futex support - */ -int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter) -{ - int ret; + /* Restore the task state */ + current_restore_rtlock_saved_state(); - raw_spin_lock_irq(&lock->wait_lock); - /* sleep on the mutex */ - set_current_state(TASK_INTERRUPTIBLE); - ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); /* - * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might - * have to fix that up. + * try_to_take_rt_mutex() sets the waiter bit unconditionally. + * We might have to fix that up: */ - fixup_rt_mutex_waiters(lock); - raw_spin_unlock_irq(&lock->wait_lock); + fixup_rt_mutex_waiters(lock, true); + debug_rt_mutex_free_waiter(&waiter); - return ret; + trace_contention_end(lock, 0); + lockevent_cond_inc(rtlock_slow_wake, !wake_q_empty(wake_q)); } -/** - * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition - * @lock: the rt_mutex we were woken on - * @waiter: the pre-initialized rt_mutex_waiter - * - * Attempt to clean up after a failed rt_mutex_wait_proxy_lock(). - * - * Unless we acquired the lock; we're still enqueued on the wait-list and can - * in fact still be granted ownership until we're removed. Therefore we can - * find we are in fact the owner and must disregard the - * rt_mutex_wait_proxy_lock() failure. - * - * Returns: - * true - did the cleanup, we done. - * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned, - * caller should disregards its return value. - * - * Special API call for PI-futex support - */ -bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter) +static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock) { - bool cleanup = false; - - raw_spin_lock_irq(&lock->wait_lock); - /* - * Do an unconditional try-lock, this deals with the lock stealing - * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter() - * sets a NULL owner. - * - * We're not interested in the return value, because the subsequent - * test on rt_mutex_owner() will infer that. If the trylock succeeded, - * we will own the lock and it will have removed the waiter. If we - * failed the trylock, we're still not owner and we need to remove - * ourselves. - */ - try_to_take_rt_mutex(lock, current, waiter); - /* - * Unless we're the owner; we're still enqueued on the wait_list. - * So check if we became owner, if not, take us off the wait_list. - */ - if (rt_mutex_owner(lock) != current) { - remove_waiter(lock, waiter); - cleanup = true; - } - /* - * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might - * have to fix that up. - */ - fixup_rt_mutex_waiters(lock); - - raw_spin_unlock_irq(&lock->wait_lock); + unsigned long flags; + DEFINE_WAKE_Q(wake_q); - return cleanup; + raw_spin_lock_irqsave(&lock->wait_lock, flags); + rtlock_slowlock_locked(lock, &wake_q); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); } + +#endif /* RT_MUTEX_BUILD_SPINLOCKS */ diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h deleted file mode 100644 index 5c253caffe91..000000000000 --- a/kernel/locking/rtmutex.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * RT-Mutexes: blocking mutual exclusion locks with PI support - * - * started by Ingo Molnar and Thomas Gleixner: - * - * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> - * - * This file contains macros used solely by rtmutex.c. - * Non-debug version. - */ - -#define rt_mutex_deadlock_check(l) (0) -#define debug_rt_mutex_init_waiter(w) do { } while (0) -#define debug_rt_mutex_free_waiter(w) do { } while (0) -#define debug_rt_mutex_lock(l) do { } while (0) -#define debug_rt_mutex_proxy_lock(l,p) do { } while (0) -#define debug_rt_mutex_proxy_unlock(l) do { } while (0) -#define debug_rt_mutex_unlock(l) do { } while (0) -#define debug_rt_mutex_init(m, n, k) do { } while (0) -#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) -#define debug_rt_mutex_print_deadlock(w) do { } while (0) -#define debug_rt_mutex_reset_waiter(w) do { } while (0) - -static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) -{ - WARN(1, "rtmutex deadlock detected\n"); -} - -static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w, - enum rtmutex_chainwalk walk) -{ - return walk == RT_MUTEX_FULL_CHAINWALK; -} diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c new file mode 100644 index 000000000000..59dbd29cb219 --- /dev/null +++ b/kernel/locking/rtmutex_api.c @@ -0,0 +1,656 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * rtmutex API + */ +#include <linux/spinlock.h> +#include <linux/export.h> + +#define RT_MUTEX_BUILD_MUTEX +#include "rtmutex.c" + +/* + * Max number of times we'll walk the boosting chain: + */ +int max_lock_depth = 1024; + +static const struct ctl_table rtmutex_sysctl_table[] = { + { + .procname = "max_lock_depth", + .data = &max_lock_depth, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +}; + +static int __init init_rtmutex_sysctl(void) +{ + register_sysctl_init("kernel", rtmutex_sysctl_table); + return 0; +} + +subsys_initcall(init_rtmutex_sysctl); + +/* + * Debug aware fast / slowpath lock,trylock,unlock + * + * The atomic acquire/release ops are compiled away, when either the + * architecture does not support cmpxchg or when debugging is enabled. + */ +static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock, + unsigned int state, + struct lockdep_map *nest_lock, + unsigned int subclass) +{ + int ret; + + might_sleep(); + mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, _RET_IP_); + ret = __rt_mutex_lock(&lock->rtmutex, state); + if (ret) + mutex_release(&lock->dep_map, _RET_IP_); + return ret; +} + +void rt_mutex_base_init(struct rt_mutex_base *rtb) +{ + __rt_mutex_base_init(rtb); +} +EXPORT_SYMBOL(rt_mutex_base_init); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +/** + * rt_mutex_lock_nested - lock a rt_mutex + * + * @lock: the rt_mutex to be locked + * @subclass: the lockdep subclass + */ +void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) +{ + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, subclass); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); + +void __sched _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock) +{ + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, nest_lock, 0); +} +EXPORT_SYMBOL_GPL(_rt_mutex_lock_nest_lock); + +#else /* !CONFIG_DEBUG_LOCK_ALLOC */ + +/** + * rt_mutex_lock - lock a rt_mutex + * + * @lock: the rt_mutex to be locked + */ +void __sched rt_mutex_lock(struct rt_mutex *lock) +{ + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, 0); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock); +#endif + +/** + * rt_mutex_lock_interruptible - lock a rt_mutex interruptible + * + * @lock: the rt_mutex to be locked + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + */ +int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) +{ + return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, NULL, 0); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); + +/** + * rt_mutex_lock_killable - lock a rt_mutex killable + * + * @lock: the rt_mutex to be locked + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + */ +int __sched rt_mutex_lock_killable(struct rt_mutex *lock) +{ + return __rt_mutex_lock_common(lock, TASK_KILLABLE, NULL, 0); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable); + +/** + * rt_mutex_trylock - try to lock a rt_mutex + * + * @lock: the rt_mutex to be locked + * + * This function can only be called in thread context. It's safe to call it + * from atomic regions, but not from hard or soft interrupt context. + * + * Returns: + * 1 on success + * 0 on contention + */ +int __sched rt_mutex_trylock(struct rt_mutex *lock) +{ + int ret; + + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) + return 0; + + ret = __rt_mutex_trylock(&lock->rtmutex); + if (ret) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL_GPL(rt_mutex_trylock); + +/** + * rt_mutex_unlock - unlock a rt_mutex + * + * @lock: the rt_mutex to be unlocked + */ +void __sched rt_mutex_unlock(struct rt_mutex *lock) +{ + mutex_release(&lock->dep_map, _RET_IP_); + __rt_mutex_unlock(&lock->rtmutex); +} +EXPORT_SYMBOL_GPL(rt_mutex_unlock); + +/* + * Futex variants, must not use fastpath. + */ +int __sched rt_mutex_futex_trylock(struct rt_mutex_base *lock) +{ + return rt_mutex_slowtrylock(lock); +} + +int __sched __rt_mutex_futex_trylock(struct rt_mutex_base *lock) +{ + return __rt_mutex_slowtrylock(lock); +} + +/** + * __rt_mutex_futex_unlock - Futex variant, that since futex variants + * do not use the fast-path, can be simple and will not need to retry. + * + * @lock: The rt_mutex to be unlocked + * @wqh: The wake queue head from which to get the next lock waiter + */ +bool __sched __rt_mutex_futex_unlock(struct rt_mutex_base *lock, + struct rt_wake_q_head *wqh) +{ + lockdep_assert_held(&lock->wait_lock); + + debug_rt_mutex_unlock(lock); + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + return false; /* done */ + } + + /* + * mark_wakeup_next_waiter() deboosts and retains preemption + * disabled when dropping the wait_lock, to avoid inversion prior + * to the wakeup. preempt_disable() therein pairs with the + * preempt_enable() in rt_mutex_postunlock(). + */ + mark_wakeup_next_waiter(wqh, lock); + + return true; /* call postunlock() */ +} + +void __sched rt_mutex_futex_unlock(struct rt_mutex_base *lock) +{ + DEFINE_RT_WAKE_Q(wqh); + unsigned long flags; + bool postunlock; + + raw_spin_lock_irqsave(&lock->wait_lock, flags); + postunlock = __rt_mutex_futex_unlock(lock, &wqh); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + + if (postunlock) + rt_mutex_postunlock(&wqh); +} + +/** + * __rt_mutex_init - initialize the rt_mutex + * + * @lock: The rt_mutex to be initialized + * @name: The lock name used for debugging + * @key: The lock class key used for debugging + * + * Initialize the rt_mutex to unlocked state. + * + * Initializing of a locked rt_mutex is not allowed + */ +void __sched __rt_mutex_init(struct rt_mutex *lock, const char *name, + struct lock_class_key *key) +{ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + __rt_mutex_base_init(&lock->rtmutex); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); +} +EXPORT_SYMBOL_GPL(__rt_mutex_init); + +/** + * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a + * proxy owner + * + * @lock: the rt_mutex to be locked + * @proxy_owner:the task to set as owner + * + * No locking. Caller has to do serializing itself + * + * Special API call for PI-futex support. This initializes the rtmutex and + * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not + * possible at this point because the pi_state which contains the rtmutex + * is not yet visible to other tasks. + */ +void __sched rt_mutex_init_proxy_locked(struct rt_mutex_base *lock, + struct task_struct *proxy_owner) +{ + static struct lock_class_key pi_futex_key; + + __rt_mutex_base_init(lock); + /* + * On PREEMPT_RT the futex hashbucket spinlock becomes 'sleeping' + * and rtmutex based. That causes a lockdep false positive, because + * some of the futex functions invoke spin_unlock(&hb->lock) with + * the wait_lock of the rtmutex associated to the pi_futex held. + * spin_unlock() in turn takes wait_lock of the rtmutex on which + * the spinlock is based, which makes lockdep notice a lock + * recursion. Give the futex/rtmutex wait_lock a separate key. + */ + lockdep_set_class(&lock->wait_lock, &pi_futex_key); + rt_mutex_set_owner(lock, proxy_owner); +} + +/** + * rt_mutex_proxy_unlock - release a lock on behalf of owner + * + * @lock: the rt_mutex to be locked + * + * No locking. Caller has to do serializing itself + * + * Special API call for PI-futex support. This just cleans up the rtmutex + * (debugging) state. Concurrent operations on this rt_mutex are not + * possible because it belongs to the pi_state which is about to be freed + * and it is not longer visible to other tasks. + */ +void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock) +{ + debug_rt_mutex_proxy_unlock(lock); + rt_mutex_clear_owner(lock); +} + +/** + * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * @wake_q: the wake_q to wake tasks after we release the wait_lock + * + * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock + * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. + * + * NOTE: does _NOT_ remove the @waiter on failure; must either call + * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this. + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for PI-futex support. + */ +int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, + struct wake_q_head *wake_q) +{ + int ret; + + lockdep_assert_held(&lock->wait_lock); + + if (try_to_take_rt_mutex(lock, task, NULL)) + return 1; + + /* We enforce deadlock detection for futexes */ + ret = task_blocks_on_rt_mutex(lock, waiter, task, NULL, + RT_MUTEX_FULL_CHAINWALK, wake_q); + + if (ret && !rt_mutex_owner(lock)) { + /* + * Reset the return value. We might have + * returned with -EDEADLK and the owner + * released the lock while we were walking the + * pi chain. Let the waiter sort it out. + */ + ret = 0; + } + + return ret; +} + +/** + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * + * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock + * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. + * + * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter + * on failure. + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for PI-futex support. + */ +int __sched rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task) +{ + int ret; + DEFINE_WAKE_Q(wake_q); + + raw_spin_lock_irq(&lock->wait_lock); + ret = __rt_mutex_start_proxy_lock(lock, waiter, task, &wake_q); + if (unlikely(ret)) + remove_waiter(lock, waiter); + preempt_disable(); + raw_spin_unlock_irq(&lock->wait_lock); + wake_up_q(&wake_q); + preempt_enable(); + + return ret; +} + +/** + * rt_mutex_wait_proxy_lock() - Wait for lock acquisition + * @lock: the rt_mutex we were woken on + * @to: the timeout, null if none. hrtimer should already have + * been started. + * @waiter: the pre-initialized rt_mutex_waiter + * + * Wait for the lock acquisition started on our behalf by + * rt_mutex_start_proxy_lock(). Upon failure, the caller must call + * rt_mutex_cleanup_proxy_lock(). + * + * Returns: + * 0 - success + * <0 - error, one of -EINTR, -ETIMEDOUT + * + * Special API call for PI-futex support + */ +int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter) +{ + int ret; + + raw_spin_lock_irq(&lock->wait_lock); + /* sleep on the mutex */ + set_current_state(TASK_INTERRUPTIBLE); + ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter, NULL); + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock, true); + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +} + +/** + * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition + * @lock: the rt_mutex we were woken on + * @waiter: the pre-initialized rt_mutex_waiter + * + * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or + * rt_mutex_wait_proxy_lock(). + * + * Unless we acquired the lock; we're still enqueued on the wait-list and can + * in fact still be granted ownership until we're removed. Therefore we can + * find we are in fact the owner and must disregard the + * rt_mutex_wait_proxy_lock() failure. + * + * Returns: + * true - did the cleanup, we done. + * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned, + * caller should disregards its return value. + * + * Special API call for PI-futex support + */ +bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter) +{ + bool cleanup = false; + + raw_spin_lock_irq(&lock->wait_lock); + /* + * Do an unconditional try-lock, this deals with the lock stealing + * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter() + * sets a NULL owner. + * + * We're not interested in the return value, because the subsequent + * test on rt_mutex_owner() will infer that. If the trylock succeeded, + * we will own the lock and it will have removed the waiter. If we + * failed the trylock, we're still not owner and we need to remove + * ourselves. + */ + try_to_take_rt_mutex(lock, current, waiter); + /* + * Unless we're the owner; we're still enqueued on the wait_list. + * So check if we became owner, if not, take us off the wait_list. + */ + if (rt_mutex_owner(lock) != current) { + remove_waiter(lock, waiter); + cleanup = true; + } + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock, false); + + raw_spin_unlock_irq(&lock->wait_lock); + + return cleanup; +} + +/* + * Recheck the pi chain, in case we got a priority setting + * + * Called from sched_setscheduler + */ +void __sched rt_mutex_adjust_pi(struct task_struct *task) +{ + struct rt_mutex_waiter *waiter; + struct rt_mutex_base *next_lock; + unsigned long flags; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + + waiter = task->pi_blocked_on; + if (!waiter || rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) { + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + return; + } + next_lock = waiter->lock; + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(task); + + rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL, + next_lock, NULL, task); +} + +/* + * Performs the wakeup of the top-waiter and re-enables preemption. + */ +void __sched rt_mutex_postunlock(struct rt_wake_q_head *wqh) +{ + rt_mutex_wake_up_q(wqh); +} + +#ifdef CONFIG_DEBUG_RT_MUTEXES +void rt_mutex_debug_task_free(struct task_struct *task) +{ + DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root)); + DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); +} +#endif + +#ifdef CONFIG_PREEMPT_RT +/* Mutexes */ +static void __mutex_rt_init_generic(struct mutex *mutex) +{ + rt_mutex_base_init(&mutex->rtmutex); + debug_check_no_locks_freed((void *)mutex, sizeof(*mutex)); +} + +static __always_inline int __mutex_lock_common(struct mutex *lock, + unsigned int state, + unsigned int subclass, + struct lockdep_map *nest_lock, + unsigned long ip) +{ + int ret; + + might_sleep(); + mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); + ret = __rt_mutex_lock(&lock->rtmutex, state); + if (ret) + mutex_release(&lock->dep_map, ip); + else + lock_acquired(&lock->dep_map, ip); + return ret; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void mutex_rt_init_lockdep(struct mutex *mutex, const char *name, struct lock_class_key *key) +{ + __mutex_rt_init_generic(mutex); + lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP); +} +EXPORT_SYMBOL(mutex_rt_init_lockdep); + +void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) +{ + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); +} +EXPORT_SYMBOL_GPL(mutex_lock_nested); + +void __sched _mutex_lock_nest_lock(struct mutex *lock, + struct lockdep_map *nest_lock) +{ + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest_lock, _RET_IP_); +} +EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); + +int __sched mutex_lock_interruptible_nested(struct mutex *lock, + unsigned int subclass) +{ + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_); +} +EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); + +int __sched _mutex_lock_killable(struct mutex *lock, unsigned int subclass, + struct lockdep_map *nest_lock) +{ + return __mutex_lock_common(lock, TASK_KILLABLE, subclass, nest_lock, _RET_IP_); +} +EXPORT_SYMBOL_GPL(_mutex_lock_killable); + +void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) +{ + int token; + + might_sleep(); + + token = io_schedule_prepare(); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); + io_schedule_finish(token); +} +EXPORT_SYMBOL_GPL(mutex_lock_io_nested); + +int __sched _mutex_trylock_nest_lock(struct mutex *lock, + struct lockdep_map *nest_lock) +{ + int ret; + + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) + return 0; + + ret = __rt_mutex_trylock(&lock->rtmutex); + if (ret) + mutex_acquire_nest(&lock->dep_map, 0, 1, nest_lock, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL_GPL(_mutex_trylock_nest_lock); +#else /* CONFIG_DEBUG_LOCK_ALLOC */ + +void mutex_rt_init_generic(struct mutex *mutex) +{ + __mutex_rt_init_generic(mutex); +} +EXPORT_SYMBOL(mutex_rt_init_generic); + +void __sched mutex_lock(struct mutex *lock) +{ + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); +} +EXPORT_SYMBOL(mutex_lock); + +int __sched mutex_lock_interruptible(struct mutex *lock) +{ + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); +} +EXPORT_SYMBOL(mutex_lock_interruptible); + +int __sched mutex_lock_killable(struct mutex *lock) +{ + return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); +} +EXPORT_SYMBOL(mutex_lock_killable); + +void __sched mutex_lock_io(struct mutex *lock) +{ + int token = io_schedule_prepare(); + + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); + io_schedule_finish(token); +} +EXPORT_SYMBOL(mutex_lock_io); + +int __sched mutex_trylock(struct mutex *lock) +{ + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) + return 0; + + return __rt_mutex_trylock(&lock->rtmutex); +} +EXPORT_SYMBOL(mutex_trylock); +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ + +void __sched mutex_unlock(struct mutex *lock) +{ + mutex_release(&lock->dep_map, _RET_IP_); + __rt_mutex_unlock(&lock->rtmutex); +} +EXPORT_SYMBOL(mutex_unlock); + +#endif /* CONFIG_PREEMPT_RT */ diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 72ad45a9a794..cf6ddd1b23a2 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * RT Mutexes: blocking mutual exclusion locks with PI support * @@ -12,73 +13,144 @@ #ifndef __KERNEL_RTMUTEX_COMMON_H #define __KERNEL_RTMUTEX_COMMON_H +#include <linux/debug_locks.h> #include <linux/rtmutex.h> #include <linux/sched/wake_q.h> + +/* + * This is a helper for the struct rt_mutex_waiter below. A waiter goes in two + * separate trees and they need their own copy of the sort keys because of + * different locking requirements. + * + * @entry: rbtree node to enqueue into the waiters tree + * @prio: Priority of the waiter + * @deadline: Deadline of the waiter if applicable + * + * See rt_waiter_node_less() and waiter_*_prio(). + */ +struct rt_waiter_node { + struct rb_node entry; + int prio; + u64 deadline; +}; + /* * This is the control structure for tasks blocked on a rt_mutex, * which is allocated on the kernel stack on of the blocked task. * - * @tree_entry: pi node to enqueue into the mutex waiters tree - * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree + * @tree: node to enqueue into the mutex waiters tree + * @pi_tree: node to enqueue into the mutex owner waiters tree * @task: task reference to the blocked task + * @lock: Pointer to the rt_mutex on which the waiter blocks + * @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT) + * @ww_ctx: WW context pointer + * + * @tree is ordered by @lock->wait_lock + * @pi_tree is ordered by rt_mutex_owner(@lock)->pi_lock */ struct rt_mutex_waiter { - struct rb_node tree_entry; - struct rb_node pi_tree_entry; + struct rt_waiter_node tree; + struct rt_waiter_node pi_tree; struct task_struct *task; - struct rt_mutex *lock; -#ifdef CONFIG_DEBUG_RT_MUTEXES - unsigned long ip; - struct pid *deadlock_task_pid; - struct rt_mutex *deadlock_lock; -#endif - int prio; - u64 deadline; + struct rt_mutex_base *lock; + unsigned int wake_state; + struct ww_acquire_ctx *ww_ctx; }; +/** + * struct rt_wake_q_head - Wrapper around regular wake_q_head to support + * "sleeping" spinlocks on RT + * @head: The regular wake_q_head for sleeping lock variants + * @rtlock_task: Task pointer for RT lock (spin/rwlock) wakeups + */ +struct rt_wake_q_head { + struct wake_q_head head; + struct task_struct *rtlock_task; +}; + +#define DEFINE_RT_WAKE_Q(name) \ + struct rt_wake_q_head name = { \ + .head = WAKE_Q_HEAD_INITIALIZER(name.head), \ + .rtlock_task = NULL, \ + } + /* - * Various helpers to access the waiters-tree: + * PI-futex support (proxy locking functions, etc.): */ -static inline int rt_mutex_has_waiters(struct rt_mutex *lock) +extern void rt_mutex_init_proxy_locked(struct rt_mutex_base *lock, + struct task_struct *proxy_owner); +extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock); +extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, + struct wake_q_head *); +extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task); +extern int rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter); +extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter); + +extern int rt_mutex_futex_trylock(struct rt_mutex_base *l); +extern int __rt_mutex_futex_trylock(struct rt_mutex_base *l); + +extern void rt_mutex_futex_unlock(struct rt_mutex_base *lock); +extern bool __rt_mutex_futex_unlock(struct rt_mutex_base *lock, + struct rt_wake_q_head *wqh); + +extern void rt_mutex_postunlock(struct rt_wake_q_head *wqh); + +/* + * Must be guarded because this header is included from rcu/tree_plugin.h + * unconditionally. + */ +#ifdef CONFIG_RT_MUTEXES +static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock) { - return !RB_EMPTY_ROOT(&lock->waiters); + return !RB_EMPTY_ROOT(&lock->waiters.rb_root); } -static inline struct rt_mutex_waiter * -rt_mutex_top_waiter(struct rt_mutex *lock) +/* + * Lockless speculative check whether @waiter is still the top waiter on + * @lock. This is solely comparing pointers and not derefencing the + * leftmost entry which might be about to vanish. + */ +static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock, + struct rt_mutex_waiter *waiter) { - struct rt_mutex_waiter *w; + struct rb_node *leftmost = rb_first_cached(&lock->waiters); - w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter, - tree_entry); - BUG_ON(w->lock != lock); - - return w; + return rb_entry(leftmost, struct rt_mutex_waiter, tree.entry) == waiter; } -static inline int task_has_pi_waiters(struct task_struct *p) +static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock) { - return !RB_EMPTY_ROOT(&p->pi_waiters); + struct rb_node *leftmost = rb_first_cached(&lock->waiters); + struct rt_mutex_waiter *w = NULL; + + lockdep_assert_held(&lock->wait_lock); + + if (leftmost) { + w = rb_entry(leftmost, struct rt_mutex_waiter, tree.entry); + BUG_ON(w->lock != lock); + } + return w; } -static inline struct rt_mutex_waiter * -task_top_pi_waiter(struct task_struct *p) +static inline int task_has_pi_waiters(struct task_struct *p) { - return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter, - pi_tree_entry); + return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root); } -/* - * lock->owner state tracking: - */ -#define RT_MUTEX_HAS_WAITERS 1UL - -static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) +static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p) { - unsigned long owner = (unsigned long) READ_ONCE(lock->owner); + lockdep_assert_held(&p->pi_lock); - return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); + return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter, + pi_tree.entry); } /* @@ -96,39 +168,59 @@ enum rtmutex_chainwalk { RT_MUTEX_FULL_CHAINWALK, }; -/* - * PI-futex support (proxy locking functions, etc.): - */ -extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); -extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, - struct task_struct *proxy_owner); -extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, - struct task_struct *proxy_owner); -extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); -extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task); -extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task); -extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter); -extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter); +static inline void __rt_mutex_base_init(struct rt_mutex_base *lock) +{ + raw_spin_lock_init(&lock->wait_lock); + lock->waiters = RB_ROOT_CACHED; + lock->owner = NULL; +} -extern int rt_mutex_futex_trylock(struct rt_mutex *l); +/* Debug functions */ +static inline void debug_rt_mutex_unlock(struct rt_mutex_base *lock) +{ + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) + DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); +} -extern void rt_mutex_futex_unlock(struct rt_mutex *lock); -extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wqh); +static inline void debug_rt_mutex_proxy_unlock(struct rt_mutex_base *lock) +{ + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) + DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); +} -extern void rt_mutex_postunlock(struct wake_q_head *wake_q); +static inline void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) +{ + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) + memset(waiter, 0x11, sizeof(*waiter)); +} -#ifdef CONFIG_DEBUG_RT_MUTEXES -# include "rtmutex-debug.h" -#else -# include "rtmutex.h" -#endif +static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) +{ + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) + memset(waiter, 0x22, sizeof(*waiter)); +} + +static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) +{ + debug_rt_mutex_init_waiter(waiter); + RB_CLEAR_NODE(&waiter->pi_tree.entry); + RB_CLEAR_NODE(&waiter->tree.entry); + waiter->wake_state = TASK_NORMAL; + waiter->task = NULL; +} + +static inline void rt_mutex_init_rtlock_waiter(struct rt_mutex_waiter *waiter) +{ + rt_mutex_init_waiter(waiter); + waiter->wake_state = TASK_RTLOCK_WAIT; +} + +#else /* CONFIG_RT_MUTEXES */ +/* Used in rcu/tree_plugin.h */ +static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) +{ + return NULL; +} +#endif /* !CONFIG_RT_MUTEXES */ #endif diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c new file mode 100644 index 000000000000..9f4322c07486 --- /dev/null +++ b/kernel/locking/rwbase_rt.c @@ -0,0 +1,303 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * RT-specific reader/writer semaphores and reader/writer locks + * + * down_write/write_lock() + * 1) Lock rtmutex + * 2) Remove the reader BIAS to force readers into the slow path + * 3) Wait until all readers have left the critical section + * 4) Mark it write locked + * + * up_write/write_unlock() + * 1) Remove the write locked marker + * 2) Set the reader BIAS, so readers can use the fast path again + * 3) Unlock rtmutex, to release blocked readers + * + * down_read/read_lock() + * 1) Try fast path acquisition (reader BIAS is set) + * 2) Take tmutex::wait_lock, which protects the writelocked flag + * 3) If !writelocked, acquire it for read + * 4) If writelocked, block on tmutex + * 5) unlock rtmutex, goto 1) + * + * up_read/read_unlock() + * 1) Try fast path release (reader count != 1) + * 2) Wake the writer waiting in down_write()/write_lock() #3 + * + * down_read/read_lock()#3 has the consequence, that rw semaphores and rw + * locks on RT are not writer fair, but writers, which should be avoided in + * RT tasks (think mmap_sem), are subject to the rtmutex priority/DL + * inheritance mechanism. + * + * It's possible to make the rw primitives writer fair by keeping a list of + * active readers. A blocked writer would force all newly incoming readers + * to block on the rtmutex, but the rtmutex would have to be proxy locked + * for one reader after the other. We can't use multi-reader inheritance + * because there is no way to support that with SCHED_DEADLINE. + * Implementing the one by one reader boosting/handover mechanism is a + * major surgery for a very dubious value. + * + * The risk of writer starvation is there, but the pathological use cases + * which trigger it are not necessarily the typical RT workloads. + * + * Fast-path orderings: + * The lock/unlock of readers can run in fast paths: lock and unlock are only + * atomic ops, and there is no inner lock to provide ACQUIRE and RELEASE + * semantics of rwbase_rt. Atomic ops should thus provide _acquire() + * and _release() (or stronger). + * + * Common code shared between RT rw_semaphore and rwlock + */ + +static __always_inline int rwbase_read_trylock(struct rwbase_rt *rwb) +{ + int r; + + /* + * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is + * set. + */ + for (r = atomic_read(&rwb->readers); r < 0;) { + if (likely(atomic_try_cmpxchg_acquire(&rwb->readers, &r, r + 1))) + return 1; + } + return 0; +} + +static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, + unsigned int state) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + DEFINE_WAKE_Q(wake_q); + int ret; + + rwbase_pre_schedule(); + raw_spin_lock_irq(&rtm->wait_lock); + + /* + * Call into the slow lock path with the rtmutex->wait_lock + * held, so this can't result in the following race: + * + * Reader1 Reader2 Writer + * down_read() + * down_write() + * rtmutex_lock(m) + * wait() + * down_read() + * unlock(m->wait_lock) + * up_read() + * wake(Writer) + * lock(m->wait_lock) + * sem->writelocked=true + * unlock(m->wait_lock) + * + * up_write() + * sem->writelocked=false + * rtmutex_unlock(m) + * down_read() + * down_write() + * rtmutex_lock(m) + * wait() + * rtmutex_lock(m) + * + * That would put Reader1 behind the writer waiting on + * Reader2 to call up_read(), which might be unbound. + */ + + trace_contention_begin(rwb, LCB_F_RT | LCB_F_READ); + + /* + * For rwlocks this returns 0 unconditionally, so the below + * !ret conditionals are optimized out. + */ + ret = rwbase_rtmutex_slowlock_locked(rtm, state, &wake_q); + + /* + * On success the rtmutex is held, so there can't be a writer + * active. Increment the reader count and immediately drop the + * rtmutex again. + * + * rtmutex->wait_lock has to be unlocked in any case of course. + */ + if (!ret) + atomic_inc(&rwb->readers); + + preempt_disable(); + raw_spin_unlock_irq(&rtm->wait_lock); + wake_up_q(&wake_q); + preempt_enable(); + + if (!ret) + rwbase_rtmutex_unlock(rtm); + + trace_contention_end(rwb, ret); + rwbase_post_schedule(); + return ret; +} + +static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb, + unsigned int state) +{ + lockdep_assert(!current->pi_blocked_on); + + if (rwbase_read_trylock(rwb)) + return 0; + + return __rwbase_read_lock(rwb, state); +} + +static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb, + unsigned int state) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + struct task_struct *owner; + DEFINE_RT_WAKE_Q(wqh); + + raw_spin_lock_irq(&rtm->wait_lock); + /* + * Wake the writer, i.e. the rtmutex owner. It might release the + * rtmutex concurrently in the fast path (due to a signal), but to + * clean up rwb->readers it needs to acquire rtm->wait_lock. The + * worst case which can happen is a spurious wakeup. + */ + owner = rt_mutex_owner(rtm); + if (owner) + rt_mutex_wake_q_add_task(&wqh, owner, state); + + /* Pairs with the preempt_enable in rt_mutex_wake_up_q() */ + preempt_disable(); + raw_spin_unlock_irq(&rtm->wait_lock); + rt_mutex_wake_up_q(&wqh); +} + +static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb, + unsigned int state) +{ + /* + * rwb->readers can only hit 0 when a writer is waiting for the + * active readers to leave the critical section. + * + * dec_and_test() is fully ordered, provides RELEASE. + */ + if (unlikely(atomic_dec_and_test(&rwb->readers))) + __rwbase_read_unlock(rwb, state); +} + +static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias, + unsigned long flags) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + + /* + * _release() is needed in case that reader is in fast path, pairing + * with atomic_try_cmpxchg_acquire() in rwbase_read_trylock(). + */ + (void)atomic_add_return_release(READER_BIAS - bias, &rwb->readers); + raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + rwbase_rtmutex_unlock(rtm); +} + +static inline void rwbase_write_unlock(struct rwbase_rt *rwb) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + unsigned long flags; + + raw_spin_lock_irqsave(&rtm->wait_lock, flags); + __rwbase_write_unlock(rwb, WRITER_BIAS, flags); +} + +static inline void rwbase_write_downgrade(struct rwbase_rt *rwb) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + unsigned long flags; + + raw_spin_lock_irqsave(&rtm->wait_lock, flags); + /* Release it and account current as reader */ + __rwbase_write_unlock(rwb, WRITER_BIAS - 1, flags); +} + +static inline bool __rwbase_write_trylock(struct rwbase_rt *rwb) +{ + /* Can do without CAS because we're serialized by wait_lock. */ + lockdep_assert_held(&rwb->rtmutex.wait_lock); + + /* + * _acquire is needed in case the reader is in the fast path, pairing + * with rwbase_read_unlock(), provides ACQUIRE. + */ + if (!atomic_read_acquire(&rwb->readers)) { + atomic_set(&rwb->readers, WRITER_BIAS); + return 1; + } + + return 0; +} + +static int __sched rwbase_write_lock(struct rwbase_rt *rwb, + unsigned int state) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + unsigned long flags; + + /* Take the rtmutex as a first step */ + if (rwbase_rtmutex_lock_state(rtm, state)) + return -EINTR; + + /* Force readers into slow path */ + atomic_sub(READER_BIAS, &rwb->readers); + + rwbase_pre_schedule(); + + raw_spin_lock_irqsave(&rtm->wait_lock, flags); + if (__rwbase_write_trylock(rwb)) + goto out_unlock; + + rwbase_set_and_save_current_state(state); + trace_contention_begin(rwb, LCB_F_RT | LCB_F_WRITE); + for (;;) { + /* Optimized out for rwlocks */ + if (rwbase_signal_pending_state(state, current)) { + rwbase_restore_current_state(); + __rwbase_write_unlock(rwb, 0, flags); + rwbase_post_schedule(); + trace_contention_end(rwb, -EINTR); + return -EINTR; + } + + if (__rwbase_write_trylock(rwb)) + break; + + raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + rwbase_schedule(); + raw_spin_lock_irqsave(&rtm->wait_lock, flags); + + set_current_state(state); + } + rwbase_restore_current_state(); + trace_contention_end(rwb, 0); + +out_unlock: + raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + rwbase_post_schedule(); + return 0; +} + +static inline int rwbase_write_trylock(struct rwbase_rt *rwb) +{ + struct rt_mutex_base *rtm = &rwb->rtmutex; + unsigned long flags; + + if (!rwbase_rtmutex_trylock(rtm)) + return 0; + + atomic_sub(READER_BIAS, &rwb->readers); + + raw_spin_lock_irqsave(&rtm->wait_lock, flags); + if (__rwbase_write_trylock(rwb)) { + raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + return 1; + } + __rwbase_write_unlock(rwb, 0, flags); + return 0; +} diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c deleted file mode 100644 index 20819df98125..000000000000 --- a/kernel/locking/rwsem-spinlock.c +++ /dev/null @@ -1,319 +0,0 @@ -/* rwsem-spinlock.c: R/W semaphores: contention handling functions for - * generic spinlock implementation - * - * Copyright (c) 2001 David Howells (dhowells@redhat.com). - * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de> - * - Derived also from comments by Linus - */ -#include <linux/rwsem.h> -#include <linux/sched/signal.h> -#include <linux/sched/debug.h> -#include <linux/export.h> - -enum rwsem_waiter_type { - RWSEM_WAITING_FOR_WRITE, - RWSEM_WAITING_FOR_READ -}; - -struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - enum rwsem_waiter_type type; -}; - -int rwsem_is_locked(struct rw_semaphore *sem) -{ - int ret = 1; - unsigned long flags; - - if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { - ret = (sem->count != 0); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - } - return ret; -} -EXPORT_SYMBOL(rwsem_is_locked); - -/* - * initialise the semaphore - */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held semaphore: - */ - debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); -#endif - sem->count = 0; - raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} -EXPORT_SYMBOL(__init_rwsem); - -/* - * handle the lock release when processes blocked on it that can now run - * - if we come here, then: - * - the 'active count' _reached_ zero - * - the 'waiting count' is non-zero - * - the spinlock must be held by the caller - * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if wakewrite is non-zero - */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) -{ - struct rwsem_waiter *waiter; - struct task_struct *tsk; - int woken; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - - if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wakewrite) - /* Wake up a writer. Note that we do not grant it the - * lock - it will have to acquire it when it runs. */ - wake_up_process(waiter->task); - goto out; - } - - /* grant an infinite number of read locks to the front of the queue */ - woken = 0; - do { - struct list_head *next = waiter->list.next; - - list_del(&waiter->list); - tsk = waiter->task; - /* - * Make sure we do not wakeup the next reader before - * setting the nil condition to grant the next reader; - * otherwise we could miss the wakeup on the other - * side and end up sleeping again. See the pairing - * in rwsem_down_read_failed(). - */ - smp_mb(); - waiter->task = NULL; - wake_up_process(tsk); - put_task_struct(tsk); - woken++; - if (next == &sem->wait_list) - break; - waiter = list_entry(next, struct rwsem_waiter, list); - } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - - sem->count += woken; - - out: - return sem; -} - -/* - * wake a single writer - */ -static inline struct rw_semaphore * -__rwsem_wake_one_writer(struct rw_semaphore *sem) -{ - struct rwsem_waiter *waiter; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - wake_up_process(waiter->task); - - return sem; -} - -/* - * get a read lock on the semaphore - */ -void __sched __down_read(struct rw_semaphore *sem) -{ - struct rwsem_waiter waiter; - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->count >= 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->count++; - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - goto out; - } - - set_current_state(TASK_UNINTERRUPTIBLE); - - /* set up my own style of waitqueue */ - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(current); - - list_add_tail(&waiter.list, &sem->wait_list); - - /* we don't need to touch the semaphore struct anymore */ - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - /* wait to be given the lock */ - for (;;) { - if (!waiter.task) - break; - schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); - } - - __set_current_state(TASK_RUNNING); - out: - ; -} - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -int __down_read_trylock(struct rw_semaphore *sem) -{ - unsigned long flags; - int ret = 0; - - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->count >= 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->count++; - ret = 1; - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; -} - -/* - * get a write lock on the semaphore - */ -int __sched __down_write_common(struct rw_semaphore *sem, int state) -{ - struct rwsem_waiter waiter; - unsigned long flags; - int ret = 0; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - /* set up my own style of waitqueue */ - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_WRITE; - list_add_tail(&waiter.list, &sem->wait_list); - - /* wait for someone to release the lock */ - for (;;) { - /* - * That is the key to support write lock stealing: allows the - * task already on CPU to get the lock soon rather than put - * itself into sleep and waiting for system woke it or someone - * else in the head of the wait list up. - */ - if (sem->count == 0) - break; - if (signal_pending_state(state, current)) - goto out_nolock; - - set_current_state(state); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - schedule(); - raw_spin_lock_irqsave(&sem->wait_lock, flags); - } - /* got the lock */ - sem->count = -1; - list_del(&waiter.list); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; - -out_nolock: - list_del(&waiter.list); - if (!list_empty(&sem->wait_list) && sem->count >= 0) - __rwsem_do_wake(sem, 0); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return -EINTR; -} - -void __sched __down_write(struct rw_semaphore *sem) -{ - __down_write_common(sem, TASK_UNINTERRUPTIBLE); -} - -int __sched __down_write_killable(struct rw_semaphore *sem) -{ - return __down_write_common(sem, TASK_KILLABLE); -} - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -int __down_write_trylock(struct rw_semaphore *sem) -{ - unsigned long flags; - int ret = 0; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->count == 0) { - /* got the lock */ - sem->count = -1; - ret = 1; - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; -} - -/* - * release a read lock on the semaphore - */ -void __up_read(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (--sem->count == 0 && !list_empty(&sem->wait_list)) - sem = __rwsem_wake_one_writer(sem); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * release a write lock on the semaphore - */ -void __up_write(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - sem->count = 0; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 1); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * downgrade a write lock into a read lock - * - just wake up any readers at the front of the queue - */ -void __downgrade_write(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - sem->count = 1; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 0); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c deleted file mode 100644 index 34e727f18e49..000000000000 --- a/kernel/locking/rwsem-xadd.c +++ /dev/null @@ -1,652 +0,0 @@ -/* rwsem.c: R/W semaphores: contention handling functions - * - * Written by David Howells (dhowells@redhat.com). - * Derived from arch/i386/kernel/semaphore.c - * - * Writer lock-stealing by Alex Shi <alex.shi@intel.com> - * and Michel Lespinasse <walken@google.com> - * - * Optimistic spinning by Tim Chen <tim.c.chen@intel.com> - * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes. - */ -#include <linux/rwsem.h> -#include <linux/init.h> -#include <linux/export.h> -#include <linux/sched/signal.h> -#include <linux/sched/rt.h> -#include <linux/sched/wake_q.h> -#include <linux/sched/debug.h> -#include <linux/osq_lock.h> - -#include "rwsem.h" - -/* - * Guide to the rw_semaphore's count field for common values. - * (32-bit case illustrated, similar for 64-bit) - * - * 0x0000000X (1) X readers active or attempting lock, no writer waiting - * X = #active_readers + #readers attempting to lock - * (X*ACTIVE_BIAS) - * - * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or - * attempting to read lock or write lock. - * - * 0xffff000X (1) X readers active or attempting lock, with waiters for lock - * X = #active readers + # readers attempting lock - * (X*ACTIVE_BIAS + WAITING_BIAS) - * (2) 1 writer attempting lock, no waiters for lock - * X-1 = #active readers + #readers attempting lock - * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) - * (3) 1 writer active, no waiters for lock - * X-1 = #active readers + #readers attempting lock - * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) - * - * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock - * (WAITING_BIAS + ACTIVE_BIAS) - * (2) 1 writer active or attempting lock, no waiters for lock - * (ACTIVE_WRITE_BIAS) - * - * 0xffff0000 (1) There are writers or readers queued but none active - * or in the process of attempting lock. - * (WAITING_BIAS) - * Note: writer can attempt to steal lock for this count by adding - * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count - * - * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue. - * (ACTIVE_WRITE_BIAS + WAITING_BIAS) - * - * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking - * the count becomes more than 0 for successful lock acquisition, - * i.e. the case where there are only readers or nobody has lock. - * (1st and 2nd case above). - * - * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and - * checking the count becomes ACTIVE_WRITE_BIAS for successful lock - * acquisition (i.e. nobody else has lock or attempts lock). If - * unsuccessful, in rwsem_down_write_failed, we'll check to see if there - * are only waiters but none active (5th case above), and attempt to - * steal the lock. - * - */ - -/* - * Initialize an rwsem: - */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held semaphore: - */ - debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); -#endif - atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); - raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER - sem->owner = NULL; - osq_lock_init(&sem->osq); -#endif -} - -EXPORT_SYMBOL(__init_rwsem); - -enum rwsem_waiter_type { - RWSEM_WAITING_FOR_WRITE, - RWSEM_WAITING_FOR_READ -}; - -struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - enum rwsem_waiter_type type; -}; - -enum rwsem_wake_type { - RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ - RWSEM_WAKE_READERS, /* Wake readers only */ - RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ -}; - -/* - * handle the lock release when processes blocked on it that can now run - * - if we come here from up_xxxx(), then: - * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) - * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) - * - there must be someone on the queue - * - the wait_lock must be held by the caller - * - tasks are marked for wakeup, the caller must later invoke wake_up_q() - * to actually wakeup the blocked task(s) and drop the reference count, - * preferably when the wait_lock is released - * - woken process blocks are discarded from the list after having task zeroed - * - writers are only marked woken if downgrading is false - */ -static void __rwsem_mark_wake(struct rw_semaphore *sem, - enum rwsem_wake_type wake_type, - struct wake_q_head *wake_q) -{ - struct rwsem_waiter *waiter, *tmp; - long oldcount, woken = 0, adjustment = 0; - - /* - * Take a peek at the queue head waiter such that we can determine - * the wakeup(s) to perform. - */ - waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list); - - if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wake_type == RWSEM_WAKE_ANY) { - /* - * Mark writer at the front of the queue for wakeup. - * Until the task is actually later awoken later by - * the caller, other writers are able to steal it. - * Readers, on the other hand, will block as they - * will notice the queued writer. - */ - wake_q_add(wake_q, waiter->task); - } - - return; - } - - /* - * Writers might steal the lock before we grant it to the next reader. - * We prefer to do the first reader grant before counting readers - * so we can bail out early if a writer stole the lock. - */ - if (wake_type != RWSEM_WAKE_READ_OWNED) { - adjustment = RWSEM_ACTIVE_READ_BIAS; - try_reader_grant: - oldcount = atomic_long_fetch_add(adjustment, &sem->count); - if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { - /* - * If the count is still less than RWSEM_WAITING_BIAS - * after removing the adjustment, it is assumed that - * a writer has stolen the lock. We have to undo our - * reader grant. - */ - if (atomic_long_add_return(-adjustment, &sem->count) < - RWSEM_WAITING_BIAS) - return; - - /* Last active locker left. Retry waking readers. */ - goto try_reader_grant; - } - /* - * It is not really necessary to set it to reader-owned here, - * but it gives the spinners an early indication that the - * readers now have the lock. - */ - rwsem_set_reader_owned(sem); - } - - /* - * Grant an infinite number of read locks to the readers at the front - * of the queue. We know that woken will be at least 1 as we accounted - * for above. Note we increment the 'active part' of the count by the - * number of readers before waking any processes up. - */ - list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { - struct task_struct *tsk; - - if (waiter->type == RWSEM_WAITING_FOR_WRITE) - break; - - woken++; - tsk = waiter->task; - - wake_q_add(wake_q, tsk); - list_del(&waiter->list); - /* - * Ensure that the last operation is setting the reader - * waiter to nil such that rwsem_down_read_failed() cannot - * race with do_exit() by always holding a reference count - * to the task to wakeup. - */ - smp_store_release(&waiter->task, NULL); - } - - adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; - if (list_empty(&sem->wait_list)) { - /* hit end of list above */ - adjustment -= RWSEM_WAITING_BIAS; - } - - if (adjustment) - atomic_long_add(adjustment, &sem->count); -} - -/* - * Wait for the read lock to be granted - */ -__visible -struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) -{ - long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; - struct rwsem_waiter waiter; - DEFINE_WAKE_Q(wake_q); - - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_READ; - - raw_spin_lock_irq(&sem->wait_lock); - if (list_empty(&sem->wait_list)) - adjustment += RWSEM_WAITING_BIAS; - list_add_tail(&waiter.list, &sem->wait_list); - - /* we're now waiting on the lock, but no longer actively locking */ - count = atomic_long_add_return(adjustment, &sem->count); - - /* - * If there are no active locks, wake the front queued process(es). - * - * If there are no writers and we are first in the queue, - * wake our own waiter to join the existing active readers ! - */ - if (count == RWSEM_WAITING_BIAS || - (count > RWSEM_WAITING_BIAS && - adjustment != -RWSEM_ACTIVE_READ_BIAS)) - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - - raw_spin_unlock_irq(&sem->wait_lock); - wake_up_q(&wake_q); - - /* wait to be given the lock */ - while (true) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!waiter.task) - break; - schedule(); - } - - __set_current_state(TASK_RUNNING); - return sem; -} -EXPORT_SYMBOL(rwsem_down_read_failed); - -/* - * This function must be called with the sem->wait_lock held to prevent - * race conditions between checking the rwsem wait list and setting the - * sem->count accordingly. - */ -static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) -{ - /* - * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS. - */ - if (count != RWSEM_WAITING_BIAS) - return false; - - /* - * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there - * are other tasks on the wait list, we need to add on WAITING_BIAS. - */ - count = list_is_singular(&sem->wait_list) ? - RWSEM_ACTIVE_WRITE_BIAS : - RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS; - - if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) - == RWSEM_WAITING_BIAS) { - rwsem_set_owner(sem); - return true; - } - - return false; -} - -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER -/* - * Try to acquire write lock before the writer has been put on wait queue. - */ -static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) -{ - long old, count = atomic_long_read(&sem->count); - - while (true) { - if (!(count == 0 || count == RWSEM_WAITING_BIAS)) - return false; - - old = atomic_long_cmpxchg_acquire(&sem->count, count, - count + RWSEM_ACTIVE_WRITE_BIAS); - if (old == count) { - rwsem_set_owner(sem); - return true; - } - - count = old; - } -} - -static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) -{ - struct task_struct *owner; - bool ret = true; - - if (need_resched()) - return false; - - rcu_read_lock(); - owner = READ_ONCE(sem->owner); - if (!rwsem_owner_is_writer(owner)) { - /* - * Don't spin if the rwsem is readers owned. - */ - ret = !rwsem_owner_is_reader(owner); - goto done; - } - - /* - * As lock holder preemption issue, we both skip spinning if task is not - * on cpu or its cpu is preempted - */ - ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); -done: - rcu_read_unlock(); - return ret; -} - -/* - * Return true only if we can still spin on the owner field of the rwsem. - */ -static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) -{ - struct task_struct *owner = READ_ONCE(sem->owner); - - if (!rwsem_owner_is_writer(owner)) - goto out; - - rcu_read_lock(); - while (sem->owner == owner) { - /* - * Ensure we emit the owner->on_cpu, dereference _after_ - * checking sem->owner still matches owner, if that fails, - * owner might point to free()d memory, if it still matches, - * the rcu_read_lock() ensures the memory stays valid. - */ - barrier(); - - /* - * abort spinning when need_resched or owner is not running or - * owner's cpu is preempted. - */ - if (!owner->on_cpu || need_resched() || - vcpu_is_preempted(task_cpu(owner))) { - rcu_read_unlock(); - return false; - } - - cpu_relax(); - } - rcu_read_unlock(); -out: - /* - * If there is a new owner or the owner is not set, we continue - * spinning. - */ - return !rwsem_owner_is_reader(READ_ONCE(sem->owner)); -} - -static bool rwsem_optimistic_spin(struct rw_semaphore *sem) -{ - bool taken = false; - - preempt_disable(); - - /* sem->wait_lock should not be held when doing optimistic spinning */ - if (!rwsem_can_spin_on_owner(sem)) - goto done; - - if (!osq_lock(&sem->osq)) - goto done; - - /* - * Optimistically spin on the owner field and attempt to acquire the - * lock whenever the owner changes. Spinning will be stopped when: - * 1) the owning writer isn't running; or - * 2) readers own the lock as we can't determine if they are - * actively running or not. - */ - while (rwsem_spin_on_owner(sem)) { - /* - * Try to acquire the lock - */ - if (rwsem_try_write_lock_unqueued(sem)) { - taken = true; - break; - } - - /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let - * the owner complete. - */ - if (!sem->owner && (need_resched() || rt_task(current))) - break; - - /* - * The cpu_relax() call is a compiler barrier which forces - * everything in this loop to be re-loaded. We don't need - * memory barriers as we'll eventually observe the right - * values at the cost of a few extra spins. - */ - cpu_relax(); - } - osq_unlock(&sem->osq); -done: - preempt_enable(); - return taken; -} - -/* - * Return true if the rwsem has active spinner - */ -static inline bool rwsem_has_spinner(struct rw_semaphore *sem) -{ - return osq_is_locked(&sem->osq); -} - -#else -static bool rwsem_optimistic_spin(struct rw_semaphore *sem) -{ - return false; -} - -static inline bool rwsem_has_spinner(struct rw_semaphore *sem) -{ - return false; -} -#endif - -/* - * Wait until we successfully acquire the write lock - */ -static inline struct rw_semaphore * -__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) -{ - long count; - bool waiting = true; /* any queued threads before us */ - struct rwsem_waiter waiter; - struct rw_semaphore *ret = sem; - DEFINE_WAKE_Q(wake_q); - - /* undo write bias from down_write operation, stop active locking */ - count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count); - - /* do optimistic spinning and steal lock if possible */ - if (rwsem_optimistic_spin(sem)) - return sem; - - /* - * Optimistic spinning failed, proceed to the slowpath - * and block until we can acquire the sem. - */ - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_WRITE; - - raw_spin_lock_irq(&sem->wait_lock); - - /* account for this before adding a new element to the list */ - if (list_empty(&sem->wait_list)) - waiting = false; - - list_add_tail(&waiter.list, &sem->wait_list); - - /* we're now waiting on the lock, but no longer actively locking */ - if (waiting) { - count = atomic_long_read(&sem->count); - - /* - * If there were already threads queued before us and there are - * no active writers, the lock must be read owned; so we try to - * wake any read locks that were queued ahead of us. - */ - if (count > RWSEM_WAITING_BIAS) { - __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); - /* - * The wakeup is normally called _after_ the wait_lock - * is released, but given that we are proactively waking - * readers we can deal with the wake_q overhead as it is - * similar to releasing and taking the wait_lock again - * for attempting rwsem_try_write_lock(). - */ - wake_up_q(&wake_q); - - /* - * Reinitialize wake_q after use. - */ - wake_q_init(&wake_q); - } - - } else - count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count); - - /* wait until we successfully acquire the lock */ - set_current_state(state); - while (true) { - if (rwsem_try_write_lock(count, sem)) - break; - raw_spin_unlock_irq(&sem->wait_lock); - - /* Block until there are no active lockers. */ - do { - if (signal_pending_state(state, current)) - goto out_nolock; - - schedule(); - set_current_state(state); - } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); - - raw_spin_lock_irq(&sem->wait_lock); - } - __set_current_state(TASK_RUNNING); - list_del(&waiter.list); - raw_spin_unlock_irq(&sem->wait_lock); - - return ret; - -out_nolock: - __set_current_state(TASK_RUNNING); - raw_spin_lock_irq(&sem->wait_lock); - list_del(&waiter.list); - if (list_empty(&sem->wait_list)) - atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); - else - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - raw_spin_unlock_irq(&sem->wait_lock); - wake_up_q(&wake_q); - - return ERR_PTR(-EINTR); -} - -__visible struct rw_semaphore * __sched -rwsem_down_write_failed(struct rw_semaphore *sem) -{ - return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(rwsem_down_write_failed); - -__visible struct rw_semaphore * __sched -rwsem_down_write_failed_killable(struct rw_semaphore *sem) -{ - return __rwsem_down_write_failed_common(sem, TASK_KILLABLE); -} -EXPORT_SYMBOL(rwsem_down_write_failed_killable); - -/* - * handle waking up a waiter on the semaphore - * - up_read/up_write has decremented the active part of count if we come here - */ -__visible -struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) -{ - unsigned long flags; - DEFINE_WAKE_Q(wake_q); - - /* - * If a spinner is present, it is not necessary to do the wakeup. - * Try to do wakeup only if the trylock succeeds to minimize - * spinlock contention which may introduce too much delay in the - * unlock operation. - * - * spinning writer up_write/up_read caller - * --------------- ----------------------- - * [S] osq_unlock() [L] osq - * MB RMB - * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock) - * - * Here, it is important to make sure that there won't be a missed - * wakeup while the rwsem is free and the only spinning writer goes - * to sleep without taking the rwsem. Even when the spinning writer - * is just going to break out of the waiting loop, it will still do - * a trylock in rwsem_down_write_failed() before sleeping. IOW, if - * rwsem_has_spinner() is true, it will guarantee at least one - * trylock attempt on the rwsem later on. - */ - if (rwsem_has_spinner(sem)) { - /* - * The smp_rmb() here is to make sure that the spinner - * state is consulted before reading the wait_lock. - */ - smp_rmb(); - if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags)) - return sem; - goto locked; - } - raw_spin_lock_irqsave(&sem->wait_lock, flags); -locked: - - if (!list_empty(&sem->wait_list)) - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - wake_up_q(&wake_q); - - return sem; -} -EXPORT_SYMBOL(rwsem_wake); - -/* - * downgrade a write lock into a read lock - * - caller incremented waiting part of count and discovered it still negative - * - just wake up any readers at the front of the queue - */ -__visible -struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) -{ - unsigned long flags; - DEFINE_WAKE_Q(wake_q); - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (!list_empty(&sem->wait_list)) - __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - wake_up_q(&wake_q); - - return sem; -} -EXPORT_SYMBOL(rwsem_downgrade_wake); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 4d48b1c4870d..24df4d98f7d2 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1,18 +1,1532 @@ +// SPDX-License-Identifier: GPL-2.0 /* kernel/rwsem.c: R/W semaphores, public implementation * * Written by David Howells (dhowells@redhat.com). * Derived from asm-i386/semaphore.h + * + * Writer lock-stealing by Alex Shi <alex.shi@intel.com> + * and Michel Lespinasse <walken@google.com> + * + * Optimistic spinning by Tim Chen <tim.c.chen@intel.com> + * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes. + * + * Rwsem count bit fields re-definition and rwsem rearchitecture by + * Waiman Long <longman@redhat.com> and + * Peter Zijlstra <peterz@infradead.org>. */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/sched/rt.h> +#include <linux/sched/task.h> #include <linux/sched/debug.h> +#include <linux/sched/wake_q.h> +#include <linux/sched/signal.h> +#include <linux/sched/clock.h> #include <linux/export.h> #include <linux/rwsem.h> #include <linux/atomic.h> +#include <linux/hung_task.h> +#include <trace/events/lock.h> + +#ifndef CONFIG_PREEMPT_RT +#include "lock_events.h" + +/* + * The least significant 2 bits of the owner value has the following + * meanings when set. + * - Bit 0: RWSEM_READER_OWNED - rwsem may be owned by readers (just a hint) + * - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock + * + * When the rwsem is reader-owned and a spinning writer has timed out, + * the nonspinnable bit will be set to disable optimistic spinning. + + * When a writer acquires a rwsem, it puts its task_struct pointer + * into the owner field. It is cleared after an unlock. + * + * When a reader acquires a rwsem, it will also puts its task_struct + * pointer into the owner field with the RWSEM_READER_OWNED bit set. + * On unlock, the owner field will largely be left untouched. So + * for a free or reader-owned rwsem, the owner value may contain + * information about the last reader that acquires the rwsem. + * + * That information may be helpful in debugging cases where the system + * seems to hang on a reader owned rwsem especially if only one reader + * is involved. Ideally we would like to track all the readers that own + * a rwsem, but the overhead is simply too big. + * + * A fast path reader optimistic lock stealing is supported when the rwsem + * is previously owned by a writer and the following conditions are met: + * - rwsem is not currently writer owned + * - the handoff isn't set. + */ +#define RWSEM_READER_OWNED (1UL << 0) +#define RWSEM_NONSPINNABLE (1UL << 1) +#define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE) + +#ifdef CONFIG_DEBUG_RWSEMS +# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ + if (!debug_locks_silent && \ + WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, magic = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ + #c, atomic_long_read(&(sem)->count), \ + (unsigned long) sem->magic, \ + atomic_long_read(&(sem)->owner), (long)current, \ + list_empty(&(sem)->wait_list) ? "" : "not ")) \ + debug_locks_off(); \ + } while (0) +#else +# define DEBUG_RWSEMS_WARN_ON(c, sem) +#endif + +/* + * On 64-bit architectures, the bit definitions of the count are: + * + * Bit 0 - writer locked bit + * Bit 1 - waiters present bit + * Bit 2 - lock handoff bit + * Bits 3-7 - reserved + * Bits 8-62 - 55-bit reader count + * Bit 63 - read fail bit + * + * On 32-bit architectures, the bit definitions of the count are: + * + * Bit 0 - writer locked bit + * Bit 1 - waiters present bit + * Bit 2 - lock handoff bit + * Bits 3-7 - reserved + * Bits 8-30 - 23-bit reader count + * Bit 31 - read fail bit + * + * It is not likely that the most significant bit (read fail bit) will ever + * be set. This guard bit is still checked anyway in the down_read() fastpath + * just in case we need to use up more of the reader bits for other purpose + * in the future. + * + * atomic_long_fetch_add() is used to obtain reader lock, whereas + * atomic_long_cmpxchg() will be used to obtain writer lock. + * + * There are three places where the lock handoff bit may be set or cleared. + * 1) rwsem_mark_wake() for readers -- set, clear + * 2) rwsem_try_write_lock() for writers -- set, clear + * 3) rwsem_del_waiter() -- clear + * + * For all the above cases, wait_lock will be held. A writer must also + * be the first one in the wait_list to be eligible for setting the handoff + * bit. So concurrent setting/clearing of handoff bit is not possible. + */ +#define RWSEM_WRITER_LOCKED (1UL << 0) +#define RWSEM_FLAG_WAITERS (1UL << 1) +#define RWSEM_FLAG_HANDOFF (1UL << 2) +#define RWSEM_FLAG_READFAIL (1UL << (BITS_PER_LONG - 1)) + +#define RWSEM_READER_SHIFT 8 +#define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT) +#define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1)) +#define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED +#define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK) +#define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\ + RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL) + +/* + * All writes to owner are protected by WRITE_ONCE() to make sure that + * store tearing can't happen as optimistic spinners may read and use + * the owner value concurrently without lock. Read from owner, however, + * may not need READ_ONCE() as long as the pointer value is only used + * for comparison and isn't being dereferenced. + * + * Both rwsem_{set,clear}_owner() functions should be in the same + * preempt disable section as the atomic op that changes sem->count. + */ +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ + lockdep_assert_preemption_disabled(); + atomic_long_set(&sem->owner, (long)current); +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ + lockdep_assert_preemption_disabled(); + atomic_long_set(&sem->owner, 0); +} + +/* + * Test the flags in the owner field. + */ +static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags) +{ + return atomic_long_read(&sem->owner) & flags; +} + +/* + * The task_struct pointer of the last owning reader will be left in + * the owner field. + * + * Note that the owner value just indicates the task has owned the rwsem + * previously, it may not be the real owner or one of the real owners + * anymore when that field is examined, so take it with a grain of salt. + * + * The reader non-spinnable bit is preserved. + */ +static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, + struct task_struct *owner) +{ + unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED | + (atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE); + + atomic_long_set(&sem->owner, val); +} + +static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) +{ + __rwsem_set_reader_owned(sem, current); +} + +#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER) +/* + * Return just the real task structure pointer of the owner + */ +struct task_struct *rwsem_owner(struct rw_semaphore *sem) +{ + return (struct task_struct *) + (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); +} + +/* + * Return true if the rwsem is owned by a reader. + */ +bool is_rwsem_reader_owned(struct rw_semaphore *sem) +{ + /* + * Check the count to see if it is write-locked. + */ + long count = atomic_long_read(&sem->count); + + if (count & RWSEM_WRITER_MASK) + return false; + return rwsem_test_oflags(sem, RWSEM_READER_OWNED); +} + +/* + * With CONFIG_DEBUG_RWSEMS or CONFIG_DETECT_HUNG_TASK_BLOCKER configured, + * it will make sure that the owner field of a reader-owned rwsem either + * points to a real reader-owner(s) or gets cleared. The only exception is + * when the unlock is done by up_read_non_owner(). + */ +static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) +{ + unsigned long val = atomic_long_read(&sem->owner); + + while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) { + if (atomic_long_try_cmpxchg(&sem->owner, &val, + val & RWSEM_OWNER_FLAGS_MASK)) + return; + } +} +#else +static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) +{ +} +#endif + +/* + * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag + * remains set. Otherwise, the operation will be aborted. + */ +static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem) +{ + unsigned long owner = atomic_long_read(&sem->owner); + + do { + if (!(owner & RWSEM_READER_OWNED)) + break; + if (owner & RWSEM_NONSPINNABLE) + break; + } while (!atomic_long_try_cmpxchg(&sem->owner, &owner, + owner | RWSEM_NONSPINNABLE)); +} + +static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp) +{ + *cntp = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count); + + if (WARN_ON_ONCE(*cntp < 0)) + rwsem_set_nonspinnable(sem); + + if (!(*cntp & RWSEM_READ_FAILED_MASK)) { + rwsem_set_reader_owned(sem); + return true; + } + + return false; +} + +static inline bool rwsem_write_trylock(struct rw_semaphore *sem) +{ + long tmp = RWSEM_UNLOCKED_VALUE; + + if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) { + rwsem_set_owner(sem); + return true; + } + + return false; +} + +/* + * Return the real task structure pointer of the owner and the embedded + * flags in the owner. pflags must be non-NULL. + */ +static inline struct task_struct * +rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags) +{ + unsigned long owner = atomic_long_read(&sem->owner); + + *pflags = owner & RWSEM_OWNER_FLAGS_MASK; + return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK); +} + +/* + * Guide to the rw_semaphore's count field. + * + * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned + * by a writer. + * + * The lock is owned by readers when + * (1) the RWSEM_WRITER_LOCKED isn't set in count, + * (2) some of the reader bits are set in count, and + * (3) the owner field has RWSEM_READ_OWNED bit set. + * + * Having some reader bits set is not enough to guarantee a readers owned + * lock as the readers may be in the process of backing out from the count + * and a writer has just released the lock. So another writer may steal + * the lock immediately after that. + */ + +/* + * Initialize an rwsem: + */ +void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held semaphore: + */ + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP); +#endif +#ifdef CONFIG_DEBUG_RWSEMS + sem->magic = sem; +#endif + atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); + raw_spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); + atomic_long_set(&sem->owner, 0L); +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER + osq_lock_init(&sem->osq); +#endif +} +EXPORT_SYMBOL(__init_rwsem); + +enum rwsem_waiter_type { + RWSEM_WAITING_FOR_WRITE, + RWSEM_WAITING_FOR_READ +}; + +struct rwsem_waiter { + struct list_head list; + struct task_struct *task; + enum rwsem_waiter_type type; + unsigned long timeout; + bool handoff_set; +}; +#define rwsem_first_waiter(sem) \ + list_first_entry(&sem->wait_list, struct rwsem_waiter, list) + +enum rwsem_wake_type { + RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ + RWSEM_WAKE_READERS, /* Wake readers only */ + RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ +}; -#include "rwsem.h" +/* + * The typical HZ value is either 250 or 1000. So set the minimum waiting + * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait + * queue before initiating the handoff protocol. + */ +#define RWSEM_WAIT_TIMEOUT DIV_ROUND_UP(HZ, 250) + +/* + * Magic number to batch-wakeup waiting readers, even when writers are + * also present in the queue. This both limits the amount of work the + * waking thread must do and also prevents any potential counter overflow, + * however unlikely. + */ +#define MAX_READERS_WAKEUP 0x100 + +static inline void +rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) +{ + lockdep_assert_held(&sem->wait_lock); + list_add_tail(&waiter->list, &sem->wait_list); + /* caller will set RWSEM_FLAG_WAITERS */ +} + +/* + * Remove a waiter from the wait_list and clear flags. + * + * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of + * this function. Modify with care. + * + * Return: true if wait_list isn't empty and false otherwise + */ +static inline bool +rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) +{ + lockdep_assert_held(&sem->wait_lock); + list_del(&waiter->list); + if (likely(!list_empty(&sem->wait_list))) + return true; + + atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count); + return false; +} + +/* + * handle the lock release when processes blocked on it that can now run + * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must + * have been set. + * - there must be someone on the queue + * - the wait_lock must be held by the caller + * - tasks are marked for wakeup, the caller must later invoke wake_up_q() + * to actually wakeup the blocked task(s) and drop the reference count, + * preferably when the wait_lock is released + * - woken process blocks are discarded from the list after having task zeroed + * - writers are only marked woken if downgrading is false + * + * Implies rwsem_del_waiter() for all woken readers. + */ +static void rwsem_mark_wake(struct rw_semaphore *sem, + enum rwsem_wake_type wake_type, + struct wake_q_head *wake_q) +{ + struct rwsem_waiter *waiter, *tmp; + long oldcount, woken = 0, adjustment = 0; + struct list_head wlist; + + lockdep_assert_held(&sem->wait_lock); + + /* + * Take a peek at the queue head waiter such that we can determine + * the wakeup(s) to perform. + */ + waiter = rwsem_first_waiter(sem); + + if (waiter->type == RWSEM_WAITING_FOR_WRITE) { + if (wake_type == RWSEM_WAKE_ANY) { + /* + * Mark writer at the front of the queue for wakeup. + * Until the task is actually later awoken later by + * the caller, other writers are able to steal it. + * Readers, on the other hand, will block as they + * will notice the queued writer. + */ + wake_q_add(wake_q, waiter->task); + lockevent_inc(rwsem_wake_writer); + } + + return; + } + + /* + * No reader wakeup if there are too many of them already. + */ + if (unlikely(atomic_long_read(&sem->count) < 0)) + return; + + /* + * Writers might steal the lock before we grant it to the next reader. + * We prefer to do the first reader grant before counting readers + * so we can bail out early if a writer stole the lock. + */ + if (wake_type != RWSEM_WAKE_READ_OWNED) { + struct task_struct *owner; + + adjustment = RWSEM_READER_BIAS; + oldcount = atomic_long_fetch_add(adjustment, &sem->count); + if (unlikely(oldcount & RWSEM_WRITER_MASK)) { + /* + * When we've been waiting "too" long (for writers + * to give up the lock), request a HANDOFF to + * force the issue. + */ + if (time_after(jiffies, waiter->timeout)) { + if (!(oldcount & RWSEM_FLAG_HANDOFF)) { + adjustment -= RWSEM_FLAG_HANDOFF; + lockevent_inc(rwsem_rlock_handoff); + } + waiter->handoff_set = true; + } + + atomic_long_add(-adjustment, &sem->count); + return; + } + /* + * Set it to reader-owned to give spinners an early + * indication that readers now have the lock. + * The reader nonspinnable bit seen at slowpath entry of + * the reader is copied over. + */ + owner = waiter->task; + __rwsem_set_reader_owned(sem, owner); + } + + /* + * Grant up to MAX_READERS_WAKEUP read locks to all the readers in the + * queue. We know that the woken will be at least 1 as we accounted + * for above. Note we increment the 'active part' of the count by the + * number of readers before waking any processes up. + * + * This is an adaptation of the phase-fair R/W locks where at the + * reader phase (first waiter is a reader), all readers are eligible + * to acquire the lock at the same time irrespective of their order + * in the queue. The writers acquire the lock according to their + * order in the queue. + * + * We have to do wakeup in 2 passes to prevent the possibility that + * the reader count may be decremented before it is incremented. It + * is because the to-be-woken waiter may not have slept yet. So it + * may see waiter->task got cleared, finish its critical section and + * do an unlock before the reader count increment. + * + * 1) Collect the read-waiters in a separate list, count them and + * fully increment the reader count in rwsem. + * 2) For each waiters in the new list, clear waiter->task and + * put them into wake_q to be woken up later. + */ + INIT_LIST_HEAD(&wlist); + list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { + if (waiter->type == RWSEM_WAITING_FOR_WRITE) + continue; + + woken++; + list_move_tail(&waiter->list, &wlist); + + /* + * Limit # of readers that can be woken up per wakeup call. + */ + if (unlikely(woken >= MAX_READERS_WAKEUP)) + break; + } + + adjustment = woken * RWSEM_READER_BIAS - adjustment; + lockevent_cond_inc(rwsem_wake_reader, woken); + + oldcount = atomic_long_read(&sem->count); + if (list_empty(&sem->wait_list)) { + /* + * Combined with list_move_tail() above, this implies + * rwsem_del_waiter(). + */ + adjustment -= RWSEM_FLAG_WAITERS; + if (oldcount & RWSEM_FLAG_HANDOFF) + adjustment -= RWSEM_FLAG_HANDOFF; + } else if (woken) { + /* + * When we've woken a reader, we no longer need to force + * writers to give up the lock and we can clear HANDOFF. + */ + if (oldcount & RWSEM_FLAG_HANDOFF) + adjustment -= RWSEM_FLAG_HANDOFF; + } + + if (adjustment) + atomic_long_add(adjustment, &sem->count); + + /* 2nd pass */ + list_for_each_entry_safe(waiter, tmp, &wlist, list) { + struct task_struct *tsk; + + tsk = waiter->task; + get_task_struct(tsk); + + /* + * Ensure calling get_task_struct() before setting the reader + * waiter to nil such that rwsem_down_read_slowpath() cannot + * race with do_exit() by always holding a reference count + * to the task to wakeup. + */ + smp_store_release(&waiter->task, NULL); + /* + * Ensure issuing the wakeup (either by us or someone else) + * after setting the reader waiter to nil. + */ + wake_q_add_safe(wake_q, tsk); + } +} + +/* + * Remove a waiter and try to wake up other waiters in the wait queue + * This function is called from the out_nolock path of both the reader and + * writer slowpaths with wait_lock held. It releases the wait_lock and + * optionally wake up waiters before it returns. + */ +static inline void +rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter, + struct wake_q_head *wake_q) + __releases(&sem->wait_lock) +{ + bool first = rwsem_first_waiter(sem) == waiter; + + wake_q_init(wake_q); + + /* + * If the wait_list isn't empty and the waiter to be deleted is + * the first waiter, we wake up the remaining waiters as they may + * be eligible to acquire or spin on the lock. + */ + if (rwsem_del_waiter(sem, waiter) && first) + rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q); + raw_spin_unlock_irq(&sem->wait_lock); + if (!wake_q_empty(wake_q)) + wake_up_q(wake_q); +} + +/* + * This function must be called with the sem->wait_lock held to prevent + * race conditions between checking the rwsem wait list and setting the + * sem->count accordingly. + * + * Implies rwsem_del_waiter() on success. + */ +static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, + struct rwsem_waiter *waiter) +{ + struct rwsem_waiter *first = rwsem_first_waiter(sem); + long count, new; + + lockdep_assert_held(&sem->wait_lock); + + count = atomic_long_read(&sem->count); + do { + bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); + + if (has_handoff) { + /* + * Honor handoff bit and yield only when the first + * waiter is the one that set it. Otherwisee, we + * still try to acquire the rwsem. + */ + if (first->handoff_set && (waiter != first)) + return false; + } + + new = count; + + if (count & RWSEM_LOCK_MASK) { + /* + * A waiter (first or not) can set the handoff bit + * if it is an RT task or wait in the wait queue + * for too long. + */ + if (has_handoff || (!rt_or_dl_task(waiter->task) && + !time_after(jiffies, waiter->timeout))) + return false; + + new |= RWSEM_FLAG_HANDOFF; + } else { + new |= RWSEM_WRITER_LOCKED; + new &= ~RWSEM_FLAG_HANDOFF; + + if (list_is_singular(&sem->wait_list)) + new &= ~RWSEM_FLAG_WAITERS; + } + } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)); + + /* + * We have either acquired the lock with handoff bit cleared or set + * the handoff bit. Only the first waiter can have its handoff_set + * set here to enable optimistic spinning in slowpath loop. + */ + if (new & RWSEM_FLAG_HANDOFF) { + first->handoff_set = true; + lockevent_inc(rwsem_wlock_handoff); + return false; + } + + /* + * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on + * success. + */ + list_del(&waiter->list); + rwsem_set_owner(sem); + return true; +} + +/* + * The rwsem_spin_on_owner() function returns the following 4 values + * depending on the lock owner state. + * OWNER_NULL : owner is currently NULL + * OWNER_WRITER: when owner changes and is a writer + * OWNER_READER: when owner changes and the new owner may be a reader. + * OWNER_NONSPINNABLE: + * when optimistic spinning has to stop because either the + * owner stops running, is unknown, or its timeslice has + * been used up. + */ +enum owner_state { + OWNER_NULL = 1 << 0, + OWNER_WRITER = 1 << 1, + OWNER_READER = 1 << 2, + OWNER_NONSPINNABLE = 1 << 3, +}; + +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER +/* + * Try to acquire write lock before the writer has been put on wait queue. + */ +static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) +{ + long count = atomic_long_read(&sem->count); + + while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) { + if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, + count | RWSEM_WRITER_LOCKED)) { + rwsem_set_owner(sem); + lockevent_inc(rwsem_opt_lock); + return true; + } + } + return false; +} + +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) +{ + struct task_struct *owner; + unsigned long flags; + bool ret = true; + + if (need_resched()) { + lockevent_inc(rwsem_opt_fail); + return false; + } + + /* + * Disable preemption is equal to the RCU read-side crital section, + * thus the task_strcut structure won't go away. + */ + owner = rwsem_owner_flags(sem, &flags); + /* + * Don't check the read-owner as the entry may be stale. + */ + if ((flags & RWSEM_NONSPINNABLE) || + (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner))) + ret = false; + + lockevent_cond_inc(rwsem_opt_fail, !ret); + return ret; +} + +static inline enum owner_state +rwsem_owner_state(struct task_struct *owner, unsigned long flags) +{ + if (flags & RWSEM_NONSPINNABLE) + return OWNER_NONSPINNABLE; + + if (flags & RWSEM_READER_OWNED) + return OWNER_READER; + + return owner ? OWNER_WRITER : OWNER_NULL; +} + +static noinline enum owner_state +rwsem_spin_on_owner(struct rw_semaphore *sem) +{ + struct task_struct *new, *owner; + unsigned long flags, new_flags; + enum owner_state state; + + lockdep_assert_preemption_disabled(); + + owner = rwsem_owner_flags(sem, &flags); + state = rwsem_owner_state(owner, flags); + if (state != OWNER_WRITER) + return state; + + for (;;) { + /* + * When a waiting writer set the handoff flag, it may spin + * on the owner as well. Once that writer acquires the lock, + * we can spin on it. So we don't need to quit even when the + * handoff bit is set. + */ + new = rwsem_owner_flags(sem, &new_flags); + if ((new != owner) || (new_flags != flags)) { + state = rwsem_owner_state(new, new_flags); + break; + } + + /* + * Ensure we emit the owner->on_cpu, dereference _after_ + * checking sem->owner still matches owner, if that fails, + * owner might point to free()d memory, if it still matches, + * our spinning context already disabled preemption which is + * equal to RCU read-side crital section ensures the memory + * stays valid. + */ + barrier(); + + if (need_resched() || !owner_on_cpu(owner)) { + state = OWNER_NONSPINNABLE; + break; + } + + cpu_relax(); + } + + return state; +} + +/* + * Calculate reader-owned rwsem spinning threshold for writer + * + * The more readers own the rwsem, the longer it will take for them to + * wind down and free the rwsem. So the empirical formula used to + * determine the actual spinning time limit here is: + * + * Spinning threshold = (10 + nr_readers/2)us + * + * The limit is capped to a maximum of 25us (30 readers). This is just + * a heuristic and is subjected to change in the future. + */ +static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem) +{ + long count = atomic_long_read(&sem->count); + int readers = count >> RWSEM_READER_SHIFT; + u64 delta; + + if (readers > 30) + readers = 30; + delta = (20 + readers) * NSEC_PER_USEC / 2; + + return sched_clock() + delta; +} + +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +{ + bool taken = false; + int prev_owner_state = OWNER_NULL; + int loop = 0; + u64 rspin_threshold = 0; + + /* sem->wait_lock should not be held when doing optimistic spinning */ + if (!osq_lock(&sem->osq)) + goto done; + + /* + * Optimistically spin on the owner field and attempt to acquire the + * lock whenever the owner changes. Spinning will be stopped when: + * 1) the owning writer isn't running; or + * 2) readers own the lock and spinning time has exceeded limit. + */ + for (;;) { + enum owner_state owner_state; + + owner_state = rwsem_spin_on_owner(sem); + if (owner_state == OWNER_NONSPINNABLE) + break; + + /* + * Try to acquire the lock + */ + taken = rwsem_try_write_lock_unqueued(sem); + + if (taken) + break; + + /* + * Time-based reader-owned rwsem optimistic spinning + */ + if (owner_state == OWNER_READER) { + /* + * Re-initialize rspin_threshold every time when + * the owner state changes from non-reader to reader. + * This allows a writer to steal the lock in between + * 2 reader phases and have the threshold reset at + * the beginning of the 2nd reader phase. + */ + if (prev_owner_state != OWNER_READER) { + if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)) + break; + rspin_threshold = rwsem_rspin_threshold(sem); + loop = 0; + } + + /* + * Check time threshold once every 16 iterations to + * avoid calling sched_clock() too frequently so + * as to reduce the average latency between the times + * when the lock becomes free and when the spinner + * is ready to do a trylock. + */ + else if (!(++loop & 0xf) && (sched_clock() > rspin_threshold)) { + rwsem_set_nonspinnable(sem); + lockevent_inc(rwsem_opt_nospin); + break; + } + } + + /* + * An RT task cannot do optimistic spinning if it cannot + * be sure the lock holder is running or live-lock may + * happen if the current task and the lock holder happen + * to run in the same CPU. However, aborting optimistic + * spinning while a NULL owner is detected may miss some + * opportunity where spinning can continue without causing + * problem. + * + * There are 2 possible cases where an RT task may be able + * to continue spinning. + * + * 1) The lock owner is in the process of releasing the + * lock, sem->owner is cleared but the lock has not + * been released yet. + * 2) The lock was free and owner cleared, but another + * task just comes in and acquire the lock before + * we try to get it. The new owner may be a spinnable + * writer. + * + * To take advantage of two scenarios listed above, the RT + * task is made to retry one more time to see if it can + * acquire the lock or continue spinning on the new owning + * writer. Of course, if the time lag is long enough or the + * new owner is not a writer or spinnable, the RT task will + * quit spinning. + * + * If the owner is a writer, the need_resched() check is + * done inside rwsem_spin_on_owner(). If the owner is not + * a writer, need_resched() check needs to be done here. + */ + if (owner_state != OWNER_WRITER) { + if (need_resched()) + break; + if (rt_or_dl_task(current) && + (prev_owner_state != OWNER_WRITER)) + break; + } + prev_owner_state = owner_state; + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + cpu_relax(); + } + osq_unlock(&sem->osq); +done: + lockevent_cond_inc(rwsem_opt_fail, !taken); + return taken; +} + +/* + * Clear the owner's RWSEM_NONSPINNABLE bit if it is set. This should + * only be called when the reader count reaches 0. + */ +static inline void clear_nonspinnable(struct rw_semaphore *sem) +{ + if (unlikely(rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))) + atomic_long_andnot(RWSEM_NONSPINNABLE, &sem->owner); +} + +#else +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) +{ + return false; +} + +static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem) +{ + return false; +} + +static inline void clear_nonspinnable(struct rw_semaphore *sem) { } + +static inline enum owner_state +rwsem_spin_on_owner(struct rw_semaphore *sem) +{ + return OWNER_NONSPINNABLE; +} +#endif + +/* + * Prepare to wake up waiter(s) in the wait queue by putting them into the + * given wake_q if the rwsem lock owner isn't a writer. If rwsem is likely + * reader-owned, wake up read lock waiters in queue front or wake up any + * front waiter otherwise. + + * This is being called from both reader and writer slow paths. + */ +static inline void rwsem_cond_wake_waiter(struct rw_semaphore *sem, long count, + struct wake_q_head *wake_q) +{ + enum rwsem_wake_type wake_type; + + if (count & RWSEM_WRITER_MASK) + return; + + if (count & RWSEM_READER_MASK) { + wake_type = RWSEM_WAKE_READERS; + } else { + wake_type = RWSEM_WAKE_ANY; + clear_nonspinnable(sem); + } + rwsem_mark_wake(sem, wake_type, wake_q); +} + +/* + * Wait for the read lock to be granted + */ +static struct rw_semaphore __sched * +rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int state) +{ + long adjustment = -RWSEM_READER_BIAS; + long rcnt = (count >> RWSEM_READER_SHIFT); + struct rwsem_waiter waiter; + DEFINE_WAKE_Q(wake_q); + + /* + * To prevent a constant stream of readers from starving a sleeping + * writer, don't attempt optimistic lock stealing if the lock is + * very likely owned by readers. + */ + if ((atomic_long_read(&sem->owner) & RWSEM_READER_OWNED) && + (rcnt > 1) && !(count & RWSEM_WRITER_LOCKED)) + goto queue; + + /* + * Reader optimistic lock stealing. + */ + if (!(count & (RWSEM_WRITER_LOCKED | RWSEM_FLAG_HANDOFF))) { + rwsem_set_reader_owned(sem); + lockevent_inc(rwsem_rlock_steal); + + /* + * Wake up other readers in the wait queue if it is + * the first reader. + */ + if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) { + raw_spin_lock_irq(&sem->wait_lock); + if (!list_empty(&sem->wait_list)) + rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, + &wake_q); + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + } + return sem; + } + +queue: + waiter.task = current; + waiter.type = RWSEM_WAITING_FOR_READ; + waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; + waiter.handoff_set = false; + + raw_spin_lock_irq(&sem->wait_lock); + if (list_empty(&sem->wait_list)) { + /* + * In case the wait queue is empty and the lock isn't owned + * by a writer, this reader can exit the slowpath and return + * immediately as its RWSEM_READER_BIAS has already been set + * in the count. + */ + if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) { + /* Provide lock ACQUIRE */ + smp_acquire__after_ctrl_dep(); + raw_spin_unlock_irq(&sem->wait_lock); + rwsem_set_reader_owned(sem); + lockevent_inc(rwsem_rlock_fast); + return sem; + } + adjustment += RWSEM_FLAG_WAITERS; + } + rwsem_add_waiter(sem, &waiter); + + /* we're now waiting on the lock, but no longer actively locking */ + count = atomic_long_add_return(adjustment, &sem->count); + + rwsem_cond_wake_waiter(sem, count, &wake_q); + raw_spin_unlock_irq(&sem->wait_lock); + + if (!wake_q_empty(&wake_q)) + wake_up_q(&wake_q); + + trace_contention_begin(sem, LCB_F_READ); + set_current_state(state); + + if (state == TASK_UNINTERRUPTIBLE) + hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_READER); + + /* wait to be given the lock */ + for (;;) { + if (!smp_load_acquire(&waiter.task)) { + /* Matches rwsem_mark_wake()'s smp_store_release(). */ + break; + } + if (signal_pending_state(state, current)) { + raw_spin_lock_irq(&sem->wait_lock); + if (waiter.task) + goto out_nolock; + raw_spin_unlock_irq(&sem->wait_lock); + /* Ordered by sem->wait_lock against rwsem_mark_wake(). */ + break; + } + schedule_preempt_disabled(); + lockevent_inc(rwsem_sleep_reader); + set_current_state(state); + } + + if (state == TASK_UNINTERRUPTIBLE) + hung_task_clear_blocker(); + + __set_current_state(TASK_RUNNING); + lockevent_inc(rwsem_rlock); + trace_contention_end(sem, 0); + return sem; + +out_nolock: + rwsem_del_wake_waiter(sem, &waiter, &wake_q); + __set_current_state(TASK_RUNNING); + lockevent_inc(rwsem_rlock_fail); + trace_contention_end(sem, -EINTR); + return ERR_PTR(-EINTR); +} + +/* + * Wait until we successfully acquire the write lock + */ +static struct rw_semaphore __sched * +rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) +{ + struct rwsem_waiter waiter; + DEFINE_WAKE_Q(wake_q); + + /* do optimistic spinning and steal lock if possible */ + if (rwsem_can_spin_on_owner(sem) && rwsem_optimistic_spin(sem)) { + /* rwsem_optimistic_spin() implies ACQUIRE on success */ + return sem; + } + + /* + * Optimistic spinning failed, proceed to the slowpath + * and block until we can acquire the sem. + */ + waiter.task = current; + waiter.type = RWSEM_WAITING_FOR_WRITE; + waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; + waiter.handoff_set = false; + + raw_spin_lock_irq(&sem->wait_lock); + rwsem_add_waiter(sem, &waiter); + + /* we're now waiting on the lock */ + if (rwsem_first_waiter(sem) != &waiter) { + rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count), + &wake_q); + if (!wake_q_empty(&wake_q)) { + /* + * We want to minimize wait_lock hold time especially + * when a large number of readers are to be woken up. + */ + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + raw_spin_lock_irq(&sem->wait_lock); + } + } else { + atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count); + } + + /* wait until we successfully acquire the lock */ + set_current_state(state); + trace_contention_begin(sem, LCB_F_WRITE); + + if (state == TASK_UNINTERRUPTIBLE) + hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_WRITER); + + for (;;) { + if (rwsem_try_write_lock(sem, &waiter)) { + /* rwsem_try_write_lock() implies ACQUIRE on success */ + break; + } + + raw_spin_unlock_irq(&sem->wait_lock); + + if (signal_pending_state(state, current)) + goto out_nolock; + + /* + * After setting the handoff bit and failing to acquire + * the lock, attempt to spin on owner to accelerate lock + * transfer. If the previous owner is a on-cpu writer and it + * has just released the lock, OWNER_NULL will be returned. + * In this case, we attempt to acquire the lock again + * without sleeping. + */ + if (waiter.handoff_set) { + enum owner_state owner_state; + + owner_state = rwsem_spin_on_owner(sem); + if (owner_state == OWNER_NULL) + goto trylock_again; + } + + schedule_preempt_disabled(); + lockevent_inc(rwsem_sleep_writer); + set_current_state(state); +trylock_again: + raw_spin_lock_irq(&sem->wait_lock); + } + + if (state == TASK_UNINTERRUPTIBLE) + hung_task_clear_blocker(); + + __set_current_state(TASK_RUNNING); + raw_spin_unlock_irq(&sem->wait_lock); + lockevent_inc(rwsem_wlock); + trace_contention_end(sem, 0); + return sem; + +out_nolock: + __set_current_state(TASK_RUNNING); + raw_spin_lock_irq(&sem->wait_lock); + rwsem_del_wake_waiter(sem, &waiter, &wake_q); + lockevent_inc(rwsem_wlock_fail); + trace_contention_end(sem, -EINTR); + return ERR_PTR(-EINTR); +} + +/* + * handle waking up a waiter on the semaphore + * - up_read/up_write has decremented the active part of count if we come here + */ +static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) +{ + unsigned long flags; + DEFINE_WAKE_Q(wake_q); + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (!list_empty(&sem->wait_list)) + rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + wake_up_q(&wake_q); + + return sem; +} + +/* + * downgrade a write lock into a read lock + * - caller incremented waiting part of count and discovered it still negative + * - just wake up any readers at the front of the queue + */ +static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) +{ + unsigned long flags; + DEFINE_WAKE_Q(wake_q); + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (!list_empty(&sem->wait_list)) + rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + wake_up_q(&wake_q); + + return sem; +} + +/* + * lock for reading + */ +static __always_inline int __down_read_common(struct rw_semaphore *sem, int state) +{ + int ret = 0; + long count; + + preempt_disable(); + if (!rwsem_read_trylock(sem, &count)) { + if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) { + ret = -EINTR; + goto out; + } + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); + } +out: + preempt_enable(); + return ret; +} + +static __always_inline void __down_read(struct rw_semaphore *sem) +{ + __down_read_common(sem, TASK_UNINTERRUPTIBLE); +} + +static __always_inline int __down_read_interruptible(struct rw_semaphore *sem) +{ + return __down_read_common(sem, TASK_INTERRUPTIBLE); +} + +static __always_inline int __down_read_killable(struct rw_semaphore *sem) +{ + return __down_read_common(sem, TASK_KILLABLE); +} + +static inline int __down_read_trylock(struct rw_semaphore *sem) +{ + int ret = 0; + long tmp; + + DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); + + preempt_disable(); + tmp = atomic_long_read(&sem->count); + while (!(tmp & RWSEM_READ_FAILED_MASK)) { + if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, + tmp + RWSEM_READER_BIAS)) { + rwsem_set_reader_owned(sem); + ret = 1; + break; + } + } + preempt_enable(); + return ret; +} + +/* + * lock for writing + */ +static __always_inline int __down_write_common(struct rw_semaphore *sem, int state) +{ + int ret = 0; + + preempt_disable(); + if (unlikely(!rwsem_write_trylock(sem))) { + if (IS_ERR(rwsem_down_write_slowpath(sem, state))) + ret = -EINTR; + } + preempt_enable(); + return ret; +} + +static __always_inline void __down_write(struct rw_semaphore *sem) +{ + __down_write_common(sem, TASK_UNINTERRUPTIBLE); +} + +static __always_inline int __down_write_killable(struct rw_semaphore *sem) +{ + return __down_write_common(sem, TASK_KILLABLE); +} + +static inline int __down_write_trylock(struct rw_semaphore *sem) +{ + int ret; + + preempt_disable(); + DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); + ret = rwsem_write_trylock(sem); + preempt_enable(); + + return ret; +} + +/* + * unlock after reading + */ +static inline void __up_read(struct rw_semaphore *sem) +{ + long tmp; + + DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); + + preempt_disable(); + rwsem_clear_reader_owned(sem); + tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); + DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); + if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == + RWSEM_FLAG_WAITERS)) { + clear_nonspinnable(sem); + rwsem_wake(sem); + } + preempt_enable(); +} + +/* + * unlock after writing + */ +static inline void __up_write(struct rw_semaphore *sem) +{ + long tmp; + + DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); + /* + * sem->owner may differ from current if the ownership is transferred + * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits. + */ + DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) && + !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem); + + preempt_disable(); + rwsem_clear_owner(sem); + tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); + if (unlikely(tmp & RWSEM_FLAG_WAITERS)) + rwsem_wake(sem); + preempt_enable(); +} + +/* + * downgrade write lock to read lock + */ +static inline void __downgrade_write(struct rw_semaphore *sem) +{ + long tmp; + + /* + * When downgrading from exclusive to shared ownership, + * anything inside the write-locked region cannot leak + * into the read side. In contrast, anything in the + * read-locked region is ok to be re-ordered into the + * write side. As such, rely on RELEASE semantics. + */ + DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem); + preempt_disable(); + tmp = atomic_long_fetch_add_release( + -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count); + rwsem_set_reader_owned(sem); + if (tmp & RWSEM_FLAG_WAITERS) + rwsem_downgrade_wake(sem); + preempt_enable(); +} + +#else /* !CONFIG_PREEMPT_RT */ + +#define RT_MUTEX_BUILD_MUTEX +#include "rtmutex.c" + +#define rwbase_set_and_save_current_state(state) \ + set_current_state(state) + +#define rwbase_restore_current_state() \ + __set_current_state(TASK_RUNNING) + +#define rwbase_rtmutex_lock_state(rtm, state) \ + __rt_mutex_lock(rtm, state) + +#define rwbase_rtmutex_slowlock_locked(rtm, state, wq) \ + __rt_mutex_slowlock_locked(rtm, NULL, state, wq) + +#define rwbase_rtmutex_unlock(rtm) \ + __rt_mutex_unlock(rtm) + +#define rwbase_rtmutex_trylock(rtm) \ + __rt_mutex_trylock(rtm) + +#define rwbase_signal_pending_state(state, current) \ + signal_pending_state(state, current) + +#define rwbase_pre_schedule() \ + rt_mutex_pre_schedule() + +#define rwbase_schedule() \ + rt_mutex_schedule() + +#define rwbase_post_schedule() \ + rt_mutex_post_schedule() + +#include "rwbase_rt.c" + +void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ + init_rwbase_rt(&(sem)->rwbase); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP); +#endif +} +EXPORT_SYMBOL(__init_rwsem); + +static inline void __down_read(struct rw_semaphore *sem) +{ + rwbase_read_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE); +} + +static inline int __down_read_interruptible(struct rw_semaphore *sem) +{ + return rwbase_read_lock(&sem->rwbase, TASK_INTERRUPTIBLE); +} + +static inline int __down_read_killable(struct rw_semaphore *sem) +{ + return rwbase_read_lock(&sem->rwbase, TASK_KILLABLE); +} + +static inline int __down_read_trylock(struct rw_semaphore *sem) +{ + return rwbase_read_trylock(&sem->rwbase); +} + +static inline void __up_read(struct rw_semaphore *sem) +{ + rwbase_read_unlock(&sem->rwbase, TASK_NORMAL); +} + +static inline void __sched __down_write(struct rw_semaphore *sem) +{ + rwbase_write_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE); +} + +static inline int __sched __down_write_killable(struct rw_semaphore *sem) +{ + return rwbase_write_lock(&sem->rwbase, TASK_KILLABLE); +} + +static inline int __down_write_trylock(struct rw_semaphore *sem) +{ + return rwbase_write_trylock(&sem->rwbase); +} + +static inline void __up_write(struct rw_semaphore *sem) +{ + rwbase_write_unlock(&sem->rwbase); +} + +static inline void __downgrade_write(struct rw_semaphore *sem) +{ + rwbase_write_downgrade(&sem->rwbase); +} + +/* Debug stubs for the common API */ +#define DEBUG_RWSEMS_WARN_ON(c, sem) + +static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, + struct task_struct *owner) +{ +} + +static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) +{ + int count = atomic_read(&sem->rwbase.readers); + + return count < 0 && count != READER_BIAS; +} + +#endif /* CONFIG_PREEMPT_RT */ /* * lock for reading @@ -23,11 +1537,37 @@ void __sched down_read(struct rw_semaphore *sem) rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); - rwsem_set_reader_owned(sem); } - EXPORT_SYMBOL(down_read); +int __sched down_read_interruptible(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_interruptible)) { + rwsem_release(&sem->dep_map, _RET_IP_); + return -EINTR; + } + + return 0; +} +EXPORT_SYMBOL(down_read_interruptible); + +int __sched down_read_killable(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) { + rwsem_release(&sem->dep_map, _RET_IP_); + return -EINTR; + } + + return 0; +} +EXPORT_SYMBOL(down_read_killable); + /* * trylock for reading -- returns 1 if successful, 0 if contention */ @@ -35,13 +1575,10 @@ int down_read_trylock(struct rw_semaphore *sem) { int ret = __down_read_trylock(sem); - if (ret == 1) { + if (ret == 1) rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); - rwsem_set_reader_owned(sem); - } return ret; } - EXPORT_SYMBOL(down_read_trylock); /* @@ -51,11 +1588,8 @@ void __sched down_write(struct rw_semaphore *sem) { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); - rwsem_set_owner(sem); } - EXPORT_SYMBOL(down_write); /* @@ -66,15 +1600,14 @@ int __sched down_write_killable(struct rw_semaphore *sem) might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); - if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { - rwsem_release(&sem->dep_map, 1, _RET_IP_); + if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, + __down_write_killable)) { + rwsem_release(&sem->dep_map, _RET_IP_); return -EINTR; } - rwsem_set_owner(sem); return 0; } - EXPORT_SYMBOL(down_write_killable); /* @@ -84,14 +1617,11 @@ int down_write_trylock(struct rw_semaphore *sem) { int ret = __down_write_trylock(sem); - if (ret == 1) { + if (ret == 1) rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); - rwsem_set_owner(sem); - } return ret; } - EXPORT_SYMBOL(down_write_trylock); /* @@ -99,11 +1629,9 @@ EXPORT_SYMBOL(down_write_trylock); */ void up_read(struct rw_semaphore *sem) { - rwsem_release(&sem->dep_map, 1, _RET_IP_); - + rwsem_release(&sem->dep_map, _RET_IP_); __up_read(sem); } - EXPORT_SYMBOL(up_read); /* @@ -111,12 +1639,9 @@ EXPORT_SYMBOL(up_read); */ void up_write(struct rw_semaphore *sem) { - rwsem_release(&sem->dep_map, 1, _RET_IP_); - - rwsem_clear_owner(sem); + rwsem_release(&sem->dep_map, _RET_IP_); __up_write(sem); } - EXPORT_SYMBOL(up_write); /* @@ -125,11 +1650,8 @@ EXPORT_SYMBOL(up_write); void downgrade_write(struct rw_semaphore *sem) { lock_downgrade(&sem->dep_map, _RET_IP_); - - rwsem_set_reader_owned(sem); __downgrade_write(sem); } - EXPORT_SYMBOL(downgrade_write); #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -138,42 +1660,52 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) { might_sleep(); rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED(sem, __down_read_trylock, __down_read); - rwsem_set_reader_owned(sem); } - EXPORT_SYMBOL(down_read_nested); +int down_read_killable_nested(struct rw_semaphore *sem, int subclass) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) { + rwsem_release(&sem->dep_map, _RET_IP_); + return -EINTR; + } + + return 0; +} +EXPORT_SYMBOL(down_read_killable_nested); + void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) { might_sleep(); rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); - rwsem_set_owner(sem); } - EXPORT_SYMBOL(_down_write_nest_lock); void down_read_non_owner(struct rw_semaphore *sem) { might_sleep(); - __down_read(sem); + /* + * The owner value for a reader-owned lock is mostly for debugging + * purpose only and is not critical to the correct functioning of + * rwsem. So it is perfectly fine to set it in a preempt-enabled + * context here. + */ + __rwsem_set_reader_owned(sem, NULL); } - EXPORT_SYMBOL(down_read_non_owner); void down_write_nested(struct rw_semaphore *sem, int subclass) { might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); - rwsem_set_owner(sem); } - EXPORT_SYMBOL(down_write_nested); int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) @@ -181,24 +1713,21 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); - if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { - rwsem_release(&sem->dep_map, 1, _RET_IP_); + if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, + __down_write_killable)) { + rwsem_release(&sem->dep_map, _RET_IP_); return -EINTR; } - rwsem_set_owner(sem); return 0; } - EXPORT_SYMBOL(down_write_killable_nested); void up_read_non_owner(struct rw_semaphore *sem) { + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); __up_read(sem); } - EXPORT_SYMBOL(up_read_non_owner); #endif - - diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h deleted file mode 100644 index a699f4048ba1..000000000000 --- a/kernel/locking/rwsem.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * The owner field of the rw_semaphore structure will be set to - * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear - * the owner field when it unlocks. A reader, on the other hand, will - * not touch the owner field when it unlocks. - * - * In essence, the owner field now has the following 3 states: - * 1) 0 - * - lock is free or the owner hasn't set the field yet - * 2) RWSEM_READER_OWNED - * - lock is currently or previously owned by readers (lock is free - * or not set by owner yet) - * 3) Other non-zero value - * - a writer owns the lock - */ -#define RWSEM_READER_OWNED ((struct task_struct *)1UL) - -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER -/* - * All writes to owner are protected by WRITE_ONCE() to make sure that - * store tearing can't happen as optimistic spinners may read and use - * the owner value concurrently without lock. Read from owner, however, - * may not need READ_ONCE() as long as the pointer value is only used - * for comparison and isn't being dereferenced. - */ -static inline void rwsem_set_owner(struct rw_semaphore *sem) -{ - WRITE_ONCE(sem->owner, current); -} - -static inline void rwsem_clear_owner(struct rw_semaphore *sem) -{ - WRITE_ONCE(sem->owner, NULL); -} - -static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) -{ - /* - * We check the owner value first to make sure that we will only - * do a write to the rwsem cacheline when it is really necessary - * to minimize cacheline contention. - */ - if (sem->owner != RWSEM_READER_OWNED) - WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); -} - -static inline bool rwsem_owner_is_writer(struct task_struct *owner) -{ - return owner && owner != RWSEM_READER_OWNED; -} - -static inline bool rwsem_owner_is_reader(struct task_struct *owner) -{ - return owner == RWSEM_READER_OWNED; -} -#else -static inline void rwsem_set_owner(struct rw_semaphore *sem) -{ -} - -static inline void rwsem_clear_owner(struct rw_semaphore *sem) -{ -} - -static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) -{ -} -#endif diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 561acdd39960..3ef032e22f7e 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -1,9 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008 Intel Corporation * Author: Matthew Wilcox <willy@linux.intel.com> * - * Distributed under the terms of the GNU GPL, version 2 - * * This file implements counting semaphores. * A counting semaphore may be acquired 'n' times before sleeping. * See mutex.c for single-acquisition sleeping locks which enforce @@ -30,15 +29,53 @@ #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/debug.h> +#include <linux/sched/wake_q.h> #include <linux/semaphore.h> #include <linux/spinlock.h> #include <linux/ftrace.h> +#include <trace/events/lock.h> +#include <linux/hung_task.h> static noinline void __down(struct semaphore *sem); static noinline int __down_interruptible(struct semaphore *sem); static noinline int __down_killable(struct semaphore *sem); static noinline int __down_timeout(struct semaphore *sem, long timeout); -static noinline void __up(struct semaphore *sem); +static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q); + +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +static inline void hung_task_sem_set_holder(struct semaphore *sem) +{ + WRITE_ONCE((sem)->last_holder, (unsigned long)current); +} + +static inline void hung_task_sem_clear_if_holder(struct semaphore *sem) +{ + if (READ_ONCE((sem)->last_holder) == (unsigned long)current) + WRITE_ONCE((sem)->last_holder, 0UL); +} + +unsigned long sem_last_holder(struct semaphore *sem) +{ + return READ_ONCE(sem->last_holder); +} +#else +static inline void hung_task_sem_set_holder(struct semaphore *sem) +{ +} +static inline void hung_task_sem_clear_if_holder(struct semaphore *sem) +{ +} +unsigned long sem_last_holder(struct semaphore *sem) +{ + return 0UL; +} +#endif + +static inline void __sem_acquire(struct semaphore *sem) +{ + sem->count--; + hung_task_sem_set_holder(sem); +} /** * down - acquire the semaphore @@ -51,13 +88,14 @@ static noinline void __up(struct semaphore *sem); * Use of this function is deprecated, please use down_interruptible() or * down_killable() instead. */ -void down(struct semaphore *sem) +void __sched down(struct semaphore *sem) { unsigned long flags; + might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else __down(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -73,14 +111,15 @@ EXPORT_SYMBOL(down); * If the sleep is interrupted by a signal, this function will return -EINTR. * If the semaphore is successfully acquired, this function returns 0. */ -int down_interruptible(struct semaphore *sem) +int __sched down_interruptible(struct semaphore *sem) { unsigned long flags; int result = 0; + might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else result = __down_interruptible(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -99,14 +138,15 @@ EXPORT_SYMBOL(down_interruptible); * -EINTR. If the semaphore is successfully acquired, this function returns * 0. */ -int down_killable(struct semaphore *sem) +int __sched down_killable(struct semaphore *sem) { unsigned long flags; int result = 0; + might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else result = __down_killable(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -120,7 +160,7 @@ EXPORT_SYMBOL(down_killable); * @sem: the semaphore to be acquired * * Try to acquire the semaphore atomically. Returns 0 if the semaphore has - * been acquired successfully or 1 if it it cannot be acquired. + * been acquired successfully or 1 if it cannot be acquired. * * NOTE: This return value is inverted from both spin_trylock and * mutex_trylock! Be careful about this when converting code. @@ -128,7 +168,7 @@ EXPORT_SYMBOL(down_killable); * Unlike mutex_trylock, this function can be used from interrupt context, * and the semaphore can be released by any task or interrupt. */ -int down_trylock(struct semaphore *sem) +int __sched down_trylock(struct semaphore *sem) { unsigned long flags; int count; @@ -136,7 +176,7 @@ int down_trylock(struct semaphore *sem) raw_spin_lock_irqsave(&sem->lock, flags); count = sem->count - 1; if (likely(count >= 0)) - sem->count = count; + __sem_acquire(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); return (count < 0); @@ -153,14 +193,15 @@ EXPORT_SYMBOL(down_trylock); * If the semaphore is not released within the specified number of jiffies, * this function returns -ETIME. It returns 0 if the semaphore was acquired. */ -int down_timeout(struct semaphore *sem, long timeout) +int __sched down_timeout(struct semaphore *sem, long timeout) { unsigned long flags; int result = 0; + might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else result = __down_timeout(sem, timeout); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -176,16 +217,22 @@ EXPORT_SYMBOL(down_timeout); * Release the semaphore. Unlike mutexes, up() may be called from any * context and even by tasks which have never called down(). */ -void up(struct semaphore *sem) +void __sched up(struct semaphore *sem) { unsigned long flags; + DEFINE_WAKE_Q(wake_q); raw_spin_lock_irqsave(&sem->lock, flags); + + hung_task_sem_clear_if_holder(sem); + if (likely(list_empty(&sem->wait_list))) sem->count++; else - __up(sem); + __up(sem, &wake_q); raw_spin_unlock_irqrestore(&sem->lock, flags); + if (!wake_q_empty(&wake_q)) + wake_up_q(&wake_q); } EXPORT_SYMBOL(up); @@ -202,7 +249,7 @@ struct semaphore_waiter { * constant, and thus optimised away by the compiler. Likewise the * 'timeout' parameter for the cases without timeouts. */ -static inline int __sched __down_common(struct semaphore *sem, long state, +static inline int __sched ___down_common(struct semaphore *sem, long state, long timeout) { struct semaphore_waiter waiter; @@ -220,8 +267,10 @@ static inline int __sched __down_common(struct semaphore *sem, long state, raw_spin_unlock_irq(&sem->lock); timeout = schedule_timeout(timeout); raw_spin_lock_irq(&sem->lock); - if (waiter.up) + if (waiter.up) { + hung_task_sem_set_holder(sem); return 0; + } } timed_out: @@ -233,6 +282,22 @@ static inline int __sched __down_common(struct semaphore *sem, long state, return -EINTR; } +static inline int __sched __down_common(struct semaphore *sem, long state, + long timeout) +{ + int ret; + + hung_task_set_blocker(sem, BLOCKER_TYPE_SEM); + + trace_contention_begin(sem, 0); + ret = ___down_common(sem, state, timeout); + trace_contention_end(sem, ret); + + hung_task_clear_blocker(); + + return ret; +} + static noinline void __sched __down(struct semaphore *sem) { __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); @@ -253,11 +318,12 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout) return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout); } -static noinline void __sched __up(struct semaphore *sem) +static noinline void __sched __up(struct semaphore *sem, + struct wake_q_head *wake_q) { struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, struct semaphore_waiter, list); list_del(&waiter->list); waiter->up = true; - wake_up_process(waiter->task); + wake_q_add(wake_q, waiter->task); } diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 4b082b5cac9e..7685defd7c52 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (2004) Linus Torvalds * @@ -21,6 +22,13 @@ #include <linux/debug_locks.h> #include <linux/export.h> +#ifdef CONFIG_MMIOWB +#ifndef arch_mmiowb_state +DEFINE_PER_CPU(struct mmiowb_state, __mmiowb_state); +EXPORT_PER_CPU_SYMBOL(__mmiowb_state); +#endif +#endif + /* * If lockdep is enabled then we use the non-preemption spin-ops * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are @@ -29,11 +37,10 @@ #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) /* * The __lock_function inlines are taken from - * include/linux/spinlock_api_smp.h + * spinlock : include/linux/spinlock_api_smp.h + * rwlock : include/linux/rwlock_api_smp.h */ #else -#define raw_read_can_lock(l) read_can_lock(l) -#define raw_write_can_lock(l) write_can_lock(l) /* * Some architectures can relax in favour of the CPU owning the lock. @@ -51,14 +58,14 @@ /* * We build the __lock_function inlines here. They are too large for * inlining all over the place, but here is only one user per function - * which embedds them into the calling _lock_function below. + * which embeds them into the calling _lock_function below. * * This could be a long-held lock. We both prepare to spin for a long - * time (making _this_ CPU preemptable if possible), and we also signal + * time (making _this_ CPU preemptible if possible), and we also signal * towards that other CPU that it should break the lock ASAP. */ #define BUILD_LOCK_OPS(op, locktype) \ -void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ +static void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ { \ for (;;) { \ preempt_disable(); \ @@ -66,15 +73,11 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ break; \ preempt_enable(); \ \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ - arch_##op##_relax(&lock->raw_lock); \ + arch_##op##_relax(&lock->raw_lock); \ } \ - (lock)->break_lock = 0; \ } \ \ -unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ +static unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ { \ unsigned long flags; \ \ @@ -86,21 +89,18 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ local_irq_restore(flags); \ preempt_enable(); \ \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ - arch_##op##_relax(&lock->raw_lock); \ + arch_##op##_relax(&lock->raw_lock); \ } \ - (lock)->break_lock = 0; \ + \ return flags; \ } \ \ -void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \ +static void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \ { \ _raw_##op##_lock_irqsave(lock); \ } \ \ -void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ +static void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ { \ unsigned long flags; \ \ @@ -124,13 +124,16 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ * __[spin|read|write]_lock_bh() */ BUILD_LOCK_OPS(spin, raw_spinlock); + +#ifndef CONFIG_PREEMPT_RT BUILD_LOCK_OPS(read, rwlock); BUILD_LOCK_OPS(write, rwlock); +#endif #endif #ifndef CONFIG_INLINE_SPIN_TRYLOCK -int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) +noinline int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) { return __raw_spin_trylock(lock); } @@ -138,7 +141,7 @@ EXPORT_SYMBOL(_raw_spin_trylock); #endif #ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH -int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) +noinline int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) { return __raw_spin_trylock_bh(lock); } @@ -146,7 +149,7 @@ EXPORT_SYMBOL(_raw_spin_trylock_bh); #endif #ifndef CONFIG_INLINE_SPIN_LOCK -void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) { __raw_spin_lock(lock); } @@ -154,7 +157,7 @@ EXPORT_SYMBOL(_raw_spin_lock); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE -unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) +noinline unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) { return __raw_spin_lock_irqsave(lock); } @@ -162,7 +165,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irqsave); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_IRQ -void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) { __raw_spin_lock_irq(lock); } @@ -170,7 +173,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irq); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_BH -void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) { __raw_spin_lock_bh(lock); } @@ -178,7 +181,7 @@ EXPORT_SYMBOL(_raw_spin_lock_bh); #endif #ifdef CONFIG_UNINLINE_SPIN_UNLOCK -void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) { __raw_spin_unlock(lock); } @@ -186,7 +189,7 @@ EXPORT_SYMBOL(_raw_spin_unlock); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE -void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) +noinline void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) { __raw_spin_unlock_irqrestore(lock, flags); } @@ -194,7 +197,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_irqrestore); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ -void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) { __raw_spin_unlock_irq(lock); } @@ -202,15 +205,17 @@ EXPORT_SYMBOL(_raw_spin_unlock_irq); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_BH -void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) { __raw_spin_unlock_bh(lock); } EXPORT_SYMBOL(_raw_spin_unlock_bh); #endif +#ifndef CONFIG_PREEMPT_RT + #ifndef CONFIG_INLINE_READ_TRYLOCK -int __lockfunc _raw_read_trylock(rwlock_t *lock) +noinline int __lockfunc _raw_read_trylock(rwlock_t *lock) { return __raw_read_trylock(lock); } @@ -218,7 +223,7 @@ EXPORT_SYMBOL(_raw_read_trylock); #endif #ifndef CONFIG_INLINE_READ_LOCK -void __lockfunc _raw_read_lock(rwlock_t *lock) +noinline void __lockfunc _raw_read_lock(rwlock_t *lock) { __raw_read_lock(lock); } @@ -226,7 +231,7 @@ EXPORT_SYMBOL(_raw_read_lock); #endif #ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE -unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) +noinline unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) { return __raw_read_lock_irqsave(lock); } @@ -234,7 +239,7 @@ EXPORT_SYMBOL(_raw_read_lock_irqsave); #endif #ifndef CONFIG_INLINE_READ_LOCK_IRQ -void __lockfunc _raw_read_lock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_read_lock_irq(rwlock_t *lock) { __raw_read_lock_irq(lock); } @@ -242,7 +247,7 @@ EXPORT_SYMBOL(_raw_read_lock_irq); #endif #ifndef CONFIG_INLINE_READ_LOCK_BH -void __lockfunc _raw_read_lock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_read_lock_bh(rwlock_t *lock) { __raw_read_lock_bh(lock); } @@ -250,7 +255,7 @@ EXPORT_SYMBOL(_raw_read_lock_bh); #endif #ifndef CONFIG_INLINE_READ_UNLOCK -void __lockfunc _raw_read_unlock(rwlock_t *lock) +noinline void __lockfunc _raw_read_unlock(rwlock_t *lock) { __raw_read_unlock(lock); } @@ -258,7 +263,7 @@ EXPORT_SYMBOL(_raw_read_unlock); #endif #ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE -void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +noinline void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { __raw_read_unlock_irqrestore(lock, flags); } @@ -266,7 +271,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irqrestore); #endif #ifndef CONFIG_INLINE_READ_UNLOCK_IRQ -void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) { __raw_read_unlock_irq(lock); } @@ -274,7 +279,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irq); #endif #ifndef CONFIG_INLINE_READ_UNLOCK_BH -void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) { __raw_read_unlock_bh(lock); } @@ -282,7 +287,7 @@ EXPORT_SYMBOL(_raw_read_unlock_bh); #endif #ifndef CONFIG_INLINE_WRITE_TRYLOCK -int __lockfunc _raw_write_trylock(rwlock_t *lock) +noinline int __lockfunc _raw_write_trylock(rwlock_t *lock) { return __raw_write_trylock(lock); } @@ -290,15 +295,25 @@ EXPORT_SYMBOL(_raw_write_trylock); #endif #ifndef CONFIG_INLINE_WRITE_LOCK -void __lockfunc _raw_write_lock(rwlock_t *lock) +noinline void __lockfunc _raw_write_lock(rwlock_t *lock) { __raw_write_lock(lock); } EXPORT_SYMBOL(_raw_write_lock); + +#ifndef CONFIG_DEBUG_LOCK_ALLOC +#define __raw_write_lock_nested(lock, subclass) __raw_write_lock(((void)(subclass), (lock))) +#endif + +void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass) +{ + __raw_write_lock_nested(lock, subclass); +} +EXPORT_SYMBOL(_raw_write_lock_nested); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE -unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) +noinline unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) { return __raw_write_lock_irqsave(lock); } @@ -306,7 +321,7 @@ EXPORT_SYMBOL(_raw_write_lock_irqsave); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_IRQ -void __lockfunc _raw_write_lock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_write_lock_irq(rwlock_t *lock) { __raw_write_lock_irq(lock); } @@ -314,7 +329,7 @@ EXPORT_SYMBOL(_raw_write_lock_irq); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_BH -void __lockfunc _raw_write_lock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_write_lock_bh(rwlock_t *lock) { __raw_write_lock_bh(lock); } @@ -322,7 +337,7 @@ EXPORT_SYMBOL(_raw_write_lock_bh); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK -void __lockfunc _raw_write_unlock(rwlock_t *lock) +noinline void __lockfunc _raw_write_unlock(rwlock_t *lock) { __raw_write_unlock(lock); } @@ -330,7 +345,7 @@ EXPORT_SYMBOL(_raw_write_unlock); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE -void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +noinline void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { __raw_write_unlock_irqrestore(lock, flags); } @@ -338,7 +353,7 @@ EXPORT_SYMBOL(_raw_write_unlock_irqrestore); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ -void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) { __raw_write_unlock_irq(lock); } @@ -346,13 +361,15 @@ EXPORT_SYMBOL(_raw_write_unlock_irq); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK_BH -void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) { __raw_write_unlock_bh(lock); } EXPORT_SYMBOL(_raw_write_unlock_bh); #endif +#endif /* !CONFIG_PREEMPT_RT */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) @@ -371,8 +388,7 @@ unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, local_irq_save(flags); preempt_disable(); spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock, - do_raw_spin_lock_flags, &flags); + LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); return flags; } EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested); @@ -397,3 +413,11 @@ notrace int in_lock_functions(unsigned long addr) && addr < (unsigned long)__lock_text_end; } EXPORT_SYMBOL(in_lock_functions); + +#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_PREEMPT_RT) +void notrace lockdep_assert_in_softirq_func(void) +{ + lockdep_assert_in_softirq(); +} +EXPORT_SYMBOL(lockdep_assert_in_softirq_func); +#endif diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 9aa0fccd5d43..2338b3adfb55 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -12,16 +12,17 @@ #include <linux/debug_locks.h> #include <linux/delay.h> #include <linux/export.h> +#include <linux/pid.h> void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, - struct lock_class_key *key) + struct lock_class_key *key, short inner) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, inner); #endif lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; lock->magic = SPINLOCK_MAGIC; @@ -31,6 +32,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, EXPORT_SYMBOL(__raw_spin_lock_init); +#ifndef CONFIG_PREEMPT_RT void __rwlock_init(rwlock_t *lock, const char *name, struct lock_class_key *key) { @@ -39,7 +41,7 @@ void __rwlock_init(rwlock_t *lock, const char *name, * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_CONFIG); #endif lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED; lock->magic = RWLOCK_MAGIC; @@ -48,22 +50,23 @@ void __rwlock_init(rwlock_t *lock, const char *name, } EXPORT_SYMBOL(__rwlock_init); +#endif static void spin_dump(raw_spinlock_t *lock, const char *msg) { - struct task_struct *owner = NULL; + struct task_struct *owner = READ_ONCE(lock->owner); - if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT) - owner = lock->owner; + if (owner == SPINLOCK_OWNER_INIT) + owner = NULL; printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n", msg, raw_smp_processor_id(), current->comm, task_pid_nr(current)); printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, " ".owner_cpu: %d\n", - lock, lock->magic, + lock, READ_ONCE(lock->magic), owner ? owner->comm : "<none>", owner ? task_pid_nr(owner) : -1, - lock->owner_cpu); + READ_ONCE(lock->owner_cpu)); dump_stack(); } @@ -80,16 +83,16 @@ static void spin_bug(raw_spinlock_t *lock, const char *msg) static inline void debug_spin_lock_before(raw_spinlock_t *lock) { - SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); - SPIN_BUG_ON(lock->owner == current, lock, "recursion"); - SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), + SPIN_BUG_ON(READ_ONCE(lock->magic) != SPINLOCK_MAGIC, lock, "bad magic"); + SPIN_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion"); + SPIN_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(), lock, "cpu recursion"); } static inline void debug_spin_lock_after(raw_spinlock_t *lock) { - lock->owner_cpu = raw_smp_processor_id(); - lock->owner = current; + WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id()); + WRITE_ONCE(lock->owner, current); } static inline void debug_spin_unlock(raw_spinlock_t *lock) @@ -99,8 +102,8 @@ static inline void debug_spin_unlock(raw_spinlock_t *lock) SPIN_BUG_ON(lock->owner != current, lock, "wrong owner"); SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), lock, "wrong CPU"); - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; + WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT); + WRITE_ONCE(lock->owner_cpu, -1); } /* @@ -111,6 +114,7 @@ void do_raw_spin_lock(raw_spinlock_t *lock) { debug_spin_lock_before(lock); arch_spin_lock(&lock->raw_lock); + mmiowb_spin_lock(); debug_spin_lock_after(lock); } @@ -118,8 +122,10 @@ int do_raw_spin_trylock(raw_spinlock_t *lock) { int ret = arch_spin_trylock(&lock->raw_lock); - if (ret) + if (ret) { + mmiowb_spin_lock(); debug_spin_lock_after(lock); + } #ifndef CONFIG_SMP /* * Must not happen on UP: @@ -131,10 +137,12 @@ int do_raw_spin_trylock(raw_spinlock_t *lock) void do_raw_spin_unlock(raw_spinlock_t *lock) { + mmiowb_spin_unlock(); debug_spin_unlock(lock); arch_spin_unlock(&lock->raw_lock); } +#ifndef CONFIG_PREEMPT_RT static void rwlock_bug(rwlock_t *lock, const char *msg) { if (!debug_locks_off()) @@ -176,15 +184,15 @@ void do_raw_read_unlock(rwlock_t *lock) static inline void debug_write_lock_before(rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); - RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), + RWLOCK_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion"); + RWLOCK_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(), lock, "cpu recursion"); } static inline void debug_write_lock_after(rwlock_t *lock) { - lock->owner_cpu = raw_smp_processor_id(); - lock->owner = current; + WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id()); + WRITE_ONCE(lock->owner, current); } static inline void debug_write_unlock(rwlock_t *lock) @@ -193,8 +201,8 @@ static inline void debug_write_unlock(rwlock_t *lock) RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner"); RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), lock, "wrong CPU"); - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; + WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT); + WRITE_ONCE(lock->owner_cpu, -1); } void do_raw_write_lock(rwlock_t *lock) @@ -224,3 +232,5 @@ void do_raw_write_unlock(rwlock_t *lock) debug_write_unlock(lock); arch_write_unlock(&lock->raw_lock); } + +#endif /* !CONFIG_PREEMPT_RT */ diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c new file mode 100644 index 000000000000..db1e11b45de6 --- /dev/null +++ b/kernel/locking/spinlock_rt.c @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * PREEMPT_RT substitution for spin/rw_locks + * + * spinlocks and rwlocks on RT are based on rtmutexes, with a few twists to + * resemble the non RT semantics: + * + * - Contrary to plain rtmutexes, spinlocks and rwlocks are state + * preserving. The task state is saved before blocking on the underlying + * rtmutex, and restored when the lock has been acquired. Regular wakeups + * during that time are redirected to the saved state so no wake up is + * missed. + * + * - Non RT spin/rwlocks disable preemption and eventually interrupts. + * Disabling preemption has the side effect of disabling migration and + * preventing RCU grace periods. + * + * The RT substitutions explicitly disable migration and take + * rcu_read_lock() across the lock held section. + */ +#include <linux/spinlock.h> +#include <linux/export.h> + +#define RT_MUTEX_BUILD_SPINLOCKS +#include "rtmutex.c" + +/* + * __might_resched() skips the state check as rtlocks are state + * preserving. Take RCU nesting into account as spin/read/write_lock() can + * legitimately nest into an RCU read side critical section. + */ +#define RTLOCK_RESCHED_OFFSETS \ + (rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT) + +#define rtlock_might_resched() \ + __might_resched(__FILE__, __LINE__, RTLOCK_RESCHED_OFFSETS) + +static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) +{ + lockdep_assert(!current->pi_blocked_on); + + if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current))) + rtlock_slowlock(rtm); +} + +static __always_inline void __rt_spin_lock(spinlock_t *lock) +{ + rtlock_might_resched(); + rtlock_lock(&lock->lock); + rcu_read_lock(); + migrate_disable(); +} + +void __sched rt_spin_lock(spinlock_t *lock) __acquires(RCU) +{ + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); + __rt_spin_lock(lock); +} +EXPORT_SYMBOL(rt_spin_lock); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __sched rt_spin_lock_nested(spinlock_t *lock, int subclass) +{ + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + __rt_spin_lock(lock); +} +EXPORT_SYMBOL(rt_spin_lock_nested); + +void __sched rt_spin_lock_nest_lock(spinlock_t *lock, + struct lockdep_map *nest_lock) +{ + spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); + __rt_spin_lock(lock); +} +EXPORT_SYMBOL(rt_spin_lock_nest_lock); +#endif + +void __sched rt_spin_unlock(spinlock_t *lock) __releases(RCU) +{ + spin_release(&lock->dep_map, _RET_IP_); + migrate_enable(); + rcu_read_unlock(); + + if (unlikely(!rt_mutex_cmpxchg_release(&lock->lock, current, NULL))) + rt_mutex_slowunlock(&lock->lock); +} +EXPORT_SYMBOL(rt_spin_unlock); + +/* + * Wait for the lock to get unlocked: instead of polling for an unlock + * (like raw spinlocks do), lock and unlock, to force the kernel to + * schedule if there's contention: + */ +void __sched rt_spin_lock_unlock(spinlock_t *lock) +{ + spin_lock(lock); + spin_unlock(lock); +} +EXPORT_SYMBOL(rt_spin_lock_unlock); + +static __always_inline int __rt_spin_trylock(spinlock_t *lock) +{ + int ret = 1; + + if (unlikely(!rt_mutex_cmpxchg_acquire(&lock->lock, NULL, current))) + ret = rt_mutex_slowtrylock(&lock->lock); + + if (ret) { + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + rcu_read_lock(); + migrate_disable(); + } + return ret; +} + +int __sched rt_spin_trylock(spinlock_t *lock) +{ + return __rt_spin_trylock(lock); +} +EXPORT_SYMBOL(rt_spin_trylock); + +int __sched rt_spin_trylock_bh(spinlock_t *lock) +{ + int ret; + + local_bh_disable(); + ret = __rt_spin_trylock(lock); + if (!ret) + local_bh_enable(); + return ret; +} +EXPORT_SYMBOL(rt_spin_trylock_bh); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __rt_spin_lock_init(spinlock_t *lock, const char *name, + struct lock_class_key *key, bool percpu) +{ + u8 type = percpu ? LD_LOCK_PERCPU : LD_LOCK_NORMAL; + + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map_type(&lock->dep_map, name, key, 0, LD_WAIT_CONFIG, + LD_WAIT_INV, type); +} +EXPORT_SYMBOL(__rt_spin_lock_init); +#endif + +/* + * RT-specific reader/writer locks + */ +#define rwbase_set_and_save_current_state(state) \ + current_save_and_set_rtlock_wait_state() + +#define rwbase_restore_current_state() \ + current_restore_rtlock_saved_state() + +static __always_inline int +rwbase_rtmutex_lock_state(struct rt_mutex_base *rtm, unsigned int state) +{ + if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current))) + rtlock_slowlock(rtm); + return 0; +} + +static __always_inline int +rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state, + struct wake_q_head *wake_q) +{ + rtlock_slowlock_locked(rtm, wake_q); + return 0; +} + +static __always_inline void rwbase_rtmutex_unlock(struct rt_mutex_base *rtm) +{ + if (likely(rt_mutex_cmpxchg_acquire(rtm, current, NULL))) + return; + + rt_mutex_slowunlock(rtm); +} + +static __always_inline int rwbase_rtmutex_trylock(struct rt_mutex_base *rtm) +{ + if (likely(rt_mutex_cmpxchg_acquire(rtm, NULL, current))) + return 1; + + return rt_mutex_slowtrylock(rtm); +} + +#define rwbase_signal_pending_state(state, current) (0) + +#define rwbase_pre_schedule() + +#define rwbase_schedule() \ + schedule_rtlock() + +#define rwbase_post_schedule() + +#include "rwbase_rt.c" +/* + * The common functions which get wrapped into the rwlock API. + */ +int __sched rt_read_trylock(rwlock_t *rwlock) +{ + int ret; + + ret = rwbase_read_trylock(&rwlock->rwbase); + if (ret) { + rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_); + rcu_read_lock(); + migrate_disable(); + } + return ret; +} +EXPORT_SYMBOL(rt_read_trylock); + +int __sched rt_write_trylock(rwlock_t *rwlock) +{ + int ret; + + ret = rwbase_write_trylock(&rwlock->rwbase); + if (ret) { + rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_); + rcu_read_lock(); + migrate_disable(); + } + return ret; +} +EXPORT_SYMBOL(rt_write_trylock); + +void __sched rt_read_lock(rwlock_t *rwlock) __acquires(RCU) +{ + rtlock_might_resched(); + rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); + rwbase_read_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); + rcu_read_lock(); + migrate_disable(); +} +EXPORT_SYMBOL(rt_read_lock); + +void __sched rt_write_lock(rwlock_t *rwlock) __acquires(RCU) +{ + rtlock_might_resched(); + rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); + rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); + rcu_read_lock(); + migrate_disable(); +} +EXPORT_SYMBOL(rt_write_lock); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass) __acquires(RCU) +{ + rtlock_might_resched(); + rwlock_acquire(&rwlock->dep_map, subclass, 0, _RET_IP_); + rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); + rcu_read_lock(); + migrate_disable(); +} +EXPORT_SYMBOL(rt_write_lock_nested); +#endif + +void __sched rt_read_unlock(rwlock_t *rwlock) __releases(RCU) +{ + rwlock_release(&rwlock->dep_map, _RET_IP_); + migrate_enable(); + rcu_read_unlock(); + rwbase_read_unlock(&rwlock->rwbase, TASK_RTLOCK_WAIT); +} +EXPORT_SYMBOL(rt_read_unlock); + +void __sched rt_write_unlock(rwlock_t *rwlock) __releases(RCU) +{ + rwlock_release(&rwlock->dep_map, _RET_IP_); + rcu_read_unlock(); + migrate_enable(); + rwbase_write_unlock(&rwlock->rwbase); +} +EXPORT_SYMBOL(rt_write_unlock); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __rt_rwlock_init(rwlock_t *rwlock, const char *name, + struct lock_class_key *key) +{ + debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock)); + lockdep_init_map_wait(&rwlock->dep_map, name, key, 0, LD_WAIT_CONFIG); +} +EXPORT_SYMBOL(__rt_rwlock_init); +#endif diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 39f56c870051..bcb1b9fea588 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Module-based API test facility for ww_mutexes - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. */ #include <linux/kernel.h> @@ -22,13 +9,22 @@ #include <linux/delay.h> #include <linux/kthread.h> #include <linux/module.h> -#include <linux/random.h> +#include <linux/prandom.h> #include <linux/slab.h> #include <linux/ww_mutex.h> -static DEFINE_WW_CLASS(ww_class); +static DEFINE_WD_CLASS(ww_class); struct workqueue_struct *wq; +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH +#define ww_acquire_init_noinject(a, b) do { \ + ww_acquire_init((a), (b)); \ + (a)->deadlock_inject_countdown = ~0U; \ + } while (0) +#else +#define ww_acquire_init_noinject(a, b) ww_acquire_init((a), (b)) +#endif + struct test_mutex { struct work_struct work; struct ww_mutex mutex; @@ -49,7 +45,7 @@ static void test_mutex_work(struct work_struct *work) wait_for_completion(&mtx->go); if (mtx->flags & TEST_MTX_TRY) { - while (!ww_mutex_trylock(&mtx->mutex)) + while (!ww_mutex_trylock(&mtx->mutex, NULL)) cond_resched(); } else { ww_mutex_lock(&mtx->mutex, NULL); @@ -66,7 +62,8 @@ static int __test_mutex(unsigned int flags) int ret; ww_mutex_init(&mtx.mutex, &ww_class); - ww_acquire_init(&ctx, &ww_class); + if (flags & TEST_MTX_CTX) + ww_acquire_init(&ctx, &ww_class); INIT_WORK_ONSTACK(&mtx.work, test_mutex_work); init_completion(&mtx.ready); @@ -94,7 +91,8 @@ static int __test_mutex(unsigned int flags) ret = wait_for_completion_timeout(&mtx.done, TIMEOUT); } ww_mutex_unlock(&mtx.mutex); - ww_acquire_fini(&ctx); + if (flags & TEST_MTX_CTX) + ww_acquire_fini(&ctx); if (ret) { pr_err("%s(flags=%x): mutual exclusion failure\n", @@ -122,19 +120,39 @@ static int test_mutex(void) return 0; } -static int test_aa(void) +static int test_aa(bool trylock) { struct ww_mutex mutex; struct ww_acquire_ctx ctx; int ret; + const char *from = trylock ? "trylock" : "lock"; ww_mutex_init(&mutex, &ww_class); ww_acquire_init(&ctx, &ww_class); - ww_mutex_lock(&mutex, &ctx); + if (!trylock) { + ret = ww_mutex_lock(&mutex, &ctx); + if (ret) { + pr_err("%s: initial lock failed!\n", __func__); + goto out; + } + } else { + ret = !ww_mutex_trylock(&mutex, &ctx); + if (ret) { + pr_err("%s: initial trylock failed!\n", __func__); + goto out; + } + } - if (ww_mutex_trylock(&mutex)) { - pr_err("%s: trylocked itself!\n", __func__); + if (ww_mutex_trylock(&mutex, NULL)) { + pr_err("%s: trylocked itself without context from %s!\n", __func__, from); + ww_mutex_unlock(&mutex); + ret = -EINVAL; + goto out; + } + + if (ww_mutex_trylock(&mutex, &ctx)) { + pr_err("%s: trylocked itself with context from %s!\n", __func__, from); ww_mutex_unlock(&mutex); ret = -EINVAL; goto out; @@ -142,17 +160,17 @@ static int test_aa(void) ret = ww_mutex_lock(&mutex, &ctx); if (ret != -EALREADY) { - pr_err("%s: missed deadlock for recursing, ret=%d\n", - __func__, ret); + pr_err("%s: missed deadlock for recursing, ret=%d from %s\n", + __func__, ret, from); if (!ret) ww_mutex_unlock(&mutex); ret = -EINVAL; goto out; } + ww_mutex_unlock(&mutex); ret = 0; out: - ww_mutex_unlock(&mutex); ww_acquire_fini(&ctx); return ret; } @@ -163,7 +181,7 @@ struct test_abba { struct ww_mutex b_mutex; struct completion a_ready; struct completion b_ready; - bool resolve; + bool resolve, trylock; int result; }; @@ -173,8 +191,13 @@ static void test_abba_work(struct work_struct *work) struct ww_acquire_ctx ctx; int err; - ww_acquire_init(&ctx, &ww_class); - ww_mutex_lock(&abba->b_mutex, &ctx); + ww_acquire_init_noinject(&ctx, &ww_class); + if (!abba->trylock) + ww_mutex_lock(&abba->b_mutex, &ctx); + else + WARN_ON(!ww_mutex_trylock(&abba->b_mutex, &ctx)); + + WARN_ON(READ_ONCE(abba->b_mutex.ctx) != &ctx); complete(&abba->b_ready); wait_for_completion(&abba->a_ready); @@ -194,7 +217,7 @@ static void test_abba_work(struct work_struct *work) abba->result = err; } -static int test_abba(bool resolve) +static int test_abba(bool trylock, bool resolve) { struct test_abba abba; struct ww_acquire_ctx ctx; @@ -205,12 +228,18 @@ static int test_abba(bool resolve) INIT_WORK_ONSTACK(&abba.work, test_abba_work); init_completion(&abba.a_ready); init_completion(&abba.b_ready); + abba.trylock = trylock; abba.resolve = resolve; schedule_work(&abba.work); - ww_acquire_init(&ctx, &ww_class); - ww_mutex_lock(&abba.a_mutex, &ctx); + ww_acquire_init_noinject(&ctx, &ww_class); + if (!trylock) + ww_mutex_lock(&abba.a_mutex, &ctx); + else + WARN_ON(!ww_mutex_trylock(&abba.a_mutex, &ctx)); + + WARN_ON(READ_ONCE(abba.a_mutex.ctx) != &ctx); complete(&abba.a_ready); wait_for_completion(&abba.b_ready); @@ -260,9 +289,9 @@ static void test_cycle_work(struct work_struct *work) { struct test_cycle *cycle = container_of(work, typeof(*cycle), work); struct ww_acquire_ctx ctx; - int err; + int err, erra = 0; - ww_acquire_init(&ctx, &ww_class); + ww_acquire_init_noinject(&ctx, &ww_class); ww_mutex_lock(&cycle->a_mutex, &ctx); complete(cycle->a_signal); @@ -270,17 +299,19 @@ static void test_cycle_work(struct work_struct *work) err = ww_mutex_lock(cycle->b_mutex, &ctx); if (err == -EDEADLK) { + err = 0; ww_mutex_unlock(&cycle->a_mutex); ww_mutex_lock_slow(cycle->b_mutex, &ctx); - err = ww_mutex_lock(&cycle->a_mutex, &ctx); + erra = ww_mutex_lock(&cycle->a_mutex, &ctx); } if (!err) ww_mutex_unlock(cycle->b_mutex); - ww_mutex_unlock(&cycle->a_mutex); + if (!erra) + ww_mutex_unlock(&cycle->a_mutex); ww_acquire_fini(&ctx); - cycle->result = err; + cycle->result = err ?: erra; } static int __test_cycle(unsigned int nthreads) @@ -324,7 +355,7 @@ static int __test_cycle(unsigned int nthreads) if (!cycle->result) continue; - pr_err("cylic deadlock not resolved, ret[%d/%d] = %d\n", + pr_err("cyclic deadlock not resolved, ret[%d/%d] = %d\n", n, nthreads, cycle->result); ret = -EINVAL; break; @@ -357,12 +388,25 @@ struct stress { int nlocks; }; +struct rnd_state rng; +DEFINE_SPINLOCK(rng_lock); + +static inline u32 prandom_u32_below(u32 ceil) +{ + u32 ret; + + spin_lock(&rng_lock); + ret = prandom_u32_state(&rng) % ceil; + spin_unlock(&rng_lock); + return ret; +} + static int *get_random_order(int count) { int *order; - int n, r, tmp; + int n, r; - order = kmalloc_array(count, sizeof(*order), GFP_TEMPORARY); + order = kmalloc_array(count, sizeof(*order), GFP_KERNEL); if (!order) return order; @@ -370,12 +414,9 @@ static int *get_random_order(int count) order[n] = n; for (n = count - 1; n > 1; n--) { - r = get_random_int() % (n + 1); - if (r != n) { - tmp = order[n]; - order[n] = order[r]; - order[r] = tmp; - } + r = prandom_u32_below(n + 1); + if (r != n) + swap(order[n], order[r]); } return order; @@ -423,21 +464,21 @@ retry: ww_mutex_unlock(&locks[order[n]]); if (err == -EDEADLK) { - ww_mutex_lock_slow(&locks[order[contended]], &ctx); - goto retry; + if (!time_after(jiffies, stress->timeout)) { + ww_mutex_lock_slow(&locks[order[contended]], &ctx); + goto retry; + } } + ww_acquire_fini(&ctx); if (err) { pr_err_once("stress (%s) failed with %d\n", __func__, err); break; } - - ww_acquire_fini(&ctx); } while (!time_after(jiffies, stress->timeout)); kfree(order); - kfree(stress); } struct reorder_lock { @@ -502,14 +543,13 @@ out: list_for_each_entry_safe(ll, ln, &locks, link) kfree(ll); kfree(order); - kfree(stress); } static void stress_one_work(struct work_struct *work) { struct stress *stress = container_of(work, typeof(*stress), work); const int nlocks = stress->nlocks; - struct ww_mutex *lock = stress->locks + (get_random_int() % nlocks); + struct ww_mutex *lock = stress->locks + get_random_u32_below(nlocks); int err; do { @@ -523,8 +563,6 @@ static void stress_one_work(struct work_struct *work) break; } } while (!time_after(jiffies, stress->timeout)); - - kfree(stress); } #define STRESS_INORDER BIT(0) @@ -535,15 +573,24 @@ static void stress_one_work(struct work_struct *work) static int stress(int nlocks, int nthreads, unsigned int flags) { struct ww_mutex *locks; - int n; + struct stress *stress_array; + int n, count; locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL); if (!locks) return -ENOMEM; + stress_array = kmalloc_array(nthreads, sizeof(*stress_array), + GFP_KERNEL); + if (!stress_array) { + kfree(locks); + return -ENOMEM; + } + for (n = 0; n < nlocks; n++) ww_mutex_init(&locks[n], &ww_class); + count = 0; for (n = 0; nthreads; n++) { struct stress *stress; void (*fn)(struct work_struct *work); @@ -567,9 +614,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags) if (!fn) continue; - stress = kmalloc(sizeof(*stress), GFP_KERNEL); - if (!stress) - break; + stress = &stress_array[count++]; INIT_WORK(&stress->work, fn); stress->locks = locks; @@ -584,6 +629,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags) for (n = 0; n < nlocks; n++) ww_mutex_destroy(&locks[n]); + kfree(stress_array); kfree(locks); return 0; @@ -592,7 +638,11 @@ static int stress(int nlocks, int nthreads, unsigned int flags) static int __init test_ww_mutex_init(void) { int ncpus = num_online_cpus(); - int ret; + int ret, i; + + printk(KERN_INFO "Beginning ww mutex selftests\n"); + + prandom_seed_state(&rng, get_random_u64()); wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0); if (!wq) @@ -602,17 +652,19 @@ static int __init test_ww_mutex_init(void) if (ret) return ret; - ret = test_aa(); + ret = test_aa(false); if (ret) return ret; - ret = test_abba(false); + ret = test_aa(true); if (ret) return ret; - ret = test_abba(true); - if (ret) - return ret; + for (i = 0; i < 4; i++) { + ret = test_abba(i & 1, i & 2); + if (ret) + return ret; + } ret = test_cycle(ncpus); if (ret) @@ -626,10 +678,11 @@ static int __init test_ww_mutex_init(void) if (ret) return ret; - ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL); + ret = stress(2046, hweight32(STRESS_ALL)*ncpus, STRESS_ALL); if (ret) return ret; + printk(KERN_INFO "All ww mutex selftests passed\n"); return 0; } @@ -643,3 +696,4 @@ module_exit(test_ww_mutex_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("API test facility for ww_mutexes"); diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h new file mode 100644 index 000000000000..31a785afee6c --- /dev/null +++ b/kernel/locking/ww_mutex.h @@ -0,0 +1,594 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef WW_RT + +#define MUTEX mutex +#define MUTEX_WAITER mutex_waiter + +static inline struct mutex_waiter * +__ww_waiter_first(struct mutex *lock) +{ + struct mutex_waiter *w; + + w = list_first_entry(&lock->wait_list, struct mutex_waiter, list); + if (list_entry_is_head(w, &lock->wait_list, list)) + return NULL; + + return w; +} + +static inline struct mutex_waiter * +__ww_waiter_next(struct mutex *lock, struct mutex_waiter *w) +{ + w = list_next_entry(w, list); + if (list_entry_is_head(w, &lock->wait_list, list)) + return NULL; + + return w; +} + +static inline struct mutex_waiter * +__ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w) +{ + w = list_prev_entry(w, list); + if (list_entry_is_head(w, &lock->wait_list, list)) + return NULL; + + return w; +} + +static inline struct mutex_waiter * +__ww_waiter_last(struct mutex *lock) +{ + struct mutex_waiter *w; + + w = list_last_entry(&lock->wait_list, struct mutex_waiter, list); + if (list_entry_is_head(w, &lock->wait_list, list)) + return NULL; + + return w; +} + +static inline void +__ww_waiter_add(struct mutex *lock, struct mutex_waiter *waiter, struct mutex_waiter *pos) +{ + struct list_head *p = &lock->wait_list; + if (pos) + p = &pos->list; + __mutex_add_waiter(lock, waiter, p); +} + +static inline struct task_struct * +__ww_mutex_owner(struct mutex *lock) +{ + return __mutex_owner(lock); +} + +static inline bool +__ww_mutex_has_waiters(struct mutex *lock) +{ + return atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS; +} + +static inline void lock_wait_lock(struct mutex *lock, unsigned long *flags) +{ + raw_spin_lock_irqsave(&lock->wait_lock, *flags); +} + +static inline void unlock_wait_lock(struct mutex *lock, unsigned long *flags) +{ + raw_spin_unlock_irqrestore(&lock->wait_lock, *flags); +} + +static inline void lockdep_assert_wait_lock_held(struct mutex *lock) +{ + lockdep_assert_held(&lock->wait_lock); +} + +#else /* WW_RT */ + +#define MUTEX rt_mutex +#define MUTEX_WAITER rt_mutex_waiter + +static inline struct rt_mutex_waiter * +__ww_waiter_first(struct rt_mutex *lock) +{ + struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root); + if (!n) + return NULL; + return rb_entry(n, struct rt_mutex_waiter, tree.entry); +} + +static inline struct rt_mutex_waiter * +__ww_waiter_next(struct rt_mutex *lock, struct rt_mutex_waiter *w) +{ + struct rb_node *n = rb_next(&w->tree.entry); + if (!n) + return NULL; + return rb_entry(n, struct rt_mutex_waiter, tree.entry); +} + +static inline struct rt_mutex_waiter * +__ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w) +{ + struct rb_node *n = rb_prev(&w->tree.entry); + if (!n) + return NULL; + return rb_entry(n, struct rt_mutex_waiter, tree.entry); +} + +static inline struct rt_mutex_waiter * +__ww_waiter_last(struct rt_mutex *lock) +{ + struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root); + if (!n) + return NULL; + return rb_entry(n, struct rt_mutex_waiter, tree.entry); +} + +static inline void +__ww_waiter_add(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct rt_mutex_waiter *pos) +{ + /* RT unconditionally adds the waiter first and then removes it on error */ +} + +static inline struct task_struct * +__ww_mutex_owner(struct rt_mutex *lock) +{ + return rt_mutex_owner(&lock->rtmutex); +} + +static inline bool +__ww_mutex_has_waiters(struct rt_mutex *lock) +{ + return rt_mutex_has_waiters(&lock->rtmutex); +} + +static inline void lock_wait_lock(struct rt_mutex *lock, unsigned long *flags) +{ + raw_spin_lock_irqsave(&lock->rtmutex.wait_lock, *flags); +} + +static inline void unlock_wait_lock(struct rt_mutex *lock, unsigned long *flags) +{ + raw_spin_unlock_irqrestore(&lock->rtmutex.wait_lock, *flags); +} + +static inline void lockdep_assert_wait_lock_held(struct rt_mutex *lock) +{ + lockdep_assert_held(&lock->rtmutex.wait_lock); +} + +#endif /* WW_RT */ + +/* + * Wait-Die: + * The newer transactions are killed when: + * It (the new transaction) makes a request for a lock being held + * by an older transaction. + * + * Wound-Wait: + * The newer transactions are wounded when: + * An older transaction makes a request for a lock being held by + * the newer transaction. + */ + +/* + * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired + * it. + */ +static __always_inline void +ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) +{ +#ifdef DEBUG_WW_MUTEXES + /* + * If this WARN_ON triggers, you used ww_mutex_lock to acquire, + * but released with a normal mutex_unlock in this call. + * + * This should never happen, always use ww_mutex_unlock. + */ + DEBUG_LOCKS_WARN_ON(ww->ctx); + + /* + * Not quite done after calling ww_acquire_done() ? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); + + if (ww_ctx->contending_lock) { + /* + * After -EDEADLK you tried to + * acquire a different ww_mutex? Bad! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); + + /* + * You called ww_mutex_lock after receiving -EDEADLK, + * but 'forgot' to unlock everything else first? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); + ww_ctx->contending_lock = NULL; + } + + /* + * Naughty, using a different class will lead to undefined behavior! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); +#endif + ww_ctx->acquired++; + ww->ctx = ww_ctx; +} + +/* + * Determine if @a is 'less' than @b. IOW, either @a is a lower priority task + * or, when of equal priority, a younger transaction than @b. + * + * Depending on the algorithm, @a will either need to wait for @b, or die. + */ +static inline bool +__ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) +{ +/* + * Can only do the RT prio for WW_RT, because task->prio isn't stable due to PI, + * so the wait_list ordering will go wobbly. rt_mutex re-queues the waiter and + * isn't affected by this. + */ +#ifdef WW_RT + /* kernel prio; less is more */ + int a_prio = a->task->prio; + int b_prio = b->task->prio; + + if (rt_or_dl_prio(a_prio) || rt_or_dl_prio(b_prio)) { + + if (a_prio > b_prio) + return true; + + if (a_prio < b_prio) + return false; + + /* equal static prio */ + + if (dl_prio(a_prio)) { + if (dl_time_before(b->task->dl.deadline, + a->task->dl.deadline)) + return true; + + if (dl_time_before(a->task->dl.deadline, + b->task->dl.deadline)) + return false; + } + + /* equal prio */ + } +#endif + + /* FIFO order tie break -- bigger is younger */ + return (signed long)(a->stamp - b->stamp) > 0; +} + +/* + * Wait-Die; wake a lesser waiter context (when locks held) such that it can + * die. + * + * Among waiters with context, only the first one can have other locks acquired + * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and + * __ww_mutex_check_kill() wake any but the earliest context. + */ +static bool +__ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, + struct ww_acquire_ctx *ww_ctx, struct wake_q_head *wake_q) +{ + if (!ww_ctx->is_wait_die) + return false; + + if (waiter->ww_ctx->acquired > 0 && __ww_ctx_less(waiter->ww_ctx, ww_ctx)) { +#ifndef WW_RT + debug_mutex_wake_waiter(lock, waiter); +#endif + /* + * When waking up the task to die, be sure to clear the + * blocked_on pointer. Otherwise we can see circular + * blocked_on relationships that can't resolve. + */ + __clear_task_blocked_on(waiter->task, lock); + wake_q_add(wake_q, waiter->task); + } + + return true; +} + +/* + * Wound-Wait; wound a lesser @hold_ctx if it holds the lock. + * + * Wound the lock holder if there are waiters with more important transactions + * than the lock holders. Even if multiple waiters may wound the lock holder, + * it's sufficient that only one does. + */ +static bool __ww_mutex_wound(struct MUTEX *lock, + struct ww_acquire_ctx *ww_ctx, + struct ww_acquire_ctx *hold_ctx, + struct wake_q_head *wake_q) +{ + struct task_struct *owner = __ww_mutex_owner(lock); + + lockdep_assert_wait_lock_held(lock); + + /* + * Possible through __ww_mutex_add_waiter() when we race with + * ww_mutex_set_context_fastpath(). In that case we'll get here again + * through __ww_mutex_check_waiters(). + */ + if (!hold_ctx) + return false; + + /* + * Can have !owner because of __mutex_unlock_slowpath(), but if owner, + * it cannot go away because we'll have FLAG_WAITERS set and hold + * wait_lock. + */ + if (!owner) + return false; + + if (ww_ctx->acquired > 0 && __ww_ctx_less(hold_ctx, ww_ctx)) { + hold_ctx->wounded = 1; + + /* + * wake_up_process() paired with set_current_state() + * inserts sufficient barriers to make sure @owner either sees + * it's wounded in __ww_mutex_check_kill() or has a + * wakeup pending to re-read the wounded state. + */ + if (owner != current) { + /* + * When waking up the task to wound, be sure to clear the + * blocked_on pointer. Otherwise we can see circular + * blocked_on relationships that can't resolve. + * + * NOTE: We pass NULL here instead of lock, because we + * are waking the mutex owner, who may be currently + * blocked on a different mutex. + */ + __clear_task_blocked_on(owner, NULL); + wake_q_add(wake_q, owner); + } + return true; + } + + return false; +} + +/* + * We just acquired @lock under @ww_ctx, if there are more important contexts + * waiting behind us on the wait-list, check if they need to die, or wound us. + * + * See __ww_mutex_add_waiter() for the list-order construction; basically the + * list is ordered by stamp, smallest (oldest) first. + * + * This relies on never mixing wait-die/wound-wait on the same wait-list; + * which is currently ensured by that being a ww_class property. + * + * The current task must not be on the wait list. + */ +static void +__ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, + struct wake_q_head *wake_q) +{ + struct MUTEX_WAITER *cur; + + lockdep_assert_wait_lock_held(lock); + + for (cur = __ww_waiter_first(lock); cur; + cur = __ww_waiter_next(lock, cur)) { + + if (!cur->ww_ctx) + continue; + + if (__ww_mutex_die(lock, cur, ww_ctx, wake_q) || + __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx, wake_q)) + break; + } +} + +/* + * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx + * and wake up any waiters so they can recheck. + */ +static __always_inline void +ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + DEFINE_WAKE_Q(wake_q); + unsigned long flags; + + ww_mutex_lock_acquired(lock, ctx); + + /* + * The lock->ctx update should be visible on all cores before + * the WAITERS check is done, otherwise contended waiters might be + * missed. The contended waiters will either see ww_ctx == NULL + * and keep spinning, or it will acquire wait_lock, add itself + * to waiter list and sleep. + */ + smp_mb(); /* See comments above and below. */ + + /* + * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS + * MB MB + * [R] MUTEX_FLAG_WAITERS [R] ww->ctx + * + * The memory barrier above pairs with the memory barrier in + * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx + * and/or !empty list. + */ + if (likely(!__ww_mutex_has_waiters(&lock->base))) + return; + + /* + * Uh oh, we raced in fastpath, check if any of the waiters need to + * die or wound us. + */ + lock_wait_lock(&lock->base, &flags); + __ww_mutex_check_waiters(&lock->base, ctx, &wake_q); + preempt_disable(); + unlock_wait_lock(&lock->base, &flags); + wake_up_q(&wake_q); + preempt_enable(); +} + +static __always_inline int +__ww_mutex_kill(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) +{ + if (ww_ctx->acquired > 0) { +#ifdef DEBUG_WW_MUTEXES + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base); + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock); + ww_ctx->contending_lock = ww; +#endif + return -EDEADLK; + } + + return 0; +} + +/* + * Check the wound condition for the current lock acquire. + * + * Wound-Wait: If we're wounded, kill ourself. + * + * Wait-Die: If we're trying to acquire a lock already held by an older + * context, kill ourselves. + * + * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to + * look at waiters before us in the wait-list. + */ +static inline int +__ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter, + struct ww_acquire_ctx *ctx) +{ + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); + struct MUTEX_WAITER *cur; + + if (ctx->acquired == 0) + return 0; + + if (!ctx->is_wait_die) { + if (ctx->wounded) + return __ww_mutex_kill(lock, ctx); + + return 0; + } + + if (hold_ctx && __ww_ctx_less(ctx, hold_ctx)) + return __ww_mutex_kill(lock, ctx); + + /* + * If there is a waiter in front of us that has a context, then its + * stamp is earlier than ours and we must kill ourself. + */ + for (cur = __ww_waiter_prev(lock, waiter); cur; + cur = __ww_waiter_prev(lock, cur)) { + + if (!cur->ww_ctx) + continue; + + return __ww_mutex_kill(lock, ctx); + } + + return 0; +} + +/* + * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest + * first. Such that older contexts are preferred to acquire the lock over + * younger contexts. + * + * Waiters without context are interspersed in FIFO order. + * + * Furthermore, for Wait-Die kill ourself immediately when possible (there are + * older contexts already waiting) to avoid unnecessary waiting and for + * Wound-Wait ensure we wound the owning context when it is younger. + */ +static inline int +__ww_mutex_add_waiter(struct MUTEX_WAITER *waiter, + struct MUTEX *lock, + struct ww_acquire_ctx *ww_ctx, + struct wake_q_head *wake_q) +{ + struct MUTEX_WAITER *cur, *pos = NULL; + bool is_wait_die; + + if (!ww_ctx) { + __ww_waiter_add(lock, waiter, NULL); + return 0; + } + + is_wait_die = ww_ctx->is_wait_die; + + /* + * Add the waiter before the first waiter with a higher stamp. + * Waiters without a context are skipped to avoid starving + * them. Wait-Die waiters may die here. Wound-Wait waiters + * never die here, but they are sorted in stamp order and + * may wound the lock holder. + */ + for (cur = __ww_waiter_last(lock); cur; + cur = __ww_waiter_prev(lock, cur)) { + + if (!cur->ww_ctx) + continue; + + if (__ww_ctx_less(ww_ctx, cur->ww_ctx)) { + /* + * Wait-Die: if we find an older context waiting, there + * is no point in queueing behind it, as we'd have to + * die the moment it would acquire the lock. + */ + if (is_wait_die) { + int ret = __ww_mutex_kill(lock, ww_ctx); + + if (ret) + return ret; + } + + break; + } + + pos = cur; + + /* Wait-Die: ensure younger waiters die. */ + __ww_mutex_die(lock, cur, ww_ctx, wake_q); + } + + __ww_waiter_add(lock, waiter, pos); + + /* + * Wound-Wait: if we're blocking on a mutex owned by a younger context, + * wound that such that we might proceed. + */ + if (!is_wait_die) { + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + + /* + * See ww_mutex_set_context_fastpath(). Orders setting + * MUTEX_FLAG_WAITERS vs the ww->ctx load, + * such that either we or the fastpath will wound @ww->ctx. + */ + smp_mb(); + __ww_mutex_wound(lock, ww_ctx, ww->ctx, wake_q); + } + + return 0; +} + +static inline void __ww_mutex_unlock(struct ww_mutex *lock) +{ + if (lock->ctx) { +#ifdef DEBUG_WW_MUTEXES + DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired); +#endif + if (lock->ctx->acquired > 0) + lock->ctx->acquired--; + lock->ctx = NULL; + } +} diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c new file mode 100644 index 000000000000..c7196de838ed --- /dev/null +++ b/kernel/locking/ww_rt_mutex.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * rtmutex API + */ +#include <linux/spinlock.h> +#include <linux/export.h> + +#define RT_MUTEX_BUILD_MUTEX +#define WW_RT +#include "rtmutex.c" + +int ww_mutex_trylock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx) +{ + struct rt_mutex *rtm = &lock->base; + + if (!ww_ctx) + return rt_mutex_trylock(rtm); + + /* + * Reset the wounded flag after a kill. No other process can + * race and wound us here, since they can't have a valid owner + * pointer if we don't have any locks held. + */ + if (ww_ctx->acquired == 0) + ww_ctx->wounded = 0; + + if (__rt_mutex_trylock(&rtm->rtmutex)) { + ww_mutex_set_context_fastpath(lock, ww_ctx); + mutex_acquire_nest(&rtm->dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_); + return 1; + } + + return 0; +} +EXPORT_SYMBOL(ww_mutex_trylock); + +static int __sched +__ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx, + unsigned int state, unsigned long ip) +{ + struct lockdep_map __maybe_unused *nest_lock = NULL; + struct rt_mutex *rtm = &lock->base; + int ret; + + might_sleep(); + + if (ww_ctx) { + if (unlikely(ww_ctx == READ_ONCE(lock->ctx))) + return -EALREADY; + + /* + * Reset the wounded flag after a kill. No other process can + * race and wound us here, since they can't have a valid owner + * pointer if we don't have any locks held. + */ + if (ww_ctx->acquired == 0) + ww_ctx->wounded = 0; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + nest_lock = &ww_ctx->dep_map; +#endif + } + mutex_acquire_nest(&rtm->dep_map, 0, 0, nest_lock, ip); + + if (likely(rt_mutex_try_acquire(&rtm->rtmutex))) { + if (ww_ctx) + ww_mutex_set_context_fastpath(lock, ww_ctx); + return 0; + } + + ret = rt_mutex_slowlock(&rtm->rtmutex, ww_ctx, state); + + if (ret) + mutex_release(&rtm->dep_map, ip); + return ret; +} + +int __sched +ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + return __ww_rt_mutex_lock(lock, ctx, TASK_UNINTERRUPTIBLE, _RET_IP_); +} +EXPORT_SYMBOL(ww_mutex_lock); + +int __sched +ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + return __ww_rt_mutex_lock(lock, ctx, TASK_INTERRUPTIBLE, _RET_IP_); +} +EXPORT_SYMBOL(ww_mutex_lock_interruptible); + +void __sched ww_mutex_unlock(struct ww_mutex *lock) +{ + struct rt_mutex *rtm = &lock->base; + + __ww_mutex_unlock(lock); + + mutex_release(&rtm->dep_map, _RET_IP_); + __rt_mutex_unlock(&rtm->rtmutex); +} +EXPORT_SYMBOL(ww_mutex_unlock); |
