summaryrefslogtreecommitdiff
path: root/kernel/locking
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/locking')
-rw-r--r--kernel/locking/Makefile14
-rw-r--r--kernel/locking/irqflag-debug.c13
-rw-r--r--kernel/locking/lock_events.c179
-rw-r--r--kernel/locking/lock_events.h64
-rw-r--r--kernel/locking/lock_events_list.h102
-rw-r--r--kernel/locking/lockdep.c4868
-rw-r--r--kernel/locking/lockdep_internals.h151
-rw-r--r--kernel/locking/lockdep_proc.c213
-rw-r--r--kernel/locking/lockdep_states.h1
-rw-r--r--kernel/locking/locktorture.c946
-rw-r--r--kernel/locking/mcs_spinlock.h21
-rw-r--r--kernel/locking/mutex-debug.c44
-rw-r--r--kernel/locking/mutex-debug.h28
-rw-r--r--kernel/locking/mutex.c735
-rw-r--r--kernel/locking/mutex.h72
-rw-r--r--kernel/locking/osq_lock.c85
-rw-r--r--kernel/locking/percpu-rwsem.c220
-rw-r--r--kernel/locking/qrwlock.c120
-rw-r--r--kernel/locking/qspinlock.c515
-rw-r--r--kernel/locking/qspinlock.h201
-rw-r--r--kernel/locking/qspinlock_paravirt.h223
-rw-r--r--kernel/locking/qspinlock_stat.h234
-rw-r--r--kernel/locking/rtmutex-debug.c181
-rw-r--r--kernel/locking/rtmutex-debug.h36
-rw-r--r--kernel/locking/rtmutex.c1628
-rw-r--r--kernel/locking/rtmutex.h34
-rw-r--r--kernel/locking/rtmutex_api.c656
-rw-r--r--kernel/locking/rtmutex_common.h226
-rw-r--r--kernel/locking/rwbase_rt.c303
-rw-r--r--kernel/locking/rwsem-spinlock.c319
-rw-r--r--kernel/locking/rwsem-xadd.c652
-rw-r--r--kernel/locking/rwsem.c1621
-rw-r--r--kernel/locking/rwsem.h68
-rw-r--r--kernel/locking/semaphore.c106
-rw-r--r--kernel/locking/spinlock.c122
-rw-r--r--kernel/locking/spinlock_debug.c54
-rw-r--r--kernel/locking/spinlock_rt.c287
-rw-r--r--kernel/locking/test-ww_mutex.c186
-rw-r--r--kernel/locking/ww_mutex.h594
-rw-r--r--kernel/locking/ww_rt_mutex.c101
40 files changed, 10871 insertions, 5352 deletions
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 760158d9d98d..a114949eeed5 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,16 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0
# Any varying coverage in these files is non-deterministic
# and is generally not a function of system call inputs.
KCOV_INSTRUMENT := n
obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
+# Avoid recursion lockdep -> sanitizer -> ... -> lockdep & improve performance.
+KASAN_SANITIZE_lockdep.o := n
+KCSAN_SANITIZE_lockdep.o := n
+
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
endif
+obj-$(CONFIG_DEBUG_IRQFLAGS) += irqflag-debug.o
obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
obj-$(CONFIG_LOCKDEP) += lockdep.o
ifeq ($(CONFIG_PROC_FS),y)
@@ -20,12 +25,11 @@ obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
-obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
-obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
+obj-$(CONFIG_RT_MUTEXES) += rtmutex_api.o
+obj-$(CONFIG_PREEMPT_RT) += spinlock_rt.o ww_rt_mutex.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
-obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
-obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
+obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o
diff --git a/kernel/locking/irqflag-debug.c b/kernel/locking/irqflag-debug.c
new file mode 100644
index 000000000000..810b50344d35
--- /dev/null
+++ b/kernel/locking/irqflag-debug.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/irqflags.h>
+
+noinstr void warn_bogus_irq_restore(void)
+{
+ instrumentation_begin();
+ WARN_ONCE(1, "raw_local_irq_restore() called with IRQs enabled\n");
+ instrumentation_end();
+}
+EXPORT_SYMBOL(warn_bogus_irq_restore);
diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c
new file mode 100644
index 000000000000..e68d82099558
--- /dev/null
+++ b/kernel/locking/lock_events.c
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <waiman.long@hpe.com>
+ */
+
+/*
+ * Collect locking event counts
+ */
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/fs.h>
+
+#include "lock_events.h"
+
+#undef LOCK_EVENT
+#define LOCK_EVENT(name) [LOCKEVENT_ ## name] = #name,
+
+#define LOCK_EVENTS_DIR "lock_event_counts"
+
+/*
+ * When CONFIG_LOCK_EVENT_COUNTS is enabled, event counts of different
+ * types of locks will be reported under the <debugfs>/lock_event_counts/
+ * directory. See lock_events_list.h for the list of available locking
+ * events.
+ *
+ * Writing to the special ".reset_counts" file will reset all the above
+ * locking event counts. This is a very slow operation and so should not
+ * be done frequently.
+ *
+ * These event counts are implemented as per-cpu variables which are
+ * summed and computed whenever the corresponding debugfs files are read. This
+ * minimizes added overhead making the counts usable even in a production
+ * environment.
+ */
+static const char * const lockevent_names[lockevent_num + 1] = {
+
+#include "lock_events_list.h"
+
+ [LOCKEVENT_reset_cnts] = ".reset_counts",
+};
+
+/*
+ * Per-cpu counts
+ */
+DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]);
+
+/*
+ * The lockevent_read() function can be overridden.
+ */
+ssize_t __weak lockevent_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buf[64];
+ int cpu, id, len;
+ u64 sum = 0;
+
+ /*
+ * Get the counter ID stored in file->f_inode->i_private
+ */
+ id = (long)file_inode(file)->i_private;
+
+ if (id >= lockevent_num)
+ return -EBADF;
+
+ for_each_possible_cpu(cpu)
+ sum += per_cpu(lockevents[id], cpu);
+ len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum);
+
+ return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+/*
+ * Function to handle write request
+ *
+ * When idx = reset_cnts, reset all the counts.
+ */
+static ssize_t lockevent_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ int cpu;
+
+ /*
+ * Get the counter ID stored in file->f_inode->i_private
+ */
+ if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts)
+ return count;
+
+ for_each_possible_cpu(cpu) {
+ int i;
+ unsigned long *ptr = per_cpu_ptr(lockevents, cpu);
+
+ for (i = 0 ; i < lockevent_num; i++)
+ WRITE_ONCE(ptr[i], 0);
+ }
+ return count;
+}
+
+/*
+ * Debugfs data structures
+ */
+static const struct file_operations fops_lockevent = {
+ .read = lockevent_read,
+ .write = lockevent_write,
+ .llseek = default_llseek,
+};
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#include <asm/paravirt.h>
+
+static bool __init skip_lockevent(const char *name)
+{
+ static int pv_on __initdata = -1;
+
+ if (pv_on < 0)
+ pv_on = !pv_is_native_spin_unlock();
+ /*
+ * Skip PV qspinlock events on bare metal.
+ */
+ if (!pv_on && !memcmp(name, "pv_", 3))
+ return true;
+ return false;
+}
+#else
+static inline bool skip_lockevent(const char *name)
+{
+ return false;
+}
+#endif
+
+/*
+ * Initialize debugfs for the locking event counts.
+ */
+static int __init init_lockevent_counts(void)
+{
+ struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL);
+ int i;
+
+ if (IS_ERR(d_counts))
+ goto out;
+
+ /*
+ * Create the debugfs files
+ *
+ * As reading from and writing to the stat files can be slow, only
+ * root is allowed to do the read/write to limit impact to system
+ * performance.
+ */
+ for (i = 0; i < lockevent_num; i++) {
+ if (skip_lockevent(lockevent_names[i]))
+ continue;
+ if (IS_ERR(debugfs_create_file(lockevent_names[i], 0400, d_counts,
+ (void *)(long)i, &fops_lockevent)))
+ goto fail_undo;
+ }
+
+ if (IS_ERR(debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
+ d_counts, (void *)(long)LOCKEVENT_reset_cnts,
+ &fops_lockevent)))
+ goto fail_undo;
+
+ return 0;
+fail_undo:
+ debugfs_remove_recursive(d_counts);
+out:
+ pr_warn("Could not create '%s' debugfs entries\n", LOCK_EVENTS_DIR);
+ return -ENOMEM;
+}
+fs_initcall(init_lockevent_counts);
diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
new file mode 100644
index 000000000000..d2345e9c0190
--- /dev/null
+++ b/kernel/locking/lock_events.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ */
+
+#ifndef __LOCKING_LOCK_EVENTS_H
+#define __LOCKING_LOCK_EVENTS_H
+
+enum lock_events {
+
+#include "lock_events_list.h"
+
+ lockevent_num, /* Total number of lock event counts */
+ LOCKEVENT_reset_cnts = lockevent_num,
+};
+
+#ifdef CONFIG_LOCK_EVENT_COUNTS
+/*
+ * Per-cpu counters
+ */
+DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]);
+
+/*
+ * Increment the statistical counters. use raw_cpu_inc() because of lower
+ * overhead and we don't care if we loose the occasional update.
+ */
+static inline void __lockevent_inc(enum lock_events event, bool cond)
+{
+ if (cond)
+ raw_cpu_inc(lockevents[event]);
+}
+
+#define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true)
+#define lockevent_cond_inc(ev, c) __lockevent_inc(LOCKEVENT_ ##ev, c)
+
+static inline void __lockevent_add(enum lock_events event, int inc)
+{
+ raw_cpu_add(lockevents[event], inc);
+}
+
+#define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c)
+
+#else /* CONFIG_LOCK_EVENT_COUNTS */
+
+#define lockevent_inc(ev)
+#define lockevent_add(ev, c) do { (void)(c); } while (0)
+#define lockevent_cond_inc(ev, c) do { (void)(c); } while (0)
+
+#endif /* CONFIG_LOCK_EVENT_COUNTS */
+
+ssize_t lockevent_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos);
+
+#endif /* __LOCKING_LOCK_EVENTS_H */
diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h
new file mode 100644
index 000000000000..4e36258cc34f
--- /dev/null
+++ b/kernel/locking/lock_events_list.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ */
+
+#ifndef LOCK_EVENT
+#define LOCK_EVENT(name) LOCKEVENT_ ## name,
+#endif
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+/*
+ * Locking events for PV qspinlock.
+ */
+LOCK_EVENT(pv_hash_hops) /* Average # of hops per hashing operation */
+LOCK_EVENT(pv_kick_unlock) /* # of vCPU kicks issued at unlock time */
+LOCK_EVENT(pv_kick_wake) /* # of vCPU kicks for pv_latency_wake */
+LOCK_EVENT(pv_latency_kick) /* Average latency (ns) of vCPU kick */
+LOCK_EVENT(pv_latency_wake) /* Average latency (ns) of kick-to-wakeup */
+LOCK_EVENT(pv_lock_stealing) /* # of lock stealing operations */
+LOCK_EVENT(pv_spurious_wakeup) /* # of spurious wakeups in non-head vCPUs */
+LOCK_EVENT(pv_wait_again) /* # of wait's after queue head vCPU kick */
+LOCK_EVENT(pv_wait_early) /* # of early vCPU wait's */
+LOCK_EVENT(pv_wait_head) /* # of vCPU wait's at the queue head */
+LOCK_EVENT(pv_wait_node) /* # of vCPU wait's at non-head queue node */
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
+/*
+ * Locking events for qspinlock
+ *
+ * Subtracting lock_use_node[234] from lock_slowpath will give you
+ * lock_use_node1.
+ */
+LOCK_EVENT(lock_pending) /* # of locking ops via pending code */
+LOCK_EVENT(lock_slowpath) /* # of locking ops via MCS lock queue */
+LOCK_EVENT(lock_use_node2) /* # of locking ops that use 2nd percpu node */
+LOCK_EVENT(lock_use_node3) /* # of locking ops that use 3rd percpu node */
+LOCK_EVENT(lock_use_node4) /* # of locking ops that use 4th percpu node */
+LOCK_EVENT(lock_no_node) /* # of locking ops w/o using percpu node */
+#endif /* CONFIG_QUEUED_SPINLOCKS */
+
+/*
+ * Locking events for Resilient Queued Spin Lock
+ */
+LOCK_EVENT(rqspinlock_lock_timeout) /* # of locking ops that timeout */
+
+/*
+ * Locking events for rwsem
+ */
+LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */
+LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */
+LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */
+LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */
+LOCK_EVENT(rwsem_opt_lock) /* # of opt-acquired write locks */
+LOCK_EVENT(rwsem_opt_fail) /* # of failed optspins */
+LOCK_EVENT(rwsem_opt_nospin) /* # of disabled optspins */
+LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */
+LOCK_EVENT(rwsem_rlock_steal) /* # of read locks by lock stealing */
+LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */
+LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */
+LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */
+LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */
+LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */
+LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */
+
+/*
+ * Locking events for rtlock_slowlock()
+ */
+LOCK_EVENT(rtlock_slowlock) /* # of rtlock_slowlock() calls */
+LOCK_EVENT(rtlock_slow_acq1) /* # of locks acquired after wait_lock */
+LOCK_EVENT(rtlock_slow_acq2) /* # of locks acquired in for loop */
+LOCK_EVENT(rtlock_slow_sleep) /* # of sleeps */
+LOCK_EVENT(rtlock_slow_wake) /* # of wakeup's */
+
+/*
+ * Locking events for rt_mutex_slowlock()
+ */
+LOCK_EVENT(rtmutex_slowlock) /* # of rt_mutex_slowlock() calls */
+LOCK_EVENT(rtmutex_slow_block) /* # of rt_mutex_slowlock_block() calls */
+LOCK_EVENT(rtmutex_slow_acq1) /* # of locks acquired after wait_lock */
+LOCK_EVENT(rtmutex_slow_acq2) /* # of locks acquired at the end */
+LOCK_EVENT(rtmutex_slow_acq3) /* # of locks acquired in *block() */
+LOCK_EVENT(rtmutex_slow_sleep) /* # of sleeps */
+LOCK_EVENT(rtmutex_slow_wake) /* # of wakeup's */
+LOCK_EVENT(rtmutex_deadlock) /* # of rt_mutex_handle_deadlock()'s */
+
+/*
+ * Locking events for lockdep
+ */
+LOCK_EVENT(lockdep_acquire)
+LOCK_EVENT(lockdep_lock)
+LOCK_EVENT(lockdep_nocheck)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 7d2499bec5fe..2d4c5bab5af8 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/lockdep.c
*
@@ -45,33 +46,87 @@
#include <linux/hash.h>
#include <linux/ftrace.h>
#include <linux/stringify.h>
+#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/gfp.h>
-#include <linux/kmemcheck.h>
#include <linux/random.h>
#include <linux/jhash.h>
+#include <linux/nmi.h>
+#include <linux/rcupdate.h>
+#include <linux/kprobes.h>
+#include <linux/lockdep.h>
+#include <linux/context_tracking.h>
+#include <linux/console.h>
+#include <linux/kasan.h>
#include <asm/sections.h>
#include "lockdep_internals.h"
+#include "lock_events.h"
-#define CREATE_TRACE_POINTS
#include <trace/events/lock.h>
#ifdef CONFIG_PROVE_LOCKING
-int prove_locking = 1;
+static int prove_locking = 1;
module_param(prove_locking, int, 0644);
#else
#define prove_locking 0
#endif
#ifdef CONFIG_LOCK_STAT
-int lock_stat = 1;
+static int lock_stat = 1;
module_param(lock_stat, int, 0644);
#else
#define lock_stat 0
#endif
+#ifdef CONFIG_SYSCTL
+static const struct ctl_table kern_lockdep_table[] = {
+#ifdef CONFIG_PROVE_LOCKING
+ {
+ .procname = "prove_locking",
+ .data = &prove_locking,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_PROVE_LOCKING */
+#ifdef CONFIG_LOCK_STAT
+ {
+ .procname = "lock_stat",
+ .data = &lock_stat,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_LOCK_STAT */
+};
+
+static __init int kernel_lockdep_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_lockdep_table);
+ return 0;
+}
+late_initcall(kernel_lockdep_sysctls_init);
+#endif /* CONFIG_SYSCTL */
+
+DEFINE_PER_CPU(unsigned int, lockdep_recursion);
+EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion);
+
+static __always_inline bool lockdep_enabled(void)
+{
+ if (!debug_locks)
+ return false;
+
+ if (this_cpu_read(lockdep_recursion))
+ return false;
+
+ if (current->lockdep_recursion)
+ return false;
+
+ return true;
+}
+
/*
* lockdep_lock: protects the lockdep graph, the hashes and the
* class/list/hash allocators.
@@ -80,11 +135,44 @@ module_param(lock_stat, int, 0644);
* to use a raw spinlock - we really dont want the spinlock
* code to recurse back into the lockdep code...
*/
-static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+static arch_spinlock_t __lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+static struct task_struct *__owner;
+
+static inline void lockdep_lock(void)
+{
+ DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+
+ __this_cpu_inc(lockdep_recursion);
+ arch_spin_lock(&__lock);
+ __owner = current;
+}
+
+static inline void lockdep_unlock(void)
+{
+ DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+
+ if (debug_locks && DEBUG_LOCKS_WARN_ON(__owner != current))
+ return;
+
+ __owner = NULL;
+ arch_spin_unlock(&__lock);
+ __this_cpu_dec(lockdep_recursion);
+}
+
+#ifdef CONFIG_PROVE_LOCKING
+static inline bool lockdep_assert_locked(void)
+{
+ return DEBUG_LOCKS_WARN_ON(__owner != current);
+}
+#endif
+
+static struct task_struct *lockdep_selftest_task_struct;
+
static int graph_lock(void)
{
- arch_spin_lock(&lockdep_lock);
+ lockdep_lock();
+ lockevent_inc(lockdep_lock);
/*
* Make sure that if another CPU detected a bug while
* walking the graph we dont change it (while the other
@@ -92,27 +180,15 @@ static int graph_lock(void)
* dropped already)
*/
if (!debug_locks) {
- arch_spin_unlock(&lockdep_lock);
+ lockdep_unlock();
return 0;
}
- /* prevent any recursions within lockdep from causing deadlocks */
- current->lockdep_recursion++;
return 1;
}
-static inline int graph_unlock(void)
+static inline void graph_unlock(void)
{
- if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) {
- /*
- * The lockdep graph lock isn't locked while we expect it to
- * be, we're confused now, bye!
- */
- return DEBUG_LOCKS_WARN_ON(1);
- }
-
- current->lockdep_recursion--;
- arch_spin_unlock(&lockdep_lock);
- return 0;
+ lockdep_unlock();
}
/*
@@ -123,33 +199,51 @@ static inline int debug_locks_off_graph_unlock(void)
{
int ret = debug_locks_off();
- arch_spin_unlock(&lockdep_lock);
+ lockdep_unlock();
return ret;
}
unsigned long nr_list_entries;
static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
+static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES);
/*
* All data structures here are protected by the global debug_lock.
*
- * Mutex key structs only get allocated, once during bootup, and never
- * get freed - this significantly simplifies the debugging code.
+ * nr_lock_classes is the number of elements of lock_classes[] that is
+ * in use.
*/
+#define KEYHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1)
+#define KEYHASH_SIZE (1UL << KEYHASH_BITS)
+static struct hlist_head lock_keys_hash[KEYHASH_SIZE];
unsigned long nr_lock_classes;
-static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+unsigned long nr_zapped_classes;
+unsigned long nr_dynamic_keys;
+unsigned long max_lock_class_idx;
+struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
static inline struct lock_class *hlock_class(struct held_lock *hlock)
{
- if (!hlock->class_idx) {
+ unsigned int class_idx = hlock->class_idx;
+
+ /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfield */
+ barrier();
+
+ if (!test_bit(class_idx, lock_classes_in_use)) {
/*
* Someone passed in garbage, we give up.
*/
DEBUG_LOCKS_WARN_ON(1);
return NULL;
}
- return lock_classes + hlock->class_idx - 1;
+
+ /*
+ * At this point, if the passed hlock->class_idx is still garbage,
+ * we just have to live with it
+ */
+ return lock_classes + class_idx;
}
#ifdef CONFIG_LOCK_STAT
@@ -203,33 +297,30 @@ static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
dst->nr += src->nr;
}
-struct lock_class_stats lock_stats(struct lock_class *class)
+void lock_stats(struct lock_class *class, struct lock_class_stats *stats)
{
- struct lock_class_stats stats;
int cpu, i;
- memset(&stats, 0, sizeof(struct lock_class_stats));
+ memset(stats, 0, sizeof(struct lock_class_stats));
for_each_possible_cpu(cpu) {
struct lock_class_stats *pcs =
&per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
- for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
- stats.contention_point[i] += pcs->contention_point[i];
+ for (i = 0; i < ARRAY_SIZE(stats->contention_point); i++)
+ stats->contention_point[i] += pcs->contention_point[i];
- for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
- stats.contending_point[i] += pcs->contending_point[i];
+ for (i = 0; i < ARRAY_SIZE(stats->contending_point); i++)
+ stats->contending_point[i] += pcs->contending_point[i];
- lock_time_add(&pcs->read_waittime, &stats.read_waittime);
- lock_time_add(&pcs->write_waittime, &stats.write_waittime);
+ lock_time_add(&pcs->read_waittime, &stats->read_waittime);
+ lock_time_add(&pcs->write_waittime, &stats->write_waittime);
- lock_time_add(&pcs->read_holdtime, &stats.read_holdtime);
- lock_time_add(&pcs->write_holdtime, &stats.write_holdtime);
+ lock_time_add(&pcs->read_holdtime, &stats->read_holdtime);
+ lock_time_add(&pcs->write_holdtime, &stats->write_holdtime);
- for (i = 0; i < ARRAY_SIZE(stats.bounces); i++)
- stats.bounces[i] += pcs->bounces[i];
+ for (i = 0; i < ARRAY_SIZE(stats->bounces); i++)
+ stats->bounces[i] += pcs->bounces[i];
}
-
- return stats;
}
void clear_lock_stats(struct lock_class *class)
@@ -248,12 +339,7 @@ void clear_lock_stats(struct lock_class *class)
static struct lock_class_stats *get_lock_stats(struct lock_class *class)
{
- return &get_cpu_var(cpu_lock_stats)[class - lock_classes];
-}
-
-static void put_lock_stats(struct lock_class_stats *stats)
-{
- put_cpu_var(cpu_lock_stats);
+ return &this_cpu_ptr(cpu_lock_stats)[class - lock_classes];
}
static void lock_release_holdtime(struct held_lock *hlock)
@@ -271,7 +357,6 @@ static void lock_release_holdtime(struct held_lock *hlock)
lock_time_inc(&stats->read_holdtime, holdtime);
else
lock_time_inc(&stats->write_holdtime, holdtime);
- put_lock_stats(stats);
}
#else
static inline void lock_release_holdtime(struct held_lock *hlock)
@@ -280,11 +365,42 @@ static inline void lock_release_holdtime(struct held_lock *hlock)
#endif
/*
- * We keep a global list of all lock classes. The list only grows,
- * never shrinks. The list is only accessed with the lockdep
- * spinlock lock held.
+ * We keep a global list of all lock classes. The list is only accessed with
+ * the lockdep spinlock lock held. free_lock_classes is a list with free
+ * elements. These elements are linked together by the lock_entry member in
+ * struct lock_class.
*/
-LIST_HEAD(all_lock_classes);
+static LIST_HEAD(all_lock_classes);
+static LIST_HEAD(free_lock_classes);
+
+/**
+ * struct pending_free - information about data structures about to be freed
+ * @zapped: Head of a list with struct lock_class elements.
+ * @lock_chains_being_freed: Bitmap that indicates which lock_chains[] elements
+ * are about to be freed.
+ */
+struct pending_free {
+ struct list_head zapped;
+ DECLARE_BITMAP(lock_chains_being_freed, MAX_LOCKDEP_CHAINS);
+};
+
+/**
+ * struct delayed_free - data structures used for delayed freeing
+ *
+ * A data structure for delayed freeing of data structures that may be
+ * accessed by RCU readers at the time these were freed.
+ *
+ * @rcu_head: Used to schedule an RCU callback for freeing data structures.
+ * @index: Index of @pf to which freed data structures are added.
+ * @scheduled: Whether or not an RCU callback has been scheduled.
+ * @pf: Array with information about data structures about to be freed.
+ */
+static struct delayed_free {
+ struct rcu_head rcu_head;
+ int index;
+ int scheduled;
+ struct pending_free pf[2];
+} delayed_free;
/*
* The lockdep classes are in a hash-table as well, for fast lookup:
@@ -308,6 +424,21 @@ static struct hlist_head classhash_table[CLASSHASH_SIZE];
static struct hlist_head chainhash_table[CHAINHASH_SIZE];
/*
+ * the id of held_lock
+ */
+static inline u16 hlock_id(struct held_lock *hlock)
+{
+ BUILD_BUG_ON(MAX_LOCKDEP_KEYS_BITS + 2 > 16);
+
+ return (hlock->class_idx | (hlock->read << MAX_LOCKDEP_KEYS_BITS));
+}
+
+static inline __maybe_unused unsigned int chain_hlock_class_idx(u16 hlock_id)
+{
+ return hlock_id & (MAX_LOCKDEP_KEYS - 1);
+}
+
+/*
* The hash key of the lock dependency chains is a hash itself too:
* it's a hash of all locks taken up to that lock, including that lock.
* It's a 64-bit hash, because it's important for the keys to be
@@ -322,17 +453,28 @@ static inline u64 iterate_chain_key(u64 key, u32 idx)
return k0 | (u64)k1 << 32;
}
-void lockdep_off(void)
+void lockdep_init_task(struct task_struct *task)
{
- current->lockdep_recursion++;
+ task->lockdep_depth = 0; /* no locks held yet */
+ task->curr_chain_key = INITIAL_CHAIN_KEY;
+ task->lockdep_recursion = 0;
}
-EXPORT_SYMBOL(lockdep_off);
-void lockdep_on(void)
+static __always_inline void lockdep_recursion_inc(void)
{
- current->lockdep_recursion--;
+ __this_cpu_inc(lockdep_recursion);
+}
+
+static __always_inline void lockdep_recursion_finish(void)
+{
+ if (WARN_ON_ONCE(__this_cpu_dec_return(lockdep_recursion)))
+ __this_cpu_write(lockdep_recursion, 0);
+}
+
+void lockdep_set_selftest_task(struct task_struct *task)
+{
+ lockdep_selftest_task_struct = task;
}
-EXPORT_SYMBOL(lockdep_on);
/*
* Debugging switches:
@@ -344,14 +486,12 @@ EXPORT_SYMBOL(lockdep_on);
#if VERBOSE
# define HARDIRQ_VERBOSE 1
# define SOFTIRQ_VERBOSE 1
-# define RECLAIM_VERBOSE 1
#else
# define HARDIRQ_VERBOSE 0
# define SOFTIRQ_VERBOSE 0
-# define RECLAIM_VERBOSE 0
#endif
-#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE
+#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
/*
* Quick filtering for interesting events:
*/
@@ -379,13 +519,6 @@ static int verbose(struct lock_class *class)
return 0;
}
-/*
- * Stack-trace: tightly packed array of stack backtrace
- * addresses. Protected by the graph_lock.
- */
-unsigned long nr_stack_trace_entries;
-static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
-
static void print_lockdep_off(const char *bug_msg)
{
printk(KERN_DEBUG "%s\n", bug_msg);
@@ -395,44 +528,108 @@ static void print_lockdep_off(const char *bug_msg)
#endif
}
-static int save_trace(struct stack_trace *trace)
-{
- trace->nr_entries = 0;
- trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
- trace->entries = stack_trace + nr_stack_trace_entries;
+unsigned long nr_stack_trace_entries;
- trace->skip = 3;
+#ifdef CONFIG_PROVE_LOCKING
+/**
+ * struct lock_trace - single stack backtrace
+ * @hash_entry: Entry in a stack_trace_hash[] list.
+ * @hash: jhash() of @entries.
+ * @nr_entries: Number of entries in @entries.
+ * @entries: Actual stack backtrace.
+ */
+struct lock_trace {
+ struct hlist_node hash_entry;
+ u32 hash;
+ u32 nr_entries;
+ unsigned long entries[] __aligned(sizeof(unsigned long));
+};
+#define LOCK_TRACE_SIZE_IN_LONGS \
+ (sizeof(struct lock_trace) / sizeof(unsigned long))
+/*
+ * Stack-trace: sequence of lock_trace structures. Protected by the graph_lock.
+ */
+static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
+static struct hlist_head stack_trace_hash[STACK_TRACE_HASH_SIZE];
- save_stack_trace(trace);
+static bool traces_identical(struct lock_trace *t1, struct lock_trace *t2)
+{
+ return t1->hash == t2->hash && t1->nr_entries == t2->nr_entries &&
+ memcmp(t1->entries, t2->entries,
+ t1->nr_entries * sizeof(t1->entries[0])) == 0;
+}
- /*
- * Some daft arches put -1 at the end to indicate its a full trace.
- *
- * <rant> this is buggy anyway, since it takes a whole extra entry so a
- * complete trace that maxes out the entries provided will be reported
- * as incomplete, friggin useless </rant>
- */
- if (trace->nr_entries != 0 &&
- trace->entries[trace->nr_entries-1] == ULONG_MAX)
- trace->nr_entries--;
+static struct lock_trace *save_trace(void)
+{
+ struct lock_trace *trace, *t2;
+ struct hlist_head *hash_head;
+ u32 hash;
+ int max_entries;
- trace->max_entries = trace->nr_entries;
+ BUILD_BUG_ON_NOT_POWER_OF_2(STACK_TRACE_HASH_SIZE);
+ BUILD_BUG_ON(LOCK_TRACE_SIZE_IN_LONGS >= MAX_STACK_TRACE_ENTRIES);
- nr_stack_trace_entries += trace->nr_entries;
+ trace = (struct lock_trace *)(stack_trace + nr_stack_trace_entries);
+ max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries -
+ LOCK_TRACE_SIZE_IN_LONGS;
- if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
+ if (max_entries <= 0) {
if (!debug_locks_off_graph_unlock())
- return 0;
+ return NULL;
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
dump_stack();
+ nbcon_cpu_emergency_exit();
- return 0;
+ return NULL;
}
+ trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3);
+
+ hash = jhash(trace->entries, trace->nr_entries *
+ sizeof(trace->entries[0]), 0);
+ trace->hash = hash;
+ hash_head = stack_trace_hash + (hash & (STACK_TRACE_HASH_SIZE - 1));
+ hlist_for_each_entry(t2, hash_head, hash_entry) {
+ if (traces_identical(trace, t2))
+ return t2;
+ }
+ nr_stack_trace_entries += LOCK_TRACE_SIZE_IN_LONGS + trace->nr_entries;
+ hlist_add_head(&trace->hash_entry, hash_head);
- return 1;
+ return trace;
+}
+
+/* Return the number of stack traces in the stack_trace[] array. */
+u64 lockdep_stack_trace_count(void)
+{
+ struct lock_trace *trace;
+ u64 c = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(stack_trace_hash); i++) {
+ hlist_for_each_entry(trace, &stack_trace_hash[i], hash_entry) {
+ c++;
+ }
+ }
+
+ return c;
}
+/* Return the number of stack hash chains that have at least one stack trace. */
+u64 lockdep_stack_hash_count(void)
+{
+ u64 c = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(stack_trace_hash); i++)
+ if (!hlist_empty(&stack_trace_hash[i]))
+ c++;
+
+ return c;
+}
+#endif
+
unsigned int nr_hardirq_chains;
unsigned int nr_softirq_chains;
unsigned int nr_process_chains;
@@ -445,6 +642,7 @@ unsigned int max_lockdep_depth;
DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
#endif
+#ifdef CONFIG_PROVE_LOCKING
/*
* Locking printouts:
*/
@@ -461,9 +659,13 @@ static const char *usage_str[] =
#include "lockdep_states.h"
#undef LOCKDEP_STATE
[LOCK_USED] = "INITIAL USE",
+ [LOCK_USED_READ] = "INITIAL READ USE",
+ /* abused as string storage for verify_lock_unused() */
+ [LOCK_USAGE_STATES] = "IN-NMI",
};
+#endif
-const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
+const char *__get_key_name(const struct lockdep_subclass_key *key, char *str)
{
return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
}
@@ -475,15 +677,26 @@ static inline unsigned long lock_flag(enum lock_usage_bit bit)
static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
{
+ /*
+ * The usage character defaults to '.' (i.e., irqs disabled and not in
+ * irq context), which is the safest usage category.
+ */
char c = '.';
- if (class->usage_mask & lock_flag(bit + 2))
+ /*
+ * The order of the following usage checks matters, which will
+ * result in the outcome character as follows:
+ *
+ * - '+': irq is enabled and not in irq context
+ * - '-': in irq context and irq is disabled
+ * - '?': in irq context and irq is enabled
+ */
+ if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) {
c = '+';
- if (class->usage_mask & lock_flag(bit)) {
- c = '-';
- if (class->usage_mask & lock_flag(bit + 2))
+ if (class->usage_mask & lock_flag(bit))
c = '?';
- }
+ } else if (class->usage_mask & lock_flag(bit))
+ c = '-';
return c;
}
@@ -501,7 +714,7 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
usage[i] = '\0';
}
-static void __print_lock_name(struct lock_class *class)
+static void __print_lock_name(struct held_lock *hlock, struct lock_class *class)
{
char str[KSYM_NAME_LEN];
const char *name;
@@ -516,18 +729,22 @@ static void __print_lock_name(struct lock_class *class)
printk(KERN_CONT "#%d", class->name_version);
if (class->subclass)
printk(KERN_CONT "/%d", class->subclass);
+ if (hlock && class->print_fn)
+ class->print_fn(hlock->instance);
}
}
-static void print_lock_name(struct lock_class *class)
+static void print_lock_name(struct held_lock *hlock, struct lock_class *class)
{
char usage[LOCK_USAGE_CHARS];
get_usage_chars(class, usage);
printk(KERN_CONT " (");
- __print_lock_name(class);
- printk(KERN_CONT "){%s}", usage);
+ __print_lock_name(hlock, class);
+ printk(KERN_CONT "){%s}-{%d:%d}", usage,
+ class->wait_type_outer ?: class->wait_type_inner,
+ class->wait_type_inner);
}
static void print_lockdep_cache(struct lockdep_map *lock)
@@ -547,36 +764,43 @@ static void print_lock(struct held_lock *hlock)
/*
* We can be called locklessly through debug_show_all_locks() so be
* extra careful, the hlock might have been released and cleared.
+ *
+ * If this indeed happens, lets pretend it does not hurt to continue
+ * to print the lock unless the hlock class_idx does not point to a
+ * registered class. The rationale here is: since we don't attempt
+ * to distinguish whether we are in this situation, if it just
+ * happened we can't count on class_idx to tell either.
*/
- unsigned int class_idx = hlock->class_idx;
+ struct lock_class *lock = hlock_class(hlock);
- /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */
- barrier();
-
- if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) {
+ if (!lock) {
printk(KERN_CONT "<RELEASED>\n");
return;
}
- print_lock_name(lock_classes + class_idx - 1);
- printk(KERN_CONT ", at: [<%p>] %pS\n",
- (void *)hlock->acquire_ip, (void *)hlock->acquire_ip);
+ printk(KERN_CONT "%px", hlock->instance);
+ print_lock_name(hlock, lock);
+ printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
}
-static void lockdep_print_held_locks(struct task_struct *curr)
+static void lockdep_print_held_locks(struct task_struct *p)
{
- int i, depth = curr->lockdep_depth;
+ int i, depth = READ_ONCE(p->lockdep_depth);
- if (!depth) {
- printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr));
+ if (!depth)
+ printk("no locks held by %s/%d.\n", p->comm, task_pid_nr(p));
+ else
+ printk("%d lock%s held by %s/%d:\n", depth,
+ str_plural(depth), p->comm, task_pid_nr(p));
+ /*
+ * It's not reliable to print a task's held locks if it's not sleeping
+ * and it's not the current task.
+ */
+ if (p != current && task_is_running(p))
return;
- }
- printk("%d lock%s held by %s/%d:\n",
- depth, depth > 1 ? "s" : "", curr->comm, task_pid_nr(curr));
-
for (i = 0; i < depth; i++) {
printk(" #%d: ", i);
- print_lock(curr->held_locks + i);
+ print_lock(p->held_locks + i);
}
}
@@ -600,19 +824,26 @@ static int very_verbose(struct lock_class *class)
* Is this the address of a static object:
*/
#ifdef __KERNEL__
-static int static_obj(void *obj)
+static int static_obj(const void *obj)
{
- unsigned long start = (unsigned long) &_stext,
- end = (unsigned long) &_end,
- addr = (unsigned long) obj;
+ unsigned long addr = (unsigned long) obj;
+
+ if (is_kernel_core_data(addr))
+ return 1;
/*
- * static variable?
+ * keys are allowed in the __ro_after_init section.
*/
- if ((addr >= start) && (addr < end))
+ if (is_kernel_rodata(addr))
return 1;
- if (arch_is_kernel_data(addr))
+ /*
+ * in initdata section and used during bootup only?
+ * NOTE: On some platforms the initdata section is
+ * outside of the _stext ... _end range.
+ */
+ if (system_state < SYSTEM_FREEING_INITMEM &&
+ init_section_contains((void *)addr, 1))
return 1;
/*
@@ -630,7 +861,8 @@ static int static_obj(void *obj)
/*
* To make lock name printouts unique, we calculate a unique
- * class->name_version generation counter:
+ * class->name_version generation counter. The caller must hold the graph
+ * lock.
*/
static int count_matching_names(struct lock_class *new_class)
{
@@ -640,7 +872,7 @@ static int count_matching_names(struct lock_class *new_class)
if (!new_class->name)
return 0;
- list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) {
+ list_for_each_entry(class, &all_lock_classes, lock_entry) {
if (new_class->key - new_class->subclass == class->key)
return class->name_version;
if (class->name && !strcmp(class->name, new_class->name))
@@ -650,48 +882,34 @@ static int count_matching_names(struct lock_class *new_class)
return count + 1;
}
-/*
- * Register a lock's class in the hash-table, if the class is not present
- * yet. Otherwise we look it up. We cache the result in the lock object
- * itself, so actual lookup of the hash should be once per lock object.
- */
-static inline struct lock_class *
-look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
+/* used from NMI context -- must be lockless */
+static noinstr struct lock_class *
+look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
{
struct lockdep_subclass_key *key;
struct hlist_head *hash_head;
struct lock_class *class;
- bool is_static = false;
if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
+ instrumentation_begin();
debug_locks_off();
+ nbcon_cpu_emergency_enter();
printk(KERN_ERR
"BUG: looking up invalid subclass: %u\n", subclass);
printk(KERN_ERR
"turning off the locking correctness validator.\n");
dump_stack();
+ nbcon_cpu_emergency_exit();
+ instrumentation_end();
return NULL;
}
/*
- * Static locks do not have their class-keys yet - for them the key
- * is the lock object itself. If the lock is in the per cpu area,
- * the canonical address of the lock (per cpu offset removed) is
- * used.
+ * If it is not initialised then it has never been locked,
+ * so it won't be present in the hash table.
*/
- if (unlikely(!lock->key)) {
- unsigned long can_addr, addr = (unsigned long)lock;
-
- if (__is_kernel_percpu_address(addr, &can_addr))
- lock->key = (void *)can_addr;
- else if (__is_module_percpu_address(addr, &can_addr))
- lock->key = (void *)can_addr;
- else if (static_obj(lock))
- lock->key = (void *)lock;
- else
- return ERR_PTR(-EINVAL);
- is_static = true;
- }
+ if (unlikely(!lock->key))
+ return NULL;
/*
* NOTE: the class-key must be unique. For dynamic locks, a static
@@ -712,18 +930,350 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return NULL;
- hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
+ hlist_for_each_entry_rcu_notrace(class, hash_head, hash_entry) {
if (class->key == key) {
/*
* Huh! same key, different name? Did someone trample
* on some memory? We're most confused.
*/
- WARN_ON_ONCE(class->name != lock->name);
+ WARN_ONCE(class->name != lock->name &&
+ lock->key != &__lockdep_no_validate__,
+ "Looking for class \"%s\" with key %ps, but found a different class \"%s\" with the same key\n",
+ lock->name, lock->key, class->name);
return class;
}
}
- return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
+ return NULL;
+}
+
+/*
+ * Static locks do not have their class-keys yet - for them the key is
+ * the lock object itself. If the lock is in the per cpu area, the
+ * canonical address of the lock (per cpu offset removed) is used.
+ */
+static bool assign_lock_key(struct lockdep_map *lock)
+{
+ unsigned long can_addr, addr = (unsigned long)lock;
+
+#ifdef __KERNEL__
+ /*
+ * lockdep_free_key_range() assumes that struct lock_class_key
+ * objects do not overlap. Since we use the address of lock
+ * objects as class key for static objects, check whether the
+ * size of lock_class_key objects does not exceed the size of
+ * the smallest lock object.
+ */
+ BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(raw_spinlock_t));
+#endif
+
+ if (__is_kernel_percpu_address(addr, &can_addr))
+ lock->key = (void *)can_addr;
+ else if (__is_module_percpu_address(addr, &can_addr))
+ lock->key = (void *)can_addr;
+ else if (static_obj(lock))
+ lock->key = (void *)lock;
+ else {
+ /* Debug-check: all keys must be persistent! */
+ debug_locks_off();
+ nbcon_cpu_emergency_enter();
+ pr_err("INFO: trying to register non-static key.\n");
+ pr_err("The code is fine but needs lockdep annotation, or maybe\n");
+ pr_err("you didn't initialize this object before use?\n");
+ pr_err("turning off the locking correctness validator.\n");
+ dump_stack();
+ nbcon_cpu_emergency_exit();
+ return false;
+ }
+
+ return true;
+}
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+
+/* Check whether element @e occurs in list @h */
+static bool in_list(struct list_head *e, struct list_head *h)
+{
+ struct list_head *f;
+
+ list_for_each(f, h) {
+ if (e == f)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Check whether entry @e occurs in any of the locks_after or locks_before
+ * lists.
+ */
+static bool in_any_class_list(struct list_head *e)
+{
+ struct lock_class *class;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(lock_classes); i++) {
+ class = &lock_classes[i];
+ if (in_list(e, &class->locks_after) ||
+ in_list(e, &class->locks_before))
+ return true;
+ }
+ return false;
+}
+
+static bool class_lock_list_valid(struct lock_class *c, struct list_head *h)
+{
+ struct lock_list *e;
+
+ list_for_each_entry(e, h, entry) {
+ if (e->links_to != c) {
+ printk(KERN_INFO "class %s: mismatch for lock entry %ld; class %s <> %s",
+ c->name ? : "(?)",
+ (unsigned long)(e - list_entries),
+ e->links_to && e->links_to->name ?
+ e->links_to->name : "(?)",
+ e->class && e->class->name ? e->class->name :
+ "(?)");
+ return false;
+ }
+ }
+ return true;
+}
+
+#ifdef CONFIG_PROVE_LOCKING
+static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
+#endif
+
+static bool check_lock_chain_key(struct lock_chain *chain)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ u64 chain_key = INITIAL_CHAIN_KEY;
+ int i;
+
+ for (i = chain->base; i < chain->base + chain->depth; i++)
+ chain_key = iterate_chain_key(chain_key, chain_hlocks[i]);
+ /*
+ * The 'unsigned long long' casts avoid that a compiler warning
+ * is reported when building tools/lib/lockdep.
+ */
+ if (chain->chain_key != chain_key) {
+ printk(KERN_INFO "chain %lld: key %#llx <> %#llx\n",
+ (unsigned long long)(chain - lock_chains),
+ (unsigned long long)chain->chain_key,
+ (unsigned long long)chain_key);
+ return false;
+ }
+#endif
+ return true;
+}
+
+static bool in_any_zapped_class_list(struct lock_class *class)
+{
+ struct pending_free *pf;
+ int i;
+
+ for (i = 0, pf = delayed_free.pf; i < ARRAY_SIZE(delayed_free.pf); i++, pf++) {
+ if (in_list(&class->lock_entry, &pf->zapped))
+ return true;
+ }
+
+ return false;
+}
+
+static bool __check_data_structures(void)
+{
+ struct lock_class *class;
+ struct lock_chain *chain;
+ struct hlist_head *head;
+ struct lock_list *e;
+ int i;
+
+ /* Check whether all classes occur in a lock list. */
+ for (i = 0; i < ARRAY_SIZE(lock_classes); i++) {
+ class = &lock_classes[i];
+ if (!in_list(&class->lock_entry, &all_lock_classes) &&
+ !in_list(&class->lock_entry, &free_lock_classes) &&
+ !in_any_zapped_class_list(class)) {
+ printk(KERN_INFO "class %px/%s is not in any class list\n",
+ class, class->name ? : "(?)");
+ return false;
+ }
+ }
+
+ /* Check whether all classes have valid lock lists. */
+ for (i = 0; i < ARRAY_SIZE(lock_classes); i++) {
+ class = &lock_classes[i];
+ if (!class_lock_list_valid(class, &class->locks_before))
+ return false;
+ if (!class_lock_list_valid(class, &class->locks_after))
+ return false;
+ }
+
+ /* Check the chain_key of all lock chains. */
+ for (i = 0; i < ARRAY_SIZE(chainhash_table); i++) {
+ head = chainhash_table + i;
+ hlist_for_each_entry_rcu(chain, head, entry) {
+ if (!check_lock_chain_key(chain))
+ return false;
+ }
+ }
+
+ /*
+ * Check whether all list entries that are in use occur in a class
+ * lock list.
+ */
+ for_each_set_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) {
+ e = list_entries + i;
+ if (!in_any_class_list(&e->entry)) {
+ printk(KERN_INFO "list entry %d is not in any class list; class %s <> %s\n",
+ (unsigned int)(e - list_entries),
+ e->class->name ? : "(?)",
+ e->links_to->name ? : "(?)");
+ return false;
+ }
+ }
+
+ /*
+ * Check whether all list entries that are not in use do not occur in
+ * a class lock list.
+ */
+ for_each_clear_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) {
+ e = list_entries + i;
+ if (in_any_class_list(&e->entry)) {
+ printk(KERN_INFO "list entry %d occurs in a class list; class %s <> %s\n",
+ (unsigned int)(e - list_entries),
+ e->class && e->class->name ? e->class->name :
+ "(?)",
+ e->links_to && e->links_to->name ?
+ e->links_to->name : "(?)");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+int check_consistency = 0;
+module_param(check_consistency, int, 0644);
+
+static void check_data_structures(void)
+{
+ static bool once = false;
+
+ if (check_consistency && !once) {
+ if (!__check_data_structures()) {
+ once = true;
+ WARN_ON(once);
+ }
+ }
+}
+
+#else /* CONFIG_DEBUG_LOCKDEP */
+
+static inline void check_data_structures(void) { }
+
+#endif /* CONFIG_DEBUG_LOCKDEP */
+
+static void init_chain_block_buckets(void);
+
+/*
+ * Initialize the lock_classes[] array elements, the free_lock_classes list
+ * and also the delayed_free structure.
+ */
+static void init_data_structures_once(void)
+{
+ static bool __read_mostly ds_initialized, rcu_head_initialized;
+ int i;
+
+ if (likely(rcu_head_initialized))
+ return;
+
+ if (system_state >= SYSTEM_SCHEDULING) {
+ init_rcu_head(&delayed_free.rcu_head);
+ rcu_head_initialized = true;
+ }
+
+ if (ds_initialized)
+ return;
+
+ ds_initialized = true;
+
+ INIT_LIST_HEAD(&delayed_free.pf[0].zapped);
+ INIT_LIST_HEAD(&delayed_free.pf[1].zapped);
+
+ for (i = 0; i < ARRAY_SIZE(lock_classes); i++) {
+ list_add_tail(&lock_classes[i].lock_entry, &free_lock_classes);
+ INIT_LIST_HEAD(&lock_classes[i].locks_after);
+ INIT_LIST_HEAD(&lock_classes[i].locks_before);
+ }
+ init_chain_block_buckets();
+}
+
+static inline struct hlist_head *keyhashentry(const struct lock_class_key *key)
+{
+ unsigned long hash = hash_long((uintptr_t)key, KEYHASH_BITS);
+
+ return lock_keys_hash + hash;
+}
+
+/* Register a dynamically allocated key. */
+void lockdep_register_key(struct lock_class_key *key)
+{
+ struct hlist_head *hash_head;
+ struct lock_class_key *k;
+ unsigned long flags;
+
+ if (WARN_ON_ONCE(static_obj(key)))
+ return;
+ hash_head = keyhashentry(key);
+
+ raw_local_irq_save(flags);
+ if (!graph_lock())
+ goto restore_irqs;
+ hlist_for_each_entry_rcu(k, hash_head, hash_entry) {
+ if (WARN_ON_ONCE(k == key))
+ goto out_unlock;
+ }
+ hlist_add_head_rcu(&key->hash_entry, hash_head);
+ nr_dynamic_keys++;
+out_unlock:
+ graph_unlock();
+restore_irqs:
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lockdep_register_key);
+
+/* Check whether a key has been registered as a dynamic key. */
+static bool is_dynamic_key(const struct lock_class_key *key)
+{
+ struct hlist_head *hash_head;
+ struct lock_class_key *k;
+ bool found = false;
+
+ if (WARN_ON_ONCE(static_obj(key)))
+ return false;
+
+ /*
+ * If lock debugging is disabled lock_keys_hash[] may contain
+ * pointers to memory that has already been freed. Avoid triggering
+ * a use-after-free in that case by returning early.
+ */
+ if (!debug_locks)
+ return true;
+
+ hash_head = keyhashentry(key);
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(k, hash_head, hash_entry) {
+ if (k == key) {
+ found = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return found;
}
/*
@@ -737,22 +1287,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
struct lockdep_subclass_key *key;
struct hlist_head *hash_head;
struct lock_class *class;
+ int idx;
DEBUG_LOCKS_WARN_ON(!irqs_disabled());
class = look_up_lock_class(lock, subclass);
- if (likely(!IS_ERR_OR_NULL(class)))
+ if (likely(class))
goto out_set_class_cache;
- /*
- * Debug-check: all keys must be persistent!
- */
- if (IS_ERR(class)) {
- debug_locks_off();
- printk("INFO: trying to register non-static key.\n");
- printk("the code is fine but needs lockdep annotation.\n");
- printk("turning off the locking correctness validator.\n");
- dump_stack();
+ if (!lock->key) {
+ if (!assign_lock_key(lock))
+ return NULL;
+ } else if (!static_obj(lock->key) && !is_dynamic_key(lock->key)) {
return NULL;
}
@@ -771,46 +1317,58 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
goto out_unlock_set;
}
- /*
- * Allocate a new key from the static array, and add it to
- * the hash:
- */
- if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
+ init_data_structures_once();
+
+ /* Allocate a new lock class and add it to the hash. */
+ class = list_first_entry_or_null(&free_lock_classes, typeof(*class),
+ lock_entry);
+ if (!class) {
if (!debug_locks_off_graph_unlock()) {
return NULL;
}
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
dump_stack();
+ nbcon_cpu_emergency_exit();
return NULL;
}
- class = lock_classes + nr_lock_classes++;
+ nr_lock_classes++;
+ __set_bit(class - lock_classes, lock_classes_in_use);
debug_atomic_inc(nr_unused_locks);
class->key = key;
class->name = lock->name;
class->subclass = subclass;
- INIT_LIST_HEAD(&class->lock_entry);
- INIT_LIST_HEAD(&class->locks_before);
- INIT_LIST_HEAD(&class->locks_after);
+ WARN_ON_ONCE(!list_empty(&class->locks_before));
+ WARN_ON_ONCE(!list_empty(&class->locks_after));
class->name_version = count_matching_names(class);
+ class->wait_type_inner = lock->wait_type_inner;
+ class->wait_type_outer = lock->wait_type_outer;
+ class->lock_type = lock->lock_type;
/*
* We use RCU's safe list-add method to make
* parallel walking of the hash-list safe:
*/
hlist_add_head_rcu(&class->hash_entry, hash_head);
/*
- * Add it to the global list of classes:
+ * Remove the class from the free list and add it to the global list
+ * of classes.
*/
- list_add_tail_rcu(&class->lock_entry, &all_lock_classes);
+ list_move_tail(&class->lock_entry, &all_lock_classes);
+ idx = class - lock_classes;
+ if (idx > max_lock_class_idx)
+ max_lock_class_idx = idx;
if (verbose(class)) {
graph_unlock();
- printk("\nnew class %p: %s", class->key, class->name);
+ nbcon_cpu_emergency_enter();
+ printk("\nnew class %px: %s", class->key, class->name);
if (class->name_version > 1)
printk(KERN_CONT "#%d", class->name_version);
printk(KERN_CONT "\n");
dump_stack();
+ nbcon_cpu_emergency_exit();
if (!graph_lock()) {
return NULL;
@@ -842,23 +1400,31 @@ out_set_class_cache:
*/
static struct lock_list *alloc_list_entry(void)
{
- if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
+ int idx = find_first_zero_bit(list_entries_in_use,
+ ARRAY_SIZE(list_entries));
+
+ if (idx >= ARRAY_SIZE(list_entries)) {
if (!debug_locks_off_graph_unlock())
return NULL;
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!");
dump_stack();
+ nbcon_cpu_emergency_exit();
return NULL;
}
- return list_entries + nr_list_entries++;
+ nr_list_entries++;
+ __set_bit(idx, list_entries_in_use);
+ return list_entries + idx;
}
/*
* Add a new dependency to the head of the list:
*/
-static int add_lock_to_list(struct lock_class *this, struct list_head *head,
- unsigned long ip, int distance,
- struct stack_trace *trace)
+static int add_lock_to_list(struct lock_class *this,
+ struct lock_class *links_to, struct list_head *head,
+ u16 distance, u8 dep,
+ const struct lock_trace *trace)
{
struct lock_list *entry;
/*
@@ -870,8 +1436,10 @@ static int add_lock_to_list(struct lock_class *this, struct list_head *head,
return 0;
entry->class = this;
+ entry->links_to = links_to;
+ entry->dep = dep;
entry->distance = distance;
- entry->trace = *trace;
+ entry->trace = trace;
/*
* Both allocation and removal are done under the graph lock; but
* iteration is under RCU-sched; see look_up_lock_class() and
@@ -885,17 +1453,21 @@ static int add_lock_to_list(struct lock_class *this, struct list_head *head,
/*
* For good efficiency of modular, we use power of 2
*/
-#define MAX_CIRCULAR_QUEUE_SIZE 4096UL
+#define MAX_CIRCULAR_QUEUE_SIZE (1UL << CONFIG_LOCKDEP_CIRCULAR_QUEUE_BITS)
#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1)
/*
- * The circular_queue and helpers is used to implement the
- * breadth-first search(BFS)algorithem, by which we can build
- * the shortest path from the next lock to be acquired to the
- * previous held lock if there is a circular between them.
+ * The circular_queue and helpers are used to implement graph
+ * breadth-first search (BFS) algorithm, by which we can determine
+ * whether there is a path from a lock to another. In deadlock checks,
+ * a path from the next lock to be acquired to a previous held lock
+ * indicates that adding the <prev> -> <next> lock dependency will
+ * produce a circle in the graph. Breadth-first search instead of
+ * depth-first search is used in order to find the shortest (circular)
+ * path.
*/
struct circular_queue {
- unsigned long element[MAX_CIRCULAR_QUEUE_SIZE];
+ struct lock_list *element[MAX_CIRCULAR_QUEUE_SIZE];
unsigned int front, rear;
};
@@ -921,7 +1493,7 @@ static inline int __cq_full(struct circular_queue *cq)
return ((cq->rear + 1) & CQ_MASK) == cq->front;
}
-static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
+static inline int __cq_enqueue(struct circular_queue *cq, struct lock_list *elem)
{
if (__cq_full(cq))
return -1;
@@ -931,14 +1503,21 @@ static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
return 0;
}
-static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
+/*
+ * Dequeue an element from the circular_queue, return a lock_list if
+ * the queue is not empty, or NULL if otherwise.
+ */
+static inline struct lock_list * __cq_dequeue(struct circular_queue *cq)
{
+ struct lock_list * lock;
+
if (__cq_empty(cq))
- return -1;
+ return NULL;
- *elem = cq->element[cq->front];
+ lock = cq->element[cq->front];
cq->front = (cq->front + 1) & CQ_MASK;
- return 0;
+
+ return lock;
}
static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
@@ -946,23 +1525,19 @@ static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
return (cq->rear - cq->front) & CQ_MASK;
}
-static inline void mark_lock_accessed(struct lock_list *lock,
- struct lock_list *parent)
+static inline void mark_lock_accessed(struct lock_list *lock)
{
- unsigned long nr;
+ lock->class->dep_gen_id = lockdep_dependency_gen_id;
+}
- nr = lock - list_entries;
- WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
+static inline void visit_lock_entry(struct lock_list *lock,
+ struct lock_list *parent)
+{
lock->parent = parent;
- lock->class->dep_gen_id = lockdep_dependency_gen_id;
}
static inline unsigned long lock_accessed(struct lock_list *lock)
{
- unsigned long nr;
-
- nr = lock - list_entries;
- WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
return lock->class->dep_gen_id == lockdep_dependency_gen_id;
}
@@ -983,114 +1558,329 @@ static inline int get_lock_depth(struct lock_list *child)
return depth;
}
-static int __bfs(struct lock_list *source_entry,
- void *data,
- int (*match)(struct lock_list *entry, void *data),
- struct lock_list **target_entry,
- int forward)
+/*
+ * Return the forward or backward dependency list.
+ *
+ * @lock: the lock_list to get its class's dependency list
+ * @offset: the offset to struct lock_class to determine whether it is
+ * locks_after or locks_before
+ */
+static inline struct list_head *get_dep_list(struct lock_list *lock, int offset)
+{
+ void *lock_class = lock->class;
+
+ return lock_class + offset;
+}
+/*
+ * Return values of a bfs search:
+ *
+ * BFS_E* indicates an error
+ * BFS_R* indicates a result (match or not)
+ *
+ * BFS_EINVALIDNODE: Find a invalid node in the graph.
+ *
+ * BFS_EQUEUEFULL: The queue is full while doing the bfs.
+ *
+ * BFS_RMATCH: Find the matched node in the graph, and put that node into
+ * *@target_entry.
+ *
+ * BFS_RNOMATCH: Haven't found the matched node and keep *@target_entry
+ * _unchanged_.
+ */
+enum bfs_result {
+ BFS_EINVALIDNODE = -2,
+ BFS_EQUEUEFULL = -1,
+ BFS_RMATCH = 0,
+ BFS_RNOMATCH = 1,
+};
+
+/*
+ * bfs_result < 0 means error
+ */
+static inline bool bfs_error(enum bfs_result res)
+{
+ return res < 0;
+}
+
+/*
+ * DEP_*_BIT in lock_list::dep
+ *
+ * For dependency @prev -> @next:
+ *
+ * SR: @prev is shared reader (->read != 0) and @next is recursive reader
+ * (->read == 2)
+ * ER: @prev is exclusive locker (->read == 0) and @next is recursive reader
+ * SN: @prev is shared reader and @next is non-recursive locker (->read != 2)
+ * EN: @prev is exclusive locker and @next is non-recursive locker
+ *
+ * Note that we define the value of DEP_*_BITs so that:
+ * bit0 is prev->read == 0
+ * bit1 is next->read != 2
+ */
+#define DEP_SR_BIT (0 + (0 << 1)) /* 0 */
+#define DEP_ER_BIT (1 + (0 << 1)) /* 1 */
+#define DEP_SN_BIT (0 + (1 << 1)) /* 2 */
+#define DEP_EN_BIT (1 + (1 << 1)) /* 3 */
+
+#define DEP_SR_MASK (1U << (DEP_SR_BIT))
+#define DEP_ER_MASK (1U << (DEP_ER_BIT))
+#define DEP_SN_MASK (1U << (DEP_SN_BIT))
+#define DEP_EN_MASK (1U << (DEP_EN_BIT))
+
+static inline unsigned int
+__calc_dep_bit(struct held_lock *prev, struct held_lock *next)
+{
+ return (prev->read == 0) + ((next->read != 2) << 1);
+}
+
+static inline u8 calc_dep(struct held_lock *prev, struct held_lock *next)
+{
+ return 1U << __calc_dep_bit(prev, next);
+}
+
+/*
+ * calculate the dep_bit for backwards edges. We care about whether @prev is
+ * shared and whether @next is recursive.
+ */
+static inline unsigned int
+__calc_dep_bitb(struct held_lock *prev, struct held_lock *next)
+{
+ return (next->read != 2) + ((prev->read == 0) << 1);
+}
+
+static inline u8 calc_depb(struct held_lock *prev, struct held_lock *next)
+{
+ return 1U << __calc_dep_bitb(prev, next);
+}
+
+/*
+ * Initialize a lock_list entry @lock belonging to @class as the root for a BFS
+ * search.
+ */
+static inline void __bfs_init_root(struct lock_list *lock,
+ struct lock_class *class)
{
+ lock->class = class;
+ lock->parent = NULL;
+ lock->only_xr = 0;
+}
+
+/*
+ * Initialize a lock_list entry @lock based on a lock acquisition @hlock as the
+ * root for a BFS search.
+ *
+ * ->only_xr of the initial lock node is set to @hlock->read == 2, to make sure
+ * that <prev> -> @hlock and @hlock -> <whatever __bfs() found> is not -(*R)->
+ * and -(S*)->.
+ */
+static inline void bfs_init_root(struct lock_list *lock,
+ struct held_lock *hlock)
+{
+ __bfs_init_root(lock, hlock_class(hlock));
+ lock->only_xr = (hlock->read == 2);
+}
+
+/*
+ * Similar to bfs_init_root() but initialize the root for backwards BFS.
+ *
+ * ->only_xr of the initial lock node is set to @hlock->read != 0, to make sure
+ * that <next> -> @hlock and @hlock -> <whatever backwards BFS found> is not
+ * -(*S)-> and -(R*)-> (reverse order of -(*R)-> and -(S*)->).
+ */
+static inline void bfs_init_rootb(struct lock_list *lock,
+ struct held_lock *hlock)
+{
+ __bfs_init_root(lock, hlock_class(hlock));
+ lock->only_xr = (hlock->read != 0);
+}
+
+static inline struct lock_list *__bfs_next(struct lock_list *lock, int offset)
+{
+ if (!lock || !lock->parent)
+ return NULL;
+
+ return list_next_or_null_rcu(get_dep_list(lock->parent, offset),
+ &lock->entry, struct lock_list, entry);
+}
+
+/*
+ * Breadth-First Search to find a strong path in the dependency graph.
+ *
+ * @source_entry: the source of the path we are searching for.
+ * @data: data used for the second parameter of @match function
+ * @match: match function for the search
+ * @target_entry: pointer to the target of a matched path
+ * @offset: the offset to struct lock_class to determine whether it is
+ * locks_after or locks_before
+ *
+ * We may have multiple edges (considering different kinds of dependencies,
+ * e.g. ER and SN) between two nodes in the dependency graph. But
+ * only the strong dependency path in the graph is relevant to deadlocks. A
+ * strong dependency path is a dependency path that doesn't have two adjacent
+ * dependencies as -(*R)-> -(S*)->, please see:
+ *
+ * Documentation/locking/lockdep-design.rst
+ *
+ * for more explanation of the definition of strong dependency paths
+ *
+ * In __bfs(), we only traverse in the strong dependency path:
+ *
+ * In lock_list::only_xr, we record whether the previous dependency only
+ * has -(*R)-> in the search, and if it does (prev only has -(*R)->), we
+ * filter out any -(S*)-> in the current dependency and after that, the
+ * ->only_xr is set according to whether we only have -(*R)-> left.
+ */
+static enum bfs_result __bfs(struct lock_list *source_entry,
+ void *data,
+ bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry,
+ int offset)
+{
+ struct circular_queue *cq = &lock_cq;
+ struct lock_list *lock = NULL;
struct lock_list *entry;
struct list_head *head;
- struct circular_queue *cq = &lock_cq;
- int ret = 1;
+ unsigned int cq_depth;
+ bool first;
- if (match(source_entry, data)) {
- *target_entry = source_entry;
- ret = 0;
- goto exit;
- }
+ lockdep_assert_locked();
- if (forward)
- head = &source_entry->class->locks_after;
- else
- head = &source_entry->class->locks_before;
+ __cq_init(cq);
+ __cq_enqueue(cq, source_entry);
- if (list_empty(head))
- goto exit;
+ while ((lock = __bfs_next(lock, offset)) || (lock = __cq_dequeue(cq))) {
+ if (!lock->class)
+ return BFS_EINVALIDNODE;
- __cq_init(cq);
- __cq_enqueue(cq, (unsigned long)source_entry);
+ /*
+ * Step 1: check whether we already finish on this one.
+ *
+ * If we have visited all the dependencies from this @lock to
+ * others (iow, if we have visited all lock_list entries in
+ * @lock->class->locks_{after,before}) we skip, otherwise go
+ * and visit all the dependencies in the list and mark this
+ * list accessed.
+ */
+ if (lock_accessed(lock))
+ continue;
+ else
+ mark_lock_accessed(lock);
+
+ /*
+ * Step 2: check whether prev dependency and this form a strong
+ * dependency path.
+ */
+ if (lock->parent) { /* Parent exists, check prev dependency */
+ u8 dep = lock->dep;
+ bool prev_only_xr = lock->parent->only_xr;
- while (!__cq_empty(cq)) {
- struct lock_list *lock;
+ /*
+ * Mask out all -(S*)-> if we only have *R in previous
+ * step, because -(*R)-> -(S*)-> don't make up a strong
+ * dependency.
+ */
+ if (prev_only_xr)
+ dep &= ~(DEP_SR_MASK | DEP_SN_MASK);
- __cq_dequeue(cq, (unsigned long *)&lock);
+ /* If nothing left, we skip */
+ if (!dep)
+ continue;
- if (!lock->class) {
- ret = -2;
- goto exit;
+ /* If there are only -(*R)-> left, set that for the next step */
+ lock->only_xr = !(dep & (DEP_SN_MASK | DEP_EN_MASK));
}
- if (forward)
- head = &lock->class->locks_after;
- else
- head = &lock->class->locks_before;
+ /*
+ * Step 3: we haven't visited this and there is a strong
+ * dependency path to this, so check with @match.
+ * If @skip is provide and returns true, we skip this
+ * lock (and any path this lock is in).
+ */
+ if (skip && skip(lock, data))
+ continue;
- DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+ if (match(lock, data)) {
+ *target_entry = lock;
+ return BFS_RMATCH;
+ }
+ /*
+ * Step 4: if not match, expand the path by adding the
+ * forward or backwards dependencies in the search
+ *
+ */
+ first = true;
+ head = get_dep_list(lock, offset);
list_for_each_entry_rcu(entry, head, entry) {
- if (!lock_accessed(entry)) {
- unsigned int cq_depth;
- mark_lock_accessed(entry, lock);
- if (match(entry, data)) {
- *target_entry = entry;
- ret = 0;
- goto exit;
- }
+ visit_lock_entry(entry, lock);
- if (__cq_enqueue(cq, (unsigned long)entry)) {
- ret = -1;
- goto exit;
- }
- cq_depth = __cq_get_elem_count(cq);
- if (max_bfs_queue_depth < cq_depth)
- max_bfs_queue_depth = cq_depth;
- }
+ /*
+ * Note we only enqueue the first of the list into the
+ * queue, because we can always find a sibling
+ * dependency from one (see __bfs_next()), as a result
+ * the space of queue is saved.
+ */
+ if (!first)
+ continue;
+
+ first = false;
+
+ if (__cq_enqueue(cq, entry))
+ return BFS_EQUEUEFULL;
+
+ cq_depth = __cq_get_elem_count(cq);
+ if (max_bfs_queue_depth < cq_depth)
+ max_bfs_queue_depth = cq_depth;
}
}
-exit:
- return ret;
+
+ return BFS_RNOMATCH;
}
-static inline int __bfs_forwards(struct lock_list *src_entry,
- void *data,
- int (*match)(struct lock_list *entry, void *data),
- struct lock_list **target_entry)
+static inline enum bfs_result
+__bfs_forwards(struct lock_list *src_entry,
+ void *data,
+ bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry)
{
- return __bfs(src_entry, data, match, target_entry, 1);
+ return __bfs(src_entry, data, match, skip, target_entry,
+ offsetof(struct lock_class, locks_after));
}
-static inline int __bfs_backwards(struct lock_list *src_entry,
- void *data,
- int (*match)(struct lock_list *entry, void *data),
- struct lock_list **target_entry)
+static inline enum bfs_result
+__bfs_backwards(struct lock_list *src_entry,
+ void *data,
+ bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry)
{
- return __bfs(src_entry, data, match, target_entry, 0);
+ return __bfs(src_entry, data, match, skip, target_entry,
+ offsetof(struct lock_class, locks_before));
}
-/*
- * Recursive, forwards-direction lock-dependency checking, used for
- * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
- * checking.
- */
+static void print_lock_trace(const struct lock_trace *trace,
+ unsigned int spaces)
+{
+ stack_trace_print(trace->entries, trace->nr_entries, spaces);
+}
/*
* Print a dependency chain entry (this is only done when a deadlock
* has been detected):
*/
-static noinline int
+static noinline void
print_circular_bug_entry(struct lock_list *target, int depth)
{
if (debug_locks_silent)
- return 0;
+ return;
printk("\n-> #%u", depth);
- print_lock_name(target->class);
+ print_lock_name(NULL, target->class);
printk(KERN_CONT ":\n");
- print_stack_trace(&target->trace, 6);
-
- return 0;
+ print_lock_trace(target->trace, 6);
}
static void
@@ -1101,6 +1891,8 @@ print_circular_lock_scenario(struct held_lock *src,
struct lock_class *source = hlock_class(src);
struct lock_class *target = hlock_class(tgt);
struct lock_class *parent = prt->class;
+ int src_read = src->read;
+ int tgt_read = tgt->read;
/*
* A direct locking problem where unsafe_class lock is taken
@@ -1117,28 +1909,36 @@ print_circular_lock_scenario(struct held_lock *src,
*/
if (parent != source) {
printk("Chain exists of:\n ");
- __print_lock_name(source);
+ __print_lock_name(src, source);
printk(KERN_CONT " --> ");
- __print_lock_name(parent);
+ __print_lock_name(NULL, parent);
printk(KERN_CONT " --> ");
- __print_lock_name(target);
+ __print_lock_name(tgt, target);
printk(KERN_CONT "\n\n");
}
printk(" Possible unsafe locking scenario:\n\n");
printk(" CPU0 CPU1\n");
printk(" ---- ----\n");
- printk(" lock(");
- __print_lock_name(target);
+ if (tgt_read != 0)
+ printk(" rlock(");
+ else
+ printk(" lock(");
+ __print_lock_name(tgt, target);
printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(parent);
+ __print_lock_name(NULL, parent);
printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(target);
+ __print_lock_name(tgt, target);
printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(source);
+ if (src_read != 0)
+ printk(" rlock(");
+ else if (src->sync)
+ printk(" sync(");
+ else
+ printk(" lock(");
+ __print_lock_name(src, source);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
@@ -1147,7 +1947,7 @@ print_circular_lock_scenario(struct held_lock *src,
* When a circular dependency is detected, print the
* header first:
*/
-static noinline int
+static noinline void
print_circular_bug_header(struct lock_list *entry, unsigned int depth,
struct held_lock *check_src,
struct held_lock *check_tgt)
@@ -1155,7 +1955,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
struct task_struct *curr = current;
if (debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("======================================================\n");
@@ -1165,22 +1965,44 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
pr_warn("%s/%d is trying to acquire lock:\n",
curr->comm, task_pid_nr(curr));
print_lock(check_src);
+
pr_warn("\nbut task is already holding lock:\n");
+
print_lock(check_tgt);
pr_warn("\nwhich lock already depends on the new lock.\n\n");
pr_warn("\nthe existing dependency chain (in reverse order) is:\n");
print_circular_bug_entry(entry, depth);
-
- return 0;
}
-static inline int class_equal(struct lock_list *entry, void *data)
+/*
+ * We are about to add B -> A into the dependency graph, and in __bfs() a
+ * strong dependency path A -> .. -> B is found: hlock_class equals
+ * entry->class.
+ *
+ * We will have a deadlock case (conflict) if A -> .. -> B -> A is a strong
+ * dependency cycle, that means:
+ *
+ * Either
+ *
+ * a) B -> A is -(E*)->
+ *
+ * or
+ *
+ * b) A -> .. -> B is -(*N)-> (i.e. A -> .. -(*N)-> B)
+ *
+ * as then we don't have -(*R)-> -(S*)-> in the cycle.
+ */
+static inline bool hlock_conflict(struct lock_list *entry, void *data)
{
- return entry->class == data;
+ struct held_lock *hlock = (struct held_lock *)data;
+
+ return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */
+ (hlock->read == 0 || /* B -> A is -(E*)-> */
+ !entry->only_xr); /* A -> .. -> B is -(*N)-> */
}
-static noinline int print_circular_bug(struct lock_list *this,
+static noinline void print_circular_bug(struct lock_list *this,
struct lock_list *target,
struct held_lock *check_src,
struct held_lock *check_tgt)
@@ -1191,13 +2013,16 @@ static noinline int print_circular_bug(struct lock_list *this,
int depth;
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
- if (!save_trace(&this->trace))
- return 0;
+ this->trace = save_trace();
+ if (!this->trace)
+ return;
depth = get_lock_depth(target);
+ nbcon_cpu_emergency_enter();
+
print_circular_bug_header(target, depth, check_src, check_tgt);
parent = get_lock_parent(target);
@@ -1217,34 +2042,35 @@ static noinline int print_circular_bug(struct lock_list *this,
printk("\nstack backtrace:\n");
dump_stack();
- return 0;
+ nbcon_cpu_emergency_exit();
}
-static noinline int print_bfs_bug(int ret)
+static noinline void print_bfs_bug(int ret)
{
if (!debug_locks_off_graph_unlock())
- return 0;
+ return;
/*
* Breadth-first-search failed, graph got corrupted?
*/
- WARN(1, "lockdep bfs error:%d\n", ret);
+ if (ret == BFS_EQUEUEFULL)
+ pr_warn("Increase LOCKDEP_CIRCULAR_QUEUE_BITS to avoid this warning:\n");
- return 0;
+ WARN(1, "lockdep bfs error:%d\n", ret);
}
-static int noop_count(struct lock_list *entry, void *data)
+static bool noop_count(struct lock_list *entry, void *data)
{
(*(unsigned long *)data)++;
- return 0;
+ return false;
}
static unsigned long __lockdep_count_forward_deps(struct lock_list *this)
{
unsigned long count = 0;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
- __bfs_forwards(this, (void *)&count, noop_count, &target_entry);
+ __bfs_forwards(this, (void *)&count, noop_count, NULL, &target_entry);
return count;
}
@@ -1253,14 +2079,13 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
unsigned long ret, flags;
struct lock_list this;
- this.parent = NULL;
- this.class = class;
+ __bfs_init_root(&this, class);
- local_irq_save(flags);
- arch_spin_lock(&lockdep_lock);
+ raw_local_irq_save(flags);
+ lockdep_lock();
ret = __lockdep_count_forward_deps(&this);
- arch_spin_unlock(&lockdep_lock);
- local_irq_restore(flags);
+ lockdep_unlock();
+ raw_local_irq_restore(flags);
return ret;
}
@@ -1268,9 +2093,9 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
static unsigned long __lockdep_count_backward_deps(struct lock_list *this)
{
unsigned long count = 0;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
- __bfs_backwards(this, (void *)&count, noop_count, &target_entry);
+ __bfs_backwards(this, (void *)&count, noop_count, NULL, &target_entry);
return count;
}
@@ -1280,68 +2105,208 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
unsigned long ret, flags;
struct lock_list this;
- this.parent = NULL;
- this.class = class;
+ __bfs_init_root(&this, class);
- local_irq_save(flags);
- arch_spin_lock(&lockdep_lock);
+ raw_local_irq_save(flags);
+ lockdep_lock();
ret = __lockdep_count_backward_deps(&this);
- arch_spin_unlock(&lockdep_lock);
- local_irq_restore(flags);
+ lockdep_unlock();
+ raw_local_irq_restore(flags);
return ret;
}
/*
- * Prove that the dependency graph starting at <entry> can not
- * lead to <target>. Print an error and return 0 if it does.
+ * Check that the dependency graph starting at <src> can lead to
+ * <target> or not.
*/
-static noinline int
-check_noncircular(struct lock_list *root, struct lock_class *target,
- struct lock_list **target_entry)
+static noinline enum bfs_result
+check_path(struct held_lock *target, struct lock_list *src_entry,
+ bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry)
{
- int result;
+ enum bfs_result ret;
+
+ ret = __bfs_forwards(src_entry, target, match, skip, target_entry);
+
+ if (unlikely(bfs_error(ret)))
+ print_bfs_bug(ret);
+
+ return ret;
+}
+
+static void print_deadlock_bug(struct task_struct *, struct held_lock *, struct held_lock *);
+
+/*
+ * Prove that the dependency graph starting at <src> can not
+ * lead to <target>. If it can, there is a circle when adding
+ * <target> -> <src> dependency.
+ *
+ * Print an error and return BFS_RMATCH if it does.
+ */
+static noinline enum bfs_result
+check_noncircular(struct held_lock *src, struct held_lock *target,
+ struct lock_trace **const trace)
+{
+ enum bfs_result ret;
+ struct lock_list *target_entry;
+ struct lock_list src_entry;
+
+ bfs_init_root(&src_entry, src);
debug_atomic_inc(nr_cyclic_checks);
- result = __bfs_forwards(root, target, class_equal, target_entry);
+ ret = check_path(target, &src_entry, hlock_conflict, NULL, &target_entry);
- return result;
+ if (unlikely(ret == BFS_RMATCH)) {
+ if (!*trace) {
+ /*
+ * If save_trace fails here, the printing might
+ * trigger a WARN but because of the !nr_entries it
+ * should not do bad things.
+ */
+ *trace = save_trace();
+ }
+
+ if (src->class_idx == target->class_idx)
+ print_deadlock_bug(current, src, target);
+ else
+ print_circular_bug(&src_entry, target_entry, src, target);
+ }
+
+ return ret;
}
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+#ifdef CONFIG_TRACE_IRQFLAGS
+
/*
* Forwards and backwards subgraph searching, for the purposes of
* proving that two subgraphs can be connected by a new dependency
* without creating any illegal irq-safe -> irq-unsafe lock dependency.
+ *
+ * A irq safe->unsafe deadlock happens with the following conditions:
+ *
+ * 1) We have a strong dependency path A -> ... -> B
+ *
+ * 2) and we have ENABLED_IRQ usage of B and USED_IN_IRQ usage of A, therefore
+ * irq can create a new dependency B -> A (consider the case that a holder
+ * of B gets interrupted by an irq whose handler will try to acquire A).
+ *
+ * 3) the dependency circle A -> ... -> B -> A we get from 1) and 2) is a
+ * strong circle:
+ *
+ * For the usage bits of B:
+ * a) if A -> B is -(*N)->, then B -> A could be any type, so any
+ * ENABLED_IRQ usage suffices.
+ * b) if A -> B is -(*R)->, then B -> A must be -(E*)->, so only
+ * ENABLED_IRQ_*_READ usage suffices.
+ *
+ * For the usage bits of A:
+ * c) if A -> B is -(E*)->, then B -> A could be any type, so any
+ * USED_IN_IRQ usage suffices.
+ * d) if A -> B is -(S*)->, then B -> A must be -(*N)->, so only
+ * USED_IN_IRQ_*_READ usage suffices.
*/
-static inline int usage_match(struct lock_list *entry, void *bit)
+/*
+ * There is a strong dependency path in the dependency graph: A -> B, and now
+ * we need to decide which usage bit of A should be accumulated to detect
+ * safe->unsafe bugs.
+ *
+ * Note that usage_accumulate() is used in backwards search, so ->only_xr
+ * stands for whether A -> B only has -(S*)-> (in this case ->only_xr is true).
+ *
+ * As above, if only_xr is false, which means A -> B has -(E*)-> dependency
+ * path, any usage of A should be considered. Otherwise, we should only
+ * consider _READ usage.
+ */
+static inline bool usage_accumulate(struct lock_list *entry, void *mask)
{
- return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit);
+ if (!entry->only_xr)
+ *(unsigned long *)mask |= entry->class->usage_mask;
+ else /* Mask out _READ usage bits */
+ *(unsigned long *)mask |= (entry->class->usage_mask & LOCKF_IRQ);
+
+ return false;
}
+/*
+ * There is a strong dependency path in the dependency graph: A -> B, and now
+ * we need to decide which usage bit of B conflicts with the usage bits of A,
+ * i.e. which usage bit of B may introduce safe->unsafe deadlocks.
+ *
+ * As above, if only_xr is false, which means A -> B has -(*N)-> dependency
+ * path, any usage of B should be considered. Otherwise, we should only
+ * consider _READ usage.
+ */
+static inline bool usage_match(struct lock_list *entry, void *mask)
+{
+ if (!entry->only_xr)
+ return !!(entry->class->usage_mask & *(unsigned long *)mask);
+ else /* Mask out _READ usage bits */
+ return !!((entry->class->usage_mask & LOCKF_IRQ) & *(unsigned long *)mask);
+}
+static inline bool usage_skip(struct lock_list *entry, void *mask)
+{
+ if (entry->class->lock_type == LD_LOCK_NORMAL)
+ return false;
+
+ /*
+ * Skip local_lock() for irq inversion detection.
+ *
+ * For !RT, local_lock() is not a real lock, so it won't carry any
+ * dependency.
+ *
+ * For RT, an irq inversion happens when we have lock A and B, and on
+ * some CPU we can have:
+ *
+ * lock(A);
+ * <interrupted>
+ * lock(B);
+ *
+ * where lock(B) cannot sleep, and we have a dependency B -> ... -> A.
+ *
+ * Now we prove local_lock() cannot exist in that dependency. First we
+ * have the observation for any lock chain L1 -> ... -> Ln, for any
+ * 1 <= i <= n, Li.inner_wait_type <= L1.inner_wait_type, otherwise
+ * wait context check will complain. And since B is not a sleep lock,
+ * therefore B.inner_wait_type >= 2, and since the inner_wait_type of
+ * local_lock() is 3, which is greater than 2, therefore there is no
+ * way the local_lock() exists in the dependency B -> ... -> A.
+ *
+ * As a result, we will skip local_lock(), when we search for irq
+ * inversion bugs.
+ */
+ if (entry->class->lock_type == LD_LOCK_PERCPU &&
+ DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
+ return false;
+
+ /*
+ * Skip WAIT_OVERRIDE for irq inversion detection -- it's not actually
+ * a lock and only used to override the wait_type.
+ */
+
+ return true;
+}
/*
* Find a node in the forwards-direction dependency sub-graph starting
* at @root->class that matches @bit.
*
- * Return 0 if such a node exists in the subgraph, and put that node
+ * Return BFS_MATCH if such a node exists in the subgraph, and put that node
* into *@target_entry.
- *
- * Return 1 otherwise and keep *@target_entry unchanged.
- * Return <0 on error.
*/
-static int
-find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
+static enum bfs_result
+find_usage_forwards(struct lock_list *root, unsigned long usage_mask,
struct lock_list **target_entry)
{
- int result;
+ enum bfs_result result;
debug_atomic_inc(nr_find_usage_forwards_checks);
- result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
+ result = __bfs_forwards(root, &usage_mask, usage_match, usage_skip, target_entry);
return result;
}
@@ -1349,22 +2314,16 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
/*
* Find a node in the backwards-direction dependency sub-graph starting
* at @root->class that matches @bit.
- *
- * Return 0 if such a node exists in the subgraph, and put that node
- * into *@target_entry.
- *
- * Return 1 otherwise and keep *@target_entry unchanged.
- * Return <0 on error.
*/
-static int
-find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
+static enum bfs_result
+find_usage_backwards(struct lock_list *root, unsigned long usage_mask,
struct lock_list **target_entry)
{
- int result;
+ enum bfs_result result;
debug_atomic_inc(nr_find_usage_backwards_checks);
- result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
+ result = __bfs_backwards(root, &usage_mask, usage_match, usage_skip, target_entry);
return result;
}
@@ -1374,31 +2333,82 @@ static void print_lock_class_header(struct lock_class *class, int depth)
int bit;
printk("%*s->", depth, "");
- print_lock_name(class);
- printk(KERN_CONT " ops: %lu", class->ops);
+ print_lock_name(NULL, class);
+#ifdef CONFIG_DEBUG_LOCKDEP
+ printk(KERN_CONT " ops: %lu", debug_class_ops_read(class));
+#endif
printk(KERN_CONT " {\n");
- for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
+ for (bit = 0; bit < LOCK_TRACE_STATES; bit++) {
if (class->usage_mask & (1 << bit)) {
int len = depth;
len += printk("%*s %s", depth, "", usage_str[bit]);
len += printk(KERN_CONT " at:\n");
- print_stack_trace(class->usage_traces + bit, len);
+ print_lock_trace(class->usage_traces[bit], len);
}
}
printk("%*s }\n", depth, "");
- printk("%*s ... key at: [<%p>] %pS\n",
+ printk("%*s ... key at: [<%px>] %pS\n",
depth, "", class->key, class->key);
}
/*
- * printk the shortest lock dependencies from @start to @end in reverse order:
+ * Dependency path printing:
+ *
+ * After BFS we get a lock dependency path (linked via ->parent of lock_list),
+ * printing out each lock in the dependency path will help on understanding how
+ * the deadlock could happen. Here are some details about dependency path
+ * printing:
+ *
+ * 1) A lock_list can be either forwards or backwards for a lock dependency,
+ * for a lock dependency A -> B, there are two lock_lists:
+ *
+ * a) lock_list in the ->locks_after list of A, whose ->class is B and
+ * ->links_to is A. In this case, we can say the lock_list is
+ * "A -> B" (forwards case).
+ *
+ * b) lock_list in the ->locks_before list of B, whose ->class is A
+ * and ->links_to is B. In this case, we can say the lock_list is
+ * "B <- A" (bacwards case).
+ *
+ * The ->trace of both a) and b) point to the call trace where B was
+ * acquired with A held.
+ *
+ * 2) A "helper" lock_list is introduced during BFS, this lock_list doesn't
+ * represent a certain lock dependency, it only provides an initial entry
+ * for BFS. For example, BFS may introduce a "helper" lock_list whose
+ * ->class is A, as a result BFS will search all dependencies starting with
+ * A, e.g. A -> B or A -> C.
+ *
+ * The notation of a forwards helper lock_list is like "-> A", which means
+ * we should search the forwards dependencies starting with "A", e.g A -> B
+ * or A -> C.
+ *
+ * The notation of a bacwards helper lock_list is like "<- B", which means
+ * we should search the backwards dependencies ending with "B", e.g.
+ * B <- A or B <- C.
+ */
+
+/*
+ * printk the shortest lock dependencies from @root to @leaf in reverse order.
+ *
+ * We have a lock dependency path as follow:
+ *
+ * @root @leaf
+ * | |
+ * V V
+ * ->parent ->parent
+ * | lock_list | <--------- | lock_list | ... | lock_list | <--------- | lock_list |
+ * | -> L1 | | L1 -> L2 | ... |Ln-2 -> Ln-1| | Ln-1 -> Ln|
+ *
+ * , so it's natural that we start from @leaf and print every ->class and
+ * ->trace until we reach the @root.
*/
static void __used
print_shortest_lock_dependencies(struct lock_list *leaf,
- struct lock_list *root)
+ struct lock_list *root)
{
struct lock_list *entry = leaf;
int depth;
@@ -1409,7 +2419,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
do {
print_lock_class_header(entry->class, depth);
printk("%*s ... acquired at:\n", depth, "");
- print_stack_trace(&entry->trace, 2);
+ print_lock_trace(entry->trace, 2);
printk("\n");
if (depth == 0 && (entry != root)) {
@@ -1420,8 +2430,61 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
entry = get_lock_parent(entry);
depth--;
} while (entry && (depth >= 0));
+}
- return;
+/*
+ * printk the shortest lock dependencies from @leaf to @root.
+ *
+ * We have a lock dependency path (from a backwards search) as follow:
+ *
+ * @leaf @root
+ * | |
+ * V V
+ * ->parent ->parent
+ * | lock_list | ---------> | lock_list | ... | lock_list | ---------> | lock_list |
+ * | L2 <- L1 | | L3 <- L2 | ... | Ln <- Ln-1 | | <- Ln |
+ *
+ * , so when we iterate from @leaf to @root, we actually print the lock
+ * dependency path L1 -> L2 -> .. -> Ln in the non-reverse order.
+ *
+ * Another thing to notice here is that ->class of L2 <- L1 is L1, while the
+ * ->trace of L2 <- L1 is the call trace of L2, in fact we don't have the call
+ * trace of L1 in the dependency path, which is alright, because most of the
+ * time we can figure out where L1 is held from the call trace of L2.
+ */
+static void __used
+print_shortest_lock_dependencies_backwards(struct lock_list *leaf,
+ struct lock_list *root)
+{
+ struct lock_list *entry = leaf;
+ const struct lock_trace *trace = NULL;
+ int depth;
+
+ /*compute depth from generated tree by BFS*/
+ depth = get_lock_depth(leaf);
+
+ do {
+ print_lock_class_header(entry->class, depth);
+ if (trace) {
+ printk("%*s ... acquired at:\n", depth, "");
+ print_lock_trace(trace, 2);
+ printk("\n");
+ }
+
+ /*
+ * Record the pointer to the trace for the next lock_list
+ * entry, see the comments for the function.
+ */
+ trace = entry->trace;
+
+ if (depth == 0 && (entry != root)) {
+ printk("lockdep:%s bad path found in chain graph\n", __func__);
+ break;
+ }
+
+ entry = get_lock_parent(entry);
+ depth--;
+ } while (entry && (depth >= 0));
}
static void
@@ -1452,11 +2515,11 @@ print_irq_lock_scenario(struct lock_list *safe_entry,
*/
if (middle_class != unsafe_class) {
printk("Chain exists of:\n ");
- __print_lock_name(safe_class);
+ __print_lock_name(NULL, safe_class);
printk(KERN_CONT " --> ");
- __print_lock_name(middle_class);
+ __print_lock_name(NULL, middle_class);
printk(KERN_CONT " --> ");
- __print_lock_name(unsafe_class);
+ __print_lock_name(NULL, unsafe_class);
printk(KERN_CONT "\n\n");
}
@@ -1464,23 +2527,23 @@ print_irq_lock_scenario(struct lock_list *safe_entry,
printk(" CPU0 CPU1\n");
printk(" ---- ----\n");
printk(" lock(");
- __print_lock_name(unsafe_class);
+ __print_lock_name(NULL, unsafe_class);
printk(KERN_CONT ");\n");
printk(" local_irq_disable();\n");
printk(" lock(");
- __print_lock_name(safe_class);
+ __print_lock_name(NULL, safe_class);
printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(middle_class);
+ __print_lock_name(NULL, middle_class);
printk(KERN_CONT ");\n");
printk(" <Interrupt>\n");
printk(" lock(");
- __print_lock_name(safe_class);
+ __print_lock_name(NULL, safe_class);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
-static int
+static void
print_bad_irq_dependency(struct task_struct *curr,
struct lock_list *prev_root,
struct lock_list *next_root,
@@ -1493,7 +2556,9 @@ print_bad_irq_dependency(struct task_struct *curr,
const char *irqclass)
{
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
+
+ nbcon_cpu_emergency_enter();
pr_warn("\n");
pr_warn("=====================================================\n");
@@ -1503,33 +2568,33 @@ print_bad_irq_dependency(struct task_struct *curr,
pr_warn("-----------------------------------------------------\n");
pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
curr->comm, task_pid_nr(curr),
- curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
+ lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT,
curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
- curr->hardirqs_enabled,
+ lockdep_hardirqs_enabled(),
curr->softirqs_enabled);
print_lock(next);
pr_warn("\nand this task is already holding:\n");
print_lock(prev);
pr_warn("which would create a new lock dependency:\n");
- print_lock_name(hlock_class(prev));
+ print_lock_name(prev, hlock_class(prev));
pr_cont(" ->");
- print_lock_name(hlock_class(next));
+ print_lock_name(next, hlock_class(next));
pr_cont("\n");
pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n",
irqclass);
- print_lock_name(backwards_entry->class);
+ print_lock_name(NULL, backwards_entry->class);
pr_warn("\n... which became %s-irq-safe at:\n", irqclass);
- print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
+ print_lock_trace(backwards_entry->class->usage_traces[bit1], 1);
pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);
- print_lock_name(forwards_entry->class);
+ print_lock_name(NULL, forwards_entry->class);
pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);
pr_warn("...");
- print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
+ print_lock_trace(forwards_entry->class->usage_traces[bit2], 1);
pr_warn("\nother info that might help us debug this:\n\n");
print_irq_lock_scenario(backwards_entry, forwards_entry,
@@ -1538,53 +2603,19 @@ print_bad_irq_dependency(struct task_struct *curr,
lockdep_print_held_locks(curr);
pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
- if (!save_trace(&prev_root->trace))
- return 0;
- print_shortest_lock_dependencies(backwards_entry, prev_root);
+ print_shortest_lock_dependencies_backwards(backwards_entry, prev_root);
pr_warn("\nthe dependencies between the lock to be acquired");
pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
- if (!save_trace(&next_root->trace))
- return 0;
+ next_root->trace = save_trace();
+ if (!next_root->trace)
+ goto out;
print_shortest_lock_dependencies(forwards_entry, next_root);
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
-}
-
-static int
-check_usage(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, enum lock_usage_bit bit_backwards,
- enum lock_usage_bit bit_forwards, const char *irqclass)
-{
- int ret;
- struct lock_list this, that;
- struct lock_list *uninitialized_var(target_entry);
- struct lock_list *uninitialized_var(target_entry1);
-
- this.parent = NULL;
-
- this.class = hlock_class(prev);
- ret = find_usage_backwards(&this, bit_backwards, &target_entry);
- if (ret < 0)
- return print_bfs_bug(ret);
- if (ret == 1)
- return ret;
-
- that.parent = NULL;
- that.class = hlock_class(next);
- ret = find_usage_forwards(&that, bit_forwards, &target_entry1);
- if (ret < 0)
- return print_bfs_bug(ret);
- if (ret == 1)
- return ret;
-
- return print_bad_irq_dependency(curr, &this, &that,
- target_entry, target_entry1,
- prev, next,
- bit_backwards, bit_forwards, irqclass);
+out:
+ nbcon_cpu_emergency_exit();
}
static const char *state_names[] = {
@@ -1603,103 +2634,361 @@ static const char *state_rnames[] = {
static inline const char *state_name(enum lock_usage_bit bit)
{
- return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2];
+ if (bit & LOCK_USAGE_READ_MASK)
+ return state_rnames[bit >> LOCK_USAGE_DIR_MASK];
+ else
+ return state_names[bit >> LOCK_USAGE_DIR_MASK];
}
+/*
+ * The bit number is encoded like:
+ *
+ * bit0: 0 exclusive, 1 read lock
+ * bit1: 0 used in irq, 1 irq enabled
+ * bit2-n: state
+ */
static int exclusive_bit(int new_bit)
{
- /*
- * USED_IN
- * USED_IN_READ
- * ENABLED
- * ENABLED_READ
- *
- * bit 0 - write/read
- * bit 1 - used_in/enabled
- * bit 2+ state
- */
-
- int state = new_bit & ~3;
- int dir = new_bit & 2;
+ int state = new_bit & LOCK_USAGE_STATE_MASK;
+ int dir = new_bit & LOCK_USAGE_DIR_MASK;
/*
* keep state, bit flip the direction and strip read.
*/
- return state | (dir ^ 2);
+ return state | (dir ^ LOCK_USAGE_DIR_MASK);
+}
+
+/*
+ * Observe that when given a bitmask where each bitnr is encoded as above, a
+ * right shift of the mask transforms the individual bitnrs as -1 and
+ * conversely, a left shift transforms into +1 for the individual bitnrs.
+ *
+ * So for all bits whose number have LOCK_ENABLED_* set (bitnr1 == 1), we can
+ * create the mask with those bit numbers using LOCK_USED_IN_* (bitnr1 == 0)
+ * instead by subtracting the bit number by 2, or shifting the mask right by 2.
+ *
+ * Similarly, bitnr1 == 0 becomes bitnr1 == 1 by adding 2, or shifting left 2.
+ *
+ * So split the mask (note that LOCKF_ENABLED_IRQ_ALL|LOCKF_USED_IN_IRQ_ALL is
+ * all bits set) and recompose with bitnr1 flipped.
+ */
+static unsigned long invert_dir_mask(unsigned long mask)
+{
+ unsigned long excl = 0;
+
+ /* Invert dir */
+ excl |= (mask & LOCKF_ENABLED_IRQ_ALL) >> LOCK_USAGE_DIR_MASK;
+ excl |= (mask & LOCKF_USED_IN_IRQ_ALL) << LOCK_USAGE_DIR_MASK;
+
+ return excl;
+}
+
+/*
+ * Note that a LOCK_ENABLED_IRQ_*_READ usage and a LOCK_USED_IN_IRQ_*_READ
+ * usage may cause deadlock too, for example:
+ *
+ * P1 P2
+ * <irq disabled>
+ * write_lock(l1); <irq enabled>
+ * read_lock(l2);
+ * write_lock(l2);
+ * <in irq>
+ * read_lock(l1);
+ *
+ * , in above case, l1 will be marked as LOCK_USED_IN_IRQ_HARDIRQ_READ and l2
+ * will marked as LOCK_ENABLE_IRQ_HARDIRQ_READ, and this is a possible
+ * deadlock.
+ *
+ * In fact, all of the following cases may cause deadlocks:
+ *
+ * LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_*
+ * LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_*
+ * LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_*_READ
+ * LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_*_READ
+ *
+ * As a result, to calculate the "exclusive mask", first we invert the
+ * direction (USED_IN/ENABLED) of the original mask, and 1) for all bits with
+ * bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). 2) for all
+ * bits with bitnr0 cleared (LOCK_*_READ), add those with bitnr0 set (LOCK_*).
+ */
+static unsigned long exclusive_mask(unsigned long mask)
+{
+ unsigned long excl = invert_dir_mask(mask);
+
+ excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK;
+ excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK;
+
+ return excl;
+}
+
+/*
+ * Retrieve the _possible_ original mask to which @mask is
+ * exclusive. Ie: this is the opposite of exclusive_mask().
+ * Note that 2 possible original bits can match an exclusive
+ * bit: one has LOCK_USAGE_READ_MASK set, the other has it
+ * cleared. So both are returned for each exclusive bit.
+ */
+static unsigned long original_mask(unsigned long mask)
+{
+ unsigned long excl = invert_dir_mask(mask);
+
+ /* Include read in existing usages */
+ excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK;
+ excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK;
+
+ return excl;
+}
+
+/*
+ * Find the first pair of bit match between an original
+ * usage mask and an exclusive usage mask.
+ */
+static int find_exclusive_match(unsigned long mask,
+ unsigned long excl_mask,
+ enum lock_usage_bit *bitp,
+ enum lock_usage_bit *excl_bitp)
+{
+ int bit, excl, excl_read;
+
+ for_each_set_bit(bit, &mask, LOCK_USED) {
+ /*
+ * exclusive_bit() strips the read bit, however,
+ * LOCK_ENABLED_IRQ_*_READ may cause deadlocks too, so we need
+ * to search excl | LOCK_USAGE_READ_MASK as well.
+ */
+ excl = exclusive_bit(bit);
+ excl_read = excl | LOCK_USAGE_READ_MASK;
+ if (excl_mask & lock_flag(excl)) {
+ *bitp = bit;
+ *excl_bitp = excl;
+ return 0;
+ } else if (excl_mask & lock_flag(excl_read)) {
+ *bitp = bit;
+ *excl_bitp = excl_read;
+ return 0;
+ }
+ }
+ return -1;
}
+/*
+ * Prove that the new dependency does not connect a hardirq-safe(-read)
+ * lock with a hardirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, enum lock_usage_bit bit)
+ struct held_lock *next)
{
+ unsigned long usage_mask = 0, forward_mask, backward_mask;
+ enum lock_usage_bit forward_bit = 0, backward_bit = 0;
+ struct lock_list *target_entry1;
+ struct lock_list *target_entry;
+ struct lock_list this, that;
+ enum bfs_result ret;
+
/*
- * Prove that the new dependency does not connect a hardirq-safe
- * lock with a hardirq-unsafe lock - to achieve this we search
- * the backwards-subgraph starting at <prev>, and the
- * forwards-subgraph starting at <next>:
+ * Step 1: gather all hard/soft IRQs usages backward in an
+ * accumulated usage mask.
*/
- if (!check_usage(curr, prev, next, bit,
- exclusive_bit(bit), state_name(bit)))
+ bfs_init_rootb(&this, prev);
+
+ ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, usage_skip, NULL);
+ if (bfs_error(ret)) {
+ print_bfs_bug(ret);
return 0;
+ }
- bit++; /* _READ */
+ usage_mask &= LOCKF_USED_IN_IRQ_ALL;
+ if (!usage_mask)
+ return 1;
/*
- * Prove that the new dependency does not connect a hardirq-safe-read
- * lock with a hardirq-unsafe lock - to achieve this we search
- * the backwards-subgraph starting at <prev>, and the
- * forwards-subgraph starting at <next>:
+ * Step 2: find exclusive uses forward that match the previous
+ * backward accumulated mask.
*/
- if (!check_usage(curr, prev, next, bit,
- exclusive_bit(bit), state_name(bit)))
+ forward_mask = exclusive_mask(usage_mask);
+
+ bfs_init_root(&that, next);
+
+ ret = find_usage_forwards(&that, forward_mask, &target_entry1);
+ if (bfs_error(ret)) {
+ print_bfs_bug(ret);
return 0;
+ }
+ if (ret == BFS_RNOMATCH)
+ return 1;
+
+ /*
+ * Step 3: we found a bad match! Now retrieve a lock from the backward
+ * list whose usage mask matches the exclusive usage mask from the
+ * lock found on the forward list.
+ *
+ * Note, we should only keep the LOCKF_ENABLED_IRQ_ALL bits, considering
+ * the follow case:
+ *
+ * When trying to add A -> B to the graph, we find that there is a
+ * hardirq-safe L, that L -> ... -> A, and another hardirq-unsafe M,
+ * that B -> ... -> M. However M is **softirq-safe**, if we use exact
+ * invert bits of M's usage_mask, we will find another lock N that is
+ * **softirq-unsafe** and N -> ... -> A, however N -> .. -> M will not
+ * cause a inversion deadlock.
+ */
+ backward_mask = original_mask(target_entry1->class->usage_mask & LOCKF_ENABLED_IRQ_ALL);
+
+ ret = find_usage_backwards(&this, backward_mask, &target_entry);
+ if (bfs_error(ret)) {
+ print_bfs_bug(ret);
+ return 0;
+ }
+ if (DEBUG_LOCKS_WARN_ON(ret == BFS_RNOMATCH))
+ return 1;
+
+ /*
+ * Step 4: narrow down to a pair of incompatible usage bits
+ * and report it.
+ */
+ ret = find_exclusive_match(target_entry->class->usage_mask,
+ target_entry1->class->usage_mask,
+ &backward_bit, &forward_bit);
+ if (DEBUG_LOCKS_WARN_ON(ret == -1))
+ return 1;
+
+ print_bad_irq_dependency(curr, &this, &that,
+ target_entry, target_entry1,
+ prev, next,
+ backward_bit, forward_bit,
+ state_name(backward_bit));
+
+ return 0;
+}
+
+#else
+static inline int check_irq_usage(struct task_struct *curr,
+ struct held_lock *prev, struct held_lock *next)
+{
return 1;
}
-static int
-check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next)
+static inline bool usage_skip(struct lock_list *entry, void *mask)
{
-#define LOCKDEP_STATE(__STATE) \
- if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \
- return 0;
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
+ return false;
+}
- return 1;
+#endif /* CONFIG_TRACE_IRQFLAGS */
+
+#ifdef CONFIG_LOCKDEP_SMALL
+/*
+ * We are about to add A -> B into the dependency graph, and in __bfs() a
+ * strong dependency path A -> .. -> B is found: hlock_class equals
+ * entry->class.
+ *
+ * If A -> .. -> B can replace A -> B in any __bfs() search (means the former
+ * is _stronger_ than or equal to the latter), we consider A -> B as redundant.
+ * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A
+ * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the
+ * dependency graph, as any strong path ..-> A -> B ->.. we can get with
+ * having dependency A -> B, we could already get a equivalent path ..-> A ->
+ * .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant.
+ *
+ * We need to make sure both the start and the end of A -> .. -> B is not
+ * weaker than A -> B. For the start part, please see the comment in
+ * check_redundant(). For the end part, we need:
+ *
+ * Either
+ *
+ * a) A -> B is -(*R)-> (everything is not weaker than that)
+ *
+ * or
+ *
+ * b) A -> .. -> B is -(*N)-> (nothing is stronger than this)
+ *
+ */
+static inline bool hlock_equal(struct lock_list *entry, void *data)
+{
+ struct held_lock *hlock = (struct held_lock *)data;
+
+ return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */
+ (hlock->read == 2 || /* A -> B is -(*R)-> */
+ !entry->only_xr); /* A -> .. -> B is -(*N)-> */
}
-static void inc_chains(void)
+/*
+ * Check that the dependency graph starting at <src> can lead to
+ * <target> or not. If it can, <src> -> <target> dependency is already
+ * in the graph.
+ *
+ * Return BFS_RMATCH if it does, or BFS_RNOMATCH if it does not, return BFS_E* if
+ * any error appears in the bfs search.
+ */
+static noinline enum bfs_result
+check_redundant(struct held_lock *src, struct held_lock *target)
{
- if (current->hardirq_context)
- nr_hardirq_chains++;
- else {
- if (current->softirq_context)
- nr_softirq_chains++;
- else
- nr_process_chains++;
- }
+ enum bfs_result ret;
+ struct lock_list *target_entry;
+ struct lock_list src_entry;
+
+ bfs_init_root(&src_entry, src);
+ /*
+ * Special setup for check_redundant().
+ *
+ * To report redundant, we need to find a strong dependency path that
+ * is equal to or stronger than <src> -> <target>. So if <src> is E,
+ * we need to let __bfs() only search for a path starting at a -(E*)->,
+ * we achieve this by setting the initial node's ->only_xr to true in
+ * that case. And if <prev> is S, we set initial ->only_xr to false
+ * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant.
+ */
+ src_entry.only_xr = src->read == 0;
+
+ debug_atomic_inc(nr_redundant_checks);
+
+ /*
+ * Note: we skip local_lock() for redundant check, because as the
+ * comment in usage_skip(), A -> local_lock() -> B and A -> B are not
+ * the same.
+ */
+ ret = check_path(target, &src_entry, hlock_equal, usage_skip, &target_entry);
+
+ if (ret == BFS_RMATCH)
+ debug_atomic_inc(nr_redundant);
+
+ return ret;
}
#else
-static inline int
-check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next)
+static inline enum bfs_result
+check_redundant(struct held_lock *src, struct held_lock *target)
{
- return 1;
+ return BFS_RNOMATCH;
}
-static inline void inc_chains(void)
+#endif
+
+static void inc_chains(int irq_context)
{
- nr_process_chains++;
+ if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT)
+ nr_hardirq_chains++;
+ else if (irq_context & LOCK_CHAIN_SOFTIRQ_CONTEXT)
+ nr_softirq_chains++;
+ else
+ nr_process_chains++;
}
-#endif
+static void dec_chains(int irq_context)
+{
+ if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT)
+ nr_hardirq_chains--;
+ else if (irq_context & LOCK_CHAIN_SOFTIRQ_CONTEXT)
+ nr_softirq_chains--;
+ else
+ nr_process_chains--;
+}
static void
-print_deadlock_scenario(struct held_lock *nxt,
- struct held_lock *prv)
+print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv)
{
struct lock_class *next = hlock_class(nxt);
struct lock_class *prev = hlock_class(prv);
@@ -1708,21 +2997,25 @@ print_deadlock_scenario(struct held_lock *nxt,
printk(" CPU0\n");
printk(" ----\n");
printk(" lock(");
- __print_lock_name(prev);
+ __print_lock_name(prv, prev);
printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(next);
+ __print_lock_name(nxt, next);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
printk(" May be due to missing lock nesting notation\n\n");
}
-static int
+static void
print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
struct held_lock *next)
{
+ struct lock_class *class = hlock_class(prev);
+
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
+
+ nbcon_cpu_emergency_enter();
pr_warn("\n");
pr_warn("============================================\n");
@@ -1735,6 +3028,11 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
pr_warn("\nbut task is already holding lock:\n");
print_lock(prev);
+ if (class->cmp_fn) {
+ pr_warn("and the lock comparison function returns %i:\n",
+ class->cmp_fn(prev->instance, next->instance));
+ }
+
pr_warn("\nother info that might help us debug this:\n");
print_deadlock_scenario(next, prev);
lockdep_print_held_locks(curr);
@@ -1742,7 +3040,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
pr_warn("\nstack backtrace:\n");
dump_stack();
- return 0;
+ nbcon_cpu_emergency_exit();
}
/*
@@ -1751,12 +3049,14 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
* (Note that this has to be done separately, because the graph cannot
* detect such classes of deadlocks.)
*
- * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read
+ * Returns: 0 on deadlock detected, 1 on OK, 2 if another lock with the same
+ * lock class is held but nest_lock is also held, i.e. we rely on the
+ * nest_lock to avoid the deadlock.
*/
static int
-check_deadlock(struct task_struct *curr, struct held_lock *next,
- struct lockdep_map *next_instance, int read)
+check_deadlock(struct task_struct *curr, struct held_lock *next)
{
+ struct lock_class *class;
struct held_lock *prev;
struct held_lock *nest = NULL;
int i;
@@ -1774,8 +3074,14 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
* Allow read-after-read recursion of the same
* lock class (i.e. read_lock(lock)+read_lock(lock)):
*/
- if ((read == 2) && prev->read)
- return 2;
+ if ((next->read == 2) && prev->read)
+ continue;
+
+ class = hlock_class(prev);
+
+ if (class->cmp_fn &&
+ class->cmp_fn(prev->instance, next->instance) < 0)
+ continue;
/*
* We're holding the nest_lock, which serializes this lock's
@@ -1784,14 +3090,15 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
if (nest)
return 2;
- return print_deadlock_bug(curr, prev, next);
+ print_deadlock_bug(curr, prev, next);
+ return 0;
}
return 1;
}
/*
* There was a chain-cache miss, and we are about to add a new dependency
- * to a previous lock. We recursively validate the following rules:
+ * to a previous lock. We validate the following rules:
*
* - would the adding of the <prev> -> <next> dependency create a
* circular dependency in the graph? [== circular deadlock]
@@ -1813,51 +3120,55 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
*/
static int
check_prev_add(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, int distance, int *stack_saved)
+ struct held_lock *next, u16 distance,
+ struct lock_trace **const trace)
{
struct lock_list *entry;
- int ret;
- struct lock_list this;
- struct lock_list *uninitialized_var(target_entry);
- /*
- * Static variable, serialized by the graph_lock().
- *
- * We use this static variable to save the stack trace in case
- * we call into this function multiple times due to encountering
- * trylocks in the held lock stack.
- */
- static struct stack_trace trace;
+ enum bfs_result ret;
+
+ if (!hlock_class(prev)->key || !hlock_class(next)->key) {
+ /*
+ * The warning statements below may trigger a use-after-free
+ * of the class name. It is better to trigger a use-after free
+ * and to have the class name most of the time instead of not
+ * having the class name available.
+ */
+ WARN_ONCE(!debug_locks_silent && !hlock_class(prev)->key,
+ "Detected use-after-free of lock class %px/%s\n",
+ hlock_class(prev),
+ hlock_class(prev)->name);
+ WARN_ONCE(!debug_locks_silent && !hlock_class(next)->key,
+ "Detected use-after-free of lock class %px/%s\n",
+ hlock_class(next),
+ hlock_class(next)->name);
+ return 2;
+ }
+
+ if (prev->class_idx == next->class_idx) {
+ struct lock_class *class = hlock_class(prev);
+
+ if (class->cmp_fn &&
+ class->cmp_fn(prev->instance, next->instance) < 0)
+ return 2;
+ }
/*
* Prove that the new <prev> -> <next> dependency would not
* create a circular dependency in the graph. (We do this by
- * forward-recursing into the graph starting at <next>, and
- * checking whether we can reach <prev>.)
+ * a breadth-first search into the graph starting at <next>,
+ * and check whether we can reach <prev>.)
*
- * We are using global variables to control the recursion, to
- * keep the stackframe size of the recursive functions low:
+ * The search is limited by the size of the circular queue (i.e.,
+ * MAX_CIRCULAR_QUEUE_SIZE) which keeps track of a breadth of nodes
+ * in the graph whose neighbours are to be checked.
*/
- this.class = hlock_class(next);
- this.parent = NULL;
- ret = check_noncircular(&this, hlock_class(prev), &target_entry);
- if (unlikely(!ret))
- return print_circular_bug(&this, target_entry, next, prev);
- else if (unlikely(ret < 0))
- return print_bfs_bug(ret);
-
- if (!check_prev_add_irq(curr, prev, next))
+ ret = check_noncircular(next, prev, trace);
+ if (unlikely(bfs_error(ret) || ret == BFS_RMATCH))
+ return 0;
+
+ if (!check_irq_usage(curr, prev, next))
return 0;
- /*
- * For recursive read-locks we do all the dependency checks,
- * but we dont store read-triggered dependencies (only
- * write-triggered dependencies). This ensures that only the
- * write-side dependencies matter, and that if for example a
- * write-lock never takes any other locks, then the reads are
- * equivalent to a NOP.
- */
- if (next->read == 2 || prev->read == 2)
- return 1;
/*
* Is the <prev> -> <next> dependency already present?
*
@@ -1870,49 +3181,71 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
if (entry->class == hlock_class(next)) {
if (distance == 1)
entry->distance = 1;
- return 2;
+ entry->dep |= calc_dep(prev, next);
+
+ /*
+ * Also, update the reverse dependency in @next's
+ * ->locks_before list.
+ *
+ * Here we reuse @entry as the cursor, which is fine
+ * because we won't go to the next iteration of the
+ * outer loop:
+ *
+ * For normal cases, we return in the inner loop.
+ *
+ * If we fail to return, we have inconsistency, i.e.
+ * <prev>::locks_after contains <next> while
+ * <next>::locks_before doesn't contain <prev>. In
+ * that case, we return after the inner and indicate
+ * something is wrong.
+ */
+ list_for_each_entry(entry, &hlock_class(next)->locks_before, entry) {
+ if (entry->class == hlock_class(prev)) {
+ if (distance == 1)
+ entry->distance = 1;
+ entry->dep |= calc_depb(prev, next);
+ return 1;
+ }
+ }
+
+ /* <prev> is not found in <next>::locks_before */
+ return 0;
}
}
- if (!*stack_saved) {
- if (!save_trace(&trace))
+ /*
+ * Is the <prev> -> <next> link redundant?
+ */
+ ret = check_redundant(prev, next);
+ if (bfs_error(ret))
+ return 0;
+ else if (ret == BFS_RMATCH)
+ return 2;
+
+ if (!*trace) {
+ *trace = save_trace();
+ if (!*trace)
return 0;
- *stack_saved = 1;
}
/*
* Ok, all validations passed, add the new lock
* to the previous lock's dependency list:
*/
- ret = add_lock_to_list(hlock_class(next),
- &hlock_class(prev)->locks_after,
- next->acquire_ip, distance, &trace);
+ ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
+ &hlock_class(prev)->locks_after, distance,
+ calc_dep(prev, next), *trace);
if (!ret)
return 0;
- ret = add_lock_to_list(hlock_class(prev),
- &hlock_class(next)->locks_before,
- next->acquire_ip, distance, &trace);
+ ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
+ &hlock_class(next)->locks_before, distance,
+ calc_depb(prev, next), *trace);
if (!ret)
return 0;
- /*
- * Debugging printouts:
- */
- if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
- /* We drop graph lock, so another thread can overwrite trace. */
- *stack_saved = 0;
- graph_unlock();
- printk("\n new dependency: ");
- print_lock_name(hlock_class(prev));
- printk(KERN_CONT " => ");
- print_lock_name(hlock_class(next));
- printk(KERN_CONT "\n");
- dump_stack();
- return graph_lock();
- }
- return 1;
+ return 2;
}
/*
@@ -1924,8 +3257,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
static int
check_prevs_add(struct task_struct *curr, struct held_lock *next)
{
+ struct lock_trace *trace = NULL;
int depth = curr->lockdep_depth;
- int stack_saved = 0;
struct held_lock *hlock;
/*
@@ -1944,16 +3277,14 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
goto out_bug;
for (;;) {
- int distance = curr->lockdep_depth - depth + 1;
+ u16 distance = curr->lockdep_depth - depth + 1;
hlock = curr->held_locks + depth - 1;
- /*
- * Only non-recursive-read entries get new dependencies
- * added:
- */
- if (hlock->read != 2 && hlock->check) {
- if (!check_prev_add(curr, hlock, next,
- distance, &stack_saved))
+
+ if (hlock->check) {
+ int ret = check_prev_add(curr, hlock, next, distance, &trace);
+ if (!ret)
return 0;
+
/*
* Stop after the first non-trylock entry,
* as non-trylock entries have added their
@@ -1963,6 +3294,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
if (!hlock->trylock)
break;
}
+
depth--;
/*
* End of lock-stack?
@@ -1991,14 +3323,245 @@ out_bug:
return 0;
}
-unsigned long nr_lock_chains;
struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
-int nr_chain_hlocks;
+static DECLARE_BITMAP(lock_chains_in_use, MAX_LOCKDEP_CHAINS);
static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
+unsigned long nr_zapped_lock_chains;
+unsigned int nr_free_chain_hlocks; /* Free chain_hlocks in buckets */
+unsigned int nr_lost_chain_hlocks; /* Lost chain_hlocks */
+unsigned int nr_large_chain_blocks; /* size > MAX_CHAIN_BUCKETS */
+
+/*
+ * The first 2 chain_hlocks entries in the chain block in the bucket
+ * list contains the following meta data:
+ *
+ * entry[0]:
+ * Bit 15 - always set to 1 (it is not a class index)
+ * Bits 0-14 - upper 15 bits of the next block index
+ * entry[1] - lower 16 bits of next block index
+ *
+ * A next block index of all 1 bits means it is the end of the list.
+ *
+ * On the unsized bucket (bucket-0), the 3rd and 4th entries contain
+ * the chain block size:
+ *
+ * entry[2] - upper 16 bits of the chain block size
+ * entry[3] - lower 16 bits of the chain block size
+ */
+#define MAX_CHAIN_BUCKETS 16
+#define CHAIN_BLK_FLAG (1U << 15)
+#define CHAIN_BLK_LIST_END 0xFFFFU
+
+static int chain_block_buckets[MAX_CHAIN_BUCKETS];
+
+static inline int size_to_bucket(int size)
+{
+ if (size > MAX_CHAIN_BUCKETS)
+ return 0;
+
+ return size - 1;
+}
+
+/*
+ * Iterate all the chain blocks in a bucket.
+ */
+#define for_each_chain_block(bucket, prev, curr) \
+ for ((prev) = -1, (curr) = chain_block_buckets[bucket]; \
+ (curr) >= 0; \
+ (prev) = (curr), (curr) = chain_block_next(curr))
+
+/*
+ * next block or -1
+ */
+static inline int chain_block_next(int offset)
+{
+ int next = chain_hlocks[offset];
+
+ WARN_ON_ONCE(!(next & CHAIN_BLK_FLAG));
+
+ if (next == CHAIN_BLK_LIST_END)
+ return -1;
+
+ next &= ~CHAIN_BLK_FLAG;
+ next <<= 16;
+ next |= chain_hlocks[offset + 1];
+
+ return next;
+}
+
+/*
+ * bucket-0 only
+ */
+static inline int chain_block_size(int offset)
+{
+ return (chain_hlocks[offset + 2] << 16) | chain_hlocks[offset + 3];
+}
+
+static inline void init_chain_block(int offset, int next, int bucket, int size)
+{
+ chain_hlocks[offset] = (next >> 16) | CHAIN_BLK_FLAG;
+ chain_hlocks[offset + 1] = (u16)next;
+
+ if (size && !bucket) {
+ chain_hlocks[offset + 2] = size >> 16;
+ chain_hlocks[offset + 3] = (u16)size;
+ }
+}
+
+static inline void add_chain_block(int offset, int size)
+{
+ int bucket = size_to_bucket(size);
+ int next = chain_block_buckets[bucket];
+ int prev, curr;
+
+ if (unlikely(size < 2)) {
+ /*
+ * We can't store single entries on the freelist. Leak them.
+ *
+ * One possible way out would be to uniquely mark them, other
+ * than with CHAIN_BLK_FLAG, such that we can recover them when
+ * the block before it is re-added.
+ */
+ if (size)
+ nr_lost_chain_hlocks++;
+ return;
+ }
+
+ nr_free_chain_hlocks += size;
+ if (!bucket) {
+ nr_large_chain_blocks++;
+
+ /*
+ * Variable sized, sort large to small.
+ */
+ for_each_chain_block(0, prev, curr) {
+ if (size >= chain_block_size(curr))
+ break;
+ }
+ init_chain_block(offset, curr, 0, size);
+ if (prev < 0)
+ chain_block_buckets[0] = offset;
+ else
+ init_chain_block(prev, offset, 0, 0);
+ return;
+ }
+ /*
+ * Fixed size, add to head.
+ */
+ init_chain_block(offset, next, bucket, size);
+ chain_block_buckets[bucket] = offset;
+}
+
+/*
+ * Only the first block in the list can be deleted.
+ *
+ * For the variable size bucket[0], the first block (the largest one) is
+ * returned, broken up and put back into the pool. So if a chain block of
+ * length > MAX_CHAIN_BUCKETS is ever used and zapped, it will just be
+ * queued up after the primordial chain block and never be used until the
+ * hlock entries in the primordial chain block is almost used up. That
+ * causes fragmentation and reduce allocation efficiency. That can be
+ * monitored by looking at the "large chain blocks" number in lockdep_stats.
+ */
+static inline void del_chain_block(int bucket, int size, int next)
+{
+ nr_free_chain_hlocks -= size;
+ chain_block_buckets[bucket] = next;
+
+ if (!bucket)
+ nr_large_chain_blocks--;
+}
+
+static void init_chain_block_buckets(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_CHAIN_BUCKETS; i++)
+ chain_block_buckets[i] = -1;
+
+ add_chain_block(0, ARRAY_SIZE(chain_hlocks));
+}
+
+/*
+ * Return offset of a chain block of the right size or -1 if not found.
+ *
+ * Fairly simple worst-fit allocator with the addition of a number of size
+ * specific free lists.
+ */
+static int alloc_chain_hlocks(int req)
+{
+ int bucket, curr, size;
+
+ /*
+ * We rely on the MSB to act as an escape bit to denote freelist
+ * pointers. Make sure this bit isn't set in 'normal' class_idx usage.
+ */
+ BUILD_BUG_ON((MAX_LOCKDEP_KEYS-1) & CHAIN_BLK_FLAG);
+
+ init_data_structures_once();
+
+ if (nr_free_chain_hlocks < req)
+ return -1;
+
+ /*
+ * We require a minimum of 2 (u16) entries to encode a freelist
+ * 'pointer'.
+ */
+ req = max(req, 2);
+ bucket = size_to_bucket(req);
+ curr = chain_block_buckets[bucket];
+
+ if (bucket) {
+ if (curr >= 0) {
+ del_chain_block(bucket, req, chain_block_next(curr));
+ return curr;
+ }
+ /* Try bucket 0 */
+ curr = chain_block_buckets[0];
+ }
+
+ /*
+ * The variable sized freelist is sorted by size; the first entry is
+ * the largest. Use it if it fits.
+ */
+ if (curr >= 0) {
+ size = chain_block_size(curr);
+ if (likely(size >= req)) {
+ del_chain_block(0, size, chain_block_next(curr));
+ if (size > req)
+ add_chain_block(curr + req, size - req);
+ return curr;
+ }
+ }
+
+ /*
+ * Last resort, split a block in a larger sized bucket.
+ */
+ for (size = MAX_CHAIN_BUCKETS; size > req; size--) {
+ bucket = size_to_bucket(size);
+ curr = chain_block_buckets[bucket];
+ if (curr < 0)
+ continue;
+
+ del_chain_block(bucket, size, chain_block_next(curr));
+ add_chain_block(curr + req, size - req);
+ return curr;
+ }
+
+ return -1;
+}
+
+static inline void free_chain_hlocks(int base, int size)
+{
+ add_chain_block(base, max(size, 2));
+}
struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
{
- return lock_classes + chain_hlocks[chain->base + i];
+ u16 chain_hlock = chain_hlocks[chain->base + i];
+ unsigned int class_idx = chain_hlock_class_idx(chain_hlock);
+
+ return lock_classes + class_idx;
}
/*
@@ -2024,12 +3587,12 @@ static inline int get_first_held_lock(struct task_struct *curr,
/*
* Returns the next chain_key iteration
*/
-static u64 print_chain_key_iteration(int class_idx, u64 chain_key)
+static u64 print_chain_key_iteration(u16 hlock_id, u64 chain_key)
{
- u64 new_chain_key = iterate_chain_key(chain_key, class_idx);
+ u64 new_chain_key = iterate_chain_key(chain_key, hlock_id);
- printk(" class_idx:%d -> chain_key:%016Lx",
- class_idx,
+ printk(" hlock_id:%d -> chain_key:%016Lx",
+ (unsigned int)hlock_id,
(unsigned long long)new_chain_key);
return new_chain_key;
}
@@ -2038,34 +3601,35 @@ static void
print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next)
{
struct held_lock *hlock;
- u64 chain_key = 0;
+ u64 chain_key = INITIAL_CHAIN_KEY;
int depth = curr->lockdep_depth;
- int i;
+ int i = get_first_held_lock(curr, hlock_next);
- printk("depth: %u\n", depth + 1);
- for (i = get_first_held_lock(curr, hlock_next); i < depth; i++) {
+ printk("depth: %u (irq_context %u)\n", depth - i + 1,
+ hlock_next->irq_context);
+ for (; i < depth; i++) {
hlock = curr->held_locks + i;
- chain_key = print_chain_key_iteration(hlock->class_idx, chain_key);
+ chain_key = print_chain_key_iteration(hlock_id(hlock), chain_key);
print_lock(hlock);
}
- print_chain_key_iteration(hlock_next->class_idx, chain_key);
+ print_chain_key_iteration(hlock_id(hlock_next), chain_key);
print_lock(hlock_next);
}
static void print_chain_keys_chain(struct lock_chain *chain)
{
int i;
- u64 chain_key = 0;
- int class_id;
+ u64 chain_key = INITIAL_CHAIN_KEY;
+ u16 hlock_id;
printk("depth: %u\n", chain->depth);
for (i = 0; i < chain->depth; i++) {
- class_id = chain_hlocks[chain->base + i];
- chain_key = print_chain_key_iteration(class_id + 1, chain_key);
+ hlock_id = chain_hlocks[chain->base + i];
+ chain_key = print_chain_key_iteration(hlock_id, chain_key);
- print_lock_name(lock_classes + class_id);
+ print_lock_name(NULL, lock_classes + chain_hlock_class_idx(hlock_id));
printk("\n");
}
}
@@ -2074,6 +3638,8 @@ static void print_collision(struct task_struct *curr,
struct held_lock *hlock_next,
struct lock_chain *chain)
{
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("============================\n");
pr_warn("WARNING: chain_key collision\n");
@@ -2090,6 +3656,8 @@ static void print_collision(struct task_struct *curr,
pr_warn("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
#endif
@@ -2114,7 +3682,7 @@ static int check_no_collision(struct task_struct *curr,
}
for (j = 0; j < chain->depth - 1; j++, i++) {
- id = curr->held_locks[i].class_idx - 1;
+ id = hlock_id(&curr->held_locks[i]);
if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) {
print_collision(curr, hlock, chain);
@@ -2126,73 +3694,66 @@ static int check_no_collision(struct task_struct *curr,
}
/*
- * Look up a dependency chain. If the key is not present yet then
- * add it and return 1 - in this case the new dependency chain is
- * validated. If the key is already hashed, return 0.
- * (On return with 1 graph_lock is held.)
+ * Given an index that is >= -1, return the index of the next lock chain.
+ * Return -2 if there is no next lock chain.
*/
-static inline int lookup_chain_cache(struct task_struct *curr,
- struct held_lock *hlock,
- u64 chain_key)
+long lockdep_next_lockchain(long i)
+{
+ i = find_next_bit(lock_chains_in_use, ARRAY_SIZE(lock_chains), i + 1);
+ return i < ARRAY_SIZE(lock_chains) ? i : -2;
+}
+
+unsigned long lock_chain_count(void)
+{
+ return bitmap_weight(lock_chains_in_use, ARRAY_SIZE(lock_chains));
+}
+
+/* Must be called with the graph lock held. */
+static struct lock_chain *alloc_lock_chain(void)
+{
+ int idx = find_first_zero_bit(lock_chains_in_use,
+ ARRAY_SIZE(lock_chains));
+
+ if (unlikely(idx >= ARRAY_SIZE(lock_chains)))
+ return NULL;
+ __set_bit(idx, lock_chains_in_use);
+ return lock_chains + idx;
+}
+
+/*
+ * Adds a dependency chain into chain hashtable. And must be called with
+ * graph_lock held.
+ *
+ * Return 0 if fail, and graph_lock is released.
+ * Return 1 if succeed, with graph_lock held.
+ */
+static inline int add_chain_cache(struct task_struct *curr,
+ struct held_lock *hlock,
+ u64 chain_key)
{
- struct lock_class *class = hlock_class(hlock);
struct hlist_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
int i, j;
/*
- * We might need to take the graph lock, ensure we've got IRQs
+ * The caller must hold the graph lock, ensure we've got IRQs
* disabled to make this an IRQ-safe lock.. for recursion reasons
* lockdep won't complain about its own locking errors.
*/
- if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ if (lockdep_assert_locked())
return 0;
- /*
- * We can walk it lock-free, because entries only get added
- * to the hash:
- */
- hlist_for_each_entry_rcu(chain, hash_head, entry) {
- if (chain->chain_key == chain_key) {
-cache_hit:
- debug_atomic_inc(chain_lookup_hits);
- if (!check_no_collision(curr, hlock, chain))
- return 0;
- if (very_verbose(class))
- printk("\nhash chain already cached, key: "
- "%016Lx tail class: [%p] %s\n",
- (unsigned long long)chain_key,
- class->key, class->name);
- return 0;
- }
- }
- if (very_verbose(class))
- printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n",
- (unsigned long long)chain_key, class->key, class->name);
- /*
- * Allocate a new chain entry from the static array, and add
- * it to the hash:
- */
- if (!graph_lock())
- return 0;
- /*
- * We have to walk the chain again locked - to avoid duplicates:
- */
- hlist_for_each_entry(chain, hash_head, entry) {
- if (chain->chain_key == chain_key) {
- graph_unlock();
- goto cache_hit;
- }
- }
- if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
+ chain = alloc_lock_chain();
+ if (!chain) {
if (!debug_locks_off_graph_unlock())
return 0;
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
dump_stack();
+ nbcon_cpu_emergency_exit();
return 0;
}
- chain = lock_chains + nr_lock_chains++;
chain->chain_key = chain_key;
chain->irq_context = hlock->irq_context;
i = get_first_held_lock(curr, hlock);
@@ -2202,41 +3763,104 @@ cache_hit:
BUILD_BUG_ON((1UL << 6) <= ARRAY_SIZE(curr->held_locks));
BUILD_BUG_ON((1UL << 8*sizeof(chain_hlocks[0])) <= ARRAY_SIZE(lock_classes));
- if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
- chain->base = nr_chain_hlocks;
- for (j = 0; j < chain->depth - 1; j++, i++) {
- int lock_id = curr->held_locks[i].class_idx - 1;
- chain_hlocks[chain->base + j] = lock_id;
- }
- chain_hlocks[chain->base + j] = class - lock_classes;
- }
-
- if (nr_chain_hlocks < MAX_LOCKDEP_CHAIN_HLOCKS)
- nr_chain_hlocks += chain->depth;
-
-#ifdef CONFIG_DEBUG_LOCKDEP
- /*
- * Important for check_no_collision().
- */
- if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) {
+ j = alloc_chain_hlocks(chain->depth);
+ if (j < 0) {
if (!debug_locks_off_graph_unlock())
return 0;
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!");
dump_stack();
+ nbcon_cpu_emergency_exit();
return 0;
}
-#endif
+ chain->base = j;
+ for (j = 0; j < chain->depth - 1; j++, i++) {
+ int lock_id = hlock_id(curr->held_locks + i);
+
+ chain_hlocks[chain->base + j] = lock_id;
+ }
+ chain_hlocks[chain->base + j] = hlock_id(hlock);
hlist_add_head_rcu(&chain->entry, hash_head);
debug_atomic_inc(chain_lookup_misses);
- inc_chains();
+ inc_chains(chain->irq_context);
+
+ return 1;
+}
+
+/*
+ * Look up a dependency chain. Must be called with either the graph lock or
+ * the RCU read lock held.
+ */
+static inline struct lock_chain *lookup_chain_cache(u64 chain_key)
+{
+ struct hlist_head *hash_head = chainhashentry(chain_key);
+ struct lock_chain *chain;
+
+ hlist_for_each_entry_rcu(chain, hash_head, entry) {
+ if (READ_ONCE(chain->chain_key) == chain_key) {
+ debug_atomic_inc(chain_lookup_hits);
+ return chain;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * If the key is not present yet in dependency chain cache then
+ * add it and return 1 - in this case the new dependency chain is
+ * validated. If the key is already hashed, return 0.
+ * (On return with 1 graph_lock is held.)
+ */
+static inline int lookup_chain_cache_add(struct task_struct *curr,
+ struct held_lock *hlock,
+ u64 chain_key)
+{
+ struct lock_class *class = hlock_class(hlock);
+ struct lock_chain *chain = lookup_chain_cache(chain_key);
+
+ if (chain) {
+cache_hit:
+ if (!check_no_collision(curr, hlock, chain))
+ return 0;
+
+ if (very_verbose(class)) {
+ printk("\nhash chain already cached, key: "
+ "%016Lx tail class: [%px] %s\n",
+ (unsigned long long)chain_key,
+ class->key, class->name);
+ }
+
+ return 0;
+ }
+
+ if (very_verbose(class)) {
+ printk("\nnew hash chain, key: %016Lx tail class: [%px] %s\n",
+ (unsigned long long)chain_key, class->key, class->name);
+ }
+
+ if (!graph_lock())
+ return 0;
+
+ /*
+ * We have to walk the chain again locked - to avoid duplicates:
+ */
+ chain = lookup_chain_cache(chain_key);
+ if (chain) {
+ graph_unlock();
+ goto cache_hit;
+ }
+
+ if (!add_chain_cache(curr, hlock, chain_key))
+ return 0;
return 1;
}
-static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
- struct held_lock *hlock, int chain_head, u64 chain_key)
+static int validate_chain(struct task_struct *curr,
+ struct held_lock *hlock,
+ int chain_head, u64 chain_key)
{
/*
* Trylock needs to maintain the stack of held locks, but it
@@ -2245,11 +3869,11 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
*
* We look up the chain_key and do the O(N^2) check and update of
* the dependencies only if this is a new dependency chain.
- * (If lookup_chain_cache() returns with 1 it acquires
+ * (If lookup_chain_cache_add() return with 1 it acquires
* graph_lock for us)
*/
if (!hlock->trylock && hlock->check &&
- lookup_chain_cache(curr, hlock, chain_key)) {
+ lookup_chain_cache_add(curr, hlock, chain_key)) {
/*
* Check whether last held lock:
*
@@ -2257,45 +3881,53 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
* - is softirq-safe, if this lock is hardirq-unsafe
*
* And check whether the new lock's dependency graph
- * could lead back to the previous lock.
+ * could lead back to the previous lock:
+ *
+ * - within the current held-lock stack
+ * - across our accumulated lock dependency records
*
- * any of these scenarios could lead to a deadlock. If
- * All validations
+ * any of these scenarios could lead to a deadlock.
*/
- int ret = check_deadlock(curr, hlock, lock, hlock->read);
+ /*
+ * The simple case: does the current hold the same lock
+ * already?
+ */
+ int ret = check_deadlock(curr, hlock);
if (!ret)
return 0;
/*
- * Mark recursive read, as we jump over it when
- * building dependencies (just like we jump over
- * trylock entries):
- */
- if (ret == 2)
- hlock->read = 2;
- /*
* Add dependency only if this lock is not the head
- * of the chain, and if it's not a secondary read-lock:
+ * of the chain, and if the new lock introduces no more
+ * lock dependency (because we already hold a lock with the
+ * same lock class) nor deadlock (because the nest_lock
+ * serializes nesting locks), see the comments for
+ * check_deadlock().
*/
- if (!chain_head && ret != 2)
+ if (!chain_head && ret != 2) {
if (!check_prevs_add(curr, hlock))
return 0;
+ }
+
graph_unlock();
- } else
- /* after lookup_chain_cache(): */
+ } else {
+ /* after lookup_chain_cache_add(): */
if (unlikely(!debug_locks))
return 0;
+ }
return 1;
}
#else
static inline int validate_chain(struct task_struct *curr,
- struct lockdep_map *lock, struct held_lock *hlock,
- int chain_head, u64 chain_key)
+ struct held_lock *hlock,
+ int chain_head, u64 chain_key)
{
return 1;
}
-#endif
+
+static void init_chain_block_buckets(void) { }
+#endif /* CONFIG_PROVE_LOCKING */
/*
* We are building curr_chain_key incrementally, so double-check
@@ -2306,7 +3938,7 @@ static void check_chain_key(struct task_struct *curr)
#ifdef CONFIG_DEBUG_LOCKDEP
struct held_lock *hlock, *prev_hlock = NULL;
unsigned int i;
- u64 chain_key = 0;
+ u64 chain_key = INITIAL_CHAIN_KEY;
for (i = 0; i < curr->lockdep_depth; i++) {
hlock = curr->held_locks + i;
@@ -2322,16 +3954,18 @@ static void check_chain_key(struct task_struct *curr)
(unsigned long long)hlock->prev_chain_key);
return;
}
+
/*
- * Whoops ran out of static storage again?
+ * hlock->class_idx can't go beyond MAX_LOCKDEP_KEYS, but is
+ * it registered lock class index?
*/
- if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS))
+ if (DEBUG_LOCKS_WARN_ON(!test_bit(hlock->class_idx, lock_classes_in_use)))
return;
if (prev_hlock && (prev_hlock->irq_context !=
hlock->irq_context))
- chain_key = 0;
- chain_key = iterate_chain_key(chain_key, hlock->class_idx);
+ chain_key = INITIAL_CHAIN_KEY;
+ chain_key = iterate_chain_key(chain_key, hlock_id(hlock));
prev_hlock = hlock;
}
if (chain_key != curr->curr_chain_key) {
@@ -2348,8 +3982,11 @@ static void check_chain_key(struct task_struct *curr)
#endif
}
-static void
-print_usage_bug_scenario(struct held_lock *lock)
+#ifdef CONFIG_PROVE_LOCKING
+static int mark_lock(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit);
+
+static void print_usage_bug_scenario(struct held_lock *lock)
{
struct lock_class *class = hlock_class(lock);
@@ -2357,21 +3994,23 @@ print_usage_bug_scenario(struct held_lock *lock)
printk(" CPU0\n");
printk(" ----\n");
printk(" lock(");
- __print_lock_name(class);
+ __print_lock_name(lock, class);
printk(KERN_CONT ");\n");
printk(" <Interrupt>\n");
printk(" lock(");
- __print_lock_name(class);
+ __print_lock_name(lock, class);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
-static int
+static void
print_usage_bug(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
{
- if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ if (!debug_locks_off() || debug_locks_silent)
+ return;
+
+ nbcon_cpu_emergency_enter();
pr_warn("\n");
pr_warn("================================\n");
@@ -2384,14 +4023,14 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
curr->comm, task_pid_nr(curr),
- trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
- trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
- trace_hardirqs_enabled(curr),
- trace_softirqs_enabled(curr));
+ lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT,
+ lockdep_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
+ lockdep_hardirqs_enabled(),
+ lockdep_softirqs_enabled(curr));
print_lock(this);
pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]);
- print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
+ print_lock_trace(hlock_class(this)->usage_traces[prev_bit], 1);
print_irqtrace_events(curr);
pr_warn("\nother info that might help us debug this:\n");
@@ -2402,7 +4041,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
pr_warn("\nstack backtrace:\n");
dump_stack();
- return 0;
+ nbcon_cpu_emergency_exit();
}
/*
@@ -2412,20 +4051,19 @@ static inline int
valid_state(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
{
- if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit)))
- return print_usage_bug(curr, this, bad_bit, new_bit);
+ if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) {
+ graph_unlock();
+ print_usage_bug(curr, this, bad_bit, new_bit);
+ return 0;
+ }
return 1;
}
-static int mark_lock(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit new_bit);
-
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
/*
* print irq inversion bug:
*/
-static int
+static void
print_irq_inversion_bug(struct task_struct *curr,
struct lock_list *root, struct lock_list *other,
struct held_lock *this, int forwards,
@@ -2436,7 +4074,9 @@ print_irq_inversion_bug(struct task_struct *curr,
int depth;
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
+
+ nbcon_cpu_emergency_enter();
pr_warn("\n");
pr_warn("========================================================\n");
@@ -2450,7 +4090,7 @@ print_irq_inversion_bug(struct task_struct *curr,
pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
else
pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
- print_lock_name(other->class);
+ print_lock_name(NULL, other->class);
pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n");
pr_warn("\nother info that might help us debug this:\n");
@@ -2476,14 +4116,15 @@ print_irq_inversion_bug(struct task_struct *curr,
lockdep_print_held_locks(curr);
pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
- if (!save_trace(&root->trace))
- return 0;
+ root->trace = save_trace();
+ if (!root->trace)
+ goto out;
print_shortest_lock_dependencies(other, root);
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
+out:
+ nbcon_cpu_emergency_exit();
}
/*
@@ -2492,22 +4133,33 @@ print_irq_inversion_bug(struct task_struct *curr,
*/
static int
check_usage_forwards(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit bit, const char *irqclass)
+ enum lock_usage_bit bit)
{
- int ret;
+ enum bfs_result ret;
struct lock_list root;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
+ enum lock_usage_bit read_bit = bit + LOCK_USAGE_READ_MASK;
+ unsigned usage_mask = lock_flag(bit) | lock_flag(read_bit);
+
+ bfs_init_root(&root, this);
+ ret = find_usage_forwards(&root, usage_mask, &target_entry);
+ if (bfs_error(ret)) {
+ print_bfs_bug(ret);
+ return 0;
+ }
+ if (ret == BFS_RNOMATCH)
+ return 1;
- root.parent = NULL;
- root.class = hlock_class(this);
- ret = find_usage_forwards(&root, bit, &target_entry);
- if (ret < 0)
- return print_bfs_bug(ret);
- if (ret == 1)
- return ret;
+ /* Check whether write or read usage is the match */
+ if (target_entry->class->usage_mask & lock_flag(bit)) {
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 1, state_name(bit));
+ } else {
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 1, state_name(read_bit));
+ }
- return print_irq_inversion_bug(curr, &root, target_entry,
- this, 1, irqclass);
+ return 0;
}
/*
@@ -2516,39 +4168,56 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
*/
static int
check_usage_backwards(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit bit, const char *irqclass)
+ enum lock_usage_bit bit)
{
- int ret;
+ enum bfs_result ret;
struct lock_list root;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
+ enum lock_usage_bit read_bit = bit + LOCK_USAGE_READ_MASK;
+ unsigned usage_mask = lock_flag(bit) | lock_flag(read_bit);
+
+ bfs_init_rootb(&root, this);
+ ret = find_usage_backwards(&root, usage_mask, &target_entry);
+ if (bfs_error(ret)) {
+ print_bfs_bug(ret);
+ return 0;
+ }
+ if (ret == BFS_RNOMATCH)
+ return 1;
- root.parent = NULL;
- root.class = hlock_class(this);
- ret = find_usage_backwards(&root, bit, &target_entry);
- if (ret < 0)
- return print_bfs_bug(ret);
- if (ret == 1)
- return ret;
+ /* Check whether write or read usage is the match */
+ if (target_entry->class->usage_mask & lock_flag(bit)) {
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 0, state_name(bit));
+ } else {
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 0, state_name(read_bit));
+ }
- return print_irq_inversion_bug(curr, &root, target_entry,
- this, 0, irqclass);
+ return 0;
}
void print_irqtrace_events(struct task_struct *curr)
{
- printk("irq event stamp: %u\n", curr->irq_events);
- printk("hardirqs last enabled at (%u): [<%p>] %pS\n",
- curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip,
- (void *)curr->hardirq_enable_ip);
- printk("hardirqs last disabled at (%u): [<%p>] %pS\n",
- curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip,
- (void *)curr->hardirq_disable_ip);
- printk("softirqs last enabled at (%u): [<%p>] %pS\n",
- curr->softirq_enable_event, (void *)curr->softirq_enable_ip,
- (void *)curr->softirq_enable_ip);
- printk("softirqs last disabled at (%u): [<%p>] %pS\n",
- curr->softirq_disable_event, (void *)curr->softirq_disable_ip,
- (void *)curr->softirq_disable_ip);
+ const struct irqtrace_events *trace = &curr->irqtrace;
+
+ nbcon_cpu_emergency_enter();
+
+ printk("irq event stamp: %u\n", trace->irq_events);
+ printk("hardirqs last enabled at (%u): [<%px>] %pS\n",
+ trace->hardirq_enable_event, (void *)trace->hardirq_enable_ip,
+ (void *)trace->hardirq_enable_ip);
+ printk("hardirqs last disabled at (%u): [<%px>] %pS\n",
+ trace->hardirq_disable_event, (void *)trace->hardirq_disable_ip,
+ (void *)trace->hardirq_disable_ip);
+ printk("softirqs last enabled at (%u): [<%px>] %pS\n",
+ trace->softirq_enable_event, (void *)trace->softirq_enable_ip,
+ (void *)trace->softirq_enable_ip);
+ printk("softirqs last disabled at (%u): [<%px>] %pS\n",
+ trace->softirq_disable_event, (void *)trace->softirq_disable_ip,
+ (void *)trace->softirq_disable_ip);
+
+ nbcon_cpu_emergency_exit();
}
static int HARDIRQ_verbose(struct lock_class *class)
@@ -2567,16 +4236,6 @@ static int SOFTIRQ_verbose(struct lock_class *class)
return 0;
}
-static int RECLAIM_FS_verbose(struct lock_class *class)
-{
-#if RECLAIM_VERBOSE
- return class_filter(class);
-#endif
- return 0;
-}
-
-#define STRICT_READ_CHECKS 1
-
static int (*state_verbose_f[])(struct lock_class *class) = {
#define LOCKDEP_STATE(__STATE) \
__STATE##_verbose,
@@ -2587,7 +4246,7 @@ static int (*state_verbose_f[])(struct lock_class *class) = {
static inline int state_verbose(enum lock_usage_bit bit,
struct lock_class *class)
{
- return state_verbose_f[bit >> 2](class);
+ return state_verbose_f[bit >> LOCK_USAGE_DIR_MASK](class);
}
typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
@@ -2598,18 +4257,8 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit)
{
int excl_bit = exclusive_bit(new_bit);
- int read = new_bit & 1;
- int dir = new_bit & 2;
-
- /*
- * mark USED_IN has to look forwards -- to ensure no dependency
- * has ENABLED state, which would allow recursion deadlocks.
- *
- * mark ENABLED has to look backwards -- to ensure no dependee
- * has USED_IN state, which, again, would allow recursion deadlocks.
- */
- check_usage_f usage = dir ?
- check_usage_backwards : check_usage_forwards;
+ int read = new_bit & LOCK_USAGE_READ_MASK;
+ int dir = new_bit & LOCK_USAGE_DIR_MASK;
/*
* Validate that this particular lock does not have conflicting
@@ -2619,23 +4268,30 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
return 0;
/*
- * Validate that the lock dependencies don't have conflicting usage
- * states.
+ * Check for read in write conflicts
*/
- if ((!read || !dir || STRICT_READ_CHECKS) &&
- !usage(curr, this, excl_bit, state_name(new_bit & ~1)))
+ if (!read && !valid_state(curr, this, new_bit,
+ excl_bit + LOCK_USAGE_READ_MASK))
return 0;
+
/*
- * Check for read in write conflicts
+ * Validate that the lock dependencies don't have conflicting usage
+ * states.
*/
- if (!read) {
- if (!valid_state(curr, this, new_bit, excl_bit + 1))
+ if (dir) {
+ /*
+ * mark ENABLED has to look backwards -- to ensure no dependee
+ * has USED_IN state, which, again, would allow recursion deadlocks.
+ */
+ if (!check_usage_backwards(curr, this, excl_bit))
return 0;
-
- if (STRICT_READ_CHECKS &&
- !usage(curr, this, excl_bit + 1,
- state_name(new_bit + 1)))
+ } else {
+ /*
+ * mark USED_IN has to look forwards -- to ensure no dependency
+ * has ENABLED state, which would allow recursion deadlocks.
+ */
+ if (!check_usage_forwards(curr, this, excl_bit))
return 0;
}
@@ -2645,35 +4301,28 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
return 1;
}
-enum mark_type {
-#define LOCKDEP_STATE(__STATE) __STATE,
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
-};
-
/*
* Mark all held locks with a usage bit:
*/
static int
-mark_held_locks(struct task_struct *curr, enum mark_type mark)
+mark_held_locks(struct task_struct *curr, enum lock_usage_bit base_bit)
{
- enum lock_usage_bit usage_bit;
struct held_lock *hlock;
int i;
for (i = 0; i < curr->lockdep_depth; i++) {
+ enum lock_usage_bit hlock_bit = base_bit;
hlock = curr->held_locks + i;
- usage_bit = 2 + (mark << 2); /* ENABLED */
if (hlock->read)
- usage_bit += 1; /* READ */
+ hlock_bit += LOCK_USAGE_READ_MASK;
- BUG_ON(usage_bit >= LOCK_USAGE_STATES);
+ BUG_ON(hlock_bit >= LOCK_USAGE_STATES);
if (!hlock->check)
continue;
- if (!mark_lock(curr, hlock, usage_bit))
+ if (!mark_lock(curr, hlock, hlock_bit))
return 0;
}
@@ -2683,18 +4332,15 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
/*
* Hardirqs will be enabled:
*/
-static void __trace_hardirqs_on_caller(unsigned long ip)
+static void __trace_hardirqs_on_caller(void)
{
struct task_struct *curr = current;
- /* we'll do an OFF -> ON transition: */
- curr->hardirqs_enabled = 1;
-
/*
* We are going to turn hardirqs on, so set the
* usage bit for all held locks:
*/
- if (!mark_held_locks(curr, HARDIRQ))
+ if (!mark_held_locks(curr, LOCK_ENABLED_HARDIRQ))
return;
/*
* If we have softirqs enabled, then set the usage
@@ -2702,22 +4348,32 @@ static void __trace_hardirqs_on_caller(unsigned long ip)
* this bit from being set before)
*/
if (curr->softirqs_enabled)
- if (!mark_held_locks(curr, SOFTIRQ))
- return;
-
- curr->hardirq_enable_ip = ip;
- curr->hardirq_enable_event = ++curr->irq_events;
- debug_atomic_inc(hardirqs_on_events);
+ mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ);
}
-__visible void trace_hardirqs_on_caller(unsigned long ip)
+/**
+ * lockdep_hardirqs_on_prepare - Prepare for enabling interrupts
+ *
+ * Invoked before a possible transition to RCU idle from exit to user or
+ * guest mode. This ensures that all RCU operations are done before RCU
+ * stops watching. After the RCU transition lockdep_hardirqs_on() has to be
+ * invoked to set the final state.
+ */
+void lockdep_hardirqs_on_prepare(void)
{
- time_hardirqs_on(CALLER_ADDR0, ip);
+ if (unlikely(!debug_locks))
+ return;
- if (unlikely(!debug_locks || current->lockdep_recursion))
+ /*
+ * NMIs do not (and cannot) track lock dependencies, nothing to do.
+ */
+ if (unlikely(in_nmi()))
+ return;
+
+ if (unlikely(this_cpu_read(lockdep_recursion)))
return;
- if (unlikely(current->hardirqs_enabled)) {
+ if (unlikely(lockdep_hardirqs_enabled())) {
/*
* Neither irq nor preemption are disabled here
* so this is racy by nature but losing one hit
@@ -2738,38 +4394,105 @@ __visible void trace_hardirqs_on_caller(unsigned long ip)
/*
* See the fine text that goes along with this variable definition.
*/
- if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
+ if (DEBUG_LOCKS_WARN_ON(early_boot_irqs_disabled))
return;
/*
* Can't allow enabling interrupts while in an interrupt handler,
* that's general bad form and such. Recursion, limited stack etc..
*/
- if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
+ if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context()))
return;
- current->lockdep_recursion = 1;
- __trace_hardirqs_on_caller(ip);
- current->lockdep_recursion = 0;
+ current->hardirq_chain_key = current->curr_chain_key;
+
+ lockdep_recursion_inc();
+ __trace_hardirqs_on_caller();
+ lockdep_recursion_finish();
}
-EXPORT_SYMBOL(trace_hardirqs_on_caller);
+EXPORT_SYMBOL_GPL(lockdep_hardirqs_on_prepare);
-void trace_hardirqs_on(void)
+void noinstr lockdep_hardirqs_on(unsigned long ip)
{
- trace_hardirqs_on_caller(CALLER_ADDR0);
+ struct irqtrace_events *trace = &current->irqtrace;
+
+ if (unlikely(!debug_locks))
+ return;
+
+ /*
+ * NMIs can happen in the middle of local_irq_{en,dis}able() where the
+ * tracking state and hardware state are out of sync.
+ *
+ * NMIs must save lockdep_hardirqs_enabled() to restore IRQ state from,
+ * and not rely on hardware state like normal interrupts.
+ */
+ if (unlikely(in_nmi())) {
+ if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI))
+ return;
+
+ /*
+ * Skip:
+ * - recursion check, because NMI can hit lockdep;
+ * - hardware state check, because above;
+ * - chain_key check, see lockdep_hardirqs_on_prepare().
+ */
+ goto skip_checks;
+ }
+
+ if (unlikely(this_cpu_read(lockdep_recursion)))
+ return;
+
+ if (lockdep_hardirqs_enabled()) {
+ /*
+ * Neither irq nor preemption are disabled here
+ * so this is racy by nature but losing one hit
+ * in a stat is not a big deal.
+ */
+ __debug_atomic_inc(redundant_hardirqs_on);
+ return;
+ }
+
+ /*
+ * We're enabling irqs and according to our state above irqs weren't
+ * already enabled, yet we find the hardware thinks they are in fact
+ * enabled.. someone messed up their IRQ state tracing.
+ */
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+
+ /*
+ * Ensure the lock stack remained unchanged between
+ * lockdep_hardirqs_on_prepare() and lockdep_hardirqs_on().
+ */
+ DEBUG_LOCKS_WARN_ON(current->hardirq_chain_key !=
+ current->curr_chain_key);
+
+skip_checks:
+ /* we'll do an OFF -> ON transition: */
+ __this_cpu_write(hardirqs_enabled, 1);
+ trace->hardirq_enable_ip = ip;
+ trace->hardirq_enable_event = ++trace->irq_events;
+ debug_atomic_inc(hardirqs_on_events);
}
-EXPORT_SYMBOL(trace_hardirqs_on);
+EXPORT_SYMBOL_GPL(lockdep_hardirqs_on);
/*
* Hardirqs were disabled:
*/
-__visible void trace_hardirqs_off_caller(unsigned long ip)
+void noinstr lockdep_hardirqs_off(unsigned long ip)
{
- struct task_struct *curr = current;
-
- time_hardirqs_off(CALLER_ADDR0, ip);
+ if (unlikely(!debug_locks))
+ return;
- if (unlikely(!debug_locks || current->lockdep_recursion))
+ /*
+ * Matching lockdep_hardirqs_on(), allow NMIs in the middle of lockdep;
+ * they will restore the software state. This ensures the software
+ * state is consistent inside NMIs as well.
+ */
+ if (in_nmi()) {
+ if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI))
+ return;
+ } else if (__this_cpu_read(lockdep_recursion))
return;
/*
@@ -2779,33 +4502,30 @@ __visible void trace_hardirqs_off_caller(unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
- if (curr->hardirqs_enabled) {
+ if (lockdep_hardirqs_enabled()) {
+ struct irqtrace_events *trace = &current->irqtrace;
+
/*
* We have done an ON -> OFF transition:
*/
- curr->hardirqs_enabled = 0;
- curr->hardirq_disable_ip = ip;
- curr->hardirq_disable_event = ++curr->irq_events;
+ __this_cpu_write(hardirqs_enabled, 0);
+ trace->hardirq_disable_ip = ip;
+ trace->hardirq_disable_event = ++trace->irq_events;
debug_atomic_inc(hardirqs_off_events);
- } else
+ } else {
debug_atomic_inc(redundant_hardirqs_off);
+ }
}
-EXPORT_SYMBOL(trace_hardirqs_off_caller);
-
-void trace_hardirqs_off(void)
-{
- trace_hardirqs_off_caller(CALLER_ADDR0);
-}
-EXPORT_SYMBOL(trace_hardirqs_off);
+EXPORT_SYMBOL_GPL(lockdep_hardirqs_off);
/*
* Softirqs will be enabled:
*/
-void trace_softirqs_on(unsigned long ip)
+void lockdep_softirqs_on(unsigned long ip)
{
- struct task_struct *curr = current;
+ struct irqtrace_events *trace = &current->irqtrace;
- if (unlikely(!debug_locks || current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
/*
@@ -2815,37 +4535,35 @@ void trace_softirqs_on(unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
- if (curr->softirqs_enabled) {
+ if (current->softirqs_enabled) {
debug_atomic_inc(redundant_softirqs_on);
return;
}
- current->lockdep_recursion = 1;
+ lockdep_recursion_inc();
/*
* We'll do an OFF -> ON transition:
*/
- curr->softirqs_enabled = 1;
- curr->softirq_enable_ip = ip;
- curr->softirq_enable_event = ++curr->irq_events;
+ current->softirqs_enabled = 1;
+ trace->softirq_enable_ip = ip;
+ trace->softirq_enable_event = ++trace->irq_events;
debug_atomic_inc(softirqs_on_events);
/*
* We are going to turn softirqs on, so set the
* usage bit for all held locks, if hardirqs are
* enabled too:
*/
- if (curr->hardirqs_enabled)
- mark_held_locks(curr, SOFTIRQ);
- current->lockdep_recursion = 0;
+ if (lockdep_hardirqs_enabled())
+ mark_held_locks(current, LOCK_ENABLED_SOFTIRQ);
+ lockdep_recursion_finish();
}
/*
* Softirqs were disabled:
*/
-void trace_softirqs_off(unsigned long ip)
+void lockdep_softirqs_off(unsigned long ip)
{
- struct task_struct *curr = current;
-
- if (unlikely(!debug_locks || current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
/*
@@ -2854,13 +4572,15 @@ void trace_softirqs_off(unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
- if (curr->softirqs_enabled) {
+ if (current->softirqs_enabled) {
+ struct irqtrace_events *trace = &current->irqtrace;
+
/*
* We have done an ON -> OFF transition:
*/
- curr->softirqs_enabled = 0;
- curr->softirq_disable_ip = ip;
- curr->softirq_disable_event = ++curr->irq_events;
+ current->softirqs_enabled = 0;
+ trace->softirq_disable_ip = ip;
+ trace->softirq_disable_event = ++trace->irq_events;
debug_atomic_inc(softirqs_off_events);
/*
* Whoops, we wanted softirqs off, so why aren't they?
@@ -2870,66 +4590,43 @@ void trace_softirqs_off(unsigned long ip)
debug_atomic_inc(redundant_softirqs_off);
}
-static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
+/**
+ * lockdep_cleanup_dead_cpu - Ensure CPU lockdep state is cleanly stopped
+ *
+ * @cpu: index of offlined CPU
+ * @idle: task pointer for offlined CPU's idle thread
+ *
+ * Invoked after the CPU is dead. Ensures that the tracing infrastructure
+ * is left in a suitable state for the CPU to be subsequently brought
+ * online again.
+ */
+void lockdep_cleanup_dead_cpu(unsigned int cpu, struct task_struct *idle)
{
- struct task_struct *curr = current;
-
if (unlikely(!debug_locks))
return;
- gfp_mask = current_gfp_context(gfp_mask);
-
- /* no reclaim without waiting on it */
- if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
- return;
-
- /* this guy won't enter reclaim */
- if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
- return;
-
- /* We're only interested __GFP_FS allocations for now */
- if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS))
- return;
-
- /*
- * Oi! Can't be having __GFP_FS allocations with IRQs disabled.
- */
- if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
- return;
-
- /* Disable lockdep if explicitly requested */
- if (gfp_mask & __GFP_NOLOCKDEP)
- return;
-
- mark_held_locks(curr, RECLAIM_FS);
+ if (unlikely(per_cpu(hardirqs_enabled, cpu))) {
+ pr_warn("CPU %u left hardirqs enabled!", cpu);
+ if (idle)
+ print_irqtrace_events(idle);
+ /* Clean it up for when the CPU comes online again. */
+ per_cpu(hardirqs_enabled, cpu) = 0;
+ }
}
-static void check_flags(unsigned long flags);
-
-void lockdep_trace_alloc(gfp_t gfp_mask)
+static int
+mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
{
- unsigned long flags;
-
- if (unlikely(current->lockdep_recursion))
- return;
-
- raw_local_irq_save(flags);
- check_flags(flags);
- current->lockdep_recursion = 1;
- __lockdep_trace_alloc(gfp_mask, flags);
- current->lockdep_recursion = 0;
- raw_local_irq_restore(flags);
-}
+ if (!check)
+ goto lock_used;
-static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
-{
/*
* If non-trylock use in a hardirq or softirq context, then
* mark the lock as used in these contexts:
*/
if (!hlock->trylock) {
if (hlock->read) {
- if (curr->hardirq_context)
+ if (lockdep_hardirq_context())
if (!mark_lock(curr, hlock,
LOCK_USED_IN_HARDIRQ_READ))
return 0;
@@ -2938,7 +4635,7 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
LOCK_USED_IN_SOFTIRQ_READ))
return 0;
} else {
- if (curr->hardirq_context)
+ if (lockdep_hardirq_context())
if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
return 0;
if (curr->softirq_context)
@@ -2946,7 +4643,13 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
return 0;
}
}
- if (!hlock->hardirqs_off) {
+
+ /*
+ * For lock_sync(), don't mark the ENABLED usage, since lock_sync()
+ * creates no critical section and no extra dependency can be introduced
+ * by interrupts
+ */
+ if (!hlock->hardirqs_off && !hlock->sync) {
if (hlock->read) {
if (!mark_lock(curr, hlock,
LOCK_ENABLED_HARDIRQ_READ))
@@ -2966,28 +4669,18 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
}
}
- /*
- * We reuse the irq context infrastructure more broadly as a general
- * context checking code. This tests GFP_FS recursion (a lock taken
- * during reclaim for a GFP_FS allocation is held over a GFP_FS
- * allocation).
- */
- if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) {
- if (hlock->read) {
- if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ))
- return 0;
- } else {
- if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS))
- return 0;
- }
- }
+lock_used:
+ /* mark it as used: */
+ if (!mark_lock(curr, hlock, LOCK_USED))
+ return 0;
return 1;
}
static inline unsigned int task_irq_context(struct task_struct *task)
{
- return 2 * !!task->hardirq_context + !!task->softirq_context;
+ return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context() +
+ LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context;
}
static int separate_irq_context(struct task_struct *curr,
@@ -3013,46 +4706,23 @@ static int separate_irq_context(struct task_struct *curr,
return 0;
}
-#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
-
-static inline
-int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit new_bit)
-{
- WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */
- return 1;
-}
-
-static inline int mark_irqflags(struct task_struct *curr,
- struct held_lock *hlock)
-{
- return 1;
-}
-
-static inline unsigned int task_irq_context(struct task_struct *task)
-{
- return 0;
-}
-
-static inline int separate_irq_context(struct task_struct *curr,
- struct held_lock *hlock)
-{
- return 0;
-}
-
-void lockdep_trace_alloc(gfp_t gfp_mask)
-{
-}
-
-#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
-
/*
* Mark a lock with a usage bit, and validate the state transition:
*/
static int mark_lock(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit)
{
- unsigned int new_mask = 1 << new_bit, ret = 1;
+ unsigned int new_mask, ret = 1;
+
+ if (new_bit >= LOCK_USAGE_STATES) {
+ DEBUG_LOCKS_WARN_ON(1);
+ return 0;
+ }
+
+ if (new_bit == LOCK_USED && this->read)
+ new_bit = LOCK_USED_READ;
+
+ new_mask = 1 << new_bit;
/*
* If already set then do not dirty the cacheline,
@@ -3066,63 +4736,210 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
/*
* Make sure we didn't race:
*/
- if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
- graph_unlock();
- return 1;
- }
+ if (unlikely(hlock_class(this)->usage_mask & new_mask))
+ goto unlock;
+
+ if (!hlock_class(this)->usage_mask)
+ debug_atomic_dec(nr_unused_locks);
hlock_class(this)->usage_mask |= new_mask;
- if (!save_trace(hlock_class(this)->usage_traces + new_bit))
- return 0;
+ if (new_bit < LOCK_TRACE_STATES) {
+ if (!(hlock_class(this)->usage_traces[new_bit] = save_trace()))
+ return 0;
+ }
- switch (new_bit) {
-#define LOCKDEP_STATE(__STATE) \
- case LOCK_USED_IN_##__STATE: \
- case LOCK_USED_IN_##__STATE##_READ: \
- case LOCK_ENABLED_##__STATE: \
- case LOCK_ENABLED_##__STATE##_READ:
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
+ if (new_bit < LOCK_USED) {
ret = mark_lock_irq(curr, this, new_bit);
if (!ret)
return 0;
- break;
- case LOCK_USED:
- debug_atomic_dec(nr_unused_locks);
- break;
- default:
- if (!debug_locks_off_graph_unlock())
- return 0;
- WARN_ON(1);
- return 0;
}
+unlock:
graph_unlock();
/*
* We must printk outside of the graph_lock:
*/
if (ret == 2) {
+ nbcon_cpu_emergency_enter();
printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
print_lock(this);
print_irqtrace_events(curr);
dump_stack();
+ nbcon_cpu_emergency_exit();
}
return ret;
}
+static inline short task_wait_context(struct task_struct *curr)
+{
+ /*
+ * Set appropriate wait type for the context; for IRQs we have to take
+ * into account force_irqthread as that is implied by PREEMPT_RT.
+ */
+ if (lockdep_hardirq_context()) {
+ /*
+ * Check if force_irqthreads will run us threaded.
+ */
+ if (curr->hardirq_threaded || curr->irq_config)
+ return LD_WAIT_CONFIG;
+
+ return LD_WAIT_SPIN;
+ } else if (curr->softirq_context) {
+ /*
+ * Softirqs are always threaded.
+ */
+ return LD_WAIT_CONFIG;
+ }
+
+ return LD_WAIT_MAX;
+}
+
+static int
+print_lock_invalid_wait_context(struct task_struct *curr,
+ struct held_lock *hlock)
+{
+ short curr_inner;
+
+ if (!debug_locks_off())
+ return 0;
+ if (debug_locks_silent)
+ return 0;
+
+ nbcon_cpu_emergency_enter();
+
+ pr_warn("\n");
+ pr_warn("=============================\n");
+ pr_warn("[ BUG: Invalid wait context ]\n");
+ print_kernel_ident();
+ pr_warn("-----------------------------\n");
+
+ pr_warn("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
+ print_lock(hlock);
+
+ pr_warn("other info that might help us debug this:\n");
+
+ curr_inner = task_wait_context(curr);
+ pr_warn("context-{%d:%d}\n", curr_inner, curr_inner);
+
+ lockdep_print_held_locks(curr);
+
+ pr_warn("stack backtrace:\n");
+ dump_stack();
+
+ nbcon_cpu_emergency_exit();
+
+ return 0;
+}
+
+/*
+ * Verify the wait_type context.
+ *
+ * This check validates we take locks in the right wait-type order; that is it
+ * ensures that we do not take mutexes inside spinlocks and do not attempt to
+ * acquire spinlocks inside raw_spinlocks and the sort.
+ *
+ * The entire thing is slightly more complex because of RCU, RCU is a lock that
+ * can be taken from (pretty much) any context but also has constraints.
+ * However when taken in a stricter environment the RCU lock does not loosen
+ * the constraints.
+ *
+ * Therefore we must look for the strictest environment in the lock stack and
+ * compare that to the lock we're trying to acquire.
+ */
+static int check_wait_context(struct task_struct *curr, struct held_lock *next)
+{
+ u8 next_inner = hlock_class(next)->wait_type_inner;
+ u8 next_outer = hlock_class(next)->wait_type_outer;
+ u8 curr_inner;
+ int depth;
+
+ if (!next_inner || next->trylock)
+ return 0;
+
+ if (!next_outer)
+ next_outer = next_inner;
+
+ /*
+ * Find start of current irq_context..
+ */
+ for (depth = curr->lockdep_depth - 1; depth >= 0; depth--) {
+ struct held_lock *prev = curr->held_locks + depth;
+ if (prev->irq_context != next->irq_context)
+ break;
+ }
+ depth++;
+
+ curr_inner = task_wait_context(curr);
+
+ for (; depth < curr->lockdep_depth; depth++) {
+ struct held_lock *prev = curr->held_locks + depth;
+ struct lock_class *class = hlock_class(prev);
+ u8 prev_inner = class->wait_type_inner;
+
+ if (prev_inner) {
+ /*
+ * We can have a bigger inner than a previous one
+ * when outer is smaller than inner, as with RCU.
+ *
+ * Also due to trylocks.
+ */
+ curr_inner = min(curr_inner, prev_inner);
+
+ /*
+ * Allow override for annotations -- this is typically
+ * only valid/needed for code that only exists when
+ * CONFIG_PREEMPT_RT=n.
+ */
+ if (unlikely(class->lock_type == LD_LOCK_WAIT_OVERRIDE))
+ curr_inner = prev_inner;
+ }
+ }
+
+ if (next_outer > curr_inner)
+ return print_lock_invalid_wait_context(curr, next);
+
+ return 0;
+}
+
+#else /* CONFIG_PROVE_LOCKING */
+
+static inline int
+mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
+{
+ return 1;
+}
+
+static inline unsigned int task_irq_context(struct task_struct *task)
+{
+ return 0;
+}
+
+static inline int separate_irq_context(struct task_struct *curr,
+ struct held_lock *hlock)
+{
+ return 0;
+}
+
+static inline int check_wait_context(struct task_struct *curr,
+ struct held_lock *next)
+{
+ return 0;
+}
+
+#endif /* CONFIG_PROVE_LOCKING */
+
/*
* Initialize a lock instance's lock-class mapping info:
*/
-void lockdep_init_map(struct lockdep_map *lock, const char *name,
- struct lock_class_key *key, int subclass)
+void lockdep_init_map_type(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key, int subclass,
+ u8 inner, u8 outer, u8 lock_type)
{
int i;
- kmemcheck_mark_initialized(lock, sizeof(*lock));
-
for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
lock->class_cache[i] = NULL;
@@ -3140,19 +4957,22 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
lock->name = name;
+ lock->wait_type_outer = outer;
+ lock->wait_type_inner = inner;
+ lock->lock_type = lock_type;
+
/*
* No key, no joy, we need to hash something.
*/
if (DEBUG_LOCKS_WARN_ON(!key))
return;
/*
- * Sanity check, the lock-class key must be persistent:
+ * Sanity check, the lock-class key must either have been allocated
+ * statically or must have been registered as a dynamic key.
*/
- if (!static_obj(key)) {
- printk("BUG: key %p not in .data!\n", key);
- /*
- * What it says above ^^^^^, I suggest you read it.
- */
+ if (!static_obj(key) && !is_dynamic_key(key)) {
+ if (debug_locks)
+ printk(KERN_ERR "BUG: key %px has not been registered!\n", key);
DEBUG_LOCKS_WARN_ON(1);
return;
}
@@ -3164,30 +4984,61 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
if (subclass) {
unsigned long flags;
- if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion))
+ if (DEBUG_LOCKS_WARN_ON(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
- current->lockdep_recursion = 1;
+ lockdep_recursion_inc();
register_lock_class(lock, subclass, 1);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
}
-EXPORT_SYMBOL_GPL(lockdep_init_map);
+EXPORT_SYMBOL_GPL(lockdep_init_map_type);
struct lock_class_key __lockdep_no_validate__;
EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
-static int
+struct lock_class_key __lockdep_no_track__;
+EXPORT_SYMBOL_GPL(__lockdep_no_track__);
+
+#ifdef CONFIG_PROVE_LOCKING
+void lockdep_set_lock_cmp_fn(struct lockdep_map *lock, lock_cmp_fn cmp_fn,
+ lock_print_fn print_fn)
+{
+ struct lock_class *class = lock->class_cache[0];
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ lockdep_recursion_inc();
+
+ if (!class)
+ class = register_lock_class(lock, 0, 0);
+
+ if (class) {
+ WARN_ON(class->cmp_fn && class->cmp_fn != cmp_fn);
+ WARN_ON(class->print_fn && class->print_fn != print_fn);
+
+ class->cmp_fn = cmp_fn;
+ class->print_fn = print_fn;
+ }
+
+ lockdep_recursion_finish();
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lockdep_set_lock_cmp_fn);
+#endif
+
+static void
print_lock_nested_lock_not_held(struct task_struct *curr,
- struct held_lock *hlock,
- unsigned long ip)
+ struct held_lock *hlock)
{
if (!debug_locks_off())
- return 0;
+ return;
if (debug_locks_silent)
- return 0;
+ return;
+
+ nbcon_cpu_emergency_enter();
pr_warn("\n");
pr_warn("==================================\n");
@@ -3210,19 +5061,23 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
pr_warn("\nstack backtrace:\n");
dump_stack();
- return 0;
+ nbcon_cpu_emergency_exit();
}
-static int __lock_is_held(struct lockdep_map *lock, int read);
+static int __lock_is_held(const struct lockdep_map *lock, int read);
/*
* This gets called for every mutex_lock*()/spin_lock*() operation.
* We maintain the dependency maps and validate the locking attempt:
+ *
+ * The callers must make sure that IRQs are disabled before calling it,
+ * otherwise we could get an interrupt which would want to take locks,
+ * which would end up in lockdep again.
*/
static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int trylock, int read, int check, int hardirqs_off,
struct lockdep_map *nest_lock, unsigned long ip,
- int references, int pin_count)
+ int references, int pin_count, int sync)
{
struct task_struct *curr = current;
struct lock_class *class = NULL;
@@ -3235,16 +5090,18 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (unlikely(!debug_locks))
return 0;
- /*
- * Lockdep should run with IRQs disabled, otherwise we could
- * get an interrupt which would want to take locks, which would
- * end up in lockdep and have you got a head-ache already?
- */
- if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ if (unlikely(lock->key == &__lockdep_no_track__))
return 0;
- if (!prove_locking || lock->key == &__lockdep_no_validate__)
+ lockevent_inc(lockdep_acquire);
+
+ if (!prove_locking || lock->key == &__lockdep_no_validate__) {
check = 0;
+ lockevent_inc(lockdep_nocheck);
+ }
+
+ if (DEBUG_LOCKS_WARN_ON(subclass >= MAX_LOCKDEP_SUBCLASSES))
+ return 0;
if (subclass < NR_LOCKDEP_CACHING_CLASSES)
class = lock->class_cache[subclass];
@@ -3256,13 +5113,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (!class)
return 0;
}
- atomic_inc((atomic_t *)&class->ops);
+
+ debug_class_ops_inc(class);
+
if (very_verbose(class)) {
- printk("\nacquire class [%p] %s", class->key, class->name);
+ nbcon_cpu_emergency_enter();
+ printk("\nacquire class [%px] %s", class->key, class->name);
if (class->name_version > 1)
printk(KERN_CONT "#%d", class->name_version);
printk(KERN_CONT "\n");
dump_stack();
+ nbcon_cpu_emergency_exit();
}
/*
@@ -3277,24 +5138,25 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
return 0;
- class_idx = class - lock_classes + 1;
+ class_idx = class - lock_classes;
- if (depth) {
+ if (depth && !sync) {
+ /* we're holding locks and the new held lock is not a sync */
hlock = curr->held_locks + depth - 1;
if (hlock->class_idx == class_idx && nest_lock) {
- if (hlock->references) {
- /*
- * Check: unsigned int references:12, overflow.
- */
- if (DEBUG_LOCKS_WARN_ON(hlock->references == (1 << 12)-1))
- return 0;
+ if (!references)
+ references++;
+ if (!hlock->references)
hlock->references++;
- } else {
- hlock->references = 2;
- }
- return 1;
+ hlock->references += references;
+
+ /* Overflow */
+ if (DEBUG_LOCKS_WARN_ON(hlock->references < references))
+ return 0;
+
+ return 2;
}
}
@@ -3313,6 +5175,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
hlock->trylock = trylock;
hlock->read = read;
hlock->check = check;
+ hlock->sync = !!sync;
hlock->hardirqs_off = !!hardirqs_off;
hlock->references = references;
#ifdef CONFIG_LOCK_STAT
@@ -3321,11 +5184,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
#endif
hlock->pin_count = pin_count;
- if (check && !mark_irqflags(curr, hlock))
+ if (check_wait_context(curr, hlock))
return 0;
- /* mark it as used: */
- if (!mark_lock(curr, hlock, LOCK_USED))
+ /* Initialize the lock usage bit */
+ if (!mark_usage(curr, hlock, check))
return 0;
/*
@@ -3339,9 +5202,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
* the hash, not class->key.
*/
/*
- * Whoops, we did it again.. ran straight out of our static allocation.
+ * Whoops, we did it again.. class_idx is invalid.
*/
- if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS))
+ if (DEBUG_LOCKS_WARN_ON(!test_bit(class_idx, lock_classes_in_use)))
return 0;
chain_key = curr->curr_chain_key;
@@ -3349,24 +5212,35 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
/*
* How can we have a chain hash when we ain't got no keys?!
*/
- if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
+ if (DEBUG_LOCKS_WARN_ON(chain_key != INITIAL_CHAIN_KEY))
return 0;
chain_head = 1;
}
hlock->prev_chain_key = chain_key;
if (separate_irq_context(curr, hlock)) {
- chain_key = 0;
+ chain_key = INITIAL_CHAIN_KEY;
chain_head = 1;
}
- chain_key = iterate_chain_key(chain_key, class_idx);
+ chain_key = iterate_chain_key(chain_key, hlock_id(hlock));
- if (nest_lock && !__lock_is_held(nest_lock, -1))
- return print_lock_nested_lock_not_held(curr, hlock, ip);
+ if (nest_lock && !__lock_is_held(nest_lock, -1)) {
+ print_lock_nested_lock_not_held(curr, hlock);
+ return 0;
+ }
- if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
+ if (!debug_locks_silent) {
+ WARN_ON_ONCE(depth && !hlock_class(hlock - 1)->key);
+ WARN_ON_ONCE(!hlock_class(hlock)->key);
+ }
+
+ if (!validate_chain(curr, hlock, chain_head, chain_key))
return 0;
+ /* For lock_sync(), we are done here since no actual critical section */
+ if (hlock->sync)
+ return 1;
+
curr->curr_chain_key = chain_key;
curr->lockdep_depth++;
check_chain_key(curr);
@@ -3376,6 +5250,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
#endif
if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
debug_locks_off();
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!");
printk(KERN_DEBUG "depth: %i max: %lu!\n",
curr->lockdep_depth, MAX_LOCK_DEPTH);
@@ -3383,6 +5258,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
lockdep_print_held_locks(current);
debug_show_all_locks();
dump_stack();
+ nbcon_cpu_emergency_exit();
return 0;
}
@@ -3393,14 +5269,16 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
return 1;
}
-static int
-print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
- unsigned long ip)
+static void print_unlock_imbalance_bug(struct task_struct *curr,
+ struct lockdep_map *lock,
+ unsigned long ip)
{
if (!debug_locks_off())
- return 0;
+ return;
if (debug_locks_silent)
- return 0;
+ return;
+
+ nbcon_cpu_emergency_enter();
pr_warn("\n");
pr_warn("=====================================\n");
@@ -3411,7 +5289,7 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
curr->comm, task_pid_nr(curr));
print_lockdep_cache(lock);
pr_cont(") at:\n");
- print_ip_sym(ip);
+ print_ip_sym(KERN_WARNING, ip);
pr_warn("but there are no more locks to release!\n");
pr_warn("\nother info that might help us debug this:\n");
lockdep_print_held_locks(curr);
@@ -3419,16 +5297,17 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
pr_warn("\nstack backtrace:\n");
dump_stack();
- return 0;
+ nbcon_cpu_emergency_exit();
}
-static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
+static noinstr int match_held_lock(const struct held_lock *hlock,
+ const struct lockdep_map *lock)
{
if (hlock->instance == lock)
return 1;
if (hlock->references) {
- struct lock_class *class = lock->class_cache[0];
+ const struct lock_class *class = lock->class_cache[0];
if (!class)
class = look_up_lock_class(lock, 0);
@@ -3439,7 +5318,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
* Clearly if the lock hasn't been acquired _ever_, we're not
* holding it either, so report failure.
*/
- if (IS_ERR_OR_NULL(class))
+ if (!class)
return 0;
/*
@@ -3450,7 +5329,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
return 0;
- if (hlock->class_idx == class - lock_classes + 1)
+ if (hlock->class_idx == class - lock_classes)
return 1;
}
@@ -3494,19 +5373,33 @@ out:
}
static int reacquire_held_locks(struct task_struct *curr, unsigned int depth,
- int idx)
+ int idx, unsigned int *merged)
{
struct held_lock *hlock;
+ int first_idx = idx;
+
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return 0;
for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) {
- if (!__lock_acquire(hlock->instance,
+ switch (__lock_acquire(hlock->instance,
hlock_class(hlock)->subclass,
hlock->trylock,
hlock->read, hlock->check,
hlock->hardirqs_off,
hlock->nest_lock, hlock->acquire_ip,
- hlock->references, hlock->pin_count))
+ hlock->references, hlock->pin_count, 0)) {
+ case 0:
return 1;
+ case 1:
+ break;
+ case 2:
+ *merged += (idx == first_idx);
+ break;
+ default:
+ WARN_ON(1);
+ return 0;
+ }
}
return 0;
}
@@ -3517,11 +5410,14 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
unsigned long ip)
{
struct task_struct *curr = current;
+ unsigned int depth, merged = 0;
struct held_lock *hlock;
struct lock_class *class;
- unsigned int depth;
int i;
+ if (unlikely(!debug_locks))
+ return 0;
+
depth = curr->lockdep_depth;
/*
* This function is about (re)setting the class of a held lock,
@@ -3531,24 +5427,29 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
return 0;
hlock = find_held_lock(curr, lock, depth, &i);
- if (!hlock)
- return print_unlock_imbalance_bug(curr, lock, ip);
+ if (!hlock) {
+ print_unlock_imbalance_bug(curr, lock, ip);
+ return 0;
+ }
- lockdep_init_map(lock, name, key, 0);
+ lockdep_init_map_type(lock, name, key, 0,
+ lock->wait_type_inner,
+ lock->wait_type_outer,
+ lock->lock_type);
class = register_lock_class(lock, subclass, 0);
- hlock->class_idx = class - lock_classes + 1;
+ hlock->class_idx = class - lock_classes;
curr->lockdep_depth = i;
curr->curr_chain_key = hlock->prev_chain_key;
- if (reacquire_held_locks(curr, depth, i))
+ if (reacquire_held_locks(curr, depth, i, &merged))
return 0;
/*
* I took it apart and put it back together again, except now I have
* these 'spare' parts.. where shall I put them.
*/
- if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged))
return 0;
return 1;
}
@@ -3556,10 +5457,13 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
{
struct task_struct *curr = current;
+ unsigned int depth, merged = 0;
struct held_lock *hlock;
- unsigned int depth;
int i;
+ if (unlikely(!debug_locks))
+ return 0;
+
depth = curr->lockdep_depth;
/*
* This function is about (re)setting the class of a held lock,
@@ -3569,8 +5473,10 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
return 0;
hlock = find_held_lock(curr, lock, depth, &i);
- if (!hlock)
- return print_unlock_imbalance_bug(curr, lock, ip);
+ if (!hlock) {
+ print_unlock_imbalance_bug(curr, lock, ip);
+ return 0;
+ }
curr->lockdep_depth = i;
curr->curr_chain_key = hlock->prev_chain_key;
@@ -3579,7 +5485,11 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
hlock->read = 1;
hlock->acquire_ip = ip;
- if (reacquire_held_locks(curr, depth, i))
+ if (reacquire_held_locks(curr, depth, i, &merged))
+ return 0;
+
+ /* Merging can't happen with unchanged classes.. */
+ if (DEBUG_LOCKS_WARN_ON(merged))
return 0;
/*
@@ -3588,22 +5498,21 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
*/
if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
return 0;
+
return 1;
}
/*
- * Remove the lock to the list of currently held locks - this gets
+ * Remove the lock from the list of currently held locks - this gets
* called on mutex_unlock()/spin_unlock*() (or on a failed
* mutex_lock_interruptible()).
- *
- * @nested is an hysterical artifact, needs a tree wide cleanup.
*/
static int
-__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+__lock_release(struct lockdep_map *lock, unsigned long ip)
{
struct task_struct *curr = current;
+ unsigned int depth, merged = 1;
struct held_lock *hlock;
- unsigned int depth;
int i;
if (unlikely(!debug_locks))
@@ -3614,16 +5523,20 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
* So we're all set to release this lock.. wait what lock? We don't
* own any locks, you've been drinking again?
*/
- if (DEBUG_LOCKS_WARN_ON(depth <= 0))
- return print_unlock_imbalance_bug(curr, lock, ip);
+ if (depth <= 0) {
+ print_unlock_imbalance_bug(curr, lock, ip);
+ return 0;
+ }
/*
* Check whether the lock exists in the current stack
* of held locks:
*/
hlock = find_held_lock(curr, lock, depth, &i);
- if (!hlock)
- return print_unlock_imbalance_bug(curr, lock, ip);
+ if (!hlock) {
+ print_unlock_imbalance_bug(curr, lock, ip);
+ return 0;
+ }
if (hlock->instance == lock)
lock_release_holdtime(hlock);
@@ -3651,20 +5564,33 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
curr->lockdep_depth = i;
curr->curr_chain_key = hlock->prev_chain_key;
- if (reacquire_held_locks(curr, depth, i + 1))
+ /*
+ * The most likely case is when the unlock is on the innermost
+ * lock. In this case, we are done!
+ */
+ if (i == depth-1)
+ return 1;
+
+ if (reacquire_held_locks(curr, depth, i + 1, &merged))
return 0;
/*
* We had N bottles of beer on the wall, we drank one, but now
* there's not N-1 bottles of beer left on the wall...
+ * Pouring two of the bottles together is acceptable.
*/
- if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
- return 0;
+ DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged);
- return 1;
+ /*
+ * Since reacquire_held_locks() would have called check_chain_key()
+ * indirectly via __lock_acquire(), we don't need to do it again
+ * on return.
+ */
+ return 0;
}
-static int __lock_is_held(struct lockdep_map *lock, int read)
+static __always_inline
+int __lock_is_held(const struct lockdep_map *lock, int read)
{
struct task_struct *curr = current;
int i;
@@ -3673,14 +5599,14 @@ static int __lock_is_held(struct lockdep_map *lock, int read)
struct held_lock *hlock = curr->held_locks + i;
if (match_held_lock(hlock, lock)) {
- if (read == -1 || hlock->read == read)
- return 1;
+ if (read == -1 || !!hlock->read == read)
+ return LOCK_STATE_HELD;
- return 0;
+ return LOCK_STATE_NOT_HELD;
}
}
- return 0;
+ return LOCK_STATE_NOT_HELD;
}
static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock)
@@ -3701,7 +5627,7 @@ static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock)
* be guessable and still allows some pin nesting in
* our u32 pin_count.
*/
- cookie.val = 1 + (prandom_u32() >> 16);
+ cookie.val = 1 + (sched_clock() & 0xffff);
hlock->pin_count += cookie.val;
return cookie;
}
@@ -3761,23 +5687,26 @@ static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie
/*
* Check whether we follow the irq-flags state precisely:
*/
-static void check_flags(unsigned long flags)
+static noinstr void check_flags(unsigned long flags)
{
-#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \
- defined(CONFIG_TRACE_IRQFLAGS)
+#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP)
if (!debug_locks)
return;
+ /* Get the warning out.. */
+ instrumentation_begin();
+
if (irqs_disabled_flags(flags)) {
- if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) {
+ if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled())) {
printk("possible reason: unannotated irqs-off.\n");
}
} else {
- if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) {
+ if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled())) {
printk("possible reason: unannotated irqs-on.\n");
}
}
+#ifndef CONFIG_PREEMPT_RT
/*
* We dont accurately track softirq state in e.g.
* hardirq contexts (such as on 4KSTACKS), so only
@@ -3792,9 +5721,12 @@ static void check_flags(unsigned long flags)
DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
}
}
+#endif
if (!debug_locks)
print_irqtrace_events(current);
+
+ instrumentation_end();
#endif
}
@@ -3804,15 +5736,15 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
- current->lockdep_recursion = 1;
+ lockdep_recursion_inc();
check_flags(flags);
if (__lock_set_class(lock, name, key, subclass, ip))
check_chain_key(current);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(lock_set_class);
@@ -3821,19 +5753,71 @@ void lock_downgrade(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
- current->lockdep_recursion = 1;
+ lockdep_recursion_inc();
check_flags(flags);
if (__lock_downgrade(lock, ip))
check_chain_key(current);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(lock_downgrade);
+/* NMI context !!! */
+static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock, int subclass)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ struct lock_class *class = look_up_lock_class(lock, subclass);
+ unsigned long mask = LOCKF_USED;
+
+ /* if it doesn't have a class (yet), it certainly hasn't been used yet */
+ if (!class)
+ return;
+
+ /*
+ * READ locks only conflict with USED, such that if we only ever use
+ * READ locks, there is no deadlock possible -- RCU.
+ */
+ if (!hlock->read)
+ mask |= LOCKF_USED_READ;
+
+ if (!(class->usage_mask & mask))
+ return;
+
+ hlock->class_idx = class - lock_classes;
+
+ print_usage_bug(current, hlock, LOCK_USED, LOCK_USAGE_STATES);
+#endif
+}
+
+static bool lockdep_nmi(void)
+{
+ if (raw_cpu_read(lockdep_recursion))
+ return false;
+
+ if (!in_nmi())
+ return false;
+
+ return true;
+}
+
+/*
+ * read_lock() is recursive if:
+ * 1. We force lockdep think this way in selftests or
+ * 2. The implementation is not queued read/write lock or
+ * 3. The locker is at an in_interrupt() context.
+ */
+bool read_lock_is_recursive(void)
+{
+ return force_read_lock_recursive ||
+ !IS_ENABLED(CONFIG_QUEUED_RWLOCKS) ||
+ in_interrupt();
+}
+EXPORT_SYMBOL_GPL(read_lock_is_recursive);
+
/*
* We are not always called with irqs disabled - do that here,
* and also avoid lockdep recursion:
@@ -3844,74 +5828,138 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
+
+ if (!debug_locks)
return;
+ /*
+ * As KASAN instrumentation is disabled and lock_acquire() is usually
+ * the first lockdep call when a task tries to acquire a lock, add
+ * kasan_check_byte() here to check for use-after-free and other
+ * memory errors.
+ */
+ kasan_check_byte(lock);
+
+ if (unlikely(!lockdep_enabled())) {
+ /* XXX allow trylock from NMI ?!? */
+ if (lockdep_nmi() && !trylock) {
+ struct held_lock hlock;
+
+ hlock.acquire_ip = ip;
+ hlock.instance = lock;
+ hlock.nest_lock = nest_lock;
+ hlock.irq_context = 2; // XXX
+ hlock.trylock = trylock;
+ hlock.read = read;
+ hlock.check = check;
+ hlock.hardirqs_off = true;
+ hlock.references = 0;
+
+ verify_lock_unused(lock, &hlock, subclass);
+ }
+ return;
+ }
+
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion = 1;
- trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
+ lockdep_recursion_inc();
__lock_acquire(lock, subclass, trylock, read, check,
- irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
- current->lockdep_recursion = 0;
+ irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 0);
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(lock_acquire);
-void lock_release(struct lockdep_map *lock, int nested,
- unsigned long ip)
+void lock_release(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ trace_lock_release(lock, ip);
+
+ if (unlikely(!lockdep_enabled() ||
+ lock->key == &__lockdep_no_track__))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion = 1;
- trace_lock_release(lock, ip);
- if (__lock_release(lock, nested, ip))
+
+ lockdep_recursion_inc();
+ if (__lock_release(lock, ip))
check_chain_key(current);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(lock_release);
-int lock_is_held_type(struct lockdep_map *lock, int read)
+/*
+ * lock_sync() - A special annotation for synchronize_{s,}rcu()-like API.
+ *
+ * No actual critical section is created by the APIs annotated with this: these
+ * APIs are used to wait for one or multiple critical sections (on other CPUs
+ * or threads), and it means that calling these APIs inside these critical
+ * sections is potential deadlock.
+ */
+void lock_sync(struct lockdep_map *lock, unsigned subclass, int read,
+ int check, struct lockdep_map *nest_lock, unsigned long ip)
{
unsigned long flags;
- int ret = 0;
- if (unlikely(current->lockdep_recursion))
- return 1; /* avoid false negative lockdep_assert_held() */
+ if (unlikely(!lockdep_enabled()))
+ return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion = 1;
+ lockdep_recursion_inc();
+ __lock_acquire(lock, subclass, 0, read, check,
+ irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 1);
+ check_chain_key(current);
+ lockdep_recursion_finish();
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_sync);
+
+noinstr int lock_is_held_type(const struct lockdep_map *lock, int read)
+{
+ unsigned long flags;
+ int ret = LOCK_STATE_NOT_HELD;
+
+ /*
+ * Avoid false negative lockdep_assert_held() and
+ * lockdep_assert_not_held().
+ */
+ if (unlikely(!lockdep_enabled()))
+ return LOCK_STATE_UNKNOWN;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ lockdep_recursion_inc();
ret = __lock_is_held(lock, read);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
return ret;
}
EXPORT_SYMBOL_GPL(lock_is_held_type);
+NOKPROBE_SYMBOL(lock_is_held_type);
struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
{
struct pin_cookie cookie = NIL_COOKIE;
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return cookie;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion = 1;
+ lockdep_recursion_inc();
cookie = __lock_pin_lock(lock);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
return cookie;
@@ -3922,15 +5970,15 @@ void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion = 1;
+ lockdep_recursion_inc();
__lock_repin_lock(lock, cookie);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(lock_repin_lock);
@@ -3939,40 +5987,30 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion = 1;
+ lockdep_recursion_inc();
__lock_unpin_lock(lock, cookie);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(lock_unpin_lock);
-void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
-{
- current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask);
-}
-EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state);
-
-void lockdep_clear_current_reclaim_state(void)
-{
- current->lockdep_reclaim_gfp = 0;
-}
-EXPORT_SYMBOL_GPL(lockdep_clear_current_reclaim_state);
-
#ifdef CONFIG_LOCK_STAT
-static int
-print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
- unsigned long ip)
+static void print_lock_contention_bug(struct task_struct *curr,
+ struct lockdep_map *lock,
+ unsigned long ip)
{
if (!debug_locks_off())
- return 0;
+ return;
if (debug_locks_silent)
- return 0;
+ return;
+
+ nbcon_cpu_emergency_enter();
pr_warn("\n");
pr_warn("=================================\n");
@@ -3983,7 +6021,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
curr->comm, task_pid_nr(curr));
print_lockdep_cache(lock);
pr_cont(") at:\n");
- print_ip_sym(ip);
+ print_ip_sym(KERN_WARNING, ip);
pr_warn("but there are no locks held!\n");
pr_warn("\nother info that might help us debug this:\n");
lockdep_print_held_locks(curr);
@@ -3991,7 +6029,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
pr_warn("\nstack backtrace:\n");
dump_stack();
- return 0;
+ nbcon_cpu_emergency_exit();
}
static void
@@ -4011,6 +6049,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!depth))
return;
+ if (unlikely(lock->key == &__lockdep_no_track__))
+ return;
+
hlock = find_held_lock(curr, lock, depth, &i);
if (!hlock) {
print_lock_contention_bug(curr, lock, ip);
@@ -4033,7 +6074,6 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
stats->contending_point[contending_point]++;
if (lock->cpu != smp_processor_id())
stats->bounces[bounce_contended + !!hlock->read]++;
- put_lock_stats(stats);
}
static void
@@ -4054,6 +6094,9 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!depth))
return;
+ if (unlikely(lock->key == &__lockdep_no_track__))
+ return;
+
hlock = find_held_lock(curr, lock, depth, &i);
if (!hlock) {
print_lock_contention_bug(curr, lock, _RET_IP_);
@@ -4070,8 +6113,6 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
hlock->holdtime_stamp = now;
}
- trace_lock_acquired(lock, ip);
-
stats = get_lock_stats(hlock_class(hlock));
if (waittime) {
if (hlock->read)
@@ -4081,7 +6122,6 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
}
if (lock->cpu != cpu)
stats->bounces[bounce_acquired + !!hlock->read]++;
- put_lock_stats(stats);
lock->cpu = cpu;
lock->ip = ip;
@@ -4091,18 +6131,16 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
- if (unlikely(!lock_stat))
- return;
+ trace_lock_contended(lock, ip);
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lock_stat || !lockdep_enabled()))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion = 1;
- trace_lock_contended(lock, ip);
+ lockdep_recursion_inc();
__lock_contended(lock, ip);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(lock_contended);
@@ -4111,17 +6149,16 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
- if (unlikely(!lock_stat))
- return;
+ trace_lock_acquired(lock, ip);
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lock_stat || !lockdep_enabled()))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion = 1;
+ lockdep_recursion_inc();
__lock_acquired(lock, ip);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(lock_acquired);
@@ -4138,9 +6175,7 @@ void lockdep_reset(void)
int i;
raw_local_irq_save(flags);
- current->curr_chain_key = 0;
- current->lockdep_depth = 0;
- current->lockdep_recursion = 0;
+ lockdep_init_task(current);
memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock));
nr_hardirq_chains = 0;
nr_softirq_chains = 0;
@@ -4151,26 +6186,111 @@ void lockdep_reset(void)
raw_local_irq_restore(flags);
}
-static void zap_class(struct lock_class *class)
+/* Remove a class from a lock chain. Must be called with the graph lock held. */
+static void remove_class_from_lock_chain(struct pending_free *pf,
+ struct lock_chain *chain,
+ struct lock_class *class)
{
+#ifdef CONFIG_PROVE_LOCKING
int i;
+ for (i = chain->base; i < chain->base + chain->depth; i++) {
+ if (chain_hlock_class_idx(chain_hlocks[i]) != class - lock_classes)
+ continue;
+ /*
+ * Each lock class occurs at most once in a lock chain so once
+ * we found a match we can break out of this loop.
+ */
+ goto free_lock_chain;
+ }
+ /* Since the chain has not been modified, return. */
+ return;
+
+free_lock_chain:
+ free_chain_hlocks(chain->base, chain->depth);
+ /* Overwrite the chain key for concurrent RCU readers. */
+ WRITE_ONCE(chain->chain_key, INITIAL_CHAIN_KEY);
+ dec_chains(chain->irq_context);
+
/*
- * Remove all dependencies this lock is
- * involved in:
+ * Note: calling hlist_del_rcu() from inside a
+ * hlist_for_each_entry_rcu() loop is safe.
*/
- for (i = 0; i < nr_list_entries; i++) {
- if (list_entries[i].class == class)
- list_del_rcu(&list_entries[i].entry);
+ hlist_del_rcu(&chain->entry);
+ __set_bit(chain - lock_chains, pf->lock_chains_being_freed);
+ nr_zapped_lock_chains++;
+#endif
+}
+
+/* Must be called with the graph lock held. */
+static void remove_class_from_lock_chains(struct pending_free *pf,
+ struct lock_class *class)
+{
+ struct lock_chain *chain;
+ struct hlist_head *head;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(chainhash_table); i++) {
+ head = chainhash_table + i;
+ hlist_for_each_entry_rcu(chain, head, entry) {
+ remove_class_from_lock_chain(pf, chain, class);
+ }
}
+}
+
+/*
+ * Remove all references to a lock class. The caller must hold the graph lock.
+ */
+static void zap_class(struct pending_free *pf, struct lock_class *class)
+{
+ struct lock_list *entry;
+ int i;
+
+ WARN_ON_ONCE(!class->key);
+
/*
- * Unhash the class and remove it from the all_lock_classes list:
+ * Remove all dependencies this lock is
+ * involved in:
*/
- hlist_del_rcu(&class->hash_entry);
- list_del_rcu(&class->lock_entry);
+ for_each_set_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) {
+ entry = list_entries + i;
+ if (entry->class != class && entry->links_to != class)
+ continue;
+ __clear_bit(i, list_entries_in_use);
+ nr_list_entries--;
+ list_del_rcu(&entry->entry);
+ }
+ if (list_empty(&class->locks_after) &&
+ list_empty(&class->locks_before)) {
+ list_move_tail(&class->lock_entry, &pf->zapped);
+ hlist_del_rcu(&class->hash_entry);
+ WRITE_ONCE(class->key, NULL);
+ WRITE_ONCE(class->name, NULL);
+ /* Class allocated but not used, -1 in nr_unused_locks */
+ if (class->usage_mask == 0)
+ debug_atomic_dec(nr_unused_locks);
+ nr_lock_classes--;
+ __clear_bit(class - lock_classes, lock_classes_in_use);
+ if (class - lock_classes == max_lock_class_idx)
+ max_lock_class_idx--;
+ } else {
+ WARN_ONCE(true, "%s() failed for class %s\n", __func__,
+ class->name);
+ }
- RCU_INIT_POINTER(class->key, NULL);
- RCU_INIT_POINTER(class->name, NULL);
+ remove_class_from_lock_chains(pf, class);
+ nr_zapped_classes++;
+}
+
+static void reinit_class(struct lock_class *class)
+{
+ WARN_ON_ONCE(!class->lock_entry.next);
+ WARN_ON_ONCE(!list_empty(&class->locks_after));
+ WARN_ON_ONCE(!list_empty(&class->locks_before));
+ memset_startat(class, 0, key);
+ WARN_ON_ONCE(!class->lock_entry.next);
+ WARN_ON_ONCE(!list_empty(&class->locks_after));
+ WARN_ON_ONCE(!list_empty(&class->locks_before));
}
static inline int within(const void *addr, void *start, unsigned long size)
@@ -4178,66 +6298,206 @@ static inline int within(const void *addr, void *start, unsigned long size)
return addr >= start && addr < start + size;
}
+static bool inside_selftest(void)
+{
+ return current == lockdep_selftest_task_struct;
+}
+
+/* The caller must hold the graph lock. */
+static struct pending_free *get_pending_free(void)
+{
+ return delayed_free.pf + delayed_free.index;
+}
+
+static void free_zapped_rcu(struct rcu_head *cb);
+
/*
- * Used in module.c to remove lock classes from memory that is going to be
- * freed; and possibly re-used by other modules.
- *
- * We will have had one sync_sched() before getting here, so we're guaranteed
- * nobody will look up these exact classes -- they're properly dead but still
- * allocated.
- */
-void lockdep_free_key_range(void *start, unsigned long size)
+* See if we need to queue an RCU callback, must called with
+* the lockdep lock held, returns false if either we don't have
+* any pending free or the callback is already scheduled.
+* Otherwise, a call_rcu() must follow this function call.
+*/
+static bool prepare_call_rcu_zapped(struct pending_free *pf)
+{
+ WARN_ON_ONCE(inside_selftest());
+
+ if (list_empty(&pf->zapped))
+ return false;
+
+ if (delayed_free.scheduled)
+ return false;
+
+ delayed_free.scheduled = true;
+
+ WARN_ON_ONCE(delayed_free.pf + delayed_free.index != pf);
+ delayed_free.index ^= 1;
+
+ return true;
+}
+
+/* The caller must hold the graph lock. May be called from RCU context. */
+static void __free_zapped_classes(struct pending_free *pf)
{
struct lock_class *class;
- struct hlist_head *head;
+
+ check_data_structures();
+
+ list_for_each_entry(class, &pf->zapped, lock_entry)
+ reinit_class(class);
+
+ list_splice_init(&pf->zapped, &free_lock_classes);
+
+#ifdef CONFIG_PROVE_LOCKING
+ bitmap_andnot(lock_chains_in_use, lock_chains_in_use,
+ pf->lock_chains_being_freed, ARRAY_SIZE(lock_chains));
+ bitmap_clear(pf->lock_chains_being_freed, 0, ARRAY_SIZE(lock_chains));
+#endif
+}
+
+static void free_zapped_rcu(struct rcu_head *ch)
+{
+ struct pending_free *pf;
unsigned long flags;
- int i;
- int locked;
+ bool need_callback;
+
+ if (WARN_ON_ONCE(ch != &delayed_free.rcu_head))
+ return;
raw_local_irq_save(flags);
- locked = graph_lock();
+ lockdep_lock();
+
+ /* closed head */
+ pf = delayed_free.pf + (delayed_free.index ^ 1);
+ __free_zapped_classes(pf);
+ delayed_free.scheduled = false;
+ need_callback =
+ prepare_call_rcu_zapped(delayed_free.pf + delayed_free.index);
+ lockdep_unlock();
+ raw_local_irq_restore(flags);
/*
- * Unhash all classes that were created by this module:
- */
+ * If there's pending free and its callback has not been scheduled,
+ * queue an RCU callback.
+ */
+ if (need_callback)
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
+
+}
+
+/*
+ * Remove all lock classes from the class hash table and from the
+ * all_lock_classes list whose key or name is in the address range [start,
+ * start + size). Move these lock classes to the zapped_classes list. Must
+ * be called with the graph lock held.
+ */
+static void __lockdep_free_key_range(struct pending_free *pf, void *start,
+ unsigned long size)
+{
+ struct lock_class *class;
+ struct hlist_head *head;
+ int i;
+
+ /* Unhash all classes that were created by a module. */
for (i = 0; i < CLASSHASH_SIZE; i++) {
head = classhash_table + i;
hlist_for_each_entry_rcu(class, head, hash_entry) {
- if (within(class->key, start, size))
- zap_class(class);
- else if (within(class->name, start, size))
- zap_class(class);
+ if (!within(class->key, start, size) &&
+ !within(class->name, start, size))
+ continue;
+ zap_class(pf, class);
}
}
+}
- if (locked)
- graph_unlock();
- raw_local_irq_restore(flags);
+/*
+ * Used in module.c to remove lock classes from memory that is going to be
+ * freed; and possibly re-used by other modules.
+ *
+ * We will have had one synchronize_rcu() before getting here, so we're
+ * guaranteed nobody will look up these exact classes -- they're properly dead
+ * but still allocated.
+ */
+static void lockdep_free_key_range_reg(void *start, unsigned long size)
+{
+ struct pending_free *pf;
+ unsigned long flags;
+ bool need_callback;
+ init_data_structures_once();
+
+ raw_local_irq_save(flags);
+ lockdep_lock();
+ pf = get_pending_free();
+ __lockdep_free_key_range(pf, start, size);
+ need_callback = prepare_call_rcu_zapped(pf);
+ lockdep_unlock();
+ raw_local_irq_restore(flags);
+ if (need_callback)
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
/*
* Wait for any possible iterators from look_up_lock_class() to pass
* before continuing to free the memory they refer to.
- *
- * sync_sched() is sufficient because the read-side is IRQ disable.
*/
- synchronize_sched();
+ synchronize_rcu();
+}
- /*
- * XXX at this point we could return the resources to the pool;
- * instead we leak them. We would need to change to bitmap allocators
- * instead of the linear allocators we have now.
- */
+/*
+ * Free all lockdep keys in the range [start, start+size). Does not sleep.
+ * Ignores debug_locks. Must only be used by the lockdep selftests.
+ */
+static void lockdep_free_key_range_imm(void *start, unsigned long size)
+{
+ struct pending_free *pf = delayed_free.pf;
+ unsigned long flags;
+
+ init_data_structures_once();
+
+ raw_local_irq_save(flags);
+ lockdep_lock();
+ __lockdep_free_key_range(pf, start, size);
+ __free_zapped_classes(pf);
+ lockdep_unlock();
+ raw_local_irq_restore(flags);
}
-void lockdep_reset_lock(struct lockdep_map *lock)
+void lockdep_free_key_range(void *start, unsigned long size)
+{
+ init_data_structures_once();
+
+ if (inside_selftest())
+ lockdep_free_key_range_imm(start, size);
+ else
+ lockdep_free_key_range_reg(start, size);
+}
+
+/*
+ * Check whether any element of the @lock->class_cache[] array refers to a
+ * registered lock class. The caller must hold either the graph lock or the
+ * RCU read lock.
+ */
+static bool lock_class_cache_is_registered(struct lockdep_map *lock)
{
struct lock_class *class;
struct hlist_head *head;
- unsigned long flags;
int i, j;
- int locked;
- raw_local_irq_save(flags);
+ for (i = 0; i < CLASSHASH_SIZE; i++) {
+ head = classhash_table + i;
+ hlist_for_each_entry_rcu(class, head, hash_entry) {
+ for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
+ if (lock->class_cache[j] == class)
+ return true;
+ }
+ }
+ return false;
+}
+
+/* The caller must hold the graph lock. Does not sleep. */
+static void __lockdep_reset_lock(struct pending_free *pf,
+ struct lockdep_map *lock)
+{
+ struct lock_class *class;
+ int j;
/*
* Remove all classes this lock might have:
@@ -4247,66 +6507,165 @@ void lockdep_reset_lock(struct lockdep_map *lock)
* If the class exists we look it up and zap it:
*/
class = look_up_lock_class(lock, j);
- if (!IS_ERR_OR_NULL(class))
- zap_class(class);
+ if (class)
+ zap_class(pf, class);
}
/*
* Debug check: in the end all mapped classes should
* be gone.
*/
+ if (WARN_ON_ONCE(lock_class_cache_is_registered(lock)))
+ debug_locks_off();
+}
+
+/*
+ * Remove all information lockdep has about a lock if debug_locks == 1. Free
+ * released data structures from RCU context.
+ */
+static void lockdep_reset_lock_reg(struct lockdep_map *lock)
+{
+ struct pending_free *pf;
+ unsigned long flags;
+ int locked;
+ bool need_callback = false;
+
+ raw_local_irq_save(flags);
locked = graph_lock();
- for (i = 0; i < CLASSHASH_SIZE; i++) {
- head = classhash_table + i;
- hlist_for_each_entry_rcu(class, head, hash_entry) {
- int match = 0;
+ if (!locked)
+ goto out_irq;
- for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
- match |= class == lock->class_cache[j];
-
- if (unlikely(match)) {
- if (debug_locks_off_graph_unlock()) {
- /*
- * We all just reset everything, how did it match?
- */
- WARN_ON(1);
- }
- goto out_restore;
- }
- }
- }
- if (locked)
- graph_unlock();
+ pf = get_pending_free();
+ __lockdep_reset_lock(pf, lock);
+ need_callback = prepare_call_rcu_zapped(pf);
+
+ graph_unlock();
+out_irq:
+ raw_local_irq_restore(flags);
+ if (need_callback)
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
+}
-out_restore:
+/*
+ * Reset a lock. Does not sleep. Ignores debug_locks. Must only be used by the
+ * lockdep selftests.
+ */
+static void lockdep_reset_lock_imm(struct lockdep_map *lock)
+{
+ struct pending_free *pf = delayed_free.pf;
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ lockdep_lock();
+ __lockdep_reset_lock(pf, lock);
+ __free_zapped_classes(pf);
+ lockdep_unlock();
raw_local_irq_restore(flags);
}
-void __init lockdep_info(void)
+void lockdep_reset_lock(struct lockdep_map *lock)
{
- printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
+ init_data_structures_once();
+
+ if (inside_selftest())
+ lockdep_reset_lock_imm(lock);
+ else
+ lockdep_reset_lock_reg(lock);
+}
- printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES);
- printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH);
- printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS);
- printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE);
- printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES);
- printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS);
- printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE);
+/*
+ * Unregister a dynamically allocated key.
+ *
+ * Unlike lockdep_register_key(), a search is always done to find a matching
+ * key irrespective of debug_locks to avoid potential invalid access to freed
+ * memory in lock_class entry.
+ */
+void lockdep_unregister_key(struct lock_class_key *key)
+{
+ struct hlist_head *hash_head = keyhashentry(key);
+ struct lock_class_key *k;
+ struct pending_free *pf;
+ unsigned long flags;
+ bool found = false;
+ bool need_callback = false;
+
+ might_sleep();
+
+ if (WARN_ON_ONCE(static_obj(key)))
+ return;
+
+ raw_local_irq_save(flags);
+ lockdep_lock();
+
+ hlist_for_each_entry_rcu(k, hash_head, hash_entry) {
+ if (k == key) {
+ hlist_del_rcu(&k->hash_entry);
+ found = true;
+ break;
+ }
+ }
+ WARN_ON_ONCE(!found && debug_locks);
+ if (found) {
+ pf = get_pending_free();
+ __lockdep_free_key_range(pf, key, 1);
+ need_callback = prepare_call_rcu_zapped(pf);
+ nr_dynamic_keys--;
+ }
+ lockdep_unlock();
+ raw_local_irq_restore(flags);
- printk(" memory used by lock dependency info: %lu kB\n",
- (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS +
- sizeof(struct list_head) * CLASSHASH_SIZE +
- sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
- sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
- sizeof(struct list_head) * CHAINHASH_SIZE
+ if (need_callback)
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
+
+ /*
+ * Wait until is_dynamic_key() has finished accessing k->hash_entry.
+ *
+ * Some operations like __qdisc_destroy() will call this in a debug
+ * kernel, and the network traffic is disabled while waiting, hence
+ * the delay of the wait matters in debugging cases. Currently use a
+ * synchronize_rcu_expedited() to speed up the wait at the cost of
+ * system IPIs. TODO: Replace RCU with hazptr for this.
+ */
+ synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(lockdep_unregister_key);
+
+void __init lockdep_init(void)
+{
+ pr_info("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
+
+ pr_info("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES);
+ pr_info("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH);
+ pr_info("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS);
+ pr_info("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE);
+ pr_info("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES);
+ pr_info("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS);
+ pr_info("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE);
+
+ pr_info(" memory used by lock dependency info: %zu kB\n",
+ (sizeof(lock_classes) +
+ sizeof(lock_classes_in_use) +
+ sizeof(classhash_table) +
+ sizeof(list_entries) +
+ sizeof(list_entries_in_use) +
+ sizeof(chainhash_table) +
+ sizeof(delayed_free)
#ifdef CONFIG_PROVE_LOCKING
- + sizeof(struct circular_queue)
+ + sizeof(lock_cq)
+ + sizeof(lock_chains)
+ + sizeof(lock_chains_in_use)
+ + sizeof(chain_hlocks)
#endif
) / 1024
);
- printk(" per task-struct memory footprint: %lu bytes\n",
- sizeof(struct held_lock) * MAX_LOCK_DEPTH);
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+ pr_info(" memory used for stack traces: %zu kB\n",
+ (sizeof(stack_trace) + sizeof(stack_trace_hash)) / 1024
+ );
+#endif
+
+ pr_info(" per task-struct memory footprint: %zu bytes\n",
+ sizeof(((struct task_struct *)NULL)->held_locks));
}
static void
@@ -4318,18 +6677,22 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
if (debug_locks_silent)
return;
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("=========================\n");
pr_warn("WARNING: held lock freed!\n");
print_kernel_ident();
pr_warn("-------------------------\n");
- pr_warn("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
+ pr_warn("%s/%d is freeing memory %px-%px, with a lock still held there!\n",
curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
print_lock(hlock);
lockdep_print_held_locks(curr);
pr_warn("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
static inline int not_in_range(const void* mem_from, unsigned long mem_len,
@@ -4354,7 +6717,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
if (unlikely(!debug_locks))
return;
- local_irq_save(flags);
+ raw_local_irq_save(flags);
for (i = 0; i < curr->lockdep_depth; i++) {
hlock = curr->held_locks + i;
@@ -4365,7 +6728,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
print_freed_lock_bug(curr, mem_from, mem_from + mem_len, hlock);
break;
}
- local_irq_restore(flags);
+ raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
@@ -4376,6 +6739,8 @@ static void print_held_locks_bug(void)
if (debug_locks_silent)
return;
+ nbcon_cpu_emergency_enter();
+
pr_warn("\n");
pr_warn("====================================\n");
pr_warn("WARNING: %s/%d still has locks held!\n",
@@ -4385,6 +6750,8 @@ static void print_held_locks_bug(void)
lockdep_print_held_locks(current);
pr_warn("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
void debug_check_no_locks_held(void)
@@ -4398,8 +6765,6 @@ EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
void debug_show_all_locks(void)
{
struct task_struct *g, *p;
- int count = 10;
- int unlock = 1;
if (unlikely(!debug_locks)) {
pr_warn("INFO: lockdep is turned off.\n");
@@ -4407,49 +6772,18 @@ void debug_show_all_locks(void)
}
pr_warn("\nShowing all locks held in the system:\n");
- /*
- * Here we try to get the tasklist_lock as hard as possible,
- * if not successful after 2 seconds we ignore it (but keep
- * trying). This is to enable a debug printout even if a
- * tasklist_lock-holding task deadlocks or crashes.
- */
-retry:
- if (!read_trylock(&tasklist_lock)) {
- if (count == 10)
- pr_warn("hm, tasklist_lock locked, retrying... ");
- if (count) {
- count--;
- pr_cont(" #%d", 10-count);
- mdelay(200);
- goto retry;
- }
- pr_cont(" ignoring it.\n");
- unlock = 0;
- } else {
- if (count != 10)
- pr_cont(" locked it.\n");
- }
-
- do_each_thread(g, p) {
- /*
- * It's not reliable to print a task's held locks
- * if it's not sleeping (or if it's not the current
- * task):
- */
- if (p->state == TASK_RUNNING && p != current)
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ if (!p->lockdep_depth)
continue;
- if (p->lockdep_depth)
- lockdep_print_held_locks(p);
- if (!unlock)
- if (read_trylock(&tasklist_lock))
- unlock = 1;
- } while_each_thread(g, p);
+ lockdep_print_held_locks(p);
+ touch_nmi_watchdog();
+ touch_all_softlockup_watchdogs();
+ }
+ rcu_read_unlock();
pr_warn("\n");
pr_warn("=============================================\n\n");
-
- if (unlock)
- read_unlock(&tasklist_lock);
}
EXPORT_SYMBOL_GPL(debug_show_all_locks);
#endif
@@ -4475,6 +6809,7 @@ asmlinkage __visible void lockdep_sys_exit(void)
if (unlikely(curr->lockdep_depth)) {
if (!debug_locks_off())
return;
+ nbcon_cpu_emergency_enter();
pr_warn("\n");
pr_warn("================================================\n");
pr_warn("WARNING: lock held when returning to user space!\n");
@@ -4483,14 +6818,24 @@ asmlinkage __visible void lockdep_sys_exit(void)
pr_warn("%s/%d is leaving the kernel with locks still held!\n",
curr->comm, curr->pid);
lockdep_print_held_locks(curr);
+ nbcon_cpu_emergency_exit();
}
+
+ /*
+ * The lock history for each syscall should be independent. So wipe the
+ * slate clean on return to userspace.
+ */
+ lockdep_invariant_state(false);
}
void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
{
struct task_struct *curr = current;
+ int dl = READ_ONCE(debug_locks);
+ bool rcu = warn_rcu_enter();
/* Note: the following can be executed concurrently, so be careful. */
+ nbcon_cpu_emergency_enter();
pr_warn("\n");
pr_warn("=============================\n");
pr_warn("WARNING: suspicious RCU usage\n");
@@ -4498,17 +6843,16 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
pr_warn("-----------------------------\n");
pr_warn("%s:%d %s!\n", file, line, s);
pr_warn("\nother info that might help us debug this:\n\n");
- pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
+ pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n%s",
!rcu_lockdep_current_cpu_online()
? "RCU used illegally from offline CPU!\n"
- : !rcu_is_watching()
- ? "RCU used illegally from idle CPU!\n"
- : "",
- rcu_scheduler_active, debug_locks);
+ : "",
+ rcu_scheduler_active, dl,
+ dl ? "" : "Possible false positive due to lockdep disabling via debug_locks = 0\n");
/*
* If a CPU is in the RCU-free window in idle (ie: in the section
- * between rcu_idle_enter() and rcu_idle_exit(), then RCU
+ * between ct_idle_enter() and ct_idle_exit(), then RCU
* considers that CPU to be in an "extended quiescent state",
* which means that RCU will be completely ignoring that CPU.
* Therefore, rcu_read_lock() and friends have absolutely no
@@ -4530,5 +6874,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
lockdep_print_held_locks(curr);
pr_warn("\nstack backtrace:\n");
dump_stack();
+ nbcon_cpu_emergency_exit();
+ warn_rcu_exit(rcu);
}
EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index c08fbd2f5ba9..0e5e6ffe91a3 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* kernel/lockdep_internals.h
*
@@ -18,9 +19,17 @@ enum lock_usage_bit {
#include "lockdep_states.h"
#undef LOCKDEP_STATE
LOCK_USED,
- LOCK_USAGE_STATES
+ LOCK_USED_READ,
+ LOCK_USAGE_STATES,
};
+/* states after LOCK_USED_READ are not traced and printed */
+static_assert(LOCK_TRACE_STATES == LOCK_USAGE_STATES);
+
+#define LOCK_USAGE_READ_MASK 1
+#define LOCK_USAGE_DIR_MASK 2
+#define LOCK_USAGE_STATE_MASK (~(LOCK_USAGE_READ_MASK | LOCK_USAGE_DIR_MASK))
+
/*
* Usage-state bitmasks:
*/
@@ -35,15 +44,40 @@ enum {
#include "lockdep_states.h"
#undef LOCKDEP_STATE
__LOCKF(USED)
+ __LOCKF(USED_READ)
};
-#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
-#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
+enum {
+#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE |
+ LOCKF_ENABLED_IRQ =
+#include "lockdep_states.h"
+ 0,
+#undef LOCKDEP_STATE
-#define LOCKF_ENABLED_IRQ_READ \
- (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
-#define LOCKF_USED_IN_IRQ_READ \
- (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
+#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE |
+ LOCKF_USED_IN_IRQ =
+#include "lockdep_states.h"
+ 0,
+#undef LOCKDEP_STATE
+
+#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE##_READ |
+ LOCKF_ENABLED_IRQ_READ =
+#include "lockdep_states.h"
+ 0,
+#undef LOCKDEP_STATE
+
+#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE##_READ |
+ LOCKF_USED_IN_IRQ_READ =
+#include "lockdep_states.h"
+ 0,
+#undef LOCKDEP_STATE
+};
+
+#define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ)
+#define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ)
+
+#define LOCKF_IRQ (LOCKF_ENABLED_IRQ | LOCKF_USED_IN_IRQ)
+#define LOCKF_IRQ_READ (LOCKF_ENABLED_IRQ_READ | LOCKF_USED_IN_IRQ_READ)
/*
* CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text,
@@ -65,51 +99,73 @@ enum {
#define MAX_LOCKDEP_ENTRIES 16384UL
#define MAX_LOCKDEP_CHAINS_BITS 15
#define MAX_STACK_TRACE_ENTRIES 262144UL
+#define STACK_TRACE_HASH_SIZE 8192
#else
-#define MAX_LOCKDEP_ENTRIES 32768UL
+#define MAX_LOCKDEP_ENTRIES (1UL << CONFIG_LOCKDEP_BITS)
-#define MAX_LOCKDEP_CHAINS_BITS 16
+#define MAX_LOCKDEP_CHAINS_BITS CONFIG_LOCKDEP_CHAINS_BITS
/*
* Stack-trace: tightly packed array of stack backtrace
* addresses. Protected by the hash_lock.
*/
-#define MAX_STACK_TRACE_ENTRIES 524288UL
+#define MAX_STACK_TRACE_ENTRIES (1UL << CONFIG_LOCKDEP_STACK_TRACE_BITS)
+#define STACK_TRACE_HASH_SIZE (1 << CONFIG_LOCKDEP_STACK_TRACE_HASH_BITS)
#endif
+/*
+ * Bit definitions for lock_chain.irq_context
+ */
+#define LOCK_CHAIN_SOFTIRQ_CONTEXT (1 << 0)
+#define LOCK_CHAIN_HARDIRQ_CONTEXT (1 << 1)
+
#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
-#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
+#define AVG_LOCKDEP_CHAIN_DEPTH 5
+#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS * AVG_LOCKDEP_CHAIN_DEPTH)
-extern struct list_head all_lock_classes;
extern struct lock_chain lock_chains[];
-#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2)
+#define LOCK_USAGE_CHARS (2*XXX_LOCK_USAGE_STATES + 1)
extern void get_usage_chars(struct lock_class *class,
char usage[LOCK_USAGE_CHARS]);
-extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
+extern const char *__get_key_name(const struct lockdep_subclass_key *key,
+ char *str);
struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i);
extern unsigned long nr_lock_classes;
+extern unsigned long nr_zapped_classes;
+extern unsigned long nr_zapped_lock_chains;
extern unsigned long nr_list_entries;
-extern unsigned long nr_lock_chains;
-extern int nr_chain_hlocks;
+extern unsigned long nr_dynamic_keys;
+long lockdep_next_lockchain(long i);
+unsigned long lock_chain_count(void);
extern unsigned long nr_stack_trace_entries;
extern unsigned int nr_hardirq_chains;
extern unsigned int nr_softirq_chains;
extern unsigned int nr_process_chains;
-extern unsigned int max_lockdep_depth;
-extern unsigned int max_recursion_depth;
+extern unsigned int nr_free_chain_hlocks;
+extern unsigned int nr_lost_chain_hlocks;
+extern unsigned int nr_large_chain_blocks;
+extern unsigned int max_lockdep_depth;
extern unsigned int max_bfs_queue_depth;
+extern unsigned long max_lock_class_idx;
+
+extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+extern unsigned long lock_classes_in_use[];
#ifdef CONFIG_PROVE_LOCKING
extern unsigned long lockdep_count_forward_deps(struct lock_class *);
extern unsigned long lockdep_count_backward_deps(struct lock_class *);
+#ifdef CONFIG_TRACE_IRQFLAGS
+u64 lockdep_stack_trace_count(void);
+u64 lockdep_stack_hash_count(void);
+#endif
#else
static inline unsigned long
lockdep_count_forward_deps(struct lock_class *class)
@@ -132,23 +188,27 @@ lockdep_count_backward_deps(struct lock_class *class)
* and we want to avoid too much cache bouncing.
*/
struct lockdep_stats {
- int chain_lookup_hits;
- int chain_lookup_misses;
- int hardirqs_on_events;
- int hardirqs_off_events;
- int redundant_hardirqs_on;
- int redundant_hardirqs_off;
- int softirqs_on_events;
- int softirqs_off_events;
- int redundant_softirqs_on;
- int redundant_softirqs_off;
- int nr_unused_locks;
- int nr_cyclic_checks;
- int nr_cyclic_check_recursions;
- int nr_find_usage_forwards_checks;
- int nr_find_usage_forwards_recursions;
- int nr_find_usage_backwards_checks;
- int nr_find_usage_backwards_recursions;
+ unsigned long chain_lookup_hits;
+ unsigned int chain_lookup_misses;
+ unsigned long hardirqs_on_events;
+ unsigned long hardirqs_off_events;
+ unsigned long redundant_hardirqs_on;
+ unsigned long redundant_hardirqs_off;
+ unsigned long softirqs_on_events;
+ unsigned long softirqs_off_events;
+ unsigned long redundant_softirqs_on;
+ unsigned long redundant_softirqs_off;
+ int nr_unused_locks;
+ unsigned int nr_redundant_checks;
+ unsigned int nr_redundant;
+ unsigned int nr_cyclic_checks;
+ unsigned int nr_find_usage_forwards_checks;
+ unsigned int nr_find_usage_backwards_checks;
+
+ /*
+ * Per lock class locking operation stat counts
+ */
+ unsigned long lock_class_ops[MAX_LOCKDEP_KEYS];
};
DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
@@ -176,9 +236,30 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
} \
__total; \
})
+
+static inline void debug_class_ops_inc(struct lock_class *class)
+{
+ int idx;
+
+ idx = class - lock_classes;
+ __debug_atomic_inc(lock_class_ops[idx]);
+}
+
+static inline unsigned long debug_class_ops_read(struct lock_class *class)
+{
+ int idx, cpu;
+ unsigned long ops = 0;
+
+ idx = class - lock_classes;
+ for_each_possible_cpu(cpu)
+ ops += per_cpu(lockdep_stats.lock_class_ops[idx], cpu);
+ return ops;
+}
+
#else
# define __debug_atomic_inc(ptr) do { } while (0)
# define debug_atomic_inc(ptr) do { } while (0)
# define debug_atomic_dec(ptr) do { } while (0)
# define debug_atomic_read(ptr) 0
+# define debug_class_ops_inc(ptr) do { } while (0)
#endif
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 6d1fcc786081..1916db9aa46b 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* kernel/lockdep_proc.c
*
@@ -23,14 +24,33 @@
#include "lockdep_internals.h"
+/*
+ * Since iteration of lock_classes is done without holding the lockdep lock,
+ * it is not safe to iterate all_lock_classes list directly as the iteration
+ * may branch off to free_lock_classes or the zapped list. Iteration is done
+ * directly on the lock_classes array by checking the lock_classes_in_use
+ * bitmap and max_lock_class_idx.
+ */
+#define iterate_lock_classes(idx, class) \
+ for (idx = 0, class = lock_classes; idx <= max_lock_class_idx; \
+ idx++, class++)
+
static void *l_next(struct seq_file *m, void *v, loff_t *pos)
{
- return seq_list_next(v, &all_lock_classes, pos);
+ struct lock_class *class = v;
+
+ ++class;
+ *pos = class - lock_classes;
+ return (*pos > max_lock_class_idx) ? NULL : class;
}
static void *l_start(struct seq_file *m, loff_t *pos)
{
- return seq_list_start_head(&all_lock_classes, *pos);
+ unsigned long idx = *pos;
+
+ if (idx > max_lock_class_idx)
+ return NULL;
+ return lock_classes + idx;
}
static void l_stop(struct seq_file *m, void *v)
@@ -56,39 +76,43 @@ static void print_name(struct seq_file *m, struct lock_class *class)
static int l_show(struct seq_file *m, void *v)
{
- struct lock_class *class = list_entry(v, struct lock_class, lock_entry);
+ struct lock_class *class = v;
struct lock_list *entry;
char usage[LOCK_USAGE_CHARS];
+ int idx = class - lock_classes;
- if (v == &all_lock_classes) {
+ if (v == lock_classes)
seq_printf(m, "all lock classes:\n");
+
+ if (!test_bit(idx, lock_classes_in_use))
return 0;
- }
seq_printf(m, "%p", class->key);
#ifdef CONFIG_DEBUG_LOCKDEP
- seq_printf(m, " OPS:%8ld", class->ops);
-#endif
-#ifdef CONFIG_PROVE_LOCKING
- seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class));
- seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
+ seq_printf(m, " OPS:%8ld", debug_class_ops_read(class));
#endif
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
+ seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class));
+ seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
- get_usage_chars(class, usage);
- seq_printf(m, " %s", usage);
+ get_usage_chars(class, usage);
+ seq_printf(m, " %s", usage);
+ }
seq_printf(m, ": ");
print_name(m, class);
seq_puts(m, "\n");
- list_for_each_entry(entry, &class->locks_after, entry) {
- if (entry->distance == 1) {
- seq_printf(m, " -> [%p] ", entry->class->key);
- print_name(m, entry->class);
- seq_puts(m, "\n");
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
+ list_for_each_entry(entry, &class->locks_after, entry) {
+ if (entry->distance == 1) {
+ seq_printf(m, " -> [%p] ", entry->class->key);
+ print_name(m, entry->class);
+ seq_puts(m, "\n");
+ }
}
+ seq_puts(m, "\n");
}
- seq_puts(m, "\n");
return 0;
}
@@ -100,33 +124,21 @@ static const struct seq_operations lockdep_ops = {
.show = l_show,
};
-static int lockdep_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &lockdep_ops);
-}
-
-static const struct file_operations proc_lockdep_operations = {
- .open = lockdep_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
#ifdef CONFIG_PROVE_LOCKING
static void *lc_start(struct seq_file *m, loff_t *pos)
{
+ if (*pos < 0)
+ return NULL;
+
if (*pos == 0)
return SEQ_START_TOKEN;
- if (*pos - 1 < nr_lock_chains)
- return lock_chains + (*pos - 1);
-
- return NULL;
+ return lock_chains + (*pos - 1);
}
static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
{
- (*pos)++;
+ *pos = lockdep_next_lockchain(*pos - 1) + 1;
return lc_start(m, pos);
}
@@ -139,15 +151,22 @@ static int lc_show(struct seq_file *m, void *v)
struct lock_chain *chain = v;
struct lock_class *class;
int i;
+ static const char * const irq_strs[] = {
+ [0] = "0",
+ [LOCK_CHAIN_HARDIRQ_CONTEXT] = "hardirq",
+ [LOCK_CHAIN_SOFTIRQ_CONTEXT] = "softirq",
+ [LOCK_CHAIN_SOFTIRQ_CONTEXT|
+ LOCK_CHAIN_HARDIRQ_CONTEXT] = "hardirq|softirq",
+ };
if (v == SEQ_START_TOKEN) {
- if (nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)
+ if (!nr_free_chain_hlocks)
seq_printf(m, "(buggered) ");
seq_printf(m, "all lock chains:\n");
return 0;
}
- seq_printf(m, "irq_context: %d\n", chain->irq_context);
+ seq_printf(m, "irq_context: %s\n", irq_strs[chain->irq_context]);
for (i = 0; i < chain->depth; i++) {
class = lock_chain_get_class(chain, i);
@@ -169,18 +188,6 @@ static const struct seq_operations lockdep_chains_ops = {
.stop = lc_stop,
.show = lc_show,
};
-
-static int lockdep_chains_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &lockdep_chains_ops);
-}
-
-static const struct file_operations proc_lockdep_chains_operations = {
- .open = lockdep_chains_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
#endif /* CONFIG_PROVE_LOCKING */
static void lockdep_stats_debug_show(struct seq_file *m)
@@ -201,6 +208,10 @@ static void lockdep_stats_debug_show(struct seq_file *m)
debug_atomic_read(chain_lookup_hits));
seq_printf(m, " cyclic checks: %11llu\n",
debug_atomic_read(nr_cyclic_checks));
+ seq_printf(m, " redundant checks: %11llu\n",
+ debug_atomic_read(nr_redundant_checks));
+ seq_printf(m, " redundant links: %11llu\n",
+ debug_atomic_read(nr_redundant));
seq_printf(m, " find-mask forwards checks: %11llu\n",
debug_atomic_read(nr_find_usage_forwards_checks));
seq_printf(m, " find-mask backwards checks: %11llu\n",
@@ -219,7 +230,6 @@ static void lockdep_stats_debug_show(struct seq_file *m)
static int lockdep_stats_show(struct seq_file *m, void *v)
{
- struct lock_class *class;
unsigned long nr_unused = 0, nr_uncategorized = 0,
nr_irq_safe = 0, nr_irq_unsafe = 0,
nr_softirq_safe = 0, nr_softirq_unsafe = 0,
@@ -229,7 +239,13 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
sum_forward_deps = 0;
- list_for_each_entry(class, &all_lock_classes, lock_entry) {
+#ifdef CONFIG_PROVE_LOCKING
+ struct lock_class *class;
+ unsigned long idx;
+
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
if (class->usage_mask == 0)
nr_unused++;
@@ -260,15 +276,18 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
nr_hardirq_read_unsafe++;
-#ifdef CONFIG_PROVE_LOCKING
sum_forward_deps += lockdep_count_forward_deps(class);
-#endif
}
+
#ifdef CONFIG_DEBUG_LOCKDEP
DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
#endif
+
+#endif
seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
nr_lock_classes, MAX_LOCKDEP_KEYS);
+ seq_printf(m, " dynamic-keys: %11lu\n",
+ nr_dynamic_keys);
seq_printf(m, " direct dependencies: %11lu [max: %lu]\n",
nr_list_entries, MAX_LOCKDEP_ENTRIES);
seq_printf(m, " indirect dependencies: %11lu\n",
@@ -287,9 +306,13 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
#ifdef CONFIG_PROVE_LOCKING
seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
- nr_lock_chains, MAX_LOCKDEP_CHAINS);
- seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n",
- nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS);
+ lock_chain_count(), MAX_LOCKDEP_CHAINS);
+ seq_printf(m, " dependency chain hlocks used: %11lu [max: %lu]\n",
+ MAX_LOCKDEP_CHAIN_HLOCKS -
+ (nr_free_chain_hlocks + nr_lost_chain_hlocks),
+ MAX_LOCKDEP_CHAIN_HLOCKS);
+ seq_printf(m, " dependency chain hlocks lost: %11u\n",
+ nr_lost_chain_hlocks);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
@@ -302,6 +325,12 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
nr_process_chains);
seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n",
nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES);
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+ seq_printf(m, " number of stack traces: %11llu\n",
+ lockdep_stack_trace_count());
+ seq_printf(m, " number of stack hash chains: %11llu\n",
+ lockdep_stack_hash_count());
+#endif
seq_printf(m, " combined max dependencies: %11u\n",
(nr_hardirq_chains + 1) *
(nr_softirq_chains + 1) *
@@ -343,25 +372,27 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
seq_printf(m, " max bfs queue depth: %11u\n",
max_bfs_queue_depth);
#endif
+ seq_printf(m, " max lock class index: %11lu\n",
+ max_lock_class_idx);
lockdep_stats_debug_show(m);
seq_printf(m, " debug_locks: %11u\n",
debug_locks);
+ /*
+ * Zapped classes and lockdep data buffers reuse statistics.
+ */
+ seq_puts(m, "\n");
+ seq_printf(m, " zapped classes: %11lu\n",
+ nr_zapped_classes);
+#ifdef CONFIG_PROVE_LOCKING
+ seq_printf(m, " zapped lock chains: %11lu\n",
+ nr_zapped_lock_chains);
+ seq_printf(m, " large chain blocks: %11u\n",
+ nr_large_chain_blocks);
+#endif
return 0;
}
-static int lockdep_stats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, lockdep_stats_show, NULL);
-}
-
-static const struct file_operations proc_lockdep_stats_operations = {
- .open = lockdep_stats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
#ifdef CONFIG_LOCK_STAT
struct lock_stat_data {
@@ -395,7 +426,7 @@ static void seq_line(struct seq_file *m, char c, int offset, int length)
for (i = 0; i < offset; i++)
seq_puts(m, " ");
for (i = 0; i < length; i++)
- seq_printf(m, "%c", c);
+ seq_putc(m, c);
seq_puts(m, "\n");
}
@@ -411,7 +442,7 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr)
static void seq_time(struct seq_file *m, s64 time)
{
- char num[15];
+ char num[22];
snprint_time(num, sizeof(num), time);
seq_printf(m, " %14s", num);
@@ -423,12 +454,12 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
seq_time(m, lt->min);
seq_time(m, lt->max);
seq_time(m, lt->total);
- seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0);
+ seq_time(m, lt->nr ? div64_u64(lt->total, lt->nr) : 0);
}
static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
{
- struct lockdep_subclass_key *ckey;
+ const struct lockdep_subclass_key *ckey;
struct lock_class_stats *stats;
struct lock_class *class;
const char *cname;
@@ -620,12 +651,16 @@ static int lock_stat_open(struct inode *inode, struct file *file)
if (!res) {
struct lock_stat_data *iter = data->stats;
struct seq_file *m = file->private_data;
+ unsigned long idx;
- list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
iter->class = class;
- iter->stats = lock_stats(class);
+ lock_stats(class, &iter->stats);
iter++;
}
+
data->iter_end = iter;
sort(data->stats, data->iter_end - data->stats,
@@ -643,6 +678,7 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct lock_class *class;
+ unsigned long idx;
char c;
if (count) {
@@ -652,8 +688,11 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf,
if (c != '0')
return count;
- list_for_each_entry(class, &all_lock_classes, lock_entry)
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
clear_lock_stats(class);
+ }
}
return count;
}
@@ -666,28 +705,24 @@ static int lock_stat_release(struct inode *inode, struct file *file)
return seq_release(inode, file);
}
-static const struct file_operations proc_lock_stat_operations = {
- .open = lock_stat_open,
- .write = lock_stat_write,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = lock_stat_release,
+static const struct proc_ops lock_stat_proc_ops = {
+ .proc_open = lock_stat_open,
+ .proc_write = lock_stat_write,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = lock_stat_release,
};
#endif /* CONFIG_LOCK_STAT */
static int __init lockdep_proc_init(void)
{
- proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations);
+ proc_create_seq("lockdep", S_IRUSR, NULL, &lockdep_ops);
#ifdef CONFIG_PROVE_LOCKING
- proc_create("lockdep_chains", S_IRUSR, NULL,
- &proc_lockdep_chains_operations);
+ proc_create_seq("lockdep_chains", S_IRUSR, NULL, &lockdep_chains_ops);
#endif
- proc_create("lockdep_stats", S_IRUSR, NULL,
- &proc_lockdep_stats_operations);
-
+ proc_create_single("lockdep_stats", S_IRUSR, NULL, lockdep_stats_show);
#ifdef CONFIG_LOCK_STAT
- proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
- &proc_lock_stat_operations);
+ proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, &lock_stat_proc_ops);
#endif
return 0;
diff --git a/kernel/locking/lockdep_states.h b/kernel/locking/lockdep_states.h
index 995b0cc2b84c..35ca09f2ed0b 100644
--- a/kernel/locking/lockdep_states.h
+++ b/kernel/locking/lockdep_states.h
@@ -6,4 +6,3 @@
*/
LOCKDEP_STATE(HARDIRQ)
LOCKDEP_STATE(SOFTIRQ)
-LOCKDEP_STATE(RECLAIM_FS)
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index f24582d4dad3..6567e5eeacc0 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -1,32 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Module-based torture test facility for locking
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (C) IBM Corporation, 2014
*
- * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
* Davidlohr Bueso <dave@stgolabs.net>
* Based on kernel/rcu/torture.c.
*/
+
+#define pr_fmt(fmt) fmt
+
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kthread.h>
#include <linux/sched/rt.h>
#include <linux/spinlock.h>
-#include <linux/rwlock.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/smp.h>
@@ -38,48 +27,105 @@
#include <linux/moduleparam.h>
#include <linux/delay.h>
#include <linux/slab.h>
-#include <linux/percpu-rwsem.h>
#include <linux/torture.h>
+#include <linux/reboot.h>
+MODULE_DESCRIPTION("torture test facility for locking");
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
-
-torture_param(int, nwriters_stress, -1,
- "Number of write-locking stress-test threads");
-torture_param(int, nreaders_stress, -1,
- "Number of read-locking stress-test threads");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
+
+torture_param(int, acq_writer_lim, 0, "Write_acquisition time limit (jiffies).");
+torture_param(int, call_rcu_chains, 0, "Self-propagate call_rcu() chains during test (0=disable).");
+torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable");
+torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)");
+torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads");
+torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads");
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
-torture_param(int, onoff_interval, 0,
- "Time between CPU hotplugs (s), 0=disable");
-torture_param(int, shuffle_interval, 3,
- "Number of jiffies between shuffles, 0=disable");
+torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, rt_boost, 2,
+ "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types.");
+torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens.");
+torture_param(int, shuffle_interval, 3, "Number of jiffies between shuffles, 0=disable");
torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
-torture_param(int, stat_interval, 60,
- "Number of seconds between stats printk()s");
+torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
-torture_param(bool, verbose, true,
- "Enable verbose debugging printk()s");
+torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
+torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority");
+/* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */
+#define MAX_NESTED_LOCKS 8
-static char *torture_type = "spin_lock";
+static char *torture_type = IS_ENABLED(CONFIG_PREEMPT_RT) ? "raw_spin_lock" : "spin_lock";
module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type,
"Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)");
+static cpumask_var_t bind_readers; // Bind the readers to the specified set of CPUs.
+static cpumask_var_t bind_writers; // Bind the writers to the specified set of CPUs.
+
+// Parse a cpumask kernel parameter. If there are more users later on,
+// this might need to got to a more central location.
+static int param_set_cpumask(const char *val, const struct kernel_param *kp)
+{
+ cpumask_var_t *cm_bind = kp->arg;
+ int ret;
+ char *s;
+
+ if (!alloc_cpumask_var(cm_bind, GFP_KERNEL)) {
+ s = "Out of memory";
+ ret = -ENOMEM;
+ goto out_err;
+ }
+ ret = cpulist_parse(val, *cm_bind);
+ if (!ret)
+ return ret;
+ s = "Bad CPU range";
+out_err:
+ pr_warn("%s: %s, all CPUs set\n", kp->name, s);
+ cpumask_setall(*cm_bind);
+ return ret;
+}
+
+// Output a cpumask kernel parameter.
+static int param_get_cpumask(char *buffer, const struct kernel_param *kp)
+{
+ cpumask_var_t *cm_bind = kp->arg;
+
+ return sprintf(buffer, "%*pbl", cpumask_pr_args(*cm_bind));
+}
+
+static bool cpumask_nonempty(cpumask_var_t mask)
+{
+ return cpumask_available(mask) && !cpumask_empty(mask);
+}
+
+static const struct kernel_param_ops lt_bind_ops = {
+ .set = param_set_cpumask,
+ .get = param_get_cpumask,
+};
+
+module_param_cb(bind_readers, &lt_bind_ops, &bind_readers, 0444);
+module_param_cb(bind_writers, &lt_bind_ops, &bind_writers, 0444);
+
+long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn);
+
static struct task_struct *stats_task;
static struct task_struct **writer_tasks;
static struct task_struct **reader_tasks;
static bool lock_is_write_held;
-static bool lock_is_read_held;
+static atomic_t lock_is_read_held;
+static unsigned long last_lock_release;
struct lock_stress_stats {
long n_lock_fail;
long n_lock_acquired;
};
-int torture_runnable = IS_ENABLED(MODULE);
-module_param(torture_runnable, int, 0444);
-MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init");
+struct call_rcu_chain {
+ struct rcu_head crc_rh;
+ bool crc_stop;
+};
+struct call_rcu_chain *call_rcu_chain_list;
/* Forward reference. */
static void lock_torture_cleanup(void);
@@ -89,13 +135,16 @@ static void lock_torture_cleanup(void);
*/
struct lock_torture_ops {
void (*init)(void);
- int (*writelock)(void);
+ void (*exit)(void);
+ int (*nested_lock)(int tid, u32 lockset);
+ int (*writelock)(int tid);
void (*write_delay)(struct torture_random_state *trsp);
void (*task_boost)(struct torture_random_state *trsp);
- void (*writeunlock)(void);
- int (*readlock)(void);
+ void (*writeunlock)(int tid);
+ void (*nested_unlock)(int tid, u32 lockset);
+ int (*readlock)(int tid);
void (*read_delay)(struct torture_random_state *trsp);
- void (*readunlock)(void);
+ void (*readunlock)(int tid);
unsigned long flags; /* for irq spinlocks */
const char *name;
@@ -105,51 +154,82 @@ struct lock_torture_cxt {
int nrealwriters_stress;
int nrealreaders_stress;
bool debug_lock;
+ bool init_called;
atomic_t n_lock_torture_errors;
struct lock_torture_ops *cur_ops;
struct lock_stress_stats *lwsa; /* writer statistics */
struct lock_stress_stats *lrsa; /* reader statistics */
};
-static struct lock_torture_cxt cxt = { 0, 0, false,
+static struct lock_torture_cxt cxt = { 0, 0, false, false,
ATOMIC_INIT(0),
NULL, NULL};
/*
* Definitions for lock torture testing.
*/
-static int torture_lock_busted_write_lock(void)
+static int torture_lock_busted_write_lock(int tid __maybe_unused)
{
return 0; /* BUGGY, do not use in real life!!! */
}
static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
-#ifdef CONFIG_PREEMPT
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_lock_busted_write_unlock(void)
+static void torture_lock_busted_write_unlock(int tid __maybe_unused)
{
/* BUGGY, do not use in real life!!! */
}
-static void torture_boost_dummy(struct torture_random_state *trsp)
+static void __torture_rt_boost(struct torture_random_state *trsp)
{
- /* Only rtmutexes care about priority */
+ const unsigned int factor = rt_boost_factor;
+
+ if (!rt_task(current)) {
+ /*
+ * Boost priority once every rt_boost_factor operations. When
+ * the task tries to take the lock, the rtmutex it will account
+ * for the new priority, and do any corresponding pi-dance.
+ */
+ if (trsp && !(torture_random(trsp) %
+ (cxt.nrealwriters_stress * factor))) {
+ sched_set_fifo(current);
+ } else /* common case, do nothing */
+ return;
+ } else {
+ /*
+ * The task will remain boosted for another 10 * rt_boost_factor
+ * operations, then restored back to its original prio, and so
+ * forth.
+ *
+ * When @trsp is nil, we want to force-reset the task for
+ * stopping the kthread.
+ */
+ if (!trsp || !(torture_random(trsp) %
+ (cxt.nrealwriters_stress * factor * 2))) {
+ sched_set_normal(current, 0);
+ } else /* common case, do nothing */
+ return;
+ }
+}
+
+static void torture_rt_boost(struct torture_random_state *trsp)
+{
+ if (rt_boost != 2)
+ return;
+
+ __torture_rt_boost(trsp);
}
static struct lock_torture_ops lock_busted_ops = {
.writelock = torture_lock_busted_write_lock,
.write_delay = torture_lock_busted_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_lock_busted_write_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -159,7 +239,8 @@ static struct lock_torture_ops lock_busted_ops = {
static DEFINE_SPINLOCK(torture_spinlock);
-static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
+static int torture_spin_lock_write_lock(int tid __maybe_unused)
+__acquires(torture_spinlock)
{
spin_lock(&torture_spinlock);
return 0;
@@ -168,24 +249,24 @@ static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_ms = 100;
+ unsigned long j;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2 * shortdelay_us)))
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) {
+ j = jiffies;
+ mdelay(long_hold);
+ pr_alert("%s: delay = %lu jiffies.\n", __func__, jiffies - j);
+ }
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us)))
udelay(shortdelay_us);
-#ifdef CONFIG_PREEMPT
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
+static void torture_spin_lock_write_unlock(int tid __maybe_unused)
+__releases(torture_spinlock)
{
spin_unlock(&torture_spinlock);
}
@@ -193,7 +274,7 @@ static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
static struct lock_torture_ops spin_lock_ops = {
.writelock = torture_spin_lock_write_lock,
.write_delay = torture_spin_lock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_spin_lock_write_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -201,7 +282,7 @@ static struct lock_torture_ops spin_lock_ops = {
.name = "spin_lock"
};
-static int torture_spin_lock_write_lock_irq(void)
+static int torture_spin_lock_write_lock_irq(int tid __maybe_unused)
__acquires(torture_spinlock)
{
unsigned long flags;
@@ -211,7 +292,7 @@ __acquires(torture_spinlock)
return 0;
}
-static void torture_lock_spin_write_unlock_irq(void)
+static void torture_lock_spin_write_unlock_irq(int tid __maybe_unused)
__releases(torture_spinlock)
{
spin_unlock_irqrestore(&torture_spinlock, cxt.cur_ops->flags);
@@ -220,7 +301,7 @@ __releases(torture_spinlock)
static struct lock_torture_ops spin_lock_irq_ops = {
.writelock = torture_spin_lock_write_lock_irq,
.write_delay = torture_spin_lock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_lock_spin_write_unlock_irq,
.readlock = NULL,
.read_delay = NULL,
@@ -228,9 +309,117 @@ static struct lock_torture_ops spin_lock_irq_ops = {
.name = "spin_lock_irq"
};
+static DEFINE_RAW_SPINLOCK(torture_raw_spinlock);
+
+static int torture_raw_spin_lock_write_lock(int tid __maybe_unused)
+__acquires(torture_raw_spinlock)
+{
+ raw_spin_lock(&torture_raw_spinlock);
+ return 0;
+}
+
+static void torture_raw_spin_lock_write_unlock(int tid __maybe_unused)
+__releases(torture_raw_spinlock)
+{
+ raw_spin_unlock(&torture_raw_spinlock);
+}
+
+static struct lock_torture_ops raw_spin_lock_ops = {
+ .writelock = torture_raw_spin_lock_write_lock,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_spin_lock_write_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_spin_lock"
+};
+
+static int torture_raw_spin_lock_write_lock_irq(int tid __maybe_unused)
+__acquires(torture_raw_spinlock)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&torture_raw_spinlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_raw_spin_lock_write_unlock_irq(int tid __maybe_unused)
+__releases(torture_raw_spinlock)
+{
+ raw_spin_unlock_irqrestore(&torture_raw_spinlock, cxt.cur_ops->flags);
+}
+
+static struct lock_torture_ops raw_spin_lock_irq_ops = {
+ .writelock = torture_raw_spin_lock_write_lock_irq,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_spin_lock_write_unlock_irq,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_spin_lock_irq"
+};
+
+#ifdef CONFIG_BPF_SYSCALL
+
+#include <asm/rqspinlock.h>
+static rqspinlock_t rqspinlock;
+
+static int torture_raw_res_spin_write_lock(int tid __maybe_unused)
+{
+ raw_res_spin_lock(&rqspinlock);
+ return 0;
+}
+
+static void torture_raw_res_spin_write_unlock(int tid __maybe_unused)
+{
+ raw_res_spin_unlock(&rqspinlock);
+}
+
+static struct lock_torture_ops raw_res_spin_lock_ops = {
+ .writelock = torture_raw_res_spin_write_lock,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_res_spin_write_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_res_spin_lock"
+};
+
+static int torture_raw_res_spin_write_lock_irq(int tid __maybe_unused)
+{
+ unsigned long flags;
+
+ raw_res_spin_lock_irqsave(&rqspinlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_raw_res_spin_write_unlock_irq(int tid __maybe_unused)
+{
+ raw_res_spin_unlock_irqrestore(&rqspinlock, cxt.cur_ops->flags);
+}
+
+static struct lock_torture_ops raw_res_spin_lock_irq_ops = {
+ .writelock = torture_raw_res_spin_write_lock_irq,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_res_spin_write_unlock_irq,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_res_spin_lock_irq"
+};
+
+#endif
+
static DEFINE_RWLOCK(torture_rwlock);
-static int torture_rwlock_write_lock(void) __acquires(torture_rwlock)
+static int torture_rwlock_write_lock(int tid __maybe_unused)
+__acquires(torture_rwlock)
{
write_lock(&torture_rwlock);
return 0;
@@ -239,24 +428,24 @@ static int torture_rwlock_write_lock(void) __acquires(torture_rwlock)
static void torture_rwlock_write_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_ms = 100;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold);
else
udelay(shortdelay_us);
}
-static void torture_rwlock_write_unlock(void) __releases(torture_rwlock)
+static void torture_rwlock_write_unlock(int tid __maybe_unused)
+__releases(torture_rwlock)
{
write_unlock(&torture_rwlock);
}
-static int torture_rwlock_read_lock(void) __acquires(torture_rwlock)
+static int torture_rwlock_read_lock(int tid __maybe_unused)
+__acquires(torture_rwlock)
{
read_lock(&torture_rwlock);
return 0;
@@ -265,19 +454,18 @@ static int torture_rwlock_read_lock(void) __acquires(torture_rwlock)
static void torture_rwlock_read_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 10;
- const unsigned long longdelay_ms = 100;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
- if (!(torture_random(trsp) %
- (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold)))
+ mdelay(long_hold);
else
udelay(shortdelay_us);
}
-static void torture_rwlock_read_unlock(void) __releases(torture_rwlock)
+static void torture_rwlock_read_unlock(int tid __maybe_unused)
+__releases(torture_rwlock)
{
read_unlock(&torture_rwlock);
}
@@ -285,7 +473,7 @@ static void torture_rwlock_read_unlock(void) __releases(torture_rwlock)
static struct lock_torture_ops rw_lock_ops = {
.writelock = torture_rwlock_write_lock,
.write_delay = torture_rwlock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwlock_write_unlock,
.readlock = torture_rwlock_read_lock,
.read_delay = torture_rwlock_read_delay,
@@ -293,7 +481,8 @@ static struct lock_torture_ops rw_lock_ops = {
.name = "rw_lock"
};
-static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock)
+static int torture_rwlock_write_lock_irq(int tid __maybe_unused)
+__acquires(torture_rwlock)
{
unsigned long flags;
@@ -302,13 +491,14 @@ static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock)
return 0;
}
-static void torture_rwlock_write_unlock_irq(void)
+static void torture_rwlock_write_unlock_irq(int tid __maybe_unused)
__releases(torture_rwlock)
{
write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
}
-static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock)
+static int torture_rwlock_read_lock_irq(int tid __maybe_unused)
+__acquires(torture_rwlock)
{
unsigned long flags;
@@ -317,7 +507,7 @@ static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock)
return 0;
}
-static void torture_rwlock_read_unlock_irq(void)
+static void torture_rwlock_read_unlock_irq(int tid __maybe_unused)
__releases(torture_rwlock)
{
read_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
@@ -326,7 +516,7 @@ __releases(torture_rwlock)
static struct lock_torture_ops rw_lock_irq_ops = {
.writelock = torture_rwlock_write_lock_irq,
.write_delay = torture_rwlock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwlock_write_unlock_irq,
.readlock = torture_rwlock_read_lock_irq,
.read_delay = torture_rwlock_read_delay,
@@ -335,8 +525,31 @@ static struct lock_torture_ops rw_lock_irq_ops = {
};
static DEFINE_MUTEX(torture_mutex);
+static struct mutex torture_nested_mutexes[MAX_NESTED_LOCKS];
+static struct lock_class_key nested_mutex_keys[MAX_NESTED_LOCKS];
+
+static void torture_mutex_init(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_NESTED_LOCKS; i++)
+ __mutex_init(&torture_nested_mutexes[i], __func__,
+ &nested_mutex_keys[i]);
+}
+
+static int torture_mutex_nested_lock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = 0; i < nested_locks; i++)
+ if (lockset & (1 << i))
+ mutex_lock(&torture_nested_mutexes[i]);
+ return 0;
+}
-static int torture_mutex_lock(void) __acquires(torture_mutex)
+static int torture_mutex_lock(int tid __maybe_unused)
+__acquires(torture_mutex)
{
mutex_lock(&torture_mutex);
return 0;
@@ -344,30 +557,37 @@ static int torture_mutex_lock(void) __acquires(torture_mutex)
static void torture_mutex_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms * 5);
- else
- mdelay(longdelay_ms / 5);
-#ifdef CONFIG_PREEMPT
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold * 5);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_mutex_unlock(void) __releases(torture_mutex)
+static void torture_mutex_unlock(int tid __maybe_unused)
+__releases(torture_mutex)
{
mutex_unlock(&torture_mutex);
}
+static void torture_mutex_nested_unlock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = nested_locks - 1; i >= 0; i--)
+ if (lockset & (1 << i))
+ mutex_unlock(&torture_nested_mutexes[i]);
+}
+
static struct lock_torture_ops mutex_lock_ops = {
+ .init = torture_mutex_init,
+ .nested_lock = torture_mutex_nested_lock,
.writelock = torture_mutex_lock,
.write_delay = torture_mutex_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_mutex_unlock,
+ .nested_unlock = torture_mutex_nested_unlock,
.readlock = NULL,
.read_delay = NULL,
.readunlock = NULL,
@@ -375,12 +595,34 @@ static struct lock_torture_ops mutex_lock_ops = {
};
#include <linux/ww_mutex.h>
-static DEFINE_WW_CLASS(torture_ww_class);
-static DEFINE_WW_MUTEX(torture_ww_mutex_0, &torture_ww_class);
-static DEFINE_WW_MUTEX(torture_ww_mutex_1, &torture_ww_class);
-static DEFINE_WW_MUTEX(torture_ww_mutex_2, &torture_ww_class);
+/*
+ * The torture ww_mutexes should belong to the same lock class as
+ * torture_ww_class to avoid lockdep problem. The ww_mutex_init()
+ * function is called for initialization to ensure that.
+ */
+static DEFINE_WD_CLASS(torture_ww_class);
+static struct ww_mutex torture_ww_mutex_0, torture_ww_mutex_1, torture_ww_mutex_2;
+static struct ww_acquire_ctx *ww_acquire_ctxs;
+
+static void torture_ww_mutex_init(void)
+{
+ ww_mutex_init(&torture_ww_mutex_0, &torture_ww_class);
+ ww_mutex_init(&torture_ww_mutex_1, &torture_ww_class);
+ ww_mutex_init(&torture_ww_mutex_2, &torture_ww_class);
+
+ ww_acquire_ctxs = kmalloc_array(cxt.nrealwriters_stress,
+ sizeof(*ww_acquire_ctxs),
+ GFP_KERNEL);
+ if (!ww_acquire_ctxs)
+ VERBOSE_TOROUT_STRING("ww_acquire_ctx: Out of memory");
+}
+
+static void torture_ww_mutex_exit(void)
+{
+ kfree(ww_acquire_ctxs);
+}
-static int torture_ww_mutex_lock(void)
+static int torture_ww_mutex_lock(int tid)
__acquires(torture_ww_mutex_0)
__acquires(torture_ww_mutex_1)
__acquires(torture_ww_mutex_2)
@@ -390,7 +632,7 @@ __acquires(torture_ww_mutex_2)
struct list_head link;
struct ww_mutex *lock;
} locks[3], *ll, *ln;
- struct ww_acquire_ctx ctx;
+ struct ww_acquire_ctx *ctx = &ww_acquire_ctxs[tid];
locks[0].lock = &torture_ww_mutex_0;
list_add(&locks[0].link, &list);
@@ -401,12 +643,12 @@ __acquires(torture_ww_mutex_2)
locks[2].lock = &torture_ww_mutex_2;
list_add(&locks[2].link, &list);
- ww_acquire_init(&ctx, &torture_ww_class);
+ ww_acquire_init(ctx, &torture_ww_class);
list_for_each_entry(ll, &list, link) {
int err;
- err = ww_mutex_lock(ll->lock, &ctx);
+ err = ww_mutex_lock(ll->lock, ctx);
if (!err)
continue;
@@ -417,28 +659,32 @@ __acquires(torture_ww_mutex_2)
if (err != -EDEADLK)
return err;
- ww_mutex_lock_slow(ll->lock, &ctx);
+ ww_mutex_lock_slow(ll->lock, ctx);
list_move(&ll->link, &list);
}
- ww_acquire_fini(&ctx);
return 0;
}
-static void torture_ww_mutex_unlock(void)
+static void torture_ww_mutex_unlock(int tid)
__releases(torture_ww_mutex_0)
__releases(torture_ww_mutex_1)
__releases(torture_ww_mutex_2)
{
+ struct ww_acquire_ctx *ctx = &ww_acquire_ctxs[tid];
+
ww_mutex_unlock(&torture_ww_mutex_0);
ww_mutex_unlock(&torture_ww_mutex_1);
ww_mutex_unlock(&torture_ww_mutex_2);
+ ww_acquire_fini(ctx);
}
static struct lock_torture_ops ww_mutex_lock_ops = {
+ .init = torture_ww_mutex_init,
+ .exit = torture_ww_mutex_exit,
.writelock = torture_ww_mutex_lock,
.write_delay = torture_mutex_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_ww_mutex_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -448,81 +694,85 @@ static struct lock_torture_ops ww_mutex_lock_ops = {
#ifdef CONFIG_RT_MUTEXES
static DEFINE_RT_MUTEX(torture_rtmutex);
+static struct rt_mutex torture_nested_rtmutexes[MAX_NESTED_LOCKS];
+static struct lock_class_key nested_rtmutex_keys[MAX_NESTED_LOCKS];
-static int torture_rtmutex_lock(void) __acquires(torture_rtmutex)
+static void torture_rtmutex_init(void)
{
- rt_mutex_lock(&torture_rtmutex);
- return 0;
+ int i;
+
+ for (i = 0; i < MAX_NESTED_LOCKS; i++)
+ __rt_mutex_init(&torture_nested_rtmutexes[i], __func__,
+ &nested_rtmutex_keys[i]);
}
-static void torture_rtmutex_boost(struct torture_random_state *trsp)
+static int torture_rtmutex_nested_lock(int tid __maybe_unused,
+ u32 lockset)
{
- int policy;
- struct sched_param param;
- const unsigned int factor = 50000; /* yes, quite arbitrary */
+ int i;
- if (!rt_task(current)) {
- /*
- * Boost priority once every ~50k operations. When the
- * task tries to take the lock, the rtmutex it will account
- * for the new priority, and do any corresponding pi-dance.
- */
- if (trsp && !(torture_random(trsp) %
- (cxt.nrealwriters_stress * factor))) {
- policy = SCHED_FIFO;
- param.sched_priority = MAX_RT_PRIO - 1;
- } else /* common case, do nothing */
- return;
- } else {
- /*
- * The task will remain boosted for another ~500k operations,
- * then restored back to its original prio, and so forth.
- *
- * When @trsp is nil, we want to force-reset the task for
- * stopping the kthread.
- */
- if (!trsp || !(torture_random(trsp) %
- (cxt.nrealwriters_stress * factor * 2))) {
- policy = SCHED_NORMAL;
- param.sched_priority = 0;
- } else /* common case, do nothing */
- return;
- }
+ for (i = 0; i < nested_locks; i++)
+ if (lockset & (1 << i))
+ rt_mutex_lock(&torture_nested_rtmutexes[i]);
+ return 0;
+}
- sched_setscheduler_nocheck(current, policy, &param);
+static int torture_rtmutex_lock(int tid __maybe_unused)
+__acquires(torture_rtmutex)
+{
+ rt_mutex_lock(&torture_rtmutex);
+ return 0;
}
static void torture_rtmutex_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_ms = 100;
/*
* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold);
if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2 * shortdelay_us)))
+ (cxt.nrealwriters_stress * 200 * shortdelay_us)))
udelay(shortdelay_us);
-#ifdef CONFIG_PREEMPT
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_rtmutex_unlock(void) __releases(torture_rtmutex)
+static void torture_rtmutex_unlock(int tid __maybe_unused)
+__releases(torture_rtmutex)
{
rt_mutex_unlock(&torture_rtmutex);
}
+static void torture_rt_boost_rtmutex(struct torture_random_state *trsp)
+{
+ if (!rt_boost)
+ return;
+
+ __torture_rt_boost(trsp);
+}
+
+static void torture_rtmutex_nested_unlock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = nested_locks - 1; i >= 0; i--)
+ if (lockset & (1 << i))
+ rt_mutex_unlock(&torture_nested_rtmutexes[i]);
+}
+
static struct lock_torture_ops rtmutex_lock_ops = {
+ .init = torture_rtmutex_init,
+ .nested_lock = torture_rtmutex_nested_lock,
.writelock = torture_rtmutex_lock,
.write_delay = torture_rtmutex_delay,
- .task_boost = torture_rtmutex_boost,
+ .task_boost = torture_rt_boost_rtmutex,
.writeunlock = torture_rtmutex_unlock,
+ .nested_unlock = torture_rtmutex_nested_unlock,
.readlock = NULL,
.read_delay = NULL,
.readunlock = NULL,
@@ -531,7 +781,8 @@ static struct lock_torture_ops rtmutex_lock_ops = {
#endif
static DECLARE_RWSEM(torture_rwsem);
-static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
+static int torture_rwsem_down_write(int tid __maybe_unused)
+__acquires(torture_rwsem)
{
down_write(&torture_rwsem);
return 0;
@@ -539,26 +790,21 @@ static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
static void torture_rwsem_write_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms * 10);
- else
- mdelay(longdelay_ms / 10);
-#ifdef CONFIG_PREEMPT
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold * 10);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_rwsem_up_write(void) __releases(torture_rwsem)
+static void torture_rwsem_up_write(int tid __maybe_unused)
+__releases(torture_rwsem)
{
up_write(&torture_rwsem);
}
-static int torture_rwsem_down_read(void) __acquires(torture_rwsem)
+static int torture_rwsem_down_read(int tid __maybe_unused)
+__acquires(torture_rwsem)
{
down_read(&torture_rwsem);
return 0;
@@ -566,21 +812,17 @@ static int torture_rwsem_down_read(void) __acquires(torture_rwsem)
static void torture_rwsem_read_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms * 2);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold)))
+ mdelay(long_hold * 2);
else
- mdelay(longdelay_ms / 2);
-#ifdef CONFIG_PREEMPT
+ mdelay(long_hold / 2);
if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_rwsem_up_read(void) __releases(torture_rwsem)
+static void torture_rwsem_up_read(int tid __maybe_unused)
+__releases(torture_rwsem)
{
up_read(&torture_rwsem);
}
@@ -588,7 +830,7 @@ static void torture_rwsem_up_read(void) __releases(torture_rwsem)
static struct lock_torture_ops rwsem_lock_ops = {
.writelock = torture_rwsem_down_write,
.write_delay = torture_rwsem_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwsem_up_write,
.readlock = torture_rwsem_down_read,
.read_delay = torture_rwsem_read_delay,
@@ -599,38 +841,48 @@ static struct lock_torture_ops rwsem_lock_ops = {
#include <linux/percpu-rwsem.h>
static struct percpu_rw_semaphore pcpu_rwsem;
-void torture_percpu_rwsem_init(void)
+static void torture_percpu_rwsem_init(void)
{
BUG_ON(percpu_init_rwsem(&pcpu_rwsem));
}
-static int torture_percpu_rwsem_down_write(void) __acquires(pcpu_rwsem)
+static void torture_percpu_rwsem_exit(void)
+{
+ percpu_free_rwsem(&pcpu_rwsem);
+}
+
+static int torture_percpu_rwsem_down_write(int tid __maybe_unused)
+__acquires(pcpu_rwsem)
{
percpu_down_write(&pcpu_rwsem);
return 0;
}
-static void torture_percpu_rwsem_up_write(void) __releases(pcpu_rwsem)
+static void torture_percpu_rwsem_up_write(int tid __maybe_unused)
+__releases(pcpu_rwsem)
{
percpu_up_write(&pcpu_rwsem);
}
-static int torture_percpu_rwsem_down_read(void) __acquires(pcpu_rwsem)
+static int torture_percpu_rwsem_down_read(int tid __maybe_unused)
+__acquires(pcpu_rwsem)
{
percpu_down_read(&pcpu_rwsem);
return 0;
}
-static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem)
+static void torture_percpu_rwsem_up_read(int tid __maybe_unused)
+__releases(pcpu_rwsem)
{
percpu_up_read(&pcpu_rwsem);
}
static struct lock_torture_ops percpu_rwsem_lock_ops = {
.init = torture_percpu_rwsem_init,
+ .exit = torture_percpu_rwsem_exit,
.writelock = torture_percpu_rwsem_down_write,
.write_delay = torture_rwsem_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_percpu_rwsem_up_write,
.readlock = torture_percpu_rwsem_down_read,
.read_delay = torture_rwsem_read_delay,
@@ -644,28 +896,63 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = {
*/
static int lock_torture_writer(void *arg)
{
+ unsigned long j;
+ unsigned long j1;
+ u32 lockset_mask;
struct lock_stress_stats *lwsp = arg;
- static DEFINE_TORTURE_RANDOM(rand);
+ DEFINE_TORTURE_RANDOM(rand);
+ bool skip_main_lock;
+ int tid = lwsp - cxt.lwsa;
VERBOSE_TOROUT_STRING("lock_torture_writer task started");
- set_user_nice(current, MAX_NICE);
+ if (!rt_task(current))
+ set_user_nice(current, MAX_NICE);
do {
if ((torture_random(&rand) & 0xfffff) == 0)
schedule_timeout_uninterruptible(1);
- cxt.cur_ops->task_boost(&rand);
- cxt.cur_ops->writelock();
- if (WARN_ON_ONCE(lock_is_write_held))
- lwsp->n_lock_fail++;
- lock_is_write_held = 1;
- if (WARN_ON_ONCE(lock_is_read_held))
- lwsp->n_lock_fail++; /* rare, but... */
+ lockset_mask = torture_random(&rand);
+ /*
+ * When using nested_locks, we want to occasionally
+ * skip the main lock so we can avoid always serializing
+ * the lock chains on that central lock. By skipping the
+ * main lock occasionally, we can create different
+ * contention patterns (allowing for multiple disjoint
+ * blocked trees)
+ */
+ skip_main_lock = (nested_locks &&
+ !(torture_random(&rand) % 100));
- lwsp->n_lock_acquired++;
- cxt.cur_ops->write_delay(&rand);
- lock_is_write_held = 0;
- cxt.cur_ops->writeunlock();
+ cxt.cur_ops->task_boost(&rand);
+ if (cxt.cur_ops->nested_lock)
+ cxt.cur_ops->nested_lock(tid, lockset_mask);
+
+ if (!skip_main_lock) {
+ if (acq_writer_lim > 0)
+ j = jiffies;
+ cxt.cur_ops->writelock(tid);
+ if (WARN_ON_ONCE(lock_is_write_held))
+ lwsp->n_lock_fail++;
+ lock_is_write_held = true;
+ if (WARN_ON_ONCE(atomic_read(&lock_is_read_held)))
+ lwsp->n_lock_fail++; /* rare, but... */
+ if (acq_writer_lim > 0) {
+ j1 = jiffies;
+ WARN_ONCE(time_after(j1, j + acq_writer_lim),
+ "%s: Lock acquisition took %lu jiffies.\n",
+ __func__, j1 - j);
+ }
+ lwsp->n_lock_acquired++;
+
+ cxt.cur_ops->write_delay(&rand);
+
+ lock_is_write_held = false;
+ WRITE_ONCE(last_lock_release, jiffies);
+ cxt.cur_ops->writeunlock(tid);
+ }
+ if (cxt.cur_ops->nested_unlock)
+ cxt.cur_ops->nested_unlock(tid, lockset_mask);
stutter_wait("lock_torture_writer");
} while (!torture_must_stop());
@@ -682,7 +969,8 @@ static int lock_torture_writer(void *arg)
static int lock_torture_reader(void *arg)
{
struct lock_stress_stats *lrsp = arg;
- static DEFINE_TORTURE_RANDOM(rand);
+ int tid = lrsp - cxt.lrsa;
+ DEFINE_TORTURE_RANDOM(rand);
VERBOSE_TOROUT_STRING("lock_torture_reader task started");
set_user_nice(current, MAX_NICE);
@@ -691,15 +979,15 @@ static int lock_torture_reader(void *arg)
if ((torture_random(&rand) & 0xfffff) == 0)
schedule_timeout_uninterruptible(1);
- cxt.cur_ops->readlock();
- lock_is_read_held = 1;
+ cxt.cur_ops->readlock(tid);
+ atomic_inc(&lock_is_read_held);
if (WARN_ON_ONCE(lock_is_write_held))
lrsp->n_lock_fail++; /* rare, but... */
lrsp->n_lock_acquired++;
cxt.cur_ops->read_delay(&rand);
- lock_is_read_held = 0;
- cxt.cur_ops->readunlock();
+ atomic_dec(&lock_is_read_held);
+ cxt.cur_ops->readunlock(tid);
stutter_wait("lock_torture_reader");
} while (!torture_must_stop());
@@ -713,26 +1001,28 @@ static int lock_torture_reader(void *arg)
static void __torture_print_stats(char *page,
struct lock_stress_stats *statp, bool write)
{
- bool fail = 0;
+ long cur;
+ bool fail = false;
int i, n_stress;
- long max = 0;
- long min = statp[0].n_lock_acquired;
+ long max = 0, min = statp ? data_race(statp[0].n_lock_acquired) : 0;
long long sum = 0;
n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress;
for (i = 0; i < n_stress; i++) {
- if (statp[i].n_lock_fail)
+ if (data_race(statp[i].n_lock_fail))
fail = true;
- sum += statp[i].n_lock_acquired;
- if (max < statp[i].n_lock_fail)
- max = statp[i].n_lock_fail;
- if (min > statp[i].n_lock_fail)
- min = statp[i].n_lock_fail;
+ cur = data_race(statp[i].n_lock_acquired);
+ sum += cur;
+ if (max < cur)
+ max = cur;
+ if (min > cur)
+ min = cur;
}
page += sprintf(page,
"%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
write ? "Writes" : "Reads ",
- sum, max, min, max / 2 > min ? "???" : "",
+ sum, max, min,
+ !onoff_interval && max / 2 > min ? "???" : "",
fail, fail ? "!!!" : "");
if (fail)
atomic_inc(&cxt.n_lock_torture_errors);
@@ -798,16 +1088,69 @@ static int lock_torture_stats(void *arg)
return 0;
}
+
static inline void
lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
const char *tag)
{
+ static cpumask_t cpumask_all;
+ cpumask_t *rcmp = cpumask_nonempty(bind_readers) ? bind_readers : &cpumask_all;
+ cpumask_t *wcmp = cpumask_nonempty(bind_writers) ? bind_writers : &cpumask_all;
+
+ cpumask_setall(&cpumask_all);
pr_alert("%s" TORTURE_FLAG
- "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+ "--- %s%s: acq_writer_lim=%d bind_readers=%*pbl bind_writers=%*pbl call_rcu_chains=%d long_hold=%d nested_locks=%d nreaders_stress=%d nwriters_stress=%d onoff_holdoff=%d onoff_interval=%d rt_boost=%d rt_boost_factor=%d shuffle_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d verbose=%d writer_fifo=%d\n",
torture_type, tag, cxt.debug_lock ? " [debug]": "",
- cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval,
- verbose, shuffle_interval, stutter, shutdown_secs,
- onoff_interval, onoff_holdoff);
+ acq_writer_lim, cpumask_pr_args(rcmp), cpumask_pr_args(wcmp),
+ call_rcu_chains, long_hold, nested_locks, cxt.nrealreaders_stress,
+ cxt.nrealwriters_stress, onoff_holdoff, onoff_interval, rt_boost,
+ rt_boost_factor, shuffle_interval, shutdown_secs, stat_interval, stutter,
+ verbose, writer_fifo);
+}
+
+// If requested, maintain call_rcu() chains to keep a grace period always
+// in flight. These increase the probability of getting an RCU CPU stall
+// warning and associated diagnostics when a locking primitive stalls.
+
+static void call_rcu_chain_cb(struct rcu_head *rhp)
+{
+ struct call_rcu_chain *crcp = container_of(rhp, struct call_rcu_chain, crc_rh);
+
+ if (!smp_load_acquire(&crcp->crc_stop)) {
+ (void)start_poll_synchronize_rcu(); // Start one grace period...
+ call_rcu(&crcp->crc_rh, call_rcu_chain_cb); // ... and later start another.
+ }
+}
+
+// Start the requested number of call_rcu() chains.
+static int call_rcu_chain_init(void)
+{
+ int i;
+
+ if (call_rcu_chains <= 0)
+ return 0;
+ call_rcu_chain_list = kcalloc(call_rcu_chains, sizeof(*call_rcu_chain_list), GFP_KERNEL);
+ if (!call_rcu_chain_list)
+ return -ENOMEM;
+ for (i = 0; i < call_rcu_chains; i++) {
+ call_rcu_chain_list[i].crc_stop = false;
+ call_rcu(&call_rcu_chain_list[i].crc_rh, call_rcu_chain_cb);
+ }
+ return 0;
+}
+
+// Stop all of the call_rcu() chains.
+static void call_rcu_chain_cleanup(void)
+{
+ int i;
+
+ if (!call_rcu_chain_list)
+ return;
+ for (i = 0; i < call_rcu_chains; i++)
+ smp_store_release(&call_rcu_chain_list[i].crc_stop, true);
+ rcu_barrier();
+ kfree(call_rcu_chain_list);
+ call_rcu_chain_list = NULL;
}
static void lock_torture_cleanup(void)
@@ -819,17 +1162,17 @@ static void lock_torture_cleanup(void)
/*
* Indicates early cleanup, meaning that the test has not run,
- * such as when passing bogus args when loading the module. As
- * such, only perform the underlying torture-specific cleanups,
- * and avoid anything related to locktorture.
+ * such as when passing bogus args when loading the module.
+ * However cxt->cur_ops.init() may have been invoked, so beside
+ * perform the underlying torture-specific cleanups, cur_ops.exit()
+ * will be invoked if needed.
*/
- if (!cxt.lwsa)
+ if (!cxt.lwsa && !cxt.lrsa)
goto end;
if (writer_tasks) {
for (i = 0; i < cxt.nrealwriters_stress; i++)
- torture_stop_kthread(lock_torture_writer,
- writer_tasks[i]);
+ torture_stop_kthread(lock_torture_writer, writer_tasks[i]);
kfree(writer_tasks);
writer_tasks = NULL;
}
@@ -856,9 +1199,22 @@ static void lock_torture_cleanup(void)
"End of test: SUCCESS");
kfree(cxt.lwsa);
+ cxt.lwsa = NULL;
kfree(cxt.lrsa);
+ cxt.lrsa = NULL;
+
+ call_rcu_chain_cleanup();
end:
+ if (cxt.init_called) {
+ if (cxt.cur_ops->exit)
+ cxt.cur_ops->exit();
+ cxt.init_called = false;
+ }
+
+ free_cpumask_var(bind_readers);
+ free_cpumask_var(bind_writers);
+
torture_cleanup_end();
}
@@ -869,6 +1225,10 @@ static int __init lock_torture_init(void)
static struct lock_torture_ops *torture_ops[] = {
&lock_busted_ops,
&spin_lock_ops, &spin_lock_irq_ops,
+ &raw_spin_lock_ops, &raw_spin_lock_irq_ops,
+#ifdef CONFIG_BPF_SYSCALL
+ &raw_res_spin_lock_ops, &raw_res_spin_lock_irq_ops,
+#endif
&rw_lock_ops, &rw_lock_irq_ops,
&mutex_lock_ops,
&ww_mutex_lock_ops,
@@ -879,7 +1239,7 @@ static int __init lock_torture_init(void)
&percpu_rwsem_lock_ops,
};
- if (!torture_init_begin(torture_type, verbose, &torture_runnable))
+ if (!torture_init_begin(torture_type, verbose))
return -EBUSY;
/* Process args and tell the world that the torturer is on the job. */
@@ -898,40 +1258,54 @@ static int __init lock_torture_init(void)
firsterr = -EINVAL;
goto unwind;
}
- if (cxt.cur_ops->init)
- cxt.cur_ops->init();
+
+ if (nwriters_stress == 0 &&
+ (!cxt.cur_ops->readlock || nreaders_stress == 0)) {
+ pr_alert("lock-torture: must run at least one locking thread\n");
+ firsterr = -EINVAL;
+ goto unwind;
+ }
if (nwriters_stress >= 0)
cxt.nrealwriters_stress = nwriters_stress;
else
cxt.nrealwriters_stress = 2 * num_online_cpus();
+ if (cxt.cur_ops->init) {
+ cxt.cur_ops->init();
+ cxt.init_called = true;
+ }
+
#ifdef CONFIG_DEBUG_MUTEXES
- if (strncmp(torture_type, "mutex", 5) == 0)
+ if (str_has_prefix(torture_type, "mutex"))
cxt.debug_lock = true;
#endif
#ifdef CONFIG_DEBUG_RT_MUTEXES
- if (strncmp(torture_type, "rtmutex", 7) == 0)
+ if (str_has_prefix(torture_type, "rtmutex"))
cxt.debug_lock = true;
#endif
#ifdef CONFIG_DEBUG_SPINLOCK
- if ((strncmp(torture_type, "spin", 4) == 0) ||
- (strncmp(torture_type, "rw_lock", 7) == 0))
+ if ((str_has_prefix(torture_type, "spin")) ||
+ (str_has_prefix(torture_type, "rw_lock")))
cxt.debug_lock = true;
#endif
/* Initialize the statistics so that each run gets its own numbers. */
+ if (nwriters_stress) {
+ lock_is_write_held = false;
+ cxt.lwsa = kmalloc_array(cxt.nrealwriters_stress,
+ sizeof(*cxt.lwsa),
+ GFP_KERNEL);
+ if (cxt.lwsa == NULL) {
+ VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
- lock_is_write_held = 0;
- cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL);
- if (cxt.lwsa == NULL) {
- VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory");
- firsterr = -ENOMEM;
- goto unwind;
- }
- for (i = 0; i < cxt.nrealwriters_stress; i++) {
- cxt.lwsa[i].n_lock_fail = 0;
- cxt.lwsa[i].n_lock_acquired = 0;
+ for (i = 0; i < cxt.nrealwriters_stress; i++) {
+ cxt.lwsa[i].n_lock_fail = 0;
+ cxt.lwsa[i].n_lock_acquired = 0;
+ }
}
if (cxt.cur_ops->readlock) {
@@ -948,61 +1322,76 @@ static int __init lock_torture_init(void)
cxt.nrealreaders_stress = cxt.nrealwriters_stress;
}
- lock_is_read_held = 0;
- cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL);
- if (cxt.lrsa == NULL) {
- VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory");
- firsterr = -ENOMEM;
- kfree(cxt.lwsa);
- cxt.lwsa = NULL;
- goto unwind;
- }
-
- for (i = 0; i < cxt.nrealreaders_stress; i++) {
- cxt.lrsa[i].n_lock_fail = 0;
- cxt.lrsa[i].n_lock_acquired = 0;
+ if (nreaders_stress) {
+ cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress,
+ sizeof(*cxt.lrsa),
+ GFP_KERNEL);
+ if (cxt.lrsa == NULL) {
+ VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory");
+ firsterr = -ENOMEM;
+ kfree(cxt.lwsa);
+ cxt.lwsa = NULL;
+ goto unwind;
+ }
+
+ for (i = 0; i < cxt.nrealreaders_stress; i++) {
+ cxt.lrsa[i].n_lock_fail = 0;
+ cxt.lrsa[i].n_lock_acquired = 0;
+ }
}
}
+ firsterr = call_rcu_chain_init();
+ if (torture_init_error(firsterr))
+ goto unwind;
+
lock_torture_print_module_parms(cxt.cur_ops, "Start of test");
/* Prepare torture context. */
if (onoff_interval > 0) {
firsterr = torture_onoff_init(onoff_holdoff * HZ,
- onoff_interval * HZ);
- if (firsterr)
+ onoff_interval * HZ, NULL);
+ if (torture_init_error(firsterr))
goto unwind;
}
if (shuffle_interval > 0) {
firsterr = torture_shuffle_init(shuffle_interval);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (shutdown_secs > 0) {
firsterr = torture_shutdown_init(shutdown_secs,
lock_torture_cleanup);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (stutter > 0) {
- firsterr = torture_stutter_init(stutter);
- if (firsterr)
+ firsterr = torture_stutter_init(stutter, stutter);
+ if (torture_init_error(firsterr))
goto unwind;
}
- writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]),
- GFP_KERNEL);
- if (writer_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
- firsterr = -ENOMEM;
- goto unwind;
+ if (nwriters_stress) {
+ writer_tasks = kcalloc(cxt.nrealwriters_stress,
+ sizeof(writer_tasks[0]),
+ GFP_KERNEL);
+ if (writer_tasks == NULL) {
+ TOROUT_ERRSTRING("writer_tasks: Out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
}
+ /* cap nested_locks to MAX_NESTED_LOCKS */
+ if (nested_locks > MAX_NESTED_LOCKS)
+ nested_locks = MAX_NESTED_LOCKS;
+
if (cxt.cur_ops->readlock) {
- reader_tasks = kzalloc(cxt.nrealreaders_stress * sizeof(reader_tasks[0]),
+ reader_tasks = kcalloc(cxt.nrealreaders_stress,
+ sizeof(reader_tasks[0]),
GFP_KERNEL);
if (reader_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
+ TOROUT_ERRSTRING("reader_tasks: Out of memory");
kfree(writer_tasks);
writer_tasks = NULL;
firsterr = -ENOMEM;
@@ -1024,10 +1413,13 @@ static int __init lock_torture_init(void)
goto create_reader;
/* Create writer. */
- firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i],
- writer_tasks[i]);
- if (firsterr)
+ firsterr = torture_create_kthread_cb(lock_torture_writer, &cxt.lwsa[i],
+ writer_tasks[i],
+ writer_fifo ? sched_set_fifo : NULL);
+ if (torture_init_error(firsterr))
goto unwind;
+ if (cpumask_nonempty(bind_writers))
+ torture_sched_setaffinity(writer_tasks[i]->pid, bind_writers, true);
create_reader:
if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress))
@@ -1035,13 +1427,15 @@ static int __init lock_torture_init(void)
/* Create reader. */
firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j],
reader_tasks[j]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
+ if (cpumask_nonempty(bind_readers))
+ torture_sched_setaffinity(reader_tasks[j]->pid, bind_readers, true);
}
if (stat_interval > 0) {
firsterr = torture_create_kthread(lock_torture_stats, NULL,
stats_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
torture_init_end();
@@ -1050,6 +1444,10 @@ static int __init lock_torture_init(void)
unwind:
torture_init_end();
lock_torture_cleanup();
+ if (shutdown_secs) {
+ WARN_ON(!IS_MODULE(CONFIG_LOCK_TORTURE_TEST));
+ kernel_power_off();
+ }
return firsterr;
}
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 6a385aabcce7..5c92ba199b90 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* MCS lock defines
*
@@ -6,7 +7,7 @@
* The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
* with the desirable properties of being fair, and with each cpu trying
* to acquire the lock spinning on a local variable.
- * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * It avoids expensive cache bounces that common test-and-set spin-lock
* implementations incur.
*/
#ifndef __LINUX_MCS_SPINLOCK_H
@@ -14,22 +15,16 @@
#include <asm/mcs_spinlock.h>
-struct mcs_spinlock {
- struct mcs_spinlock *next;
- int locked; /* 1 if lock acquired */
- int count; /* nesting count, see qspinlock.c */
-};
-
#ifndef arch_mcs_spin_lock_contended
/*
- * Using smp_load_acquire() provides a memory barrier that ensures
- * subsequent operations happen after the lock is acquired.
+ * Using smp_cond_load_acquire() provides the acquire semantics
+ * required so that subsequent operations happen after the
+ * lock is acquired. Additionally, some architectures such as
+ * ARM64 would like to do spin-waiting instead of purely
+ * spinning, and smp_cond_load_acquire() provides that behavior.
*/
#define arch_mcs_spin_lock_contended(l) \
-do { \
- while (!(smp_load_acquire(l))) \
- cpu_relax(); \
-} while (0)
+ smp_cond_load_acquire(l, VAL)
#endif
#ifndef arch_mcs_spin_unlock_contended
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 9aa713629387..2c6b02d4699b 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -1,6 +1,4 @@
/*
- * kernel/mutex-debug.c
- *
* Debugging code for mutexes
*
* Started by Ingo Molnar:
@@ -14,6 +12,7 @@
*/
#include <linux/mutex.h>
#include <linux/delay.h>
+#include <linux/device.h>
#include <linux/export.h>
#include <linux/poison.h>
#include <linux/sched.h>
@@ -22,7 +21,7 @@
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
-#include "mutex-debug.h"
+#include "mutex.h"
/*
* Must be called with lock->wait_lock held.
@@ -32,11 +31,12 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
waiter->magic = waiter;
INIT_LIST_HEAD(&waiter->list);
+ waiter->ww_ctx = MUTEX_POISON_WW_CTX;
}
void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
{
- SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
+ lockdep_assert_held(&lock->wait_lock);
DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list));
DEBUG_LOCKS_WARN_ON(waiter->magic != waiter);
DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
@@ -51,21 +51,22 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter)
void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct task_struct *task)
{
- SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
+ lockdep_assert_held(&lock->wait_lock);
- /* Mark the current thread as blocked on the lock: */
- task->blocked_on = waiter;
+ /* Current thread can't be already blocked (since it's executing!) */
+ DEBUG_LOCKS_WARN_ON(__get_task_blocked_on(task));
}
-void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct task_struct *task)
{
+ struct mutex *blocked_on = __get_task_blocked_on(task);
+
DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
DEBUG_LOCKS_WARN_ON(waiter->task != task);
- DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter);
- task->blocked_on = NULL;
+ DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on != lock);
- list_del_init(&waiter->list);
+ INIT_LIST_HEAD(&waiter->list);
waiter->task = NULL;
}
@@ -77,19 +78,22 @@ void debug_mutex_unlock(struct mutex *lock)
}
}
-void debug_mutex_init(struct mutex *lock, const char *name,
- struct lock_class_key *key)
+void debug_mutex_init(struct mutex *lock)
{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- /*
- * Make sure we are not reinitializing a held lock:
- */
- debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lockdep_init_map(&lock->dep_map, name, key, 0);
-#endif
lock->magic = lock;
}
+static void devm_mutex_release(void *res)
+{
+ mutex_destroy(res);
+}
+
+int __devm_mutex_init(struct device *dev, struct mutex *lock)
+{
+ return devm_add_action_or_reset(dev, devm_mutex_release, lock);
+}
+EXPORT_SYMBOL_GPL(__devm_mutex_init);
+
/***
* mutex_destroy - mark a mutex unusable
* @lock: the mutex to be destroyed
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
deleted file mode 100644
index 4174417d5309..000000000000
--- a/kernel/locking/mutex-debug.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Mutexes: blocking mutual exclusion locks
- *
- * started by Ingo Molnar:
- *
- * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- * This file contains mutex debugging related internal declarations,
- * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case.
- * More details are in kernel/mutex-debug.c.
- */
-
-/*
- * This must be called with lock->wait_lock held.
- */
-extern void debug_mutex_lock_common(struct mutex *lock,
- struct mutex_waiter *waiter);
-extern void debug_mutex_wake_waiter(struct mutex *lock,
- struct mutex_waiter *waiter);
-extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
-extern void debug_mutex_add_waiter(struct mutex *lock,
- struct mutex_waiter *waiter,
- struct task_struct *task);
-extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
- struct task_struct *task);
-extern void debug_mutex_unlock(struct mutex *lock);
-extern void debug_mutex_init(struct mutex *lock, const char *name,
- struct lock_class_key *key);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 858a07590e39..2a1d165b3167 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/locking/mutex.c
*
@@ -15,7 +16,7 @@
* by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
* and Sven Dietrich.
*
- * Also see Documentation/locking/mutex-design.txt.
+ * Also see Documentation/locking/mutex-design.rst.
*/
#include <linux/mutex.h>
#include <linux/ww_mutex.h>
@@ -28,101 +29,108 @@
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
#include <linux/osq_lock.h>
+#include <linux/hung_task.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/lock.h>
+
+#ifndef CONFIG_PREEMPT_RT
+#include "mutex.h"
#ifdef CONFIG_DEBUG_MUTEXES
-# include "mutex-debug.h"
+# define MUTEX_WARN_ON(cond) DEBUG_LOCKS_WARN_ON(cond)
#else
-# include "mutex.h"
+# define MUTEX_WARN_ON(cond)
#endif
-void
-__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
+static void __mutex_init_generic(struct mutex *lock)
{
atomic_long_set(&lock->owner, 0);
- spin_lock_init(&lock->wait_lock);
+ raw_spin_lock_init(&lock->wait_lock);
INIT_LIST_HEAD(&lock->wait_list);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
osq_lock_init(&lock->osq);
#endif
-
- debug_mutex_init(lock, name, key);
+ debug_mutex_init(lock);
}
-EXPORT_SYMBOL(__mutex_init);
-
-/*
- * @owner: contains: 'struct task_struct *' to the current lock owner,
- * NULL means not owned. Since task_struct pointers are aligned at
- * at least L1_CACHE_BYTES, we have low bits to store extra state.
- *
- * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup.
- * Bit1 indicates unlock needs to hand the lock to the top-waiter
- * Bit2 indicates handoff has been done and we're waiting for pickup.
- */
-#define MUTEX_FLAG_WAITERS 0x01
-#define MUTEX_FLAG_HANDOFF 0x02
-#define MUTEX_FLAG_PICKUP 0x04
-
-#define MUTEX_FLAGS 0x07
static inline struct task_struct *__owner_task(unsigned long owner)
{
return (struct task_struct *)(owner & ~MUTEX_FLAGS);
}
+bool mutex_is_locked(struct mutex *lock)
+{
+ return __mutex_owner(lock) != NULL;
+}
+EXPORT_SYMBOL(mutex_is_locked);
+
static inline unsigned long __owner_flags(unsigned long owner)
{
return owner & MUTEX_FLAGS;
}
+/* Do not use the return value as a pointer directly. */
+unsigned long mutex_get_owner(struct mutex *lock)
+{
+ unsigned long owner = atomic_long_read(&lock->owner);
+
+ return (unsigned long)__owner_task(owner);
+}
+
/*
- * Trylock variant that retuns the owning task on failure.
+ * Returns: __mutex_owner(lock) on failure or NULL on success.
*/
-static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock)
+static inline struct task_struct *__mutex_trylock_common(struct mutex *lock, bool handoff)
{
unsigned long owner, curr = (unsigned long)current;
owner = atomic_long_read(&lock->owner);
for (;;) { /* must loop, can race against a flag */
- unsigned long old, flags = __owner_flags(owner);
+ unsigned long flags = __owner_flags(owner);
unsigned long task = owner & ~MUTEX_FLAGS;
if (task) {
- if (likely(task != curr))
- break;
-
- if (likely(!(flags & MUTEX_FLAG_PICKUP)))
+ if (flags & MUTEX_FLAG_PICKUP) {
+ if (task != curr)
+ break;
+ flags &= ~MUTEX_FLAG_PICKUP;
+ } else if (handoff) {
+ if (flags & MUTEX_FLAG_HANDOFF)
+ break;
+ flags |= MUTEX_FLAG_HANDOFF;
+ } else {
break;
-
- flags &= ~MUTEX_FLAG_PICKUP;
+ }
} else {
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(flags & MUTEX_FLAG_PICKUP);
-#endif
+ MUTEX_WARN_ON(flags & (MUTEX_FLAG_HANDOFF | MUTEX_FLAG_PICKUP));
+ task = curr;
}
- /*
- * We set the HANDOFF bit, we must make sure it doesn't live
- * past the point where we acquire it. This would be possible
- * if we (accidentally) set the bit on an unlocked mutex.
- */
- flags &= ~MUTEX_FLAG_HANDOFF;
-
- old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags);
- if (old == owner)
- return NULL;
-
- owner = old;
+ if (atomic_long_try_cmpxchg_acquire(&lock->owner, &owner, task | flags)) {
+ if (task == curr)
+ return NULL;
+ break;
+ }
}
return __owner_task(owner);
}
/*
+ * Trylock or set HANDOFF
+ */
+static inline bool __mutex_trylock_or_handoff(struct mutex *lock, bool handoff)
+{
+ return !__mutex_trylock_common(lock, handoff);
+}
+
+/*
* Actual trylock that will work on any unlocked state.
*/
static inline bool __mutex_trylock(struct mutex *lock)
{
- return !__mutex_trylock_or_owner(lock);
+ return !__mutex_trylock_common(lock, false);
}
#ifndef CONFIG_DEBUG_LOCK_ALLOC
@@ -131,6 +139,11 @@ static inline bool __mutex_trylock(struct mutex *lock)
* There is nothing that would stop spreading the lockdep annotations outwards
* except more code.
*/
+void mutex_init_generic(struct mutex *lock)
+{
+ __mutex_init_generic(lock);
+}
+EXPORT_SYMBOL(mutex_init_generic);
/*
* Optimistic trylock that only works in the uncontended case. Make sure to
@@ -139,8 +152,11 @@ static inline bool __mutex_trylock(struct mutex *lock)
static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
{
unsigned long curr = (unsigned long)current;
+ unsigned long zero = 0UL;
+
+ MUTEX_WARN_ON(lock->magic != lock);
- if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr))
+ if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr))
return true;
return false;
@@ -150,12 +166,23 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
{
unsigned long curr = (unsigned long)current;
- if (atomic_long_cmpxchg_release(&lock->owner, curr, 0UL) == curr)
- return true;
+ return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL);
+}
- return false;
+#else /* !CONFIG_DEBUG_LOCK_ALLOC */
+
+void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key)
+{
+ __mutex_init_generic(lock);
+
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP);
}
-#endif
+EXPORT_SYMBOL(mutex_init_lockep);
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag)
{
@@ -173,8 +200,35 @@ static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_wait
}
/*
+ * Add @waiter to a given location in the lock wait_list and set the
+ * FLAG_WAITERS flag if it's the first waiter.
+ */
+static void
+__mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+ struct list_head *list)
+{
+ hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX);
+ debug_mutex_add_waiter(lock, waiter, current);
+
+ list_add_tail(&waiter->list, list);
+ if (__mutex_waiter_is_first(lock, waiter))
+ __mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
+}
+
+static void
+__mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter)
+{
+ list_del(&waiter->list);
+ if (likely(list_empty(&lock->wait_list)))
+ __mutex_clear_flag(lock, MUTEX_FLAGS);
+
+ debug_mutex_remove_waiter(lock, waiter, current);
+ hung_task_clear_blocker();
+}
+
+/*
* Give up ownership to a specific task, when @task = NULL, this is equivalent
- * to a regular unlock. Sets PICKUP on a handoff, clears HANDOF, preserves
+ * to a regular unlock. Sets PICKUP on a handoff, clears HANDOFF, preserves
* WAITERS. Provides RELEASE semantics like a regular unlock, the
* __mutex_trylock() provides a matching ACQUIRE semantics for the handoff.
*/
@@ -183,23 +237,18 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task)
unsigned long owner = atomic_long_read(&lock->owner);
for (;;) {
- unsigned long old, new;
+ unsigned long new;
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
- DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP);
-#endif
+ MUTEX_WARN_ON(__owner_task(owner) != current);
+ MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP);
new = (owner & MUTEX_FLAG_WAITERS);
new |= (unsigned long)task;
if (task)
new |= MUTEX_FLAG_PICKUP;
- old = atomic_long_cmpxchg_release(&lock->owner, owner, new);
- if (old == owner)
+ if (atomic_long_try_cmpxchg_release(&lock->owner, &owner, new))
break;
-
- owner = old;
}
}
@@ -243,135 +292,18 @@ void __sched mutex_lock(struct mutex *lock)
EXPORT_SYMBOL(mutex_lock);
#endif
-static __always_inline void
-ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
-{
-#ifdef CONFIG_DEBUG_MUTEXES
- /*
- * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
- * but released with a normal mutex_unlock in this call.
- *
- * This should never happen, always use ww_mutex_unlock.
- */
- DEBUG_LOCKS_WARN_ON(ww->ctx);
-
- /*
- * Not quite done after calling ww_acquire_done() ?
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
-
- if (ww_ctx->contending_lock) {
- /*
- * After -EDEADLK you tried to
- * acquire a different ww_mutex? Bad!
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
-
- /*
- * You called ww_mutex_lock after receiving -EDEADLK,
- * but 'forgot' to unlock everything else first?
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
- ww_ctx->contending_lock = NULL;
- }
-
- /*
- * Naughty, using a different class will lead to undefined behavior!
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
-#endif
- ww_ctx->acquired++;
-}
-
-static inline bool __sched
-__ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
-{
- return a->stamp - b->stamp <= LONG_MAX &&
- (a->stamp != b->stamp || a > b);
-}
-
-/*
- * Wake up any waiters that may have to back off when the lock is held by the
- * given context.
- *
- * Due to the invariants on the wait list, this can only affect the first
- * waiter with a context.
- *
- * The current task must not be on the wait list.
- */
-static void __sched
-__ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
-{
- struct mutex_waiter *cur;
-
- lockdep_assert_held(&lock->wait_lock);
+#include "ww_mutex.h"
- list_for_each_entry(cur, &lock->wait_list, list) {
- if (!cur->ww_ctx)
- continue;
-
- if (cur->ww_ctx->acquired > 0 &&
- __ww_ctx_stamp_after(cur->ww_ctx, ww_ctx)) {
- debug_mutex_wake_waiter(lock, cur);
- wake_up_process(cur->task);
- }
-
- break;
- }
-}
-
-/*
- * After acquiring lock with fastpath or when we lost out in contested
- * slowpath, set ctx and wake up any waiters so they can recheck.
- */
-static __always_inline void
-ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
-{
- ww_mutex_lock_acquired(lock, ctx);
-
- lock->ctx = ctx;
-
- /*
- * The lock->ctx update should be visible on all cores before
- * the atomic read is done, otherwise contended waiters might be
- * missed. The contended waiters will either see ww_ctx == NULL
- * and keep spinning, or it will acquire wait_lock, add itself
- * to waiter list and sleep.
- */
- smp_mb(); /* ^^^ */
-
- /*
- * Check if lock is contended, if not there is nobody to wake up
- */
- if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS)))
- return;
-
- /*
- * Uh oh, we raced in fastpath, wake up everyone in this case,
- * so they can see the new lock->ctx.
- */
- spin_lock(&lock->base.wait_lock);
- __ww_mutex_wakeup_for_backoff(&lock->base, ctx);
- spin_unlock(&lock->base.wait_lock);
-}
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
/*
- * After acquiring lock in the slowpath set ctx.
- *
- * Unlike for the fast path, the caller ensures that waiters are woken up where
- * necessary.
- *
- * Callers must hold the mutex wait_lock.
+ * Trylock variant that returns the owning task on failure.
*/
-static __always_inline void
-ww_mutex_set_context_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock)
{
- ww_mutex_lock_acquired(lock, ctx);
- lock->ctx = ctx;
+ return __mutex_trylock_common(lock, false);
}
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-
static inline
bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
struct mutex_waiter *waiter)
@@ -426,21 +358,23 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
{
bool ret = true;
- rcu_read_lock();
+ lockdep_assert_preemption_disabled();
+
while (__mutex_owner(lock) == owner) {
/*
* Ensure we emit the owner->on_cpu, dereference _after_
- * checking lock->owner still matches owner. If that fails,
- * owner might point to freed memory. If it still matches,
- * the rcu_read_lock() ensures the memory stays valid.
+ * checking lock->owner still matches owner. And we already
+ * disabled preemption which is equal to the RCU read-side
+ * crital section in optimistic spinning code. Thus the
+ * task_strcut structure won't go away during the spinning
+ * period
*/
barrier();
/*
* Use vcpu_is_preempted to detect lock holder preemption issue.
*/
- if (!owner->on_cpu || need_resched() ||
- vcpu_is_preempted(task_cpu(owner))) {
+ if (!owner_on_cpu(owner) || need_resched()) {
ret = false;
break;
}
@@ -452,7 +386,6 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
cpu_relax();
}
- rcu_read_unlock();
return ret;
}
@@ -465,19 +398,19 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
struct task_struct *owner;
int retval = 1;
+ lockdep_assert_preemption_disabled();
+
if (need_resched())
return 0;
- rcu_read_lock();
- owner = __mutex_owner(lock);
-
/*
- * As lock holder preemption issue, we both skip spinning if task is not
- * on cpu or its cpu is preempted
+ * We already disabled preemption which is equal to the RCU read-side
+ * crital section in optimistic spinning code. Thus the task_strcut
+ * structure won't go away during the spinning period.
*/
+ owner = __mutex_owner(lock);
if (owner)
- retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
- rcu_read_unlock();
+ retval = owner_on_cpu(owner);
/*
* If lock->owner is not set, the mutex has been released. Return true
@@ -510,7 +443,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
*/
static __always_inline bool
mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
- const bool use_ww_ctx, struct mutex_waiter *waiter)
+ struct mutex_waiter *waiter)
{
if (!waiter) {
/*
@@ -586,7 +519,7 @@ fail:
#else
static __always_inline bool
mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
- const bool use_ww_ctx, struct mutex_waiter *waiter)
+ struct mutex_waiter *waiter)
{
return false;
}
@@ -603,6 +536,11 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
* This function must not be used in interrupt context. Unlocking
* of a not locked mutex is not allowed.
*
+ * The caller must ensure that the mutex stays alive until this function has
+ * returned - mutex_unlock() can NOT directly be used to release an object such
+ * that another concurrent task can free it.
+ * Mutexes are different from spinlocks & refcounts in this aspect.
+ *
* This function is similar to (but not equivalent to) up().
*/
void __sched mutex_unlock(struct mutex *lock)
@@ -628,182 +566,102 @@ EXPORT_SYMBOL(mutex_unlock);
*/
void __sched ww_mutex_unlock(struct ww_mutex *lock)
{
- /*
- * The unlocking fastpath is the 0->1 transition from 'locked'
- * into 'unlocked' state:
- */
- if (lock->ctx) {
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
-#endif
- if (lock->ctx->acquired > 0)
- lock->ctx->acquired--;
- lock->ctx = NULL;
- }
-
+ __ww_mutex_unlock(lock);
mutex_unlock(&lock->base);
}
EXPORT_SYMBOL(ww_mutex_unlock);
-static inline int __sched
-__ww_mutex_lock_check_stamp(struct mutex *lock, struct mutex_waiter *waiter,
- struct ww_acquire_ctx *ctx)
-{
- struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
- struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
- struct mutex_waiter *cur;
-
- if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx))
- goto deadlock;
-
- /*
- * If there is a waiter in front of us that has a context, then its
- * stamp is earlier than ours and we must back off.
- */
- cur = waiter;
- list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) {
- if (cur->ww_ctx)
- goto deadlock;
- }
-
- return 0;
-
-deadlock:
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
- ctx->contending_lock = ww;
-#endif
- return -EDEADLK;
-}
-
-static inline int __sched
-__ww_mutex_add_waiter(struct mutex_waiter *waiter,
- struct mutex *lock,
- struct ww_acquire_ctx *ww_ctx)
-{
- struct mutex_waiter *cur;
- struct list_head *pos;
-
- if (!ww_ctx) {
- list_add_tail(&waiter->list, &lock->wait_list);
- return 0;
- }
-
- /*
- * Add the waiter before the first waiter with a higher stamp.
- * Waiters without a context are skipped to avoid starving
- * them.
- */
- pos = &lock->wait_list;
- list_for_each_entry_reverse(cur, &lock->wait_list, list) {
- if (!cur->ww_ctx)
- continue;
-
- if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) {
- /* Back off immediately if necessary. */
- if (ww_ctx->acquired > 0) {
-#ifdef CONFIG_DEBUG_MUTEXES
- struct ww_mutex *ww;
-
- ww = container_of(lock, struct ww_mutex, base);
- DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
- ww_ctx->contending_lock = ww;
-#endif
- return -EDEADLK;
- }
-
- break;
- }
-
- pos = &cur->list;
-
- /*
- * Wake up the waiter so that it gets a chance to back
- * off.
- */
- if (cur->ww_ctx->acquired > 0) {
- debug_mutex_wake_waiter(lock, cur);
- wake_up_process(cur->task);
- }
- }
-
- list_add_tail(&waiter->list, pos);
- return 0;
-}
-
/*
* Lock a mutex (possibly interruptible), slowpath:
*/
static __always_inline int __sched
-__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
+__mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclass,
struct lockdep_map *nest_lock, unsigned long ip,
struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
{
+ DEFINE_WAKE_Q(wake_q);
struct mutex_waiter waiter;
- bool first = false;
struct ww_mutex *ww;
+ unsigned long flags;
int ret;
+ if (!use_ww_ctx)
+ ww_ctx = NULL;
+
might_sleep();
+ MUTEX_WARN_ON(lock->magic != lock);
+
ww = container_of(lock, struct ww_mutex, base);
- if (use_ww_ctx && ww_ctx) {
+ if (ww_ctx) {
if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
return -EALREADY;
+
+ /*
+ * Reset the wounded flag after a kill. No other process can
+ * race and wound us here since they can't have a valid owner
+ * pointer if we don't have any locks held.
+ */
+ if (ww_ctx->acquired == 0)
+ ww_ctx->wounded = 0;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ nest_lock = &ww_ctx->dep_map;
+#endif
}
preempt_disable();
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
+ trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
if (__mutex_trylock(lock) ||
- mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, NULL)) {
+ mutex_optimistic_spin(lock, ww_ctx, NULL)) {
/* got the lock, yay! */
lock_acquired(&lock->dep_map, ip);
- if (use_ww_ctx && ww_ctx)
+ if (ww_ctx)
ww_mutex_set_context_fastpath(ww, ww_ctx);
+ trace_contention_end(lock, 0);
preempt_enable();
return 0;
}
- spin_lock(&lock->wait_lock);
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
/*
* After waiting to acquire the wait_lock, try again.
*/
if (__mutex_trylock(lock)) {
- if (use_ww_ctx && ww_ctx)
- __ww_mutex_wakeup_for_backoff(lock, ww_ctx);
+ if (ww_ctx)
+ __ww_mutex_check_waiters(lock, ww_ctx, &wake_q);
goto skip_wait;
}
debug_mutex_lock_common(lock, &waiter);
- debug_mutex_add_waiter(lock, &waiter, current);
+ waiter.task = current;
+ if (use_ww_ctx)
+ waiter.ww_ctx = ww_ctx;
lock_contended(&lock->dep_map, ip);
if (!use_ww_ctx) {
/* add waiting tasks to the end of the waitqueue (FIFO): */
- list_add_tail(&waiter.list, &lock->wait_list);
-
-#ifdef CONFIG_DEBUG_MUTEXES
- waiter.ww_ctx = MUTEX_POISON_WW_CTX;
-#endif
+ __mutex_add_waiter(lock, &waiter, &lock->wait_list);
} else {
- /* Add in stamp order, waking up waiters that must back off. */
- ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx);
+ /*
+ * Add in stamp order, waking up waiters that must kill
+ * themselves.
+ */
+ ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx, &wake_q);
if (ret)
- goto err_early_backoff;
-
- waiter.ww_ctx = ww_ctx;
+ goto err_early_kill;
}
- waiter.task = current;
-
- if (__mutex_waiter_is_first(lock, &waiter))
- __mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
-
+ __set_task_blocked_on(current, lock);
set_current_state(state);
+ trace_contention_begin(lock, LCB_F_MUTEX);
for (;;) {
+ bool first;
+
/*
* Once we hold wait_lock, we're serialized against
* mutex_unlock() handing the lock off to us, do a trylock
@@ -814,92 +672,154 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
goto acquired;
/*
- * Check for signals and wound conditions while holding
+ * Check for signals and kill conditions while holding
* wait_lock. This ensures the lock cancellation is ordered
* against mutex_unlock() and wake-ups do not go missing.
*/
- if (unlikely(signal_pending_state(state, current))) {
+ if (signal_pending_state(state, current)) {
ret = -EINTR;
goto err;
}
- if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) {
- ret = __ww_mutex_lock_check_stamp(lock, &waiter, ww_ctx);
+ if (ww_ctx) {
+ ret = __ww_mutex_check_kill(lock, &waiter, ww_ctx);
if (ret)
goto err;
}
- spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
+
schedule_preempt_disabled();
+ first = __mutex_waiter_is_first(lock, &waiter);
+
/*
- * ww_mutex needs to always recheck its position since its waiter
- * list is not FIFO ordered.
+ * As we likely have been woken up by task
+ * that has cleared our blocked_on state, re-set
+ * it to the lock we are trying to acquire.
*/
- if ((use_ww_ctx && ww_ctx) || !first) {
- first = __mutex_waiter_is_first(lock, &waiter);
- if (first)
- __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
- }
-
+ set_task_blocked_on(current, lock);
set_current_state(state);
/*
* Here we order against unlock; we must either see it change
* state back to RUNNING and fall through the next schedule(),
* or we must see its unlock and acquire.
*/
- if (__mutex_trylock(lock) ||
- (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, &waiter)))
+ if (__mutex_trylock_or_handoff(lock, first))
break;
- spin_lock(&lock->wait_lock);
+ if (first) {
+ trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
+ /*
+ * mutex_optimistic_spin() can call schedule(), so
+ * clear blocked on so we don't become unselectable
+ * to run.
+ */
+ clear_task_blocked_on(current, lock);
+ if (mutex_optimistic_spin(lock, ww_ctx, &waiter))
+ break;
+ set_task_blocked_on(current, lock);
+ trace_contention_begin(lock, LCB_F_MUTEX);
+ }
+
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
}
- spin_lock(&lock->wait_lock);
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
acquired:
+ __clear_task_blocked_on(current, lock);
__set_current_state(TASK_RUNNING);
- mutex_remove_waiter(lock, &waiter, current);
- if (likely(list_empty(&lock->wait_list)))
- __mutex_clear_flag(lock, MUTEX_FLAGS);
+ if (ww_ctx) {
+ /*
+ * Wound-Wait; we stole the lock (!first_waiter), check the
+ * waiters as anyone might want to wound us.
+ */
+ if (!ww_ctx->is_wait_die &&
+ !__mutex_waiter_is_first(lock, &waiter))
+ __ww_mutex_check_waiters(lock, ww_ctx, &wake_q);
+ }
+
+ __mutex_remove_waiter(lock, &waiter);
debug_mutex_free_waiter(&waiter);
skip_wait:
/* got the lock - cleanup and rejoice! */
lock_acquired(&lock->dep_map, ip);
+ trace_contention_end(lock, 0);
- if (use_ww_ctx && ww_ctx)
- ww_mutex_set_context_slowpath(ww, ww_ctx);
+ if (ww_ctx)
+ ww_mutex_lock_acquired(ww, ww_ctx);
- spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
preempt_enable();
return 0;
err:
+ __clear_task_blocked_on(current, lock);
__set_current_state(TASK_RUNNING);
- mutex_remove_waiter(lock, &waiter, current);
-err_early_backoff:
- spin_unlock(&lock->wait_lock);
+ __mutex_remove_waiter(lock, &waiter);
+err_early_kill:
+ WARN_ON(__get_task_blocked_on(current));
+ trace_contention_end(lock, ret);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
debug_mutex_free_waiter(&waiter);
- mutex_release(&lock->dep_map, 1, ip);
+ mutex_release(&lock->dep_map, ip);
preempt_enable();
return ret;
}
static int __sched
-__mutex_lock(struct mutex *lock, long state, unsigned int subclass,
+__mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
struct lockdep_map *nest_lock, unsigned long ip)
{
return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false);
}
static int __sched
-__ww_mutex_lock(struct mutex *lock, long state, unsigned int subclass,
- struct lockdep_map *nest_lock, unsigned long ip,
- struct ww_acquire_ctx *ww_ctx)
+__ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
+ unsigned long ip, struct ww_acquire_ctx *ww_ctx)
{
- return __mutex_lock_common(lock, state, subclass, nest_lock, ip, ww_ctx, true);
+ return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true);
+}
+
+/**
+ * ww_mutex_trylock - tries to acquire the w/w mutex with optional acquire context
+ * @ww: mutex to lock
+ * @ww_ctx: optional w/w acquire context
+ *
+ * Trylocks a mutex with the optional acquire context; no deadlock detection is
+ * possible. Returns 1 if the mutex has been acquired successfully, 0 otherwise.
+ *
+ * Unlike ww_mutex_lock, no deadlock handling is performed. However, if a @ctx is
+ * specified, -EALREADY handling may happen in calls to ww_mutex_trylock.
+ *
+ * A mutex acquired with this function must be released with ww_mutex_unlock.
+ */
+int ww_mutex_trylock(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
+{
+ if (!ww_ctx)
+ return mutex_trylock(&ww->base);
+
+ MUTEX_WARN_ON(ww->base.magic != &ww->base);
+
+ /*
+ * Reset the wounded flag after a kill. No other process can
+ * race and wound us here, since they can't have a valid owner
+ * pointer if we don't have any locks held.
+ */
+ if (ww_ctx->acquired == 0)
+ ww_ctx->wounded = 0;
+
+ if (__mutex_trylock(&ww->base)) {
+ ww_mutex_set_context_fastpath(ww, ww_ctx);
+ mutex_acquire_nest(&ww->base.dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_);
+ return 1;
+ }
+
+ return 0;
}
+EXPORT_SYMBOL(ww_mutex_trylock);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void __sched
@@ -918,11 +838,12 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
int __sched
-mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
+_mutex_lock_killable(struct mutex *lock, unsigned int subclass,
+ struct lockdep_map *nest)
{
- return __mutex_lock(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
+ return __mutex_lock(lock, TASK_KILLABLE, subclass, nest, _RET_IP_);
}
-EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
+EXPORT_SYMBOL_GPL(_mutex_lock_killable);
int __sched
mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
@@ -978,8 +899,7 @@ ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
might_sleep();
ret = __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE,
- 0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
- ctx);
+ 0, _RET_IP_, ctx);
if (!ret && ctx && ctx->acquired > 1)
return ww_mutex_deadlock_injection(lock, ctx);
@@ -994,8 +914,7 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
might_sleep();
ret = __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE,
- 0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
- ctx);
+ 0, _RET_IP_, ctx);
if (!ret && ctx && ctx->acquired > 1)
return ww_mutex_deadlock_injection(lock, ctx);
@@ -1014,8 +933,9 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
struct task_struct *next = NULL;
DEFINE_WAKE_Q(wake_q);
unsigned long owner;
+ unsigned long flags;
- mutex_release(&lock->dep_map, 1, ip);
+ mutex_release(&lock->dep_map, ip);
/*
* Release the lock before (potentially) taking the spinlock such that
@@ -1026,29 +946,21 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
*/
owner = atomic_long_read(&lock->owner);
for (;;) {
- unsigned long old;
-
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
- DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP);
-#endif
+ MUTEX_WARN_ON(__owner_task(owner) != current);
+ MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP);
if (owner & MUTEX_FLAG_HANDOFF)
break;
- old = atomic_long_cmpxchg_release(&lock->owner, owner,
- __owner_flags(owner));
- if (old == owner) {
+ if (atomic_long_try_cmpxchg_release(&lock->owner, &owner, __owner_flags(owner))) {
if (owner & MUTEX_FLAG_WAITERS)
break;
return;
}
-
- owner = old;
}
- spin_lock(&lock->wait_lock);
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
debug_mutex_unlock(lock);
if (!list_empty(&lock->wait_list)) {
/* get the first entry from the wait-list: */
@@ -1059,15 +971,14 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
next = waiter->task;
debug_mutex_wake_waiter(lock, waiter);
+ __clear_task_blocked_on(next, lock);
wake_q_add(&wake_q, next);
}
if (owner & MUTEX_FLAG_HANDOFF)
__mutex_handoff(lock, next);
- spin_unlock(&lock->wait_lock);
-
- wake_up_q(&wake_q);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
}
#ifndef CONFIG_DEBUG_LOCK_ALLOC
@@ -1082,15 +993,16 @@ static noinline int __sched
__mutex_lock_interruptible_slowpath(struct mutex *lock);
/**
- * mutex_lock_interruptible - acquire the mutex, interruptible
- * @lock: the mutex to be acquired
+ * mutex_lock_interruptible() - Acquire the mutex, interruptible by signals.
+ * @lock: The mutex to be acquired.
*
- * Lock the mutex like mutex_lock(), and return 0 if the mutex has
- * been acquired or sleep until the mutex becomes available. If a
- * signal arrives while waiting for the lock then this function
- * returns -EINTR.
+ * Lock the mutex like mutex_lock(). If a signal is delivered while the
+ * process is sleeping, this function will return without acquiring the
+ * mutex.
*
- * This function is similar to (but not equivalent to) down_interruptible().
+ * Context: Process context.
+ * Return: 0 if the lock was successfully acquired or %-EINTR if a
+ * signal arrived.
*/
int __sched mutex_lock_interruptible(struct mutex *lock)
{
@@ -1104,6 +1016,18 @@ int __sched mutex_lock_interruptible(struct mutex *lock)
EXPORT_SYMBOL(mutex_lock_interruptible);
+/**
+ * mutex_lock_killable() - Acquire the mutex, interruptible by fatal signals.
+ * @lock: The mutex to be acquired.
+ *
+ * Lock the mutex like mutex_lock(). If a signal which will be fatal to
+ * the current process is delivered while the process is sleeping, this
+ * function will return without acquiring the mutex.
+ *
+ * Context: Process context.
+ * Return: 0 if the lock was successfully acquired or %-EINTR if a
+ * fatal signal arrived.
+ */
int __sched mutex_lock_killable(struct mutex *lock)
{
might_sleep();
@@ -1115,6 +1039,16 @@ int __sched mutex_lock_killable(struct mutex *lock)
}
EXPORT_SYMBOL(mutex_lock_killable);
+/**
+ * mutex_lock_io() - Acquire the mutex and mark the process as waiting for I/O
+ * @lock: The mutex to be acquired.
+ *
+ * Lock the mutex like mutex_lock(). While the task is waiting for this
+ * mutex, it will be accounted as being in the IO wait state by the
+ * scheduler.
+ *
+ * Context: Process context.
+ */
void __sched mutex_lock_io(struct mutex *lock)
{
int token;
@@ -1146,7 +1080,7 @@ __mutex_lock_interruptible_slowpath(struct mutex *lock)
static noinline int __sched
__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
- return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, NULL,
+ return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0,
_RET_IP_, ctx);
}
@@ -1154,12 +1088,13 @@ static noinline int __sched
__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
struct ww_acquire_ctx *ctx)
{
- return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, NULL,
+ return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0,
_RET_IP_, ctx);
}
#endif
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
/**
* mutex_trylock - try to acquire the mutex, without waiting
* @lock: the mutex to be acquired
@@ -1176,14 +1111,24 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
*/
int __sched mutex_trylock(struct mutex *lock)
{
- bool locked = __mutex_trylock(lock);
+ MUTEX_WARN_ON(lock->magic != lock);
+ return __mutex_trylock(lock);
+}
+EXPORT_SYMBOL(mutex_trylock);
+#else
+int __sched _mutex_trylock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock)
+{
+ bool locked;
+ MUTEX_WARN_ON(lock->magic != lock);
+ locked = __mutex_trylock(lock);
if (locked)
- mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ mutex_acquire_nest(&lock->dep_map, 0, 1, nest_lock, _RET_IP_);
return locked;
}
-EXPORT_SYMBOL(mutex_trylock);
+EXPORT_SYMBOL(_mutex_trylock_nest_lock);
+#endif
#ifndef CONFIG_DEBUG_LOCK_ALLOC
int __sched
@@ -1216,7 +1161,11 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
}
EXPORT_SYMBOL(ww_mutex_lock_interruptible);
-#endif
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
+#endif /* !CONFIG_PREEMPT_RT */
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(contention_begin);
+EXPORT_TRACEPOINT_SYMBOL_GPL(contention_end);
/**
* atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 6ebc1902f779..9ad4da8cea00 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -1,24 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Mutexes: blocking mutual exclusion locks
*
* started by Ingo Molnar:
*
* Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- * This file contains mutex debugging related internal prototypes, for the
- * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
*/
+#ifndef CONFIG_PREEMPT_RT
+/*
+ * This is the control structure for tasks blocked on mutex, which resides
+ * on the blocked task's kernel stack:
+ */
+struct mutex_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ struct ww_acquire_ctx *ww_ctx;
+#ifdef CONFIG_DEBUG_MUTEXES
+ void *magic;
+#endif
+};
-#define mutex_remove_waiter(lock, waiter, task) \
- __list_del((waiter)->list.prev, (waiter)->list.next)
+/*
+ * @owner: contains: 'struct task_struct *' to the current lock owner,
+ * NULL means not owned. Since task_struct pointers are aligned at
+ * at least L1_CACHE_BYTES, we have low bits to store extra state.
+ *
+ * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup.
+ * Bit1 indicates unlock needs to hand the lock to the top-waiter
+ * Bit2 indicates handoff has been done and we're waiting for pickup.
+ */
+#define MUTEX_FLAG_WAITERS 0x01
+#define MUTEX_FLAG_HANDOFF 0x02
+#define MUTEX_FLAG_PICKUP 0x04
-#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
-#define debug_mutex_free_waiter(waiter) do { } while (0)
-#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
-#define debug_mutex_unlock(lock) do { } while (0)
-#define debug_mutex_init(lock, name, key) do { } while (0)
+#define MUTEX_FLAGS 0x07
-static inline void
-debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
+/*
+ * Internal helper function; C doesn't allow us to hide it :/
+ *
+ * DO NOT USE (outside of mutex & scheduler code).
+ */
+static inline struct task_struct *__mutex_owner(struct mutex *lock)
{
+ if (!lock)
+ return NULL;
+ return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS);
}
+
+#ifdef CONFIG_DEBUG_MUTEXES
+extern void debug_mutex_lock_common(struct mutex *lock,
+ struct mutex_waiter *waiter);
+extern void debug_mutex_wake_waiter(struct mutex *lock,
+ struct mutex_waiter *waiter);
+extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
+extern void debug_mutex_add_waiter(struct mutex *lock,
+ struct mutex_waiter *waiter,
+ struct task_struct *task);
+extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+ struct task_struct *task);
+extern void debug_mutex_unlock(struct mutex *lock);
+extern void debug_mutex_init(struct mutex *lock);
+#else /* CONFIG_DEBUG_MUTEXES */
+# define debug_mutex_lock_common(lock, waiter) do { } while (0)
+# define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
+# define debug_mutex_free_waiter(waiter) do { } while (0)
+# define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
+# define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0)
+# define debug_mutex_unlock(lock) do { } while (0)
+# define debug_mutex_init(lock) do { } while (0)
+#endif /* !CONFIG_DEBUG_MUTEXES */
+#endif /* CONFIG_PREEMPT_RT */
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index a3167941093b..b4233dc2c2b0 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/osq_lock.h>
@@ -10,6 +11,13 @@
* called from interrupt context and we have preemption disabled while
* spinning.
*/
+
+struct optimistic_spin_node {
+ struct optimistic_spin_node *next, *prev;
+ int locked; /* 1 if lock acquired */
+ int cpu; /* encoded CPU # + 1 value */
+};
+
static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
/*
@@ -36,32 +44,28 @@ static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
/*
* Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
* Can return NULL in case we were the last queued and we updated @lock instead.
+ *
+ * If osq_lock() is being cancelled there must be a previous node
+ * and 'old_cpu' is its CPU #.
+ * For osq_unlock() there is never a previous node and old_cpu is
+ * set to OSQ_UNLOCKED_VAL.
*/
static inline struct optimistic_spin_node *
osq_wait_next(struct optimistic_spin_queue *lock,
struct optimistic_spin_node *node,
- struct optimistic_spin_node *prev)
+ int old_cpu)
{
- struct optimistic_spin_node *next = NULL;
int curr = encode_cpu(smp_processor_id());
- int old;
-
- /*
- * If there is a prev node in queue, then the 'old' value will be
- * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
- * we're currently last in queue, then the queue will then become empty.
- */
- old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
for (;;) {
if (atomic_read(&lock->tail) == curr &&
- atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) {
+ atomic_cmpxchg_acquire(&lock->tail, curr, old_cpu) == curr) {
/*
* We were the last queued, we moved @lock back. @prev
* will now observe @lock and will complete its
* unlock()/unqueue().
*/
- break;
+ return NULL;
}
/*
@@ -75,15 +79,15 @@ osq_wait_next(struct optimistic_spin_queue *lock,
* wait for a new @node->next from its Step-C.
*/
if (node->next) {
+ struct optimistic_spin_node *next;
+
next = xchg(&node->next, NULL);
if (next)
- break;
+ return next;
}
cpu_relax();
}
-
- return next;
}
bool osq_lock(struct optimistic_spin_queue *lock)
@@ -109,6 +113,19 @@ bool osq_lock(struct optimistic_spin_queue *lock)
prev = decode_cpu(old);
node->prev = prev;
+
+ /*
+ * osq_lock() unqueue
+ *
+ * node->prev = prev osq_wait_next()
+ * WMB MB
+ * prev->next = node next->prev = prev // unqueue-C
+ *
+ * Here 'node->prev' and 'next->prev' are the same variable and we need
+ * to ensure these stores happen in-order to avoid corrupting the list.
+ */
+ smp_wmb();
+
WRITE_ONCE(prev->next, node);
/*
@@ -120,20 +137,17 @@ bool osq_lock(struct optimistic_spin_queue *lock)
* cmpxchg in an attempt to undo our queueing.
*/
- while (!READ_ONCE(node->locked)) {
- /*
- * If we need to reschedule bail... so we can block.
- * Use vcpu_is_preempted() to avoid waiting for a preempted
- * lock holder:
- */
- if (need_resched() || vcpu_is_preempted(node_cpu(node->prev)))
- goto unqueue;
-
- cpu_relax();
- }
- return true;
+ /*
+ * Wait to acquire the lock or cancellation. Note that need_resched()
+ * will come with an IPI, which will wake smp_cond_load_relaxed() if it
+ * is implemented with a monitor-wait. vcpu_is_preempted() relies on
+ * polling, be careful.
+ */
+ if (smp_cond_load_relaxed(&node->locked, VAL || need_resched() ||
+ vcpu_is_preempted(node_cpu(node->prev))))
+ return true;
-unqueue:
+ /* unqueue */
/*
* Step - A -- stabilize @prev
*
@@ -143,13 +157,17 @@ unqueue:
*/
for (;;) {
- if (prev->next == node &&
+ /*
+ * cpu_relax() below implies a compiler barrier which would
+ * prevent this comparison being optimized away.
+ */
+ if (data_race(prev->next) == node &&
cmpxchg(&prev->next, node, NULL) == node)
break;
/*
* We can only fail the cmpxchg() racing against an unlock(),
- * in which case we should observe @node->locked becomming
+ * in which case we should observe @node->locked becoming
* true.
*/
if (smp_load_acquire(&node->locked))
@@ -171,7 +189,7 @@ unqueue:
* back to @prev.
*/
- next = osq_wait_next(lock, node, prev);
+ next = osq_wait_next(lock, node, prev->cpu);
if (!next)
return false;
@@ -197,8 +215,7 @@ void osq_unlock(struct optimistic_spin_queue *lock)
/*
* Fast path for the uncontended case.
*/
- if (likely(atomic_cmpxchg_release(&lock->tail, curr,
- OSQ_UNLOCKED_VAL) == curr))
+ if (atomic_try_cmpxchg_release(&lock->tail, &curr, OSQ_UNLOCKED_VAL))
return;
/*
@@ -211,7 +228,7 @@ void osq_unlock(struct optimistic_spin_queue *lock)
return;
}
- next = osq_wait_next(lock, node, NULL);
+ next = osq_wait_next(lock, node, OSQ_UNLOCKED_VAL);
if (next)
WRITE_ONCE(next->locked, 1);
}
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 883cf1b92d90..ef234469baac 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -1,24 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/atomic.h>
-#include <linux/rwsem.h>
#include <linux/percpu.h>
+#include <linux/wait.h>
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/debug.h>
#include <linux/errno.h>
+#include <trace/events/lock.h>
int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
- const char *name, struct lock_class_key *rwsem_key)
+ const char *name, struct lock_class_key *key)
{
sem->read_count = alloc_percpu(int);
if (unlikely(!sem->read_count))
return -ENOMEM;
- /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
- rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
- __init_rwsem(&sem->rw_sem, name, rwsem_key);
+ rcu_sync_init(&sem->rss);
rcuwait_init(&sem->writer);
- sem->readers_block = 0;
+ init_waitqueue_head(&sem->waiters);
+ atomic_set(&sem->block, 0);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
return 0;
}
EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
@@ -38,77 +45,149 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
}
EXPORT_SYMBOL_GPL(percpu_free_rwsem);
-int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
+static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
{
+ this_cpu_inc(*sem->read_count);
+
/*
* Due to having preemption disabled the decrement happens on
* the same CPU as the increment, avoiding the
* increment-on-one-CPU-and-decrement-on-another problem.
*
- * If the reader misses the writer's assignment of readers_block, then
- * the writer is guaranteed to see the reader's increment.
+ * If the reader misses the writer's assignment of sem->block, then the
+ * writer is guaranteed to see the reader's increment.
*
* Conversely, any readers that increment their sem->read_count after
- * the writer looks are guaranteed to see the readers_block value,
- * which in turn means that they are guaranteed to immediately
- * decrement their sem->read_count, so that it doesn't matter that the
- * writer missed them.
+ * the writer looks are guaranteed to see the sem->block value, which
+ * in turn means that they are guaranteed to immediately decrement
+ * their sem->read_count, so that it doesn't matter that the writer
+ * missed them.
*/
smp_mb(); /* A matches D */
/*
- * If !readers_block the critical section starts here, matched by the
+ * If !sem->block the critical section starts here, matched by the
* release in percpu_up_write().
*/
- if (likely(!smp_load_acquire(&sem->readers_block)))
+ if (likely(!atomic_read_acquire(&sem->block)))
+ return true;
+
+ this_cpu_dec(*sem->read_count);
+
+ /* Prod writer to re-evaluate readers_active_check() */
+ rcuwait_wake_up(&sem->writer);
+
+ return false;
+}
+
+static inline bool __percpu_down_write_trylock(struct percpu_rw_semaphore *sem)
+{
+ if (atomic_read(&sem->block))
+ return false;
+
+ return atomic_xchg(&sem->block, 1) == 0;
+}
+
+static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader)
+{
+ if (reader) {
+ bool ret;
+
+ preempt_disable();
+ ret = __percpu_down_read_trylock(sem);
+ preempt_enable();
+
+ return ret;
+ }
+ return __percpu_down_write_trylock(sem);
+}
+
+/*
+ * The return value of wait_queue_entry::func means:
+ *
+ * <0 - error, wakeup is terminated and the error is returned
+ * 0 - no wakeup, a next waiter is tried
+ * >0 - woken, if EXCLUSIVE, counted towards @nr_exclusive.
+ *
+ * We use EXCLUSIVE for both readers and writers to preserve FIFO order,
+ * and play games with the return value to allow waking multiple readers.
+ *
+ * Specifically, we wake readers until we've woken a single writer, or until a
+ * trylock fails.
+ */
+static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
+ unsigned int mode, int wake_flags,
+ void *key)
+{
+ bool reader = wq_entry->flags & WQ_FLAG_CUSTOM;
+ struct percpu_rw_semaphore *sem = key;
+ struct task_struct *p;
+
+ /* concurrent against percpu_down_write(), can get stolen */
+ if (!__percpu_rwsem_trylock(sem, reader))
return 1;
- /*
- * Per the above comment; we still have preemption disabled and
- * will thus decrement on the same CPU as we incremented.
- */
- __percpu_up_read(sem);
+ p = get_task_struct(wq_entry->private);
+ list_del_init(&wq_entry->entry);
+ smp_store_release(&wq_entry->private, NULL);
- if (try)
- return 0;
+ wake_up_process(p);
+ put_task_struct(p);
- /*
- * We either call schedule() in the wait, or we'll fall through
- * and reschedule on the preempt_enable() in percpu_down_read().
- */
- preempt_enable_no_resched();
+ return !reader; /* wake (readers until) 1 writer */
+}
+
+static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader,
+ bool freeze)
+{
+ DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function);
+ bool wait;
+ spin_lock_irq(&sem->waiters.lock);
/*
- * Avoid lockdep for the down/up_read() we already have them.
+ * Serialize against the wakeup in percpu_up_write(), if we fail
+ * the trylock, the wakeup must see us on the list.
*/
- __down_read(&sem->rw_sem);
- this_cpu_inc(*sem->read_count);
- __up_read(&sem->rw_sem);
+ wait = !__percpu_rwsem_trylock(sem, reader);
+ if (wait) {
+ wq_entry.flags |= WQ_FLAG_EXCLUSIVE | reader * WQ_FLAG_CUSTOM;
+ __add_wait_queue_entry_tail(&sem->waiters, &wq_entry);
+ }
+ spin_unlock_irq(&sem->waiters.lock);
- preempt_disable();
- return 1;
+ while (wait) {
+ set_current_state(TASK_UNINTERRUPTIBLE |
+ (freeze ? TASK_FREEZABLE : 0));
+ if (!smp_load_acquire(&wq_entry.private))
+ break;
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
}
-EXPORT_SYMBOL_GPL(__percpu_down_read);
-void __percpu_up_read(struct percpu_rw_semaphore *sem)
+bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try,
+ bool freeze)
{
- smp_mb(); /* B matches C */
- /*
- * In other words, if they see our decrement (presumably to aggregate
- * zero, as that is the only time it matters) they will also see our
- * critical section.
- */
- __this_cpu_dec(*sem->read_count);
+ if (__percpu_down_read_trylock(sem))
+ return true;
- /* Prod writer to recheck readers_active */
- rcuwait_wake_up(&sem->writer);
+ if (try)
+ return false;
+
+ trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ);
+ preempt_enable();
+ percpu_rwsem_wait(sem, /* .reader = */ true, freeze);
+ preempt_disable();
+ trace_contention_end(sem, 0);
+
+ return true;
}
-EXPORT_SYMBOL_GPL(__percpu_up_read);
+EXPORT_SYMBOL_GPL(__percpu_down_read);
#define per_cpu_sum(var) \
({ \
- typeof(var) __sum = 0; \
+ TYPEOF_UNQUAL(var) __sum = 0; \
int cpu; \
compiletime_assert_atomic_type(__sum); \
for_each_possible_cpu(cpu) \
@@ -116,11 +195,19 @@ EXPORT_SYMBOL_GPL(__percpu_up_read);
__sum; \
})
+bool percpu_is_read_locked(struct percpu_rw_semaphore *sem)
+{
+ return per_cpu_sum(*sem->read_count) != 0 && !atomic_read(&sem->block);
+}
+EXPORT_SYMBOL_GPL(percpu_is_read_locked);
+
/*
* Return true if the modular sum of the sem->read_count per-CPU variable is
* zero. If this sum is zero, then it is stable due to the fact that if any
* newly arriving readers increment a given counter, they will immediately
* decrement that same counter.
+ *
+ * Assumes sem->block is set.
*/
static bool readers_active_check(struct percpu_rw_semaphore *sem)
{
@@ -137,34 +224,45 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem)
return true;
}
-void percpu_down_write(struct percpu_rw_semaphore *sem)
+void __sched percpu_down_write(struct percpu_rw_semaphore *sem)
{
+ bool contended = false;
+
+ might_sleep();
+ rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
/* Notify readers to take the slow path. */
rcu_sync_enter(&sem->rss);
- down_write(&sem->rw_sem);
-
/*
- * Notify new readers to block; up until now, and thus throughout the
- * longish rcu_sync_enter() above, new readers could still come in.
+ * Try set sem->block; this provides writer-writer exclusion.
+ * Having sem->block set makes new readers block.
*/
- WRITE_ONCE(sem->readers_block, 1);
+ if (!__percpu_down_write_trylock(sem)) {
+ trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE);
+ percpu_rwsem_wait(sem, /* .reader = */ false, false);
+ contended = true;
+ }
- smp_mb(); /* D matches A */
+ /* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */
/*
- * If they don't see our writer of readers_block, then we are
- * guaranteed to see their sem->read_count increment, and therefore
- * will wait for them.
+ * If they don't see our store of sem->block, then we are guaranteed to
+ * see their sem->read_count increment, and therefore will wait for
+ * them.
*/
- /* Wait for all now active readers to complete. */
- rcuwait_wait_event(&sem->writer, readers_active_check(sem));
+ /* Wait for all active readers to complete. */
+ rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE);
+ if (contended)
+ trace_contention_end(sem, 0);
}
EXPORT_SYMBOL_GPL(percpu_down_write);
void percpu_up_write(struct percpu_rw_semaphore *sem)
{
+ rwsem_release(&sem->dep_map, _RET_IP_);
+
/*
* Signal the writer is done, no fast path yet.
*
@@ -175,12 +273,12 @@ void percpu_up_write(struct percpu_rw_semaphore *sem)
* Therefore we force it through the slow path which guarantees an
* acquire and thereby guarantees the critical section's consistency.
*/
- smp_store_release(&sem->readers_block, 0);
+ atomic_set_release(&sem->block, 0);
/*
- * Release the write lock, this will allow readers back in the game.
+ * Prod any pending reader/writer to make progress.
*/
- up_write(&sem->rw_sem);
+ __wake_up(&sem->waiters, TASK_NORMAL, 1, sem);
/*
* Once this completes (at least one RCU-sched grace period hence) the
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 2655f26ec882..d2ef312a8611 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -1,16 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Queued read/write locks
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
* (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
*
* Authors: Waiman Long <waiman.long@hp.com>
@@ -21,51 +12,13 @@
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/spinlock.h>
-#include <asm/qrwlock.h>
-
-/*
- * This internal data structure is used for optimizing access to some of
- * the subfields within the atomic_t cnts.
- */
-struct __qrwlock {
- union {
- atomic_t cnts;
- struct {
-#ifdef __LITTLE_ENDIAN
- u8 wmode; /* Writer mode */
- u8 rcnts[3]; /* Reader counts */
-#else
- u8 rcnts[3]; /* Reader counts */
- u8 wmode; /* Writer mode */
-#endif
- };
- };
- arch_spinlock_t lock;
-};
-
-/**
- * rspin_until_writer_unlock - inc reader count & spin until writer is gone
- * @lock : Pointer to queue rwlock structure
- * @writer: Current queue rwlock writer status byte
- *
- * In interrupt context or at the head of the queue, the reader will just
- * increment the reader count & wait until the writer releases the lock.
- */
-static __always_inline void
-rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
-{
- while ((cnts & _QW_WMASK) == _QW_LOCKED) {
- cpu_relax();
- cnts = atomic_read_acquire(&lock->cnts);
- }
-}
+#include <trace/events/lock.h>
/**
- * queued_read_lock_slowpath - acquire read lock of a queue rwlock
- * @lock: Pointer to queue rwlock structure
- * @cnts: Current qrwlock lock value
+ * queued_read_lock_slowpath - acquire read lock of a queued rwlock
+ * @lock: Pointer to queued rwlock structure
*/
-void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
+void __lockfunc queued_read_lock_slowpath(struct qrwlock *lock)
{
/*
* Readers come here when they cannot get the lock without waiting
@@ -73,78 +26,67 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
if (unlikely(in_interrupt())) {
/*
* Readers in interrupt context will get the lock immediately
- * if the writer is just waiting (not holding the lock yet).
- * The rspin_until_writer_unlock() function returns immediately
- * in this case. Otherwise, they will spin (with ACQUIRE
- * semantics) until the lock is available without waiting in
- * the queue.
+ * if the writer is just waiting (not holding the lock yet),
+ * so spin with ACQUIRE semantics until the lock is available
+ * without waiting in the queue.
*/
- rspin_until_writer_unlock(lock, cnts);
+ atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED));
return;
}
atomic_sub(_QR_BIAS, &lock->cnts);
+ trace_contention_begin(lock, LCB_F_SPIN | LCB_F_READ);
+
/*
* Put the reader into the wait queue
*/
arch_spin_lock(&lock->wait_lock);
+ atomic_add(_QR_BIAS, &lock->cnts);
/*
* The ACQUIRE semantics of the following spinning code ensure
* that accesses can't leak upwards out of our subsequent critical
* section in the case that the lock is currently held for write.
*/
- cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts);
- rspin_until_writer_unlock(lock, cnts);
+ atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED));
/*
* Signal the next one in queue to become queue head
*/
arch_spin_unlock(&lock->wait_lock);
+
+ trace_contention_end(lock, 0);
}
EXPORT_SYMBOL(queued_read_lock_slowpath);
/**
- * queued_write_lock_slowpath - acquire write lock of a queue rwlock
- * @lock : Pointer to queue rwlock structure
+ * queued_write_lock_slowpath - acquire write lock of a queued rwlock
+ * @lock : Pointer to queued rwlock structure
*/
-void queued_write_lock_slowpath(struct qrwlock *lock)
+void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock)
{
- u32 cnts;
+ int cnts;
+
+ trace_contention_begin(lock, LCB_F_SPIN | LCB_F_WRITE);
/* Put the writer into the wait queue */
arch_spin_lock(&lock->wait_lock);
/* Try to acquire the lock directly if no reader is present */
- if (!atomic_read(&lock->cnts) &&
- (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0))
+ if (!(cnts = atomic_read(&lock->cnts)) &&
+ atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED))
goto unlock;
- /*
- * Set the waiting flag to notify readers that a writer is pending,
- * or wait for a previous writer to go away.
- */
- for (;;) {
- struct __qrwlock *l = (struct __qrwlock *)lock;
-
- if (!READ_ONCE(l->wmode) &&
- (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0))
- break;
+ /* Set the waiting flag to notify readers that a writer is pending */
+ atomic_or(_QW_WAITING, &lock->cnts);
- cpu_relax();
- }
-
- /* When no more readers, set the locked flag */
- for (;;) {
- cnts = atomic_read(&lock->cnts);
- if ((cnts == _QW_WAITING) &&
- (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING,
- _QW_LOCKED) == _QW_WAITING))
- break;
-
- cpu_relax();
- }
+ /* When no more readers or writers, set the locked flag */
+ do {
+ cnts = atomic_cond_read_relaxed(&lock->cnts, VAL == _QW_WAITING);
+ } while (!atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED));
unlock:
arch_spin_unlock(&lock->wait_lock);
+
+ trace_contention_end(lock, 0);
}
EXPORT_SYMBOL(queued_write_lock_slowpath);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index fd24153e8a48..af8d122bb649 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -1,22 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Queued spinlock
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
* (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
- * (C) Copyright 2013-2014 Red Hat, Inc.
+ * (C) Copyright 2013-2014,2018 Red Hat, Inc.
* (C) Copyright 2015 Intel Corp.
* (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
*
- * Authors: Waiman Long <waiman.long@hpe.com>
+ * Authors: Waiman Long <longman@redhat.com>
* Peter Zijlstra <peterz@infradead.org>
*/
@@ -31,18 +22,26 @@
#include <linux/prefetch.h>
#include <asm/byteorder.h>
#include <asm/qspinlock.h>
+#include <trace/events/lock.h>
+
+/*
+ * Include queued spinlock definitions and statistics code
+ */
+#include "qspinlock.h"
+#include "qspinlock_stat.h"
/*
* The basic principle of a queue-based spinlock can best be understood
* by studying a classic queue-based spinlock implementation called the
- * MCS lock. The paper below provides a good description for this kind
- * of lock.
+ * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable
+ * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and
+ * Scott") is available at
*
- * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
+ * https://bugzilla.kernel.org/show_bug.cgi?id=206115
*
- * This queued spinlock implementation is based on the MCS lock, however to make
- * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing
- * API, we must modify it somehow.
+ * This queued spinlock implementation is based on the MCS lock, however to
+ * make it fit the 4 bytes we assume spinlock_t to be, and preserve its
+ * existing API, we must modify it somehow.
*
* In particular; where the traditional MCS lock consists of a tail pointer
* (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
@@ -70,12 +69,6 @@
#include "mcs_spinlock.h"
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
-#define MAX_NODES 8
-#else
-#define MAX_NODES 4
-#endif
-
/*
* Per-CPU queue node structures; we can never have more than 4 nested
* contexts: task, softirq, hardirq, nmi.
@@ -84,164 +77,7 @@
*
* PV doubles the storage and uses the second cacheline for PV state.
*/
-static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
-
-/*
- * We must be able to distinguish between no-tail and the tail at 0:0,
- * therefore increment the cpu number by one.
- */
-
-static inline __pure u32 encode_tail(int cpu, int idx)
-{
- u32 tail;
-
-#ifdef CONFIG_DEBUG_SPINLOCK
- BUG_ON(idx > 3);
-#endif
- tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
- tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
-
- return tail;
-}
-
-static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
-{
- int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
- int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
-
- return per_cpu_ptr(&mcs_nodes[idx], cpu);
-}
-
-#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
-
-/*
- * By using the whole 2nd least significant byte for the pending bit, we
- * can allow better optimization of the lock acquisition for the pending
- * bit holder.
- *
- * This internal structure is also used by the set_locked function which
- * is not restricted to _Q_PENDING_BITS == 8.
- */
-struct __qspinlock {
- union {
- atomic_t val;
-#ifdef __LITTLE_ENDIAN
- struct {
- u8 locked;
- u8 pending;
- };
- struct {
- u16 locked_pending;
- u16 tail;
- };
-#else
- struct {
- u16 tail;
- u16 locked_pending;
- };
- struct {
- u8 reserved[2];
- u8 pending;
- u8 locked;
- };
-#endif
- };
-};
-
-#if _Q_PENDING_BITS == 8
-/**
- * clear_pending_set_locked - take ownership and clear the pending bit.
- * @lock: Pointer to queued spinlock structure
- *
- * *,1,0 -> *,0,1
- *
- * Lock stealing is not allowed if this function is used.
- */
-static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
-{
- struct __qspinlock *l = (void *)lock;
-
- WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL);
-}
-
-/*
- * xchg_tail - Put in the new queue tail code word & retrieve previous one
- * @lock : Pointer to queued spinlock structure
- * @tail : The new queue tail code word
- * Return: The previous queue tail code word
- *
- * xchg(lock, tail)
- *
- * p,*,* -> n,*,* ; prev = xchg(lock, node)
- */
-static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
-{
- struct __qspinlock *l = (void *)lock;
-
- /*
- * Use release semantics to make sure that the MCS node is properly
- * initialized before changing the tail code.
- */
- return (u32)xchg_release(&l->tail,
- tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
-}
-
-#else /* _Q_PENDING_BITS == 8 */
-
-/**
- * clear_pending_set_locked - take ownership and clear the pending bit.
- * @lock: Pointer to queued spinlock structure
- *
- * *,1,0 -> *,0,1
- */
-static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
-{
- atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
-}
-
-/**
- * xchg_tail - Put in the new queue tail code word & retrieve previous one
- * @lock : Pointer to queued spinlock structure
- * @tail : The new queue tail code word
- * Return: The previous queue tail code word
- *
- * xchg(lock, tail)
- *
- * p,*,* -> n,*,* ; prev = xchg(lock, node)
- */
-static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
-{
- u32 old, new, val = atomic_read(&lock->val);
-
- for (;;) {
- new = (val & _Q_LOCKED_PENDING_MASK) | tail;
- /*
- * Use release semantics to make sure that the MCS node is
- * properly initialized before changing the tail code.
- */
- old = atomic_cmpxchg_release(&lock->val, val, new);
- if (old == val)
- break;
-
- val = old;
- }
- return old;
-}
-#endif /* _Q_PENDING_BITS == 8 */
-
-/**
- * set_locked - Set the lock bit and own the lock
- * @lock: Pointer to queued spinlock structure
- *
- * *,*,0 -> *,0,1
- */
-static __always_inline void set_locked(struct qspinlock *lock)
-{
- struct __qspinlock *l = (void *)lock;
-
- WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
-}
-
+static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[_Q_MAX_NODES]);
/*
* Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
@@ -268,123 +104,6 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
#endif
-/*
- * Various notes on spin_is_locked() and spin_unlock_wait(), which are
- * 'interesting' functions:
- *
- * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE
- * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64,
- * PPC). Also qspinlock has a similar issue per construction, the setting of
- * the locked byte can be unordered acquiring the lock proper.
- *
- * This gets to be 'interesting' in the following cases, where the /should/s
- * end up false because of this issue.
- *
- *
- * CASE 1:
- *
- * So the spin_is_locked() correctness issue comes from something like:
- *
- * CPU0 CPU1
- *
- * global_lock(); local_lock(i)
- * spin_lock(&G) spin_lock(&L[i])
- * for (i) if (!spin_is_locked(&G)) {
- * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep();
- * return;
- * }
- * // deal with fail
- *
- * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such
- * that there is exclusion between the two critical sections.
- *
- * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from
- * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i])
- * /should/ be constrained by the ACQUIRE from spin_lock(&G).
- *
- * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB.
- *
- *
- * CASE 2:
- *
- * For spin_unlock_wait() there is a second correctness issue, namely:
- *
- * CPU0 CPU1
- *
- * flag = set;
- * smp_mb(); spin_lock(&l)
- * spin_unlock_wait(&l); if (!flag)
- * // add to lockless list
- * spin_unlock(&l);
- * // iterate lockless list
- *
- * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0
- * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE
- * semantics etc..)
- *
- * Where flag /should/ be ordered against the locked store of l.
- */
-
-/*
- * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before
- * issuing an _unordered_ store to set _Q_LOCKED_VAL.
- *
- * This means that the store can be delayed, but no later than the
- * store-release from the unlock. This means that simply observing
- * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired.
- *
- * There are two paths that can issue the unordered store:
- *
- * (1) clear_pending_set_locked(): *,1,0 -> *,0,1
- *
- * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0
- * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1
- *
- * However, in both cases we have other !0 state we've set before to queue
- * ourseves:
- *
- * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our
- * load is constrained by that ACQUIRE to not pass before that, and thus must
- * observe the store.
- *
- * For (2) we have a more intersting scenario. We enqueue ourselves using
- * xchg_tail(), which ends up being a RELEASE. This in itself is not
- * sufficient, however that is followed by an smp_cond_acquire() on the same
- * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and
- * guarantees we must observe that store.
- *
- * Therefore both cases have other !0 state that is observable before the
- * unordered locked byte store comes through. This means we can use that to
- * wait for the lock store, and then wait for an unlock.
- */
-#ifndef queued_spin_unlock_wait
-void queued_spin_unlock_wait(struct qspinlock *lock)
-{
- u32 val;
-
- for (;;) {
- val = atomic_read(&lock->val);
-
- if (!val) /* not locked, we're done */
- goto done;
-
- if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */
- break;
-
- /* not locked, but pending, wait until we observe the lock */
- cpu_relax();
- }
-
- /* any unlock is good */
- while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
- cpu_relax();
-
-done:
- smp_acquire__after_ctrl_dep();
-}
-EXPORT_SYMBOL(queued_spin_unlock_wait);
-#endif
-
#endif /* _GEN_PV_LOCK_SLOWPATH */
/**
@@ -408,82 +127,82 @@ EXPORT_SYMBOL(queued_spin_unlock_wait);
* contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' :
* queue : ^--' :
*/
-void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
{
struct mcs_spinlock *prev, *next, *node;
- u32 new, old, tail;
+ u32 old, tail;
int idx;
BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
if (pv_enabled())
- goto queue;
+ goto pv_queue;
if (virt_spin_lock(lock))
return;
/*
- * wait for in-progress pending->locked hand-overs
+ * Wait for in-progress pending->locked hand-overs with a bounded
+ * number of spins so that we guarantee forward progress.
*
* 0,1,0 -> 0,0,1
*/
if (val == _Q_PENDING_VAL) {
- while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL)
- cpu_relax();
+ int cnt = _Q_PENDING_LOOPS;
+ val = atomic_cond_read_relaxed(&lock->val,
+ (VAL != _Q_PENDING_VAL) || !cnt--);
}
/*
+ * If we observe any contention; queue.
+ */
+ if (val & ~_Q_LOCKED_MASK)
+ goto queue;
+
+ /*
* trylock || pending
*
- * 0,0,0 -> 0,0,1 ; trylock
- * 0,0,1 -> 0,1,1 ; pending
+ * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
*/
- for (;;) {
- /*
- * If we observe any contention; queue.
- */
- if (val & ~_Q_LOCKED_MASK)
- goto queue;
+ val = queued_fetch_set_pending_acquire(lock);
- new = _Q_LOCKED_VAL;
- if (val == new)
- new |= _Q_PENDING_VAL;
+ /*
+ * If we observe contention, there is a concurrent locker.
+ *
+ * Undo and queue; our setting of PENDING might have made the
+ * n,0,0 -> 0,0,0 transition fail and it will now be waiting
+ * on @next to become !NULL.
+ */
+ if (unlikely(val & ~_Q_LOCKED_MASK)) {
- /*
- * Acquire semantic is required here as the function may
- * return immediately if the lock was free.
- */
- old = atomic_cmpxchg_acquire(&lock->val, val, new);
- if (old == val)
- break;
+ /* Undo PENDING if we set it. */
+ if (!(val & _Q_PENDING_MASK))
+ clear_pending(lock);
- val = old;
+ goto queue;
}
/*
- * we won the trylock
- */
- if (new == _Q_LOCKED_VAL)
- return;
-
- /*
- * we're pending, wait for the owner to go away.
+ * We're pending, wait for the owner to go away.
*
- * *,1,1 -> *,1,0
+ * 0,1,1 -> *,1,0
*
* this wait loop must be a load-acquire such that we match the
* store-release that clears the locked bit and create lock
- * sequentiality; this is because not all clear_pending_set_locked()
- * implementations imply full barriers.
+ * sequentiality; this is because not all
+ * clear_pending_set_locked() implementations imply full
+ * barriers.
*/
- smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK));
+ if (val & _Q_LOCKED_MASK)
+ smp_cond_load_acquire(&lock->locked, !VAL);
/*
* take ownership and clear the pending bit.
*
- * *,1,0 -> *,0,1
+ * 0,1,0 -> 0,0,1
*/
clear_pending_set_locked(lock);
+ lockevent_inc(lock_pending);
return;
/*
@@ -491,11 +210,44 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* queuing.
*/
queue:
- node = this_cpu_ptr(&mcs_nodes[0]);
+ lockevent_inc(lock_slowpath);
+pv_queue:
+ node = this_cpu_ptr(&qnodes[0].mcs);
idx = node->count++;
tail = encode_tail(smp_processor_id(), idx);
- node += idx;
+ trace_contention_begin(lock, LCB_F_SPIN);
+
+ /*
+ * 4 nodes are allocated based on the assumption that there will
+ * not be nested NMIs taking spinlocks. That may not be true in
+ * some architectures even though the chance of needing more than
+ * 4 nodes will still be extremely unlikely. When that happens,
+ * we fall back to spinning on the lock directly without using
+ * any MCS node. This is not the most elegant solution, but is
+ * simple enough.
+ */
+ if (unlikely(idx >= _Q_MAX_NODES)) {
+ lockevent_inc(lock_no_node);
+ while (!queued_spin_trylock(lock))
+ cpu_relax();
+ goto release;
+ }
+
+ node = grab_mcs_node(node, idx);
+
+ /*
+ * Keep counts of non-zero index values:
+ */
+ lockevent_cond_inc(lock_use_node2 + idx - 1, idx);
+
+ /*
+ * Ensure that we increment the head node->count before initialising
+ * the actual node. If the compiler is kind enough to reorder these
+ * stores, then an IRQ could overwrite our assignments.
+ */
+ barrier();
+
node->locked = 0;
node->next = NULL;
pv_init_node(node);
@@ -509,12 +261,18 @@ queue:
goto release;
/*
+ * Ensure that the initialisation of @node is complete before we
+ * publish the updated tail via xchg_tail() and potentially link
+ * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
+ */
+ smp_wmb();
+
+ /*
+ * Publish the updated tail.
* We have already touched the queueing cacheline; don't bother with
* pending stuff.
*
* p,*,* -> n,*,*
- *
- * RELEASE, such that the stores to @node must be complete.
*/
old = xchg_tail(lock, tail);
next = NULL;
@@ -524,16 +282,9 @@ queue:
* head of the waitqueue.
*/
if (old & _Q_TAIL_MASK) {
- prev = decode_tail(old);
- /*
- * The above xchg_tail() is also a load of @lock which generates,
- * through decode_tail(), a pointer.
- *
- * The address dependency matches the RELEASE of xchg_tail()
- * such that the access to @prev must happen after.
- */
- smp_read_barrier_depends();
+ prev = decode_tail(old, qnodes);
+ /* Link @node into the waitqueue. */
WRITE_ONCE(prev->next, node);
pv_wait_node(node, prev);
@@ -563,8 +314,8 @@ queue:
*
* The PV pv_wait_head_or_lock function, if active, will acquire
* the lock and return a non-zero value. So we have to skip the
- * smp_cond_load_acquire() call. As the next PV queue head hasn't been
- * designated yet, there is no way for the locked value to become
+ * atomic_cond_read_acquire() call. As the next PV queue head hasn't
+ * been designated yet, there is no way for the locked value to become
* _Q_SLOW_VAL. So both the set_locked() and the
* atomic_cmpxchg_relaxed() calls will be safe.
*
@@ -574,53 +325,58 @@ queue:
if ((val = pv_wait_head_or_lock(lock, node)))
goto locked;
- val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK));
+ val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK));
locked:
/*
* claim the lock:
*
* n,0,0 -> 0,0,1 : lock, uncontended
- * *,0,0 -> *,0,1 : lock, contended
+ * *,*,0 -> *,*,1 : lock, contended
*
- * If the queue head is the only one in the queue (lock value == tail),
- * clear the tail code and grab the lock. Otherwise, we only need
- * to grab the lock.
+ * If the queue head is the only one in the queue (lock value == tail)
+ * and nobody is pending, clear the tail code and grab the lock.
+ * Otherwise, we only need to grab the lock.
*/
- for (;;) {
- /* In the PV case we might already have _Q_LOCKED_VAL set */
- if ((val & _Q_TAIL_MASK) != tail) {
- set_locked(lock);
- break;
- }
- /*
- * The smp_cond_load_acquire() call above has provided the
- * necessary acquire semantics required for locking. At most
- * two iterations of this loop may be ran.
- */
- old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
- if (old == val)
- goto release; /* No contention */
- val = old;
+ /*
+ * In the PV case we might already have _Q_LOCKED_VAL set, because
+ * of lock stealing; therefore we must also allow:
+ *
+ * n,0,1 -> 0,0,1
+ *
+ * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
+ * above wait condition, therefore any concurrent setting of
+ * PENDING will make the uncontended transition fail.
+ */
+ if ((val & _Q_TAIL_MASK) == tail) {
+ if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
+ goto release; /* No contention */
}
/*
+ * Either somebody is queued behind us or _Q_PENDING_VAL got set
+ * which will then detect the remaining tail and queue behind us
+ * ensuring we'll see a @next.
+ */
+ set_locked(lock);
+
+ /*
* contended path; wait for next if not observed yet, release.
*/
- if (!next) {
- while (!(next = READ_ONCE(node->next)))
- cpu_relax();
- }
+ if (!next)
+ next = smp_cond_load_relaxed(&node->next, (VAL));
arch_mcs_spin_unlock_contended(&next->locked);
pv_kick_node(lock, next);
release:
+ trace_contention_end(lock, 0);
+
/*
* release the node
*/
- __this_cpu_dec(mcs_nodes[0].count);
+ __this_cpu_dec(qnodes[0].mcs.count);
}
EXPORT_SYMBOL(queued_spin_lock_slowpath);
@@ -644,4 +400,11 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
#include "qspinlock_paravirt.h"
#include "qspinlock.c"
+bool nopvspin;
+static __init int parse_nopvspin(char *arg)
+{
+ nopvspin = true;
+ return 0;
+}
+early_param("nopvspin", parse_nopvspin);
#endif
diff --git a/kernel/locking/qspinlock.h b/kernel/locking/qspinlock.h
new file mode 100644
index 000000000000..d69958a844f7
--- /dev/null
+++ b/kernel/locking/qspinlock.h
@@ -0,0 +1,201 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Queued spinlock defines
+ *
+ * This file contains macro definitions and functions shared between different
+ * qspinlock slow path implementations.
+ */
+#ifndef __LINUX_QSPINLOCK_H
+#define __LINUX_QSPINLOCK_H
+
+#include <asm-generic/percpu.h>
+#include <linux/percpu-defs.h>
+#include <asm-generic/qspinlock.h>
+#include <asm-generic/mcs_spinlock.h>
+
+#define _Q_MAX_NODES 4
+
+/*
+ * The pending bit spinning loop count.
+ * This heuristic is used to limit the number of lockword accesses
+ * made by atomic_cond_read_relaxed when waiting for the lock to
+ * transition out of the "== _Q_PENDING_VAL" state. We don't spin
+ * indefinitely because there's no guarantee that we'll make forward
+ * progress.
+ */
+#ifndef _Q_PENDING_LOOPS
+#define _Q_PENDING_LOOPS 1
+#endif
+
+/*
+ * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in
+ * size and four of them will fit nicely in one 64-byte cacheline. For
+ * pvqspinlock, however, we need more space for extra data. To accommodate
+ * that, we insert two more long words to pad it up to 32 bytes. IOW, only
+ * two of them can fit in a cacheline in this case. That is OK as it is rare
+ * to have more than 2 levels of slowpath nesting in actual use. We don't
+ * want to penalize pvqspinlocks to optimize for a rare case in native
+ * qspinlocks.
+ */
+struct qnode {
+ struct mcs_spinlock mcs;
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+ long reserved[2];
+#endif
+};
+
+/*
+ * We must be able to distinguish between no-tail and the tail at 0:0,
+ * therefore increment the cpu number by one.
+ */
+
+static inline __pure u32 encode_tail(int cpu, int idx)
+{
+ u32 tail;
+
+ tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
+ tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
+
+ return tail;
+}
+
+static inline __pure struct mcs_spinlock *decode_tail(u32 tail,
+ struct qnode __percpu *qnodes)
+{
+ int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
+ int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
+
+ return per_cpu_ptr(&qnodes[idx].mcs, cpu);
+}
+
+static inline __pure
+struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx)
+{
+ return &((struct qnode *)base + idx)->mcs;
+}
+
+#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
+
+#if _Q_PENDING_BITS == 8
+/**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,* -> *,0,*
+ */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ WRITE_ONCE(lock->pending, 0);
+}
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ *
+ * Lock stealing is not allowed if this function is used.
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+ WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
+}
+
+/*
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail), which heads an address dependency
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+ /*
+ * We can use relaxed semantics since the caller ensures that the
+ * MCS node is properly initialized before updating the tail.
+ */
+ return (u32)xchg_relaxed(&lock->tail,
+ tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+
+#else /* _Q_PENDING_BITS == 8 */
+
+/**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,* -> *,0,*
+ */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ atomic_andnot(_Q_PENDING_VAL, &lock->val);
+}
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+ atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
+}
+
+/**
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+ u32 old, new;
+
+ old = atomic_read(&lock->val);
+ do {
+ new = (old & _Q_LOCKED_PENDING_MASK) | tail;
+ /*
+ * We can use relaxed semantics since the caller ensures that
+ * the MCS node is properly initialized before updating the
+ * tail.
+ */
+ } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
+
+ return old;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+/**
+ * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending
+ * @lock : Pointer to queued spinlock structure
+ * Return: The previous lock value
+ *
+ * *,*,* -> *,1,*
+ */
+#ifndef queued_fetch_set_pending_acquire
+static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock)
+{
+ return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
+}
+#endif
+
+/**
+ * set_locked - Set the lock bit and own the lock
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,*,0 -> *,0,1
+ */
+static __always_inline void set_locked(struct qspinlock *lock)
+{
+ WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
+}
+
+#endif /* __LINUX_QSPINLOCK_H */
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 4ccfcaae5b89..dc1cb90e3644 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -1,9 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _GEN_PV_LOCK_SLOWPATH
#error "do not include this file"
#endif
#include <linux/hash.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/debug_locks.h>
/*
@@ -37,44 +38,65 @@
#define PV_PREV_CHECK_MASK 0xff
/*
- * Queue node uses: vcpu_running & vcpu_halted.
- * Queue head uses: vcpu_running & vcpu_hashed.
+ * Queue node uses: VCPU_RUNNING & VCPU_HALTED.
+ * Queue head uses: VCPU_RUNNING & VCPU_HASHED.
*/
enum vcpu_state {
- vcpu_running = 0,
- vcpu_halted, /* Used only in pv_wait_node */
- vcpu_hashed, /* = pv_hash'ed + vcpu_halted */
+ VCPU_RUNNING = 0,
+ VCPU_HALTED, /* Used only in pv_wait_node */
+ VCPU_HASHED, /* = pv_hash'ed + VCPU_HALTED */
};
struct pv_node {
struct mcs_spinlock mcs;
- struct mcs_spinlock __res[3];
-
int cpu;
u8 state;
};
/*
- * Include queued spinlock statistics code
- */
-#include "qspinlock_stat.h"
-
-/*
+ * Hybrid PV queued/unfair lock
+ *
* By replacing the regular queued_spin_trylock() with the function below,
* it will be called once when a lock waiter enter the PV slowpath before
- * being queued. By allowing one lock stealing attempt here when the pending
- * bit is off, it helps to reduce the performance impact of lock waiter
- * preemption without the drawback of lock starvation.
+ * being queued.
+ *
+ * The pending bit is set by the queue head vCPU of the MCS wait queue in
+ * pv_wait_head_or_lock() to signal that it is ready to spin on the lock.
+ * When that bit becomes visible to the incoming waiters, no lock stealing
+ * is allowed. The function will return immediately to make the waiters
+ * enter the MCS wait queue. So lock starvation shouldn't happen as long
+ * as the queued mode vCPUs are actively running to set the pending bit
+ * and hence disabling lock stealing.
+ *
+ * When the pending bit isn't set, the lock waiters will stay in the unfair
+ * mode spinning on the lock unless the MCS wait queue is empty. In this
+ * case, the lock waiters will enter the queued mode slowpath trying to
+ * become the queue head and set the pending bit.
+ *
+ * This hybrid PV queued/unfair lock combines the best attributes of a
+ * queued lock (no lock starvation) and an unfair lock (good performance
+ * on not heavily contended locks).
*/
-#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l)
-static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
+#define queued_spin_trylock(l) pv_hybrid_queued_unfair_trylock(l)
+static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
{
- struct __qspinlock *l = (void *)lock;
+ /*
+ * Stay in unfair lock mode as long as queued mode waiters are
+ * present in the MCS wait queue but the pending bit isn't set.
+ */
+ for (;;) {
+ int val = atomic_read(&lock->val);
+ u8 old = 0;
+
+ if (!(val & _Q_LOCKED_PENDING_MASK) &&
+ try_cmpxchg_acquire(&lock->locked, &old, _Q_LOCKED_VAL)) {
+ lockevent_inc(pv_lock_stealing);
+ return true;
+ }
+ if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK))
+ break;
- if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
- (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
- qstat_inc(qstat_pv_lock_stealing, true);
- return true;
+ cpu_relax();
}
return false;
@@ -87,30 +109,20 @@ static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
#if _Q_PENDING_BITS == 8
static __always_inline void set_pending(struct qspinlock *lock)
{
- struct __qspinlock *l = (void *)lock;
-
- WRITE_ONCE(l->pending, 1);
-}
-
-static __always_inline void clear_pending(struct qspinlock *lock)
-{
- struct __qspinlock *l = (void *)lock;
-
- WRITE_ONCE(l->pending, 0);
+ WRITE_ONCE(lock->pending, 1);
}
/*
* The pending bit check in pv_queued_spin_steal_lock() isn't a memory
- * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
- * just to be sure that it will get it.
+ * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the
+ * lock just to be sure that it will get it.
*/
-static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+static __always_inline bool trylock_clear_pending(struct qspinlock *lock)
{
- struct __qspinlock *l = (void *)lock;
+ u16 old = _Q_PENDING_VAL;
- return !READ_ONCE(l->locked) &&
- (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
- == _Q_PENDING_VAL);
+ return !READ_ONCE(lock->locked) &&
+ try_cmpxchg_acquire(&lock->locked_pending, &old, _Q_LOCKED_VAL);
}
#else /* _Q_PENDING_BITS == 8 */
static __always_inline void set_pending(struct qspinlock *lock)
@@ -118,32 +130,21 @@ static __always_inline void set_pending(struct qspinlock *lock)
atomic_or(_Q_PENDING_VAL, &lock->val);
}
-static __always_inline void clear_pending(struct qspinlock *lock)
+static __always_inline bool trylock_clear_pending(struct qspinlock *lock)
{
- atomic_andnot(_Q_PENDING_VAL, &lock->val);
-}
-
-static __always_inline int trylock_clear_pending(struct qspinlock *lock)
-{
- int val = atomic_read(&lock->val);
-
- for (;;) {
- int old, new;
-
- if (val & _Q_LOCKED_MASK)
- break;
+ int old, new;
+ old = atomic_read(&lock->val);
+ do {
+ if (old & _Q_LOCKED_MASK)
+ return false;
/*
* Try to clear pending bit & set locked bit
*/
- old = val;
- new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
- val = atomic_cmpxchg(&lock->val, old, new);
+ new = (old & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
+ } while (!atomic_try_cmpxchg_acquire (&lock->val, &old, new));
- if (val == old)
- return 1;
- }
- return 0;
+ return true;
}
#endif /* _Q_PENDING_BITS == 8 */
@@ -211,10 +212,11 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
int hopcnt = 0;
for_each_hash_entry(he, offset, hash) {
+ struct qspinlock *old = NULL;
hopcnt++;
- if (!cmpxchg(&he->lock, NULL, lock)) {
+ if (try_cmpxchg(&he->lock, &old, lock)) {
WRITE_ONCE(he->node, node);
- qstat_hop(hopcnt);
+ lockevent_pv_hop(hopcnt);
return &he->lock;
}
}
@@ -264,7 +266,7 @@ pv_wait_early(struct pv_node *prev, int loop)
if ((loop & PV_PREV_CHECK_MASK) != 0)
return false;
- return READ_ONCE(prev->state) != vcpu_running || vcpu_is_preempted(prev->cpu);
+ return READ_ONCE(prev->state) != VCPU_RUNNING;
}
/*
@@ -274,10 +276,10 @@ static void pv_init_node(struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
- BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock));
+ BUILD_BUG_ON(sizeof(struct pv_node) > sizeof(struct qnode));
pn->cpu = smp_processor_id();
- pn->state = vcpu_running;
+ pn->state = VCPU_RUNNING;
}
/*
@@ -289,8 +291,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
{
struct pv_node *pn = (struct pv_node *)node;
struct pv_node *pp = (struct pv_node *)prev;
- int loop;
bool wait_early;
+ int loop;
for (;;) {
for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
@@ -306,26 +308,26 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
/*
* Order pn->state vs pn->locked thusly:
*
- * [S] pn->state = vcpu_halted [S] next->locked = 1
+ * [S] pn->state = VCPU_HALTED [S] next->locked = 1
* MB MB
- * [L] pn->locked [RmW] pn->state = vcpu_hashed
+ * [L] pn->locked [RmW] pn->state = VCPU_HASHED
*
* Matches the cmpxchg() from pv_kick_node().
*/
- smp_store_mb(pn->state, vcpu_halted);
+ smp_store_mb(pn->state, VCPU_HALTED);
if (!READ_ONCE(node->locked)) {
- qstat_inc(qstat_pv_wait_node, true);
- qstat_inc(qstat_pv_wait_early, wait_early);
- pv_wait(&pn->state, vcpu_halted);
+ lockevent_inc(pv_wait_node);
+ lockevent_cond_inc(pv_wait_early, wait_early);
+ pv_wait(&pn->state, VCPU_HALTED);
}
/*
- * If pv_kick_node() changed us to vcpu_hashed, retain that
+ * If pv_kick_node() changed us to VCPU_HASHED, retain that
* value so that pv_wait_head_or_lock() knows to not also try
* to hash this lock.
*/
- cmpxchg(&pn->state, vcpu_halted, vcpu_running);
+ cmpxchg(&pn->state, VCPU_HALTED, VCPU_RUNNING);
/*
* If the locked flag is still not set after wakeup, it is a
@@ -334,7 +336,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
* So it is better to spin for a while in the hope that the
* MCS lock will be released soon.
*/
- qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
+ lockevent_cond_inc(pv_spurious_wakeup,
+ !READ_ONCE(node->locked));
}
/*
@@ -354,16 +357,24 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
- struct __qspinlock *l = (void *)lock;
-
+ u8 old = VCPU_HALTED;
/*
* If the vCPU is indeed halted, advance its state to match that of
* pv_wait_node(). If OTOH this fails, the vCPU was running and will
* observe its next->locked value and advance itself.
*
* Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
+ *
+ * The write to next->locked in arch_mcs_spin_unlock_contended()
+ * must be ordered before the read of pn->state in the cmpxchg()
+ * below for the code to work correctly. To guarantee full ordering
+ * irrespective of the success or failure of the cmpxchg(),
+ * a relaxed version with explicit barrier is used. The control
+ * dependency will order the reading of pn->state before any
+ * subsequent writes.
*/
- if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+ smp_mb__before_atomic();
+ if (!try_cmpxchg_relaxed(&pn->state, &old, VCPU_HASHED))
return;
/*
@@ -373,7 +384,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
* the hash table later on at unlock time, no atomic instruction is
* needed.
*/
- WRITE_ONCE(l->locked, _Q_SLOW_VAL);
+ WRITE_ONCE(lock->locked, _Q_SLOW_VAL);
(void)pv_hash(lock, pn);
}
@@ -388,7 +399,6 @@ static u32
pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
- struct __qspinlock *l = (void *)lock;
struct qspinlock **lp = NULL;
int waitcnt = 0;
int loop;
@@ -397,20 +407,20 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
* If pv_kick_node() already advanced our state, we don't need to
* insert ourselves into the hash table anymore.
*/
- if (READ_ONCE(pn->state) == vcpu_hashed)
+ if (READ_ONCE(pn->state) == VCPU_HASHED)
lp = (struct qspinlock **)1;
/*
* Tracking # of slowpath locking operations
*/
- qstat_inc(qstat_pv_lock_slowpath, true);
+ lockevent_inc(lock_slowpath);
for (;; waitcnt++) {
/*
* Set correct vCPU state to be used by queue node wait-early
* mechanism.
*/
- WRITE_ONCE(pn->state, vcpu_running);
+ WRITE_ONCE(pn->state, VCPU_RUNNING);
/*
* Set the pending bit in the active lock spinning loop to
@@ -439,21 +449,21 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
*
* Matches the smp_rmb() in __pv_queued_spin_unlock().
*/
- if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
+ if (xchg(&lock->locked, _Q_SLOW_VAL) == 0) {
/*
* The lock was free and now we own the lock.
* Change the lock value back to _Q_LOCKED_VAL
* and unhash the table.
*/
- WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+ WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
WRITE_ONCE(*lp, NULL);
goto gotlock;
}
}
- WRITE_ONCE(pn->state, vcpu_hashed);
- qstat_inc(qstat_pv_wait_head, true);
- qstat_inc(qstat_pv_wait_again, waitcnt);
- pv_wait(&l->locked, _Q_SLOW_VAL);
+ WRITE_ONCE(pn->state, VCPU_HASHED);
+ lockevent_inc(pv_wait_head);
+ lockevent_cond_inc(pv_wait_again, waitcnt);
+ pv_wait(&lock->locked, _Q_SLOW_VAL);
/*
* Because of lock stealing, the queue head vCPU may not be
@@ -472,13 +482,22 @@ gotlock:
}
/*
+ * Include the architecture specific callee-save thunk of the
+ * __pv_queued_spin_unlock(). This thunk is put together with
+ * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
+ * function close to each other sharing consecutive instruction cachelines.
+ * Alternatively, architecture specific version of __pv_queued_spin_unlock()
+ * can be defined.
+ */
+#include <asm/qspinlock_paravirt.h>
+
+/*
* PV versions of the unlock fastpath and slowpath functions to be used
* instead of queued_spin_unlock().
*/
-__visible void
+__visible __lockfunc void
__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
{
- struct __qspinlock *l = (void *)lock;
struct pv_node *node;
if (unlikely(locked != _Q_SLOW_VAL)) {
@@ -507,7 +526,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
* Now that we have a reference to the (likely) blocked pv_node,
* release the lock.
*/
- smp_store_release(&l->locked, 0);
+ smp_store_release(&lock->locked, 0);
/*
* At this point the memory pointed at by lock can be freed/reused,
@@ -516,33 +535,21 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
* vCPU is harmless other than the additional latency in completing
* the unlock.
*/
- qstat_inc(qstat_pv_kick_unlock, true);
+ lockevent_inc(pv_kick_unlock);
pv_kick(node->cpu);
}
-/*
- * Include the architecture specific callee-save thunk of the
- * __pv_queued_spin_unlock(). This thunk is put together with
- * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
- * function close to each other sharing consecutive instruction cachelines.
- * Alternatively, architecture specific version of __pv_queued_spin_unlock()
- * can be defined.
- */
-#include <asm/qspinlock_paravirt.h>
-
#ifndef __pv_queued_spin_unlock
-__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+__visible __lockfunc void __pv_queued_spin_unlock(struct qspinlock *lock)
{
- struct __qspinlock *l = (void *)lock;
- u8 locked;
+ u8 locked = _Q_LOCKED_VAL;
/*
* We must not unlock if SLOW, because in that case we must first
* unhash. Otherwise it would be possible to have multiple @lock
* entries, which would be BAD.
*/
- locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0);
- if (likely(locked == _Q_LOCKED_VAL))
+ if (try_cmpxchg_release(&lock->locked, &locked, 0))
return;
__pv_queued_spin_unlock_slowpath(lock, locked);
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 4a30ef63c607..e625bb410aa2 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -1,252 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * Authors: Waiman Long <waiman.long@hpe.com>
+ * Authors: Waiman Long <longman@redhat.com>
*/
-/*
- * When queued spinlock statistical counters are enabled, the following
- * debugfs files will be created for reporting the counter values:
- *
- * <debugfs>/qlockstat/
- * pv_hash_hops - average # of hops per hashing operation
- * pv_kick_unlock - # of vCPU kicks issued at unlock time
- * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake
- * pv_latency_kick - average latency (ns) of vCPU kick operation
- * pv_latency_wake - average latency (ns) from vCPU kick to wakeup
- * pv_lock_slowpath - # of locking operations via the slowpath
- * pv_lock_stealing - # of lock stealing operations
- * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs
- * pv_wait_again - # of wait's after a queue head vCPU kick
- * pv_wait_early - # of early vCPU wait's
- * pv_wait_head - # of vCPU wait's at the queue head
- * pv_wait_node - # of vCPU wait's at a non-head queue node
- *
- * Writing to the "reset_counters" file will reset all the above counter
- * values.
- *
- * These statistical counters are implemented as per-cpu variables which are
- * summed and computed whenever the corresponding debugfs files are read. This
- * minimizes added overhead making the counters usable even in a production
- * environment.
- *
- * There may be slight difference between pv_kick_wake and pv_kick_unlock.
- */
-enum qlock_stats {
- qstat_pv_hash_hops,
- qstat_pv_kick_unlock,
- qstat_pv_kick_wake,
- qstat_pv_latency_kick,
- qstat_pv_latency_wake,
- qstat_pv_lock_slowpath,
- qstat_pv_lock_stealing,
- qstat_pv_spurious_wakeup,
- qstat_pv_wait_again,
- qstat_pv_wait_early,
- qstat_pv_wait_head,
- qstat_pv_wait_node,
- qstat_num, /* Total number of statistical counters */
- qstat_reset_cnts = qstat_num,
-};
+#include "lock_events.h"
-#ifdef CONFIG_QUEUED_LOCK_STAT
+#ifdef CONFIG_LOCK_EVENT_COUNTS
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
/*
- * Collect pvqspinlock statistics
+ * Collect pvqspinlock locking event counts
*/
-#include <linux/debugfs.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/fs.h>
-static const char * const qstat_names[qstat_num + 1] = {
- [qstat_pv_hash_hops] = "pv_hash_hops",
- [qstat_pv_kick_unlock] = "pv_kick_unlock",
- [qstat_pv_kick_wake] = "pv_kick_wake",
- [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
- [qstat_pv_latency_kick] = "pv_latency_kick",
- [qstat_pv_latency_wake] = "pv_latency_wake",
- [qstat_pv_lock_slowpath] = "pv_lock_slowpath",
- [qstat_pv_lock_stealing] = "pv_lock_stealing",
- [qstat_pv_wait_again] = "pv_wait_again",
- [qstat_pv_wait_early] = "pv_wait_early",
- [qstat_pv_wait_head] = "pv_wait_head",
- [qstat_pv_wait_node] = "pv_wait_node",
- [qstat_reset_cnts] = "reset_counters",
-};
+#define EVENT_COUNT(ev) lockevents[LOCKEVENT_ ## ev]
/*
- * Per-cpu counters
+ * PV specific per-cpu counter
*/
-static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
static DEFINE_PER_CPU(u64, pv_kick_time);
/*
- * Function to read and return the qlock statistical counter values
+ * Function to read and return the PV qspinlock counts.
*
* The following counters are handled specially:
- * 1. qstat_pv_latency_kick
+ * 1. pv_latency_kick
* Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
- * 2. qstat_pv_latency_wake
+ * 2. pv_latency_wake
* Average wake latency (ns) = pv_latency_wake/pv_kick_wake
- * 3. qstat_pv_hash_hops
+ * 3. pv_hash_hops
* Average hops/hash = pv_hash_hops/pv_kick_unlock
*/
-static ssize_t qstat_read(struct file *file, char __user *user_buf,
- size_t count, loff_t *ppos)
+ssize_t lockevent_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
{
char buf[64];
- int cpu, counter, len;
- u64 stat = 0, kicks = 0;
+ int cpu, id, len;
+ u64 sum = 0, kicks = 0;
/*
* Get the counter ID stored in file->f_inode->i_private
*/
- counter = (long)file_inode(file)->i_private;
+ id = (long)file_inode(file)->i_private;
- if (counter >= qstat_num)
+ if (id >= lockevent_num)
return -EBADF;
for_each_possible_cpu(cpu) {
- stat += per_cpu(qstats[counter], cpu);
+ sum += per_cpu(lockevents[id], cpu);
/*
- * Need to sum additional counter for some of them
+ * Need to sum additional counters for some of them
*/
- switch (counter) {
+ switch (id) {
- case qstat_pv_latency_kick:
- case qstat_pv_hash_hops:
- kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
+ case LOCKEVENT_pv_latency_kick:
+ case LOCKEVENT_pv_hash_hops:
+ kicks += per_cpu(EVENT_COUNT(pv_kick_unlock), cpu);
break;
- case qstat_pv_latency_wake:
- kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
+ case LOCKEVENT_pv_latency_wake:
+ kicks += per_cpu(EVENT_COUNT(pv_kick_wake), cpu);
break;
}
}
- if (counter == qstat_pv_hash_hops) {
+ if (id == LOCKEVENT_pv_hash_hops) {
u64 frac = 0;
if (kicks) {
- frac = 100ULL * do_div(stat, kicks);
+ frac = 100ULL * do_div(sum, kicks);
frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
}
/*
* Return a X.XX decimal number
*/
- len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
+ len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n",
+ sum, frac);
} else {
/*
* Round to the nearest ns
*/
- if ((counter == qstat_pv_latency_kick) ||
- (counter == qstat_pv_latency_wake)) {
+ if ((id == LOCKEVENT_pv_latency_kick) ||
+ (id == LOCKEVENT_pv_latency_wake)) {
if (kicks)
- stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
+ sum = DIV_ROUND_CLOSEST_ULL(sum, kicks);
}
- len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
+ len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum);
}
return simple_read_from_buffer(user_buf, count, ppos, buf, len);
}
/*
- * Function to handle write request
- *
- * When counter = reset_cnts, reset all the counter values.
- * Since the counter updates aren't atomic, the resetting is done twice
- * to make sure that the counters are very likely to be all cleared.
- */
-static ssize_t qstat_write(struct file *file, const char __user *user_buf,
- size_t count, loff_t *ppos)
-{
- int cpu;
-
- /*
- * Get the counter ID stored in file->f_inode->i_private
- */
- if ((long)file_inode(file)->i_private != qstat_reset_cnts)
- return count;
-
- for_each_possible_cpu(cpu) {
- int i;
- unsigned long *ptr = per_cpu_ptr(qstats, cpu);
-
- for (i = 0 ; i < qstat_num; i++)
- WRITE_ONCE(ptr[i], 0);
- }
- return count;
-}
-
-/*
- * Debugfs data structures
- */
-static const struct file_operations fops_qstat = {
- .read = qstat_read,
- .write = qstat_write,
- .llseek = default_llseek,
-};
-
-/*
- * Initialize debugfs for the qspinlock statistical counters
- */
-static int __init init_qspinlock_stat(void)
-{
- struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
- int i;
-
- if (!d_qstat)
- goto out;
-
- /*
- * Create the debugfs files
- *
- * As reading from and writing to the stat files can be slow, only
- * root is allowed to do the read/write to limit impact to system
- * performance.
- */
- for (i = 0; i < qstat_num; i++)
- if (!debugfs_create_file(qstat_names[i], 0400, d_qstat,
- (void *)(long)i, &fops_qstat))
- goto fail_undo;
-
- if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
- (void *)(long)qstat_reset_cnts, &fops_qstat))
- goto fail_undo;
-
- return 0;
-fail_undo:
- debugfs_remove_recursive(d_qstat);
-out:
- pr_warn("Could not create 'qlockstat' debugfs entries\n");
- return -ENOMEM;
-}
-fs_initcall(init_qspinlock_stat);
-
-/*
- * Increment the PV qspinlock statistical counters
- */
-static inline void qstat_inc(enum qlock_stats stat, bool cond)
-{
- if (cond)
- this_cpu_inc(qstats[stat]);
-}
-
-/*
* PV hash hop count
*/
-static inline void qstat_hop(int hopcnt)
+static inline void lockevent_pv_hop(int hopcnt)
{
- this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
+ this_cpu_add(EVENT_COUNT(pv_hash_hops), hopcnt);
}
/*
@@ -258,7 +111,7 @@ static inline void __pv_kick(int cpu)
per_cpu(pv_kick_time, cpu) = start;
pv_kick(cpu);
- this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
+ this_cpu_add(EVENT_COUNT(pv_latency_kick), sched_clock() - start);
}
/*
@@ -271,18 +124,19 @@ static inline void __pv_wait(u8 *ptr, u8 val)
*pkick_time = 0;
pv_wait(ptr, val);
if (*pkick_time) {
- this_cpu_add(qstats[qstat_pv_latency_wake],
+ this_cpu_add(EVENT_COUNT(pv_latency_wake),
sched_clock() - *pkick_time);
- qstat_inc(qstat_pv_kick_wake, true);
+ lockevent_inc(pv_kick_wake);
}
}
#define pv_kick(c) __pv_kick(c)
#define pv_wait(p, v) __pv_wait(p, v)
-#else /* CONFIG_QUEUED_LOCK_STAT */
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
+#else /* CONFIG_LOCK_EVENT_COUNTS */
-static inline void qstat_inc(enum qlock_stats stat, bool cond) { }
-static inline void qstat_hop(int hopcnt) { }
+static inline void lockevent_pv_hop(int hopcnt) { }
-#endif /* CONFIG_QUEUED_LOCK_STAT */
+#endif /* CONFIG_LOCK_EVENT_COUNTS */
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
deleted file mode 100644
index ac35e648b0e5..000000000000
--- a/kernel/locking/rtmutex-debug.c
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * RT-Mutexes: blocking mutual exclusion locks with PI support
- *
- * started by Ingo Molnar and Thomas Gleixner:
- *
- * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- * This code is based on the rt.c implementation in the preempt-rt tree.
- * Portions of said code are
- *
- * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey
- * Copyright (C) 2006 Esben Nielsen
- * Copyright (C) 2006 Kihon Technologies Inc.,
- * Steven Rostedt <rostedt@goodmis.org>
- *
- * See rt.c in preempt-rt for proper credits and further information
- */
-#include <linux/sched.h>
-#include <linux/sched/rt.h>
-#include <linux/sched/debug.h>
-#include <linux/delay.h>
-#include <linux/export.h>
-#include <linux/spinlock.h>
-#include <linux/kallsyms.h>
-#include <linux/syscalls.h>
-#include <linux/interrupt.h>
-#include <linux/rbtree.h>
-#include <linux/fs.h>
-#include <linux/debug_locks.h>
-
-#include "rtmutex_common.h"
-
-static void printk_task(struct task_struct *p)
-{
- if (p)
- printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio);
- else
- printk("<none>");
-}
-
-static void printk_lock(struct rt_mutex *lock, int print_owner)
-{
- if (lock->name)
- printk(" [%p] {%s}\n",
- lock, lock->name);
- else
- printk(" [%p] {%s:%d}\n",
- lock, lock->file, lock->line);
-
- if (print_owner && rt_mutex_owner(lock)) {
- printk(".. ->owner: %p\n", lock->owner);
- printk(".. held by: ");
- printk_task(rt_mutex_owner(lock));
- printk("\n");
- }
-}
-
-void rt_mutex_debug_task_free(struct task_struct *task)
-{
- DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters));
- DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
-}
-
-/*
- * We fill out the fields in the waiter to store the information about
- * the deadlock. We print when we return. act_waiter can be NULL in
- * case of a remove waiter operation.
- */
-void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
- struct rt_mutex_waiter *act_waiter,
- struct rt_mutex *lock)
-{
- struct task_struct *task;
-
- if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter)
- return;
-
- task = rt_mutex_owner(act_waiter->lock);
- if (task && task != current) {
- act_waiter->deadlock_task_pid = get_pid(task_pid(task));
- act_waiter->deadlock_lock = lock;
- }
-}
-
-void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
-{
- struct task_struct *task;
-
- if (!waiter->deadlock_lock || !debug_locks)
- return;
-
- rcu_read_lock();
- task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID);
- if (!task) {
- rcu_read_unlock();
- return;
- }
-
- if (!debug_locks_off()) {
- rcu_read_unlock();
- return;
- }
-
- pr_warn("\n");
- pr_warn("============================================\n");
- pr_warn("WARNING: circular locking deadlock detected!\n");
- pr_warn("%s\n", print_tainted());
- pr_warn("--------------------------------------------\n");
- printk("%s/%d is deadlocking current task %s/%d\n\n",
- task->comm, task_pid_nr(task),
- current->comm, task_pid_nr(current));
-
- printk("\n1) %s/%d is trying to acquire this lock:\n",
- current->comm, task_pid_nr(current));
- printk_lock(waiter->lock, 1);
-
- printk("\n2) %s/%d is blocked on this lock:\n",
- task->comm, task_pid_nr(task));
- printk_lock(waiter->deadlock_lock, 1);
-
- debug_show_held_locks(current);
- debug_show_held_locks(task);
-
- printk("\n%s/%d's [blocked] stackdump:\n\n",
- task->comm, task_pid_nr(task));
- show_stack(task, NULL);
- printk("\n%s/%d's [current] stackdump:\n\n",
- current->comm, task_pid_nr(current));
- dump_stack();
- debug_show_all_locks();
- rcu_read_unlock();
-
- printk("[ turning off deadlock detection."
- "Please report this trace. ]\n\n");
-}
-
-void debug_rt_mutex_lock(struct rt_mutex *lock)
-{
-}
-
-void debug_rt_mutex_unlock(struct rt_mutex *lock)
-{
- DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
-}
-
-void
-debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
-{
-}
-
-void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
-{
- DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
-}
-
-void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
-{
- memset(waiter, 0x11, sizeof(*waiter));
- waiter->deadlock_task_pid = NULL;
-}
-
-void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
-{
- put_pid(waiter->deadlock_task_pid);
- memset(waiter, 0x22, sizeof(*waiter));
-}
-
-void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key)
-{
- /*
- * Make sure we are not reinitializing a held lock:
- */
- debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lock->name = name;
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- lockdep_init_map(&lock->dep_map, name, key, 0);
-#endif
-}
-
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
deleted file mode 100644
index 5078c6ddf4a5..000000000000
--- a/kernel/locking/rtmutex-debug.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * RT-Mutexes: blocking mutual exclusion locks with PI support
- *
- * started by Ingo Molnar and Thomas Gleixner:
- *
- * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- * This file contains macros used solely by rtmutex.c. Debug version.
- */
-
-extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
-extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
-extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key);
-extern void debug_rt_mutex_lock(struct rt_mutex *lock);
-extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
-extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
- struct task_struct *powner);
-extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
-extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
- struct rt_mutex_waiter *waiter,
- struct rt_mutex *lock);
-extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
-# define debug_rt_mutex_reset_waiter(w) \
- do { (w)->deadlock_lock = NULL; } while (0)
-
-static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
- enum rtmutex_chainwalk walk)
-{
- return (waiter != NULL);
-}
-
-static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
-{
- debug_rt_mutex_print_deadlock(w);
-}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 649dc9d3951a..c80902eacd79 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* RT-Mutexes: simple blocking mutual exclusion locks with PI support
*
@@ -7,19 +8,62 @@
* Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
* Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
* Copyright (C) 2006 Esben Nielsen
+ * Adaptive Spinlocks:
+ * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
+ * and Peter Morreale,
+ * Adaptive Spinlocks simplification:
+ * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
*
- * See Documentation/locking/rt-mutex-design.txt for details.
+ * See Documentation/locking/rt-mutex-design.rst for details.
*/
-#include <linux/spinlock.h>
-#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/deadline.h>
#include <linux/sched/signal.h>
#include <linux/sched/rt.h>
-#include <linux/sched/deadline.h>
#include <linux/sched/wake_q.h>
-#include <linux/sched/debug.h>
-#include <linux/timer.h>
+#include <linux/ww_mutex.h>
+
+#include <trace/events/lock.h>
#include "rtmutex_common.h"
+#include "lock_events.h"
+
+#ifndef WW_RT
+# define build_ww_mutex() (false)
+# define ww_container_of(rtm) NULL
+
+static inline int __ww_mutex_add_waiter(struct rt_mutex_waiter *waiter,
+ struct rt_mutex *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ struct wake_q_head *wake_q)
+{
+ return 0;
+}
+
+static inline void __ww_mutex_check_waiters(struct rt_mutex *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ struct wake_q_head *wake_q)
+{
+}
+
+static inline void ww_mutex_lock_acquired(struct ww_mutex *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+}
+
+static inline int __ww_mutex_check_kill(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ return 0;
+}
+
+#else
+# define build_ww_mutex() (true)
+# define ww_container_of(rtm) container_of(rtm, struct ww_mutex, base)
+# include "ww_mutex.h"
+#endif
/*
* lock->owner state tracking:
@@ -48,24 +92,41 @@
* set this bit before looking at the lock.
*/
-static void
-rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
+static __always_inline struct task_struct *
+rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner)
{
unsigned long val = (unsigned long)owner;
if (rt_mutex_has_waiters(lock))
val |= RT_MUTEX_HAS_WAITERS;
- lock->owner = (struct task_struct *)val;
+ return (struct task_struct *)val;
+}
+
+static __always_inline void
+rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner)
+{
+ /*
+ * lock->wait_lock is held but explicit acquire semantics are needed
+ * for a new lock owner so WRITE_ONCE is insufficient.
+ */
+ xchg_acquire(&lock->owner, rt_mutex_owner_encode(lock, owner));
+}
+
+static __always_inline void rt_mutex_clear_owner(struct rt_mutex_base *lock)
+{
+ /* lock->wait_lock is held so the unlock provides release semantics. */
+ WRITE_ONCE(lock->owner, rt_mutex_owner_encode(lock, NULL));
}
-static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
+static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock)
{
lock->owner = (struct task_struct *)
((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
}
-static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
+static __always_inline void
+fixup_rt_mutex_waiters(struct rt_mutex_base *lock, bool acquire_lock)
{
unsigned long owner, *p = (unsigned long *) &lock->owner;
@@ -131,8 +192,21 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
* still set.
*/
owner = READ_ONCE(*p);
- if (owner & RT_MUTEX_HAS_WAITERS)
- WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
+ if (owner & RT_MUTEX_HAS_WAITERS) {
+ /*
+ * See rt_mutex_set_owner() and rt_mutex_clear_owner() on
+ * why xchg_acquire() is used for updating owner for
+ * locking and WRITE_ONCE() for unlocking.
+ *
+ * WRITE_ONCE() would work for the acquire case too, but
+ * in case that the lock acquisition failed it might
+ * force other lockers into the slow path unnecessarily.
+ */
+ if (acquire_lock)
+ xchg_acquire(p, owner & ~RT_MUTEX_HAS_WAITERS);
+ else
+ WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
+ }
}
/*
@@ -140,23 +214,46 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
* set up.
*/
#ifndef CONFIG_DEBUG_RT_MUTEXES
-# define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c)
-# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c)
-# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c)
+static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return try_cmpxchg_acquire(&lock->owner, &old, new);
+}
+
+static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
+{
+ return rt_mutex_cmpxchg_acquire(lock, NULL, current);
+}
+
+static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return try_cmpxchg_release(&lock->owner, &old, new);
+}
/*
* Callers must hold the ->wait_lock -- which is the whole purpose as we force
* all future threads that attempt to [Rmw] the lock to the slowpath. As such
* relaxed semantics suffice.
*/
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock)
{
- unsigned long owner, *p = (unsigned long *) &lock->owner;
+ unsigned long *p = (unsigned long *) &lock->owner;
+ unsigned long owner, new;
+ owner = READ_ONCE(*p);
do {
- owner = *p;
- } while (cmpxchg_relaxed(p, owner,
- owner | RT_MUTEX_HAS_WAITERS) != owner);
+ new = owner | RT_MUTEX_HAS_WAITERS;
+ } while (!try_cmpxchg_relaxed(p, &owner, new));
+
+ /*
+ * The cmpxchg loop above is relaxed to avoid back-to-back ACQUIRE
+ * operations in the event of contention. Ensure the successful
+ * cmpxchg is visible.
+ */
+ smp_mb__after_atomic();
}
/*
@@ -165,8 +262,8 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
* 2) Drop lock->wait_lock
* 3) Try to unlock the lock with cmpxchg
*/
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
- unsigned long flags)
+static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock,
+ unsigned long flags)
__releases(lock->wait_lock)
{
struct task_struct *owner = rt_mutex_owner(lock);
@@ -201,11 +298,36 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
}
#else
-# define rt_mutex_cmpxchg_relaxed(l,c,n) (0)
-# define rt_mutex_cmpxchg_acquire(l,c,n) (0)
-# define rt_mutex_cmpxchg_release(l,c,n) (0)
+static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return false;
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+}
+
+static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock);
+
+static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
+{
+ /*
+ * With debug enabled rt_mutex_cmpxchg trylock() will always fail.
+ *
+ * Avoid unconditionally taking the slow path by using
+ * rt_mutex_slow_trylock() which is covered by the debug code and can
+ * acquire a non-contended rtmutex.
+ */
+ return rt_mutex_slowtrylock(lock);
+}
+
+static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return false;
+}
+
+static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock)
{
lock->owner = (struct task_struct *)
((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
@@ -214,8 +336,8 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
/*
* Simple slow path only version: lock->owner is protected by lock->wait_lock.
*/
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
- unsigned long flags)
+static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock,
+ unsigned long flags)
__releases(lock->wait_lock)
{
lock->owner = NULL;
@@ -224,15 +346,53 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
}
#endif
+static __always_inline int __waiter_prio(struct task_struct *task)
+{
+ int prio = task->prio;
+
+ if (!rt_or_dl_prio(prio))
+ return DEFAULT_PRIO;
+
+ return prio;
+}
+
+/*
+ * Update the waiter->tree copy of the sort keys.
+ */
+static __always_inline void
+waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
+{
+ lockdep_assert_held(&waiter->lock->wait_lock);
+ lockdep_assert(RB_EMPTY_NODE(&waiter->tree.entry));
+
+ waiter->tree.prio = __waiter_prio(task);
+ waiter->tree.deadline = task->dl.deadline;
+}
+
+/*
+ * Update the waiter->pi_tree copy of the sort keys (from the tree copy).
+ */
+static __always_inline void
+waiter_clone_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
+{
+ lockdep_assert_held(&waiter->lock->wait_lock);
+ lockdep_assert_held(&task->pi_lock);
+ lockdep_assert(RB_EMPTY_NODE(&waiter->pi_tree.entry));
+
+ waiter->pi_tree.prio = waiter->tree.prio;
+ waiter->pi_tree.deadline = waiter->tree.deadline;
+}
+
/*
- * Only use with rt_mutex_waiter_{less,equal}()
+ * Only use with rt_waiter_node_{less,equal}()
*/
+#define task_to_waiter_node(p) \
+ &(struct rt_waiter_node){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }
#define task_to_waiter(p) \
- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
+ &(struct rt_mutex_waiter){ .tree = *task_to_waiter_node(p) }
-static inline int
-rt_mutex_waiter_less(struct rt_mutex_waiter *left,
- struct rt_mutex_waiter *right)
+static __always_inline int rt_waiter_node_less(struct rt_waiter_node *left,
+ struct rt_waiter_node *right)
{
if (left->prio < right->prio)
return 1;
@@ -249,9 +409,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left,
return 0;
}
-static inline int
-rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
- struct rt_mutex_waiter *right)
+static __always_inline int rt_waiter_node_equal(struct rt_waiter_node *left,
+ struct rt_waiter_node *right)
{
if (left->prio != right->prio)
return 0;
@@ -268,88 +427,110 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
return 1;
}
-static void
-rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
+static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
+ struct rt_mutex_waiter *top_waiter)
{
- struct rb_node **link = &lock->waiters.rb_node;
- struct rb_node *parent = NULL;
- struct rt_mutex_waiter *entry;
- int leftmost = 1;
-
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry);
- if (rt_mutex_waiter_less(waiter, entry)) {
- link = &parent->rb_left;
- } else {
- link = &parent->rb_right;
- leftmost = 0;
- }
+ if (rt_waiter_node_less(&waiter->tree, &top_waiter->tree))
+ return true;
+
+#ifdef RT_MUTEX_BUILD_SPINLOCKS
+ /*
+ * Note that RT tasks are excluded from same priority (lateral)
+ * steals to prevent the introduction of an unbounded latency.
+ */
+ if (rt_or_dl_prio(waiter->tree.prio))
+ return false;
+
+ return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree);
+#else
+ return false;
+#endif
+}
+
+#define __node_2_waiter(node) \
+ rb_entry((node), struct rt_mutex_waiter, tree.entry)
+
+static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b)
+{
+ struct rt_mutex_waiter *aw = __node_2_waiter(a);
+ struct rt_mutex_waiter *bw = __node_2_waiter(b);
+
+ if (rt_waiter_node_less(&aw->tree, &bw->tree))
+ return 1;
+
+ if (!build_ww_mutex())
+ return 0;
+
+ if (rt_waiter_node_less(&bw->tree, &aw->tree))
+ return 0;
+
+ /* NOTE: relies on waiter->ww_ctx being set before insertion */
+ if (aw->ww_ctx) {
+ if (!bw->ww_ctx)
+ return 1;
+
+ return (signed long)(aw->ww_ctx->stamp -
+ bw->ww_ctx->stamp) < 0;
}
- if (leftmost)
- lock->waiters_leftmost = &waiter->tree_entry;
+ return 0;
+}
- rb_link_node(&waiter->tree_entry, parent, link);
- rb_insert_color(&waiter->tree_entry, &lock->waiters);
+static __always_inline void
+rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
+{
+ lockdep_assert_held(&lock->wait_lock);
+
+ rb_add_cached(&waiter->tree.entry, &lock->waiters, __waiter_less);
}
-static void
-rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
+static __always_inline void
+rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
{
- if (RB_EMPTY_NODE(&waiter->tree_entry))
+ lockdep_assert_held(&lock->wait_lock);
+
+ if (RB_EMPTY_NODE(&waiter->tree.entry))
return;
- if (lock->waiters_leftmost == &waiter->tree_entry)
- lock->waiters_leftmost = rb_next(&waiter->tree_entry);
+ rb_erase_cached(&waiter->tree.entry, &lock->waiters);
+ RB_CLEAR_NODE(&waiter->tree.entry);
+}
+
+#define __node_2_rt_node(node) \
+ rb_entry((node), struct rt_waiter_node, entry)
- rb_erase(&waiter->tree_entry, &lock->waiters);
- RB_CLEAR_NODE(&waiter->tree_entry);
+static __always_inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b)
+{
+ return rt_waiter_node_less(__node_2_rt_node(a), __node_2_rt_node(b));
}
-static void
+static __always_inline void
rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
- struct rb_node **link = &task->pi_waiters.rb_node;
- struct rb_node *parent = NULL;
- struct rt_mutex_waiter *entry;
- int leftmost = 1;
-
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry);
- if (rt_mutex_waiter_less(waiter, entry)) {
- link = &parent->rb_left;
- } else {
- link = &parent->rb_right;
- leftmost = 0;
- }
- }
-
- if (leftmost)
- task->pi_waiters_leftmost = &waiter->pi_tree_entry;
+ lockdep_assert_held(&task->pi_lock);
- rb_link_node(&waiter->pi_tree_entry, parent, link);
- rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters);
+ rb_add_cached(&waiter->pi_tree.entry, &task->pi_waiters, __pi_waiter_less);
}
-static void
+static __always_inline void
rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
- if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
- return;
+ lockdep_assert_held(&task->pi_lock);
- if (task->pi_waiters_leftmost == &waiter->pi_tree_entry)
- task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry);
+ if (RB_EMPTY_NODE(&waiter->pi_tree.entry))
+ return;
- rb_erase(&waiter->pi_tree_entry, &task->pi_waiters);
- RB_CLEAR_NODE(&waiter->pi_tree_entry);
+ rb_erase_cached(&waiter->pi_tree.entry, &task->pi_waiters);
+ RB_CLEAR_NODE(&waiter->pi_tree.entry);
}
-static void rt_mutex_adjust_prio(struct task_struct *p)
+static __always_inline void rt_mutex_adjust_prio(struct rt_mutex_base *lock,
+ struct task_struct *p)
{
struct task_struct *pi_task = NULL;
+ lockdep_assert_held(&lock->wait_lock);
+ lockdep_assert(rt_mutex_owner(lock) == p);
lockdep_assert_held(&p->pi_lock);
if (task_has_pi_waiters(p))
@@ -358,6 +539,42 @@ static void rt_mutex_adjust_prio(struct task_struct *p)
rt_mutex_setprio(p, pi_task);
}
+/* RT mutex specific wake_q wrappers */
+static __always_inline void rt_mutex_wake_q_add_task(struct rt_wake_q_head *wqh,
+ struct task_struct *task,
+ unsigned int wake_state)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && wake_state == TASK_RTLOCK_WAIT) {
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING))
+ WARN_ON_ONCE(wqh->rtlock_task);
+ get_task_struct(task);
+ wqh->rtlock_task = task;
+ } else {
+ wake_q_add(&wqh->head, task);
+ }
+}
+
+static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh,
+ struct rt_mutex_waiter *w)
+{
+ rt_mutex_wake_q_add_task(wqh, w->task, w->wake_state);
+}
+
+static __always_inline void rt_mutex_wake_up_q(struct rt_wake_q_head *wqh)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && wqh->rtlock_task) {
+ wake_up_state(wqh->rtlock_task, TASK_RTLOCK_WAIT);
+ put_task_struct(wqh->rtlock_task);
+ wqh->rtlock_task = NULL;
+ }
+
+ if (!wake_q_empty(&wqh->head))
+ wake_up_q(&wqh->head);
+
+ /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */
+ preempt_enable();
+}
+
/*
* Deadlock detection is conditional:
*
@@ -371,25 +588,16 @@ static void rt_mutex_adjust_prio(struct task_struct *p)
* deadlock detection is disabled independent of the detect argument
* and the config settings.
*/
-static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
- enum rtmutex_chainwalk chwalk)
+static __always_inline bool
+rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
+ enum rtmutex_chainwalk chwalk)
{
- /*
- * This is just a wrapper function for the following call,
- * because debug_rt_mutex_detect_deadlock() smells like a magic
- * debug feature and I wanted to keep the cond function in the
- * main source file along with the comments instead of having
- * two of the same in the headers.
- */
- return debug_rt_mutex_detect_deadlock(waiter, chwalk);
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
+ return waiter != NULL;
+ return chwalk == RT_MUTEX_FULL_CHAINWALK;
}
-/*
- * Max number of times we'll walk the boosting chain:
- */
-int max_lock_depth = 1024;
-
-static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
+static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_struct *p)
{
return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
}
@@ -418,9 +626,14 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
* Chain walk basics and protection scope
*
* [R] refcount on task
- * [P] task->pi_lock held
+ * [Pn] task->pi_lock held
* [L] rtmutex->wait_lock held
*
+ * Normal locking order:
+ *
+ * rtmutex->wait_lock
+ * task->pi_lock
+ *
* Step Description Protected by
* function arguments:
* @task [R]
@@ -435,39 +648,44 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
* again:
* loop_sanity_check();
* retry:
- * [1] lock(task->pi_lock); [R] acquire [P]
- * [2] waiter = task->pi_blocked_on; [P]
- * [3] check_exit_conditions_1(); [P]
- * [4] lock = waiter->lock; [P]
- * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L]
- * unlock(task->pi_lock); release [P]
+ * [1] lock(task->pi_lock); [R] acquire [P1]
+ * [2] waiter = task->pi_blocked_on; [P1]
+ * [3] check_exit_conditions_1(); [P1]
+ * [4] lock = waiter->lock; [P1]
+ * [5] if (!try_lock(lock->wait_lock)) { [P1] try to acquire [L]
+ * unlock(task->pi_lock); release [P1]
* goto retry;
* }
- * [6] check_exit_conditions_2(); [P] + [L]
- * [7] requeue_lock_waiter(lock, waiter); [P] + [L]
- * [8] unlock(task->pi_lock); release [P]
+ * [6] check_exit_conditions_2(); [P1] + [L]
+ * [7] requeue_lock_waiter(lock, waiter); [P1] + [L]
+ * [8] unlock(task->pi_lock); release [P1]
* put_task_struct(task); release [R]
* [9] check_exit_conditions_3(); [L]
* [10] task = owner(lock); [L]
* get_task_struct(task); [L] acquire [R]
- * lock(task->pi_lock); [L] acquire [P]
- * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L]
- * [12] check_exit_conditions_4(); [P] + [L]
- * [13] unlock(task->pi_lock); release [P]
+ * lock(task->pi_lock); [L] acquire [P2]
+ * [11] requeue_pi_waiter(tsk, waiters(lock));[P2] + [L]
+ * [12] check_exit_conditions_4(); [P2] + [L]
+ * [13] unlock(task->pi_lock); release [P2]
* unlock(lock->wait_lock); release [L]
* goto again;
+ *
+ * Where P1 is the blocking task and P2 is the lock owner; going up one step
+ * the owner becomes the next blocked task etc..
+ *
+*
*/
-static int rt_mutex_adjust_prio_chain(struct task_struct *task,
- enum rtmutex_chainwalk chwalk,
- struct rt_mutex *orig_lock,
- struct rt_mutex *next_lock,
- struct rt_mutex_waiter *orig_waiter,
- struct task_struct *top_task)
+static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
+ enum rtmutex_chainwalk chwalk,
+ struct rt_mutex_base *orig_lock,
+ struct rt_mutex_base *next_lock,
+ struct rt_mutex_waiter *orig_waiter,
+ struct task_struct *top_task)
{
struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
struct rt_mutex_waiter *prerequeue_top_waiter;
int ret = 0, depth = 0;
- struct rt_mutex *lock;
+ struct rt_mutex_base *lock;
bool detect_deadlock;
bool requeue = true;
@@ -550,6 +768,31 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
goto out_unlock_pi;
/*
+ * There could be 'spurious' loops in the lock graph due to ww_mutex,
+ * consider:
+ *
+ * P1: A, ww_A, ww_B
+ * P2: ww_B, ww_A
+ * P3: A
+ *
+ * P3 should not return -EDEADLK because it gets trapped in the cycle
+ * created by P1 and P2 (which will resolve -- and runs into
+ * max_lock_depth above). Therefore disable detect_deadlock such that
+ * the below termination condition can trigger once all relevant tasks
+ * are boosted.
+ *
+ * Even when we start with ww_mutex we can disable deadlock detection,
+ * since we would supress a ww_mutex induced deadlock at [6] anyway.
+ * Supressing it here however is not sufficient since we might still
+ * hit [6] due to adjustment driven iteration.
+ *
+ * NOTE: if someone were to create a deadlock between 2 ww_classes we'd
+ * utterly fail to report it; lockdep should.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && waiter->ww_ctx && detect_deadlock)
+ detect_deadlock = false;
+
+ /*
* Drop out, when the task has no waiters. Note,
* top_waiter can be NULL, when we are in the deboosting
* mode!
@@ -578,7 +821,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* enabled we continue, but stop the requeueing in the chain
* walk.
*/
- if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
+ if (rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) {
if (!detect_deadlock)
goto out_unlock_pi;
else
@@ -586,13 +829,18 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
}
/*
- * [4] Get the next lock
+ * [4] Get the next lock; per holding task->pi_lock we can't unblock
+ * and guarantee @lock's existence.
*/
lock = waiter->lock;
/*
* [5] We need to trylock here as we are holding task->pi_lock,
* which is the reverse lock order versus the other rtmutex
* operations.
+ *
+ * Per the above, holding task->pi_lock guarantees lock exists, so
+ * inverting this lock order is infeasible from a life-time
+ * perspective.
*/
if (!raw_spin_trylock(&lock->wait_lock)) {
raw_spin_unlock_irq(&task->pi_lock);
@@ -610,9 +858,21 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* walk, we detected a deadlock.
*/
if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
- debug_rt_mutex_deadlock(chwalk, orig_waiter, lock);
- raw_spin_unlock(&lock->wait_lock);
ret = -EDEADLK;
+
+ /*
+ * When the deadlock is due to ww_mutex; also see above. Don't
+ * report the deadlock and instead let the ww_mutex wound/die
+ * logic pick which of the contending threads gets -EDEADLK.
+ *
+ * NOTE: assumes the cycle only contains a single ww_class; any
+ * other configuration and we fail to report; also, see
+ * lockdep.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter && orig_waiter->ww_ctx)
+ ret = 0;
+
+ raw_spin_unlock(&lock->wait_lock);
goto out_unlock_pi;
}
@@ -639,8 +899,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
}
/* [10] Grab the next task, i.e. owner of @lock */
- task = rt_mutex_owner(lock);
- get_task_struct(task);
+ task = get_task_struct(rt_mutex_owner(lock));
raw_spin_lock(&task->pi_lock);
/*
@@ -685,18 +944,18 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* or
*
* DL CBS enforcement advancing the effective deadline.
- *
- * Even though pi_waiters also uses these fields, and that tree is only
- * updated in [11], we can do this here, since we hold [L], which
- * serializes all pi_waiters access and rb_erase() does not care about
- * the values of the node being removed.
*/
- waiter->prio = task->prio;
- waiter->deadline = task->dl.deadline;
+ waiter_update_prio(waiter, task);
rt_mutex_enqueue(lock, waiter);
- /* [8] Release the task */
+ /*
+ * [8] Release the (blocking) task in preparation for
+ * taking the owner task in [10].
+ *
+ * Since we hold lock->waiter_lock, task cannot unblock, even if we
+ * release task->pi_lock.
+ */
raw_spin_unlock(&task->pi_lock);
put_task_struct(task);
@@ -713,15 +972,20 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* then we need to wake the new top waiter up to try
* to get the lock.
*/
- if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
- wake_up_process(rt_mutex_top_waiter(lock)->task);
+ top_waiter = rt_mutex_top_waiter(lock);
+ if (prerequeue_top_waiter != top_waiter)
+ wake_up_state(top_waiter->task, top_waiter->wake_state);
raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
- /* [10] Grab the next task, i.e. the owner of @lock */
- task = rt_mutex_owner(lock);
- get_task_struct(task);
+ /*
+ * [10] Grab the next task, i.e. the owner of @lock
+ *
+ * Per holding lock->wait_lock and checking for !owner above, there
+ * must be an owner and it cannot go away.
+ */
+ task = get_task_struct(rt_mutex_owner(lock));
raw_spin_lock(&task->pi_lock);
/* [11] requeue the pi waiters if necessary */
@@ -733,13 +997,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* and adjust the priority of the owner.
*/
rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
+ waiter_clone_prio(waiter, task);
rt_mutex_enqueue_pi(task, waiter);
- rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(lock, task);
} else if (prerequeue_top_waiter == waiter) {
/*
* The waiter was the top waiter on the lock, but is
- * no longer the top prority waiter. Replace waiter in
+ * no longer the top priority waiter. Replace waiter in
* the owner tasks pi waiters tree with the new top
* (highest priority) waiter and adjust the priority
* of the owner.
@@ -749,8 +1014,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
*/
rt_mutex_dequeue_pi(task, waiter);
waiter = rt_mutex_top_waiter(lock);
+ waiter_clone_prio(waiter, task);
rt_mutex_enqueue_pi(task, waiter);
- rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(lock, task);
} else {
/*
* Nothing changed. No need to do any priority
@@ -817,8 +1083,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* @waiter: The waiter that is queued to the lock's wait tree if the
* callsite called task_blocked_on_lock(), otherwise NULL
*/
-static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
- struct rt_mutex_waiter *waiter)
+static int __sched
+try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task,
+ struct rt_mutex_waiter *waiter)
{
lockdep_assert_held(&lock->wait_lock);
@@ -853,19 +1120,21 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* trylock attempt.
*/
if (waiter) {
- /*
- * If waiter is not the highest priority waiter of
- * @lock, give up.
- */
- if (waiter != rt_mutex_top_waiter(lock))
- return 0;
+ struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock);
/*
- * We can acquire the lock. Remove the waiter from the
- * lock waiters tree.
+ * If waiter is the highest priority waiter of @lock,
+ * or allowed to steal it, take it over.
*/
- rt_mutex_dequeue(lock, waiter);
-
+ if (waiter == top_waiter || rt_mutex_steal(waiter, top_waiter)) {
+ /*
+ * We can acquire the lock. Remove the waiter from the
+ * lock waiters tree.
+ */
+ rt_mutex_dequeue(lock, waiter);
+ } else {
+ return 0;
+ }
} else {
/*
* If the lock has waiters already we check whether @task is
@@ -876,13 +1145,9 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* not need to be dequeued.
*/
if (rt_mutex_has_waiters(lock)) {
- /*
- * If @task->prio is greater than or equal to
- * the top waiter priority (kernel view),
- * @task lost.
- */
- if (!rt_mutex_waiter_less(task_to_waiter(task),
- rt_mutex_top_waiter(lock)))
+ /* Check whether the trylock can steal it. */
+ if (!rt_mutex_steal(task_to_waiter(task),
+ rt_mutex_top_waiter(lock)))
return 0;
/*
@@ -919,9 +1184,6 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
raw_spin_unlock(&task->pi_lock);
takeit:
- /* We got the lock. */
- debug_rt_mutex_lock(lock);
-
/*
* This either preserves the RT_MUTEX_HAS_WAITERS bit if there
* are still waiters or clears it.
@@ -938,14 +1200,16 @@ takeit:
*
* This must be called with lock->wait_lock held and interrupts disabled
*/
-static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task,
- enum rtmutex_chainwalk chwalk)
+static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task,
+ struct ww_acquire_ctx *ww_ctx,
+ enum rtmutex_chainwalk chwalk,
+ struct wake_q_head *wake_q)
{
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex_waiter *top_waiter = waiter;
- struct rt_mutex *next_lock;
+ struct rt_mutex_base *next_lock;
int chain_walk = 0, res;
lockdep_assert_held(&lock->wait_lock);
@@ -958,15 +1222,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
* the other will detect the deadlock and return -EDEADLOCK,
* which is wrong, as the other waiter is not in a deadlock
* situation.
+ *
+ * Except for ww_mutex, in that case the chain walk must already deal
+ * with spurious cycles, see the comments at [3] and [6].
*/
- if (owner == task)
+ if (owner == task && !(build_ww_mutex() && ww_ctx))
return -EDEADLK;
raw_spin_lock(&task->pi_lock);
waiter->task = task;
waiter->lock = lock;
- waiter->prio = task->prio;
- waiter->deadline = task->dl.deadline;
+ waiter_update_prio(waiter, task);
+ waiter_clone_prio(waiter, task);
/* Get the top priority waiter on the lock */
if (rt_mutex_has_waiters(lock))
@@ -977,6 +1244,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
raw_spin_unlock(&task->pi_lock);
+ if (build_ww_mutex() && ww_ctx) {
+ struct rt_mutex *rtm;
+
+ /* Check whether the waiter should back out immediately */
+ rtm = container_of(lock, struct rt_mutex, rtmutex);
+ res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, wake_q);
+ if (res) {
+ raw_spin_lock(&task->pi_lock);
+ rt_mutex_dequeue(lock, waiter);
+ task->pi_blocked_on = NULL;
+ raw_spin_unlock(&task->pi_lock);
+ return res;
+ }
+ }
+
if (!owner)
return 0;
@@ -985,7 +1267,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
rt_mutex_dequeue_pi(owner, top_waiter);
rt_mutex_enqueue_pi(owner, waiter);
- rt_mutex_adjust_prio(owner);
+ rt_mutex_adjust_prio(lock, owner);
if (owner->pi_blocked_on)
chain_walk = 1;
} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
@@ -1011,7 +1293,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
*/
get_task_struct(owner);
- raw_spin_unlock_irq(&lock->wait_lock);
+ raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q);
res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
next_lock, waiter, task);
@@ -1027,11 +1309,13 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
*
* Called with lock->wait_lock held and interrupts disabled.
*/
-static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
- struct rt_mutex *lock)
+static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh,
+ struct rt_mutex_base *lock)
{
struct rt_mutex_waiter *waiter;
+ lockdep_assert_held(&lock->wait_lock);
+
raw_spin_lock(&current->pi_lock);
waiter = rt_mutex_top_waiter(lock);
@@ -1044,7 +1328,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
* task unblocks.
*/
rt_mutex_dequeue_pi(current, waiter);
- rt_mutex_adjust_prio(current);
+ rt_mutex_adjust_prio(lock, current);
/*
* As we are waking up the top waiter, and the waiter stays
@@ -1064,240 +1348,22 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
* deboost but before waking our donor task, hence the preempt_disable()
* before unlock.
*
- * Pairs with preempt_enable() in rt_mutex_postunlock();
+ * Pairs with preempt_enable() in rt_mutex_wake_up_q();
*/
preempt_disable();
- wake_q_add(wake_q, waiter->task);
- raw_spin_unlock(&current->pi_lock);
-}
-
-/*
- * Remove a waiter from a lock and give up
- *
- * Must be called with lock->wait_lock held and interrupts disabled. I must
- * have just failed to try_to_take_rt_mutex().
- */
-static void remove_waiter(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter)
-{
- bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
- struct task_struct *owner = rt_mutex_owner(lock);
- struct rt_mutex *next_lock;
-
- lockdep_assert_held(&lock->wait_lock);
-
- raw_spin_lock(&current->pi_lock);
- rt_mutex_dequeue(lock, waiter);
- current->pi_blocked_on = NULL;
+ rt_mutex_wake_q_add(wqh, waiter);
raw_spin_unlock(&current->pi_lock);
-
- /*
- * Only update priority if the waiter was the highest priority
- * waiter of the lock and there is an owner to update.
- */
- if (!owner || !is_top_waiter)
- return;
-
- raw_spin_lock(&owner->pi_lock);
-
- rt_mutex_dequeue_pi(owner, waiter);
-
- if (rt_mutex_has_waiters(lock))
- rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
-
- rt_mutex_adjust_prio(owner);
-
- /* Store the lock on which owner is blocked or NULL */
- next_lock = task_blocked_on_lock(owner);
-
- raw_spin_unlock(&owner->pi_lock);
-
- /*
- * Don't walk the chain, if the owner task is not blocked
- * itself.
- */
- if (!next_lock)
- return;
-
- /* gets dropped in rt_mutex_adjust_prio_chain()! */
- get_task_struct(owner);
-
- raw_spin_unlock_irq(&lock->wait_lock);
-
- rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
- next_lock, NULL, current);
-
- raw_spin_lock_irq(&lock->wait_lock);
-}
-
-/*
- * Recheck the pi chain, in case we got a priority setting
- *
- * Called from sched_setscheduler
- */
-void rt_mutex_adjust_pi(struct task_struct *task)
-{
- struct rt_mutex_waiter *waiter;
- struct rt_mutex *next_lock;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&task->pi_lock, flags);
-
- waiter = task->pi_blocked_on;
- if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- return;
- }
- next_lock = waiter->lock;
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-
- /* gets dropped in rt_mutex_adjust_prio_chain()! */
- get_task_struct(task);
-
- rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
- next_lock, NULL, task);
}
-void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock)
{
- debug_rt_mutex_init_waiter(waiter);
- RB_CLEAR_NODE(&waiter->pi_tree_entry);
- RB_CLEAR_NODE(&waiter->tree_entry);
- waiter->task = NULL;
-}
-
-/**
- * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
- * @lock: the rt_mutex to take
- * @state: the state the task should block in (TASK_INTERRUPTIBLE
- * or TASK_UNINTERRUPTIBLE)
- * @timeout: the pre-initialized and started timer, or NULL for none
- * @waiter: the pre-initialized rt_mutex_waiter
- *
- * Must be called with lock->wait_lock held and interrupts disabled
- */
-static int __sched
-__rt_mutex_slowlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- struct rt_mutex_waiter *waiter)
-{
- int ret = 0;
-
- for (;;) {
- /* Try to acquire the lock: */
- if (try_to_take_rt_mutex(lock, current, waiter))
- break;
-
- /*
- * TASK_INTERRUPTIBLE checks for signals and
- * timeout. Ignored otherwise.
- */
- if (likely(state == TASK_INTERRUPTIBLE)) {
- /* Signal pending? */
- if (signal_pending(current))
- ret = -EINTR;
- if (timeout && !timeout->task)
- ret = -ETIMEDOUT;
- if (ret)
- break;
- }
-
- raw_spin_unlock_irq(&lock->wait_lock);
-
- debug_rt_mutex_print_deadlock(waiter);
-
- schedule();
-
- raw_spin_lock_irq(&lock->wait_lock);
- set_current_state(state);
- }
-
- __set_current_state(TASK_RUNNING);
- return ret;
-}
-
-static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
- struct rt_mutex_waiter *w)
-{
- /*
- * If the result is not -EDEADLOCK or the caller requested
- * deadlock detection, nothing to do here.
- */
- if (res != -EDEADLOCK || detect_deadlock)
- return;
-
- /*
- * Yell lowdly and stop the task right here.
- */
- rt_mutex_print_deadlock(w);
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
- schedule();
- }
-}
-
-/*
- * Slow path lock function:
- */
-static int __sched
-rt_mutex_slowlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk)
-{
- struct rt_mutex_waiter waiter;
- unsigned long flags;
- int ret = 0;
-
- rt_mutex_init_waiter(&waiter);
-
- /*
- * Technically we could use raw_spin_[un]lock_irq() here, but this can
- * be called in early boot if the cmpxchg() fast path is disabled
- * (debug, no architecture support). In this case we will acquire the
- * rtmutex with lock->wait_lock held. But we cannot unconditionally
- * enable interrupts in that early boot case. So we need to use the
- * irqsave/restore variants.
- */
- raw_spin_lock_irqsave(&lock->wait_lock, flags);
-
- /* Try to acquire the lock again: */
- if (try_to_take_rt_mutex(lock, current, NULL)) {
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- return 0;
- }
-
- set_current_state(state);
-
- /* Setup the timer, when timeout != NULL */
- if (unlikely(timeout))
- hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-
- ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
-
- if (likely(!ret))
- /* sleep on the mutex */
- ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
-
- if (unlikely(ret)) {
- __set_current_state(TASK_RUNNING);
- if (rt_mutex_has_waiters(lock))
- remove_waiter(lock, &waiter);
- rt_mutex_handle_deadlock(ret, chwalk, &waiter);
- }
+ int ret = try_to_take_rt_mutex(lock, current, NULL);
/*
- * try_to_take_rt_mutex() sets the waiter bit
- * unconditionally. We might have to fix that up.
+ * try_to_take_rt_mutex() sets the lock waiters bit
+ * unconditionally. Clean this up.
*/
- fixup_rt_mutex_waiters(lock);
-
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
-
- /* Remove pending timer: */
- if (unlikely(timeout))
- hrtimer_cancel(&timeout->timer);
-
- debug_rt_mutex_free_waiter(&waiter);
+ fixup_rt_mutex_waiters(lock, true);
return ret;
}
@@ -1305,7 +1371,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
/*
* Slow path try-lock function:
*/
-static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
+static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock)
{
unsigned long flags;
int ret;
@@ -1324,27 +1390,27 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
*/
raw_spin_lock_irqsave(&lock->wait_lock, flags);
- ret = try_to_take_rt_mutex(lock, current, NULL);
-
- /*
- * try_to_take_rt_mutex() sets the lock waiters bit
- * unconditionally. Clean this up.
- */
- fixup_rt_mutex_waiters(lock);
+ ret = __rt_mutex_slowtrylock(lock);
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return ret;
}
+static __always_inline int __rt_mutex_trylock(struct rt_mutex_base *lock)
+{
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+ return 1;
+
+ return rt_mutex_slowtrylock(lock);
+}
+
/*
* Slow path to release a rt-mutex.
- *
- * Return whether the current task needs to call rt_mutex_postunlock().
*/
-static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
- struct wake_q_head *wake_q)
+static void __sched rt_mutex_slowunlock(struct rt_mutex_base *lock)
{
+ DEFINE_RT_WAKE_Q(wqh);
unsigned long flags;
/* irqsave required to support early boot calls */
@@ -1386,7 +1452,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
while (!rt_mutex_has_waiters(lock)) {
/* Drops lock->wait_lock ! */
if (unlock_rt_mutex_safe(lock, flags) == true)
- return false;
+ return;
/* Relock the rtmutex and try again */
raw_spin_lock_irqsave(&lock->wait_lock, flags);
}
@@ -1397,481 +1463,437 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
*
* Queue the next waiter for wakeup once we release the wait_lock.
*/
- mark_wakeup_next_waiter(wake_q, lock);
+ mark_wakeup_next_waiter(&wqh, lock);
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- return true; /* call rt_mutex_postunlock() */
+ rt_mutex_wake_up_q(&wqh);
}
-/*
- * debug aware fast / slowpath lock,trylock,unlock
- *
- * The atomic acquire/release ops are compiled away, when either the
- * architecture does not support cmpxchg or when debugging is enabled.
- */
-static inline int
-rt_mutex_fastlock(struct rt_mutex *lock, int state,
- int (*slowfn)(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk))
+static __always_inline void __rt_mutex_unlock(struct rt_mutex_base *lock)
{
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
- return 0;
+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ return;
- return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
+ rt_mutex_slowunlock(lock);
}
-static inline int
-rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk,
- int (*slowfn)(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk))
+#ifdef CONFIG_SMP
+static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *owner)
{
- if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
- likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
- return 0;
+ bool res = true;
- return slowfn(lock, state, timeout, chwalk);
+ rcu_read_lock();
+ for (;;) {
+ /* If owner changed, trylock again. */
+ if (owner != rt_mutex_owner(lock))
+ break;
+ /*
+ * Ensure that @owner is dereferenced after checking that
+ * the lock owner still matches @owner. If that fails,
+ * @owner might point to freed memory. If it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
+ /*
+ * Stop spinning when:
+ * - the lock owner has been scheduled out
+ * - current is not longer the top waiter
+ * - current is requested to reschedule (redundant
+ * for CONFIG_PREEMPT_RCU=y)
+ * - the VCPU on which owner runs is preempted
+ */
+ if (!owner_on_cpu(owner) || need_resched() ||
+ !rt_mutex_waiter_is_top_waiter(lock, waiter)) {
+ res = false;
+ break;
+ }
+ cpu_relax();
+ }
+ rcu_read_unlock();
+ return res;
}
-
-static inline int
-rt_mutex_fasttrylock(struct rt_mutex *lock,
- int (*slowfn)(struct rt_mutex *lock))
+#else
+static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *owner)
{
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
- return 1;
-
- return slowfn(lock);
+ return false;
}
+#endif
+
+#ifdef RT_MUTEX_BUILD_MUTEX
+/*
+ * Functions required for:
+ * - rtmutex, futex on all kernels
+ * - mutex and rwsem substitutions on RT kernels
+ */
/*
- * Performs the wakeup of the the top-waiter and re-enables preemption.
+ * Remove a waiter from a lock and give up
+ *
+ * Must be called with lock->wait_lock held and interrupts disabled. It must
+ * have just failed to try_to_take_rt_mutex().
*/
-void rt_mutex_postunlock(struct wake_q_head *wake_q)
+static void __sched remove_waiter(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter)
{
- wake_up_q(wake_q);
+ bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
+ struct task_struct *owner = rt_mutex_owner(lock);
+ struct rt_mutex_base *next_lock;
- /* Pairs with preempt_disable() in rt_mutex_slowunlock() */
- preempt_enable();
-}
+ lockdep_assert_held(&lock->wait_lock);
-static inline void
-rt_mutex_fastunlock(struct rt_mutex *lock,
- bool (*slowfn)(struct rt_mutex *lock,
- struct wake_q_head *wqh))
-{
- DEFINE_WAKE_Q(wake_q);
+ raw_spin_lock(&current->pi_lock);
+ rt_mutex_dequeue(lock, waiter);
+ current->pi_blocked_on = NULL;
+ raw_spin_unlock(&current->pi_lock);
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ /*
+ * Only update priority if the waiter was the highest priority
+ * waiter of the lock and there is an owner to update.
+ */
+ if (!owner || !is_top_waiter)
return;
- if (slowfn(lock, &wake_q))
- rt_mutex_postunlock(&wake_q);
-}
+ raw_spin_lock(&owner->pi_lock);
-/**
- * rt_mutex_lock - lock a rt_mutex
- *
- * @lock: the rt_mutex to be locked
- */
-void __sched rt_mutex_lock(struct rt_mutex *lock)
-{
- might_sleep();
+ rt_mutex_dequeue_pi(owner, waiter);
- mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
- rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
-}
-EXPORT_SYMBOL_GPL(rt_mutex_lock);
+ if (rt_mutex_has_waiters(lock))
+ rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
-/**
- * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
- *
- * @lock: the rt_mutex to be locked
- *
- * Returns:
- * 0 on success
- * -EINTR when interrupted by a signal
- */
-int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
-{
- int ret;
+ rt_mutex_adjust_prio(lock, owner);
- might_sleep();
+ /* Store the lock on which owner is blocked or NULL */
+ next_lock = task_blocked_on_lock(owner);
- mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
- ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
- if (ret)
- mutex_release(&lock->dep_map, 1, _RET_IP_);
+ raw_spin_unlock(&owner->pi_lock);
- return ret;
-}
-EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+ /*
+ * Don't walk the chain, if the owner task is not blocked
+ * itself.
+ */
+ if (!next_lock)
+ return;
-/*
- * Futex variant, must not use fastpath.
- */
-int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
-{
- return rt_mutex_slowtrylock(lock);
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(owner);
+
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
+ next_lock, NULL, current);
+
+ raw_spin_lock_irq(&lock->wait_lock);
}
/**
- * rt_mutex_timed_lock - lock a rt_mutex interruptible
- * the timeout structure is provided
- * by the caller
- *
- * @lock: the rt_mutex to be locked
- * @timeout: timeout structure or NULL (no timeout)
+ * rt_mutex_slowlock_block() - Perform the wait-wake-try-to-take loop
+ * @lock: the rt_mutex to take
+ * @ww_ctx: WW mutex context pointer
+ * @state: the state the task should block in (TASK_INTERRUPTIBLE
+ * or TASK_UNINTERRUPTIBLE)
+ * @timeout: the pre-initialized and started timer, or NULL for none
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @wake_q: wake_q of tasks to wake when we drop the lock->wait_lock
*
- * Returns:
- * 0 on success
- * -EINTR when interrupted by a signal
- * -ETIMEDOUT when the timeout expired
+ * Must be called with lock->wait_lock held and interrupts disabled
*/
-int
-rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
+static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state,
+ struct hrtimer_sleeper *timeout,
+ struct rt_mutex_waiter *waiter,
+ struct wake_q_head *wake_q)
+ __releases(&lock->wait_lock) __acquires(&lock->wait_lock)
{
- int ret;
+ struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex);
+ struct task_struct *owner;
+ int ret = 0;
- might_sleep();
+ lockevent_inc(rtmutex_slow_block);
+ for (;;) {
+ /* Try to acquire the lock: */
+ if (try_to_take_rt_mutex(lock, current, waiter)) {
+ lockevent_inc(rtmutex_slow_acq3);
+ break;
+ }
- mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
- ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
- RT_MUTEX_MIN_CHAINWALK,
- rt_mutex_slowlock);
- if (ret)
- mutex_release(&lock->dep_map, 1, _RET_IP_);
+ if (timeout && !timeout->task) {
+ ret = -ETIMEDOUT;
+ break;
+ }
+ if (signal_pending_state(state, current)) {
+ ret = -EINTR;
+ break;
+ }
- return ret;
-}
-EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
+ if (build_ww_mutex() && ww_ctx) {
+ ret = __ww_mutex_check_kill(rtm, waiter, ww_ctx);
+ if (ret)
+ break;
+ }
-/**
- * rt_mutex_trylock - try to lock a rt_mutex
- *
- * @lock: the rt_mutex to be locked
- *
- * This function can only be called in thread context. It's safe to
- * call it from atomic regions, but not from hard interrupt or soft
- * interrupt context.
- *
- * Returns 1 on success and 0 on contention
- */
-int __sched rt_mutex_trylock(struct rt_mutex *lock)
-{
- int ret;
+ if (waiter == rt_mutex_top_waiter(lock))
+ owner = rt_mutex_owner(lock);
+ else
+ owner = NULL;
+ raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q);
- if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
- return 0;
+ if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) {
+ lockevent_inc(rtmutex_slow_sleep);
+ rt_mutex_schedule();
+ }
- ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
- if (ret)
- mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ raw_spin_lock_irq(&lock->wait_lock);
+ set_current_state(state);
+ }
+ __set_current_state(TASK_RUNNING);
return ret;
}
-EXPORT_SYMBOL_GPL(rt_mutex_trylock);
-/**
- * rt_mutex_unlock - unlock a rt_mutex
- *
- * @lock: the rt_mutex to be unlocked
- */
-void __sched rt_mutex_unlock(struct rt_mutex *lock)
+static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
+ struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *w)
{
- mutex_release(&lock->dep_map, 1, _RET_IP_);
- rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
+ /*
+ * If the result is not -EDEADLOCK or the caller requested
+ * deadlock detection, nothing to do here.
+ */
+ if (res != -EDEADLOCK || detect_deadlock)
+ return;
+
+ if (build_ww_mutex() && w->ww_ctx)
+ return;
+
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ WARN(1, "rtmutex deadlock detected\n");
+
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ rt_mutex_schedule();
+ }
}
-EXPORT_SYMBOL_GPL(rt_mutex_unlock);
/**
- * Futex variant, that since futex variants do not use the fast-path, can be
- * simple and will not need to retry.
+ * __rt_mutex_slowlock - Locking slowpath invoked with lock::wait_lock held
+ * @lock: The rtmutex to block lock
+ * @ww_ctx: WW mutex context pointer
+ * @state: The task state for sleeping
+ * @chwalk: Indicator whether full or partial chainwalk is requested
+ * @waiter: Initializer waiter for blocking
+ * @wake_q: The wake_q to wake tasks after we release the wait_lock
*/
-bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wake_q)
+static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state,
+ enum rtmutex_chainwalk chwalk,
+ struct rt_mutex_waiter *waiter,
+ struct wake_q_head *wake_q)
{
+ struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex);
+ struct ww_mutex *ww = ww_container_of(rtm);
+ int ret;
+
lockdep_assert_held(&lock->wait_lock);
+ lockevent_inc(rtmutex_slowlock);
- debug_rt_mutex_unlock(lock);
+ /* Try to acquire the lock again: */
+ if (try_to_take_rt_mutex(lock, current, NULL)) {
+ if (build_ww_mutex() && ww_ctx) {
+ __ww_mutex_check_waiters(rtm, ww_ctx, wake_q);
+ ww_mutex_lock_acquired(ww, ww_ctx);
+ }
+ lockevent_inc(rtmutex_slow_acq1);
+ return 0;
+ }
+
+ set_current_state(state);
- if (!rt_mutex_has_waiters(lock)) {
- lock->owner = NULL;
- return false; /* done */
+ trace_contention_begin(lock, LCB_F_RT);
+
+ ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk, wake_q);
+ if (likely(!ret))
+ ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter, wake_q);
+
+ if (likely(!ret)) {
+ /* acquired the lock */
+ if (build_ww_mutex() && ww_ctx) {
+ if (!ww_ctx->is_wait_die)
+ __ww_mutex_check_waiters(rtm, ww_ctx, wake_q);
+ ww_mutex_lock_acquired(ww, ww_ctx);
+ }
+ lockevent_inc(rtmutex_slow_acq2);
+ } else {
+ __set_current_state(TASK_RUNNING);
+ remove_waiter(lock, waiter);
+ rt_mutex_handle_deadlock(ret, chwalk, lock, waiter);
+ lockevent_inc(rtmutex_deadlock);
}
/*
- * We've already deboosted, mark_wakeup_next_waiter() will
- * retain preempt_disabled when we drop the wait_lock, to
- * avoid inversion prior to the wakeup. preempt_disable()
- * therein pairs with rt_mutex_postunlock().
+ * try_to_take_rt_mutex() sets the waiter bit
+ * unconditionally. We might have to fix that up.
*/
- mark_wakeup_next_waiter(wake_q, lock);
+ fixup_rt_mutex_waiters(lock, true);
+
+ trace_contention_end(lock, ret);
- return true; /* call postunlock() */
+ return ret;
}
-void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
+static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state,
+ struct wake_q_head *wake_q)
{
- DEFINE_WAKE_Q(wake_q);
- bool postunlock;
+ struct rt_mutex_waiter waiter;
+ int ret;
- raw_spin_lock_irq(&lock->wait_lock);
- postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
- raw_spin_unlock_irq(&lock->wait_lock);
+ rt_mutex_init_waiter(&waiter);
+ waiter.ww_ctx = ww_ctx;
+
+ ret = __rt_mutex_slowlock(lock, ww_ctx, state, RT_MUTEX_MIN_CHAINWALK,
+ &waiter, wake_q);
- if (postunlock)
- rt_mutex_postunlock(&wake_q);
+ debug_rt_mutex_free_waiter(&waiter);
+ lockevent_cond_inc(rtmutex_slow_wake, !wake_q_empty(wake_q));
+ return ret;
}
-/**
- * rt_mutex_destroy - mark a mutex unusable
- * @lock: the mutex to be destroyed
- *
- * This function marks the mutex uninitialized, and any subsequent
- * use of the mutex is forbidden. The mutex must not be locked when
- * this function is called.
+/*
+ * rt_mutex_slowlock - Locking slowpath invoked when fast path fails
+ * @lock: The rtmutex to block lock
+ * @ww_ctx: WW mutex context pointer
+ * @state: The task state for sleeping
*/
-void rt_mutex_destroy(struct rt_mutex *lock)
+static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state)
{
- WARN_ON(rt_mutex_is_locked(lock));
-#ifdef CONFIG_DEBUG_RT_MUTEXES
- lock->magic = NULL;
-#endif
+ DEFINE_WAKE_Q(wake_q);
+ unsigned long flags;
+ int ret;
+
+ /*
+ * Do all pre-schedule work here, before we queue a waiter and invoke
+ * PI -- any such work that trips on rtlock (PREEMPT_RT spinlock) would
+ * otherwise recurse back into task_blocks_on_rt_mutex() through
+ * rtlock_slowlock() and will then enqueue a second waiter for this
+ * same task and things get really confusing real fast.
+ */
+ rt_mutex_pre_schedule();
+
+ /*
+ * Technically we could use raw_spin_[un]lock_irq() here, but this can
+ * be called in early boot if the cmpxchg() fast path is disabled
+ * (debug, no architecture support). In this case we will acquire the
+ * rtmutex with lock->wait_lock held. But we cannot unconditionally
+ * enable interrupts in that early boot case. So we need to use the
+ * irqsave/restore variants.
+ */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state, &wake_q);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
+ rt_mutex_post_schedule();
+
+ return ret;
}
-EXPORT_SYMBOL_GPL(rt_mutex_destroy);
-/**
- * __rt_mutex_init - initialize the rt lock
- *
- * @lock: the rt lock to be initialized
- *
- * Initialize the rt lock to unlocked state.
- *
- * Initializing of a locked rt lock is not allowed
- */
-void __rt_mutex_init(struct rt_mutex *lock, const char *name,
- struct lock_class_key *key)
+static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock,
+ unsigned int state)
{
- lock->owner = NULL;
- raw_spin_lock_init(&lock->wait_lock);
- lock->waiters = RB_ROOT;
- lock->waiters_leftmost = NULL;
+ lockdep_assert(!current->pi_blocked_on);
- if (name && key)
- debug_rt_mutex_init(lock, name, key);
+ if (likely(rt_mutex_try_acquire(lock)))
+ return 0;
+
+ return rt_mutex_slowlock(lock, NULL, state);
}
-EXPORT_SYMBOL_GPL(__rt_mutex_init);
+#endif /* RT_MUTEX_BUILD_MUTEX */
-/**
- * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
- * proxy owner
- *
- * @lock: the rt_mutex to be locked
- * @proxy_owner:the task to set as owner
- *
- * No locking. Caller has to do serializing itself
- *
- * Special API call for PI-futex support. This initializes the rtmutex and
- * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not
- * possible at this point because the pi_state which contains the rtmutex
- * is not yet visible to other tasks.
+#ifdef RT_MUTEX_BUILD_SPINLOCKS
+/*
+ * Functions required for spin/rw_lock substitution on RT kernels
*/
-void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
- struct task_struct *proxy_owner)
-{
- __rt_mutex_init(lock, NULL, NULL);
- debug_rt_mutex_proxy_lock(lock, proxy_owner);
- rt_mutex_set_owner(lock, proxy_owner);
-}
/**
- * rt_mutex_proxy_unlock - release a lock on behalf of owner
- *
- * @lock: the rt_mutex to be locked
- *
- * No locking. Caller has to do serializing itself
- *
- * Special API call for PI-futex support. This merrily cleans up the rtmutex
- * (debugging) state. Concurrent operations on this rt_mutex are not
- * possible because it belongs to the pi_state which is about to be freed
- * and it is not longer visible to other tasks.
+ * rtlock_slowlock_locked - Slow path lock acquisition for RT locks
+ * @lock: The underlying RT mutex
+ * @wake_q: The wake_q to wake tasks after we release the wait_lock
*/
-void rt_mutex_proxy_unlock(struct rt_mutex *lock,
- struct task_struct *proxy_owner)
+static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock,
+ struct wake_q_head *wake_q)
+ __releases(&lock->wait_lock) __acquires(&lock->wait_lock)
{
- debug_rt_mutex_proxy_unlock(lock);
- rt_mutex_set_owner(lock, NULL);
-}
-
-int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task)
-{
- int ret;
-
- if (try_to_take_rt_mutex(lock, task, NULL))
- return 1;
+ struct rt_mutex_waiter waiter;
+ struct task_struct *owner;
- /* We enforce deadlock detection for futexes */
- ret = task_blocks_on_rt_mutex(lock, waiter, task,
- RT_MUTEX_FULL_CHAINWALK);
+ lockdep_assert_held(&lock->wait_lock);
+ lockevent_inc(rtlock_slowlock);
- if (ret && !rt_mutex_owner(lock)) {
- /*
- * Reset the return value. We might have
- * returned with -EDEADLK and the owner
- * released the lock while we were walking the
- * pi chain. Let the waiter sort it out.
- */
- ret = 0;
+ if (try_to_take_rt_mutex(lock, current, NULL)) {
+ lockevent_inc(rtlock_slow_acq1);
+ return;
}
- if (unlikely(ret))
- remove_waiter(lock, waiter);
+ rt_mutex_init_rtlock_waiter(&waiter);
- debug_rt_mutex_print_deadlock(waiter);
+ /* Save current state and set state to TASK_RTLOCK_WAIT */
+ current_save_and_set_rtlock_wait_state();
- return ret;
-}
+ trace_contention_begin(lock, LCB_F_RT);
-/**
- * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
- * @lock: the rt_mutex to take
- * @waiter: the pre-initialized rt_mutex_waiter
- * @task: the task to prepare
- *
- * Returns:
- * 0 - task blocked on lock
- * 1 - acquired the lock for task, caller should wake it up
- * <0 - error
- *
- * Special API call for FUTEX_REQUEUE_PI support.
- */
-int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task)
-{
- int ret;
+ task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK, wake_q);
- raw_spin_lock_irq(&lock->wait_lock);
- ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
- raw_spin_unlock_irq(&lock->wait_lock);
+ for (;;) {
+ /* Try to acquire the lock again */
+ if (try_to_take_rt_mutex(lock, current, &waiter)) {
+ lockevent_inc(rtlock_slow_acq2);
+ break;
+ }
- return ret;
-}
+ if (&waiter == rt_mutex_top_waiter(lock))
+ owner = rt_mutex_owner(lock);
+ else
+ owner = NULL;
+ raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q);
-/**
- * rt_mutex_next_owner - return the next owner of the lock
- *
- * @lock: the rt lock query
- *
- * Returns the next owner of the lock or NULL
- *
- * Caller has to serialize against other accessors to the lock
- * itself.
- *
- * Special API call for PI-futex support
- */
-struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
-{
- if (!rt_mutex_has_waiters(lock))
- return NULL;
+ if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) {
+ lockevent_inc(rtlock_slow_sleep);
+ schedule_rtlock();
+ }
- return rt_mutex_top_waiter(lock)->task;
-}
+ raw_spin_lock_irq(&lock->wait_lock);
+ set_current_state(TASK_RTLOCK_WAIT);
+ }
-/**
- * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
- * @lock: the rt_mutex we were woken on
- * @to: the timeout, null if none. hrtimer should already have
- * been started.
- * @waiter: the pre-initialized rt_mutex_waiter
- *
- * Wait for the the lock acquisition started on our behalf by
- * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
- * rt_mutex_cleanup_proxy_lock().
- *
- * Returns:
- * 0 - success
- * <0 - error, one of -EINTR, -ETIMEDOUT
- *
- * Special API call for PI-futex support
- */
-int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter)
-{
- int ret;
+ /* Restore the task state */
+ current_restore_rtlock_saved_state();
- raw_spin_lock_irq(&lock->wait_lock);
- /* sleep on the mutex */
- set_current_state(TASK_INTERRUPTIBLE);
- ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
/*
- * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
- * have to fix that up.
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally.
+ * We might have to fix that up:
*/
- fixup_rt_mutex_waiters(lock);
- raw_spin_unlock_irq(&lock->wait_lock);
+ fixup_rt_mutex_waiters(lock, true);
+ debug_rt_mutex_free_waiter(&waiter);
- return ret;
+ trace_contention_end(lock, 0);
+ lockevent_cond_inc(rtlock_slow_wake, !wake_q_empty(wake_q));
}
-/**
- * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
- * @lock: the rt_mutex we were woken on
- * @waiter: the pre-initialized rt_mutex_waiter
- *
- * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
- *
- * Unless we acquired the lock; we're still enqueued on the wait-list and can
- * in fact still be granted ownership until we're removed. Therefore we can
- * find we are in fact the owner and must disregard the
- * rt_mutex_wait_proxy_lock() failure.
- *
- * Returns:
- * true - did the cleanup, we done.
- * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
- * caller should disregards its return value.
- *
- * Special API call for PI-futex support
- */
-bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter)
+static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock)
{
- bool cleanup = false;
-
- raw_spin_lock_irq(&lock->wait_lock);
- /*
- * Do an unconditional try-lock, this deals with the lock stealing
- * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
- * sets a NULL owner.
- *
- * We're not interested in the return value, because the subsequent
- * test on rt_mutex_owner() will infer that. If the trylock succeeded,
- * we will own the lock and it will have removed the waiter. If we
- * failed the trylock, we're still not owner and we need to remove
- * ourselves.
- */
- try_to_take_rt_mutex(lock, current, waiter);
- /*
- * Unless we're the owner; we're still enqueued on the wait_list.
- * So check if we became owner, if not, take us off the wait_list.
- */
- if (rt_mutex_owner(lock) != current) {
- remove_waiter(lock, waiter);
- cleanup = true;
- }
- /*
- * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
- * have to fix that up.
- */
- fixup_rt_mutex_waiters(lock);
-
- raw_spin_unlock_irq(&lock->wait_lock);
+ unsigned long flags;
+ DEFINE_WAKE_Q(wake_q);
- return cleanup;
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ rtlock_slowlock_locked(lock, &wake_q);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
}
+
+#endif /* RT_MUTEX_BUILD_SPINLOCKS */
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
deleted file mode 100644
index 5c253caffe91..000000000000
--- a/kernel/locking/rtmutex.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * RT-Mutexes: blocking mutual exclusion locks with PI support
- *
- * started by Ingo Molnar and Thomas Gleixner:
- *
- * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- * This file contains macros used solely by rtmutex.c.
- * Non-debug version.
- */
-
-#define rt_mutex_deadlock_check(l) (0)
-#define debug_rt_mutex_init_waiter(w) do { } while (0)
-#define debug_rt_mutex_free_waiter(w) do { } while (0)
-#define debug_rt_mutex_lock(l) do { } while (0)
-#define debug_rt_mutex_proxy_lock(l,p) do { } while (0)
-#define debug_rt_mutex_proxy_unlock(l) do { } while (0)
-#define debug_rt_mutex_unlock(l) do { } while (0)
-#define debug_rt_mutex_init(m, n, k) do { } while (0)
-#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
-#define debug_rt_mutex_print_deadlock(w) do { } while (0)
-#define debug_rt_mutex_reset_waiter(w) do { } while (0)
-
-static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
-{
- WARN(1, "rtmutex deadlock detected\n");
-}
-
-static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w,
- enum rtmutex_chainwalk walk)
-{
- return walk == RT_MUTEX_FULL_CHAINWALK;
-}
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
new file mode 100644
index 000000000000..59dbd29cb219
--- /dev/null
+++ b/kernel/locking/rtmutex_api.c
@@ -0,0 +1,656 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * rtmutex API
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+
+#define RT_MUTEX_BUILD_MUTEX
+#include "rtmutex.c"
+
+/*
+ * Max number of times we'll walk the boosting chain:
+ */
+int max_lock_depth = 1024;
+
+static const struct ctl_table rtmutex_sysctl_table[] = {
+ {
+ .procname = "max_lock_depth",
+ .data = &max_lock_depth,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+};
+
+static int __init init_rtmutex_sysctl(void)
+{
+ register_sysctl_init("kernel", rtmutex_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_rtmutex_sysctl);
+
+/*
+ * Debug aware fast / slowpath lock,trylock,unlock
+ *
+ * The atomic acquire/release ops are compiled away, when either the
+ * architecture does not support cmpxchg or when debugging is enabled.
+ */
+static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock,
+ unsigned int state,
+ struct lockdep_map *nest_lock,
+ unsigned int subclass)
+{
+ int ret;
+
+ might_sleep();
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, _RET_IP_);
+ ret = __rt_mutex_lock(&lock->rtmutex, state);
+ if (ret)
+ mutex_release(&lock->dep_map, _RET_IP_);
+ return ret;
+}
+
+void rt_mutex_base_init(struct rt_mutex_base *rtb)
+{
+ __rt_mutex_base_init(rtb);
+}
+EXPORT_SYMBOL(rt_mutex_base_init);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/**
+ * rt_mutex_lock_nested - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ * @subclass: the lockdep subclass
+ */
+void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass)
+{
+ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, subclass);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
+
+void __sched _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock)
+{
+ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, nest_lock, 0);
+}
+EXPORT_SYMBOL_GPL(_rt_mutex_lock_nest_lock);
+
+#else /* !CONFIG_DEBUG_LOCK_ALLOC */
+
+/**
+ * rt_mutex_lock - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ */
+void __sched rt_mutex_lock(struct rt_mutex *lock)
+{
+ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock);
+#endif
+
+/**
+ * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ */
+int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
+{
+ return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+
+/**
+ * rt_mutex_lock_killable - lock a rt_mutex killable
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ */
+int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
+{
+ return __rt_mutex_lock_common(lock, TASK_KILLABLE, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
+
+/**
+ * rt_mutex_trylock - try to lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * This function can only be called in thread context. It's safe to call it
+ * from atomic regions, but not from hard or soft interrupt context.
+ *
+ * Returns:
+ * 1 on success
+ * 0 on contention
+ */
+int __sched rt_mutex_trylock(struct rt_mutex *lock)
+{
+ int ret;
+
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
+ return 0;
+
+ ret = __rt_mutex_trylock(&lock->rtmutex);
+ if (ret)
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rt_mutex_trylock);
+
+/**
+ * rt_mutex_unlock - unlock a rt_mutex
+ *
+ * @lock: the rt_mutex to be unlocked
+ */
+void __sched rt_mutex_unlock(struct rt_mutex *lock)
+{
+ mutex_release(&lock->dep_map, _RET_IP_);
+ __rt_mutex_unlock(&lock->rtmutex);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+
+/*
+ * Futex variants, must not use fastpath.
+ */
+int __sched rt_mutex_futex_trylock(struct rt_mutex_base *lock)
+{
+ return rt_mutex_slowtrylock(lock);
+}
+
+int __sched __rt_mutex_futex_trylock(struct rt_mutex_base *lock)
+{
+ return __rt_mutex_slowtrylock(lock);
+}
+
+/**
+ * __rt_mutex_futex_unlock - Futex variant, that since futex variants
+ * do not use the fast-path, can be simple and will not need to retry.
+ *
+ * @lock: The rt_mutex to be unlocked
+ * @wqh: The wake queue head from which to get the next lock waiter
+ */
+bool __sched __rt_mutex_futex_unlock(struct rt_mutex_base *lock,
+ struct rt_wake_q_head *wqh)
+{
+ lockdep_assert_held(&lock->wait_lock);
+
+ debug_rt_mutex_unlock(lock);
+
+ if (!rt_mutex_has_waiters(lock)) {
+ lock->owner = NULL;
+ return false; /* done */
+ }
+
+ /*
+ * mark_wakeup_next_waiter() deboosts and retains preemption
+ * disabled when dropping the wait_lock, to avoid inversion prior
+ * to the wakeup. preempt_disable() therein pairs with the
+ * preempt_enable() in rt_mutex_postunlock().
+ */
+ mark_wakeup_next_waiter(wqh, lock);
+
+ return true; /* call postunlock() */
+}
+
+void __sched rt_mutex_futex_unlock(struct rt_mutex_base *lock)
+{
+ DEFINE_RT_WAKE_Q(wqh);
+ unsigned long flags;
+ bool postunlock;
+
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ postunlock = __rt_mutex_futex_unlock(lock, &wqh);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ if (postunlock)
+ rt_mutex_postunlock(&wqh);
+}
+
+/**
+ * __rt_mutex_init - initialize the rt_mutex
+ *
+ * @lock: The rt_mutex to be initialized
+ * @name: The lock name used for debugging
+ * @key: The lock class key used for debugging
+ *
+ * Initialize the rt_mutex to unlocked state.
+ *
+ * Initializing of a locked rt_mutex is not allowed
+ */
+void __sched __rt_mutex_init(struct rt_mutex *lock, const char *name,
+ struct lock_class_key *key)
+{
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ __rt_mutex_base_init(&lock->rtmutex);
+ lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP);
+}
+EXPORT_SYMBOL_GPL(__rt_mutex_init);
+
+/**
+ * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
+ * proxy owner
+ *
+ * @lock: the rt_mutex to be locked
+ * @proxy_owner:the task to set as owner
+ *
+ * No locking. Caller has to do serializing itself
+ *
+ * Special API call for PI-futex support. This initializes the rtmutex and
+ * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not
+ * possible at this point because the pi_state which contains the rtmutex
+ * is not yet visible to other tasks.
+ */
+void __sched rt_mutex_init_proxy_locked(struct rt_mutex_base *lock,
+ struct task_struct *proxy_owner)
+{
+ static struct lock_class_key pi_futex_key;
+
+ __rt_mutex_base_init(lock);
+ /*
+ * On PREEMPT_RT the futex hashbucket spinlock becomes 'sleeping'
+ * and rtmutex based. That causes a lockdep false positive, because
+ * some of the futex functions invoke spin_unlock(&hb->lock) with
+ * the wait_lock of the rtmutex associated to the pi_futex held.
+ * spin_unlock() in turn takes wait_lock of the rtmutex on which
+ * the spinlock is based, which makes lockdep notice a lock
+ * recursion. Give the futex/rtmutex wait_lock a separate key.
+ */
+ lockdep_set_class(&lock->wait_lock, &pi_futex_key);
+ rt_mutex_set_owner(lock, proxy_owner);
+}
+
+/**
+ * rt_mutex_proxy_unlock - release a lock on behalf of owner
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * No locking. Caller has to do serializing itself
+ *
+ * Special API call for PI-futex support. This just cleans up the rtmutex
+ * (debugging) state. Concurrent operations on this rt_mutex are not
+ * possible because it belongs to the pi_state which is about to be freed
+ * and it is not longer visible to other tasks.
+ */
+void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock)
+{
+ debug_rt_mutex_proxy_unlock(lock);
+ rt_mutex_clear_owner(lock);
+}
+
+/**
+ * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ * @wake_q: the wake_q to wake tasks after we release the wait_lock
+ *
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: does _NOT_ remove the @waiter on failure; must either call
+ * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this.
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for PI-futex support.
+ */
+int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task,
+ struct wake_q_head *wake_q)
+{
+ int ret;
+
+ lockdep_assert_held(&lock->wait_lock);
+
+ if (try_to_take_rt_mutex(lock, task, NULL))
+ return 1;
+
+ /* We enforce deadlock detection for futexes */
+ ret = task_blocks_on_rt_mutex(lock, waiter, task, NULL,
+ RT_MUTEX_FULL_CHAINWALK, wake_q);
+
+ if (ret && !rt_mutex_owner(lock)) {
+ /*
+ * Reset the return value. We might have
+ * returned with -EDEADLK and the owner
+ * released the lock while we were walking the
+ * pi chain. Let the waiter sort it out.
+ */
+ ret = 0;
+ }
+
+ return ret;
+}
+
+/**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter
+ * on failure.
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for PI-futex support.
+ */
+int __sched rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+{
+ int ret;
+ DEFINE_WAKE_Q(wake_q);
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ ret = __rt_mutex_start_proxy_lock(lock, waiter, task, &wake_q);
+ if (unlikely(ret))
+ remove_waiter(lock, waiter);
+ preempt_disable();
+ raw_spin_unlock_irq(&lock->wait_lock);
+ wake_up_q(&wake_q);
+ preempt_enable();
+
+ return ret;
+}
+
+/**
+ * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @to: the timeout, null if none. hrtimer should already have
+ * been started.
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Wait for the lock acquisition started on our behalf by
+ * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
+ * rt_mutex_cleanup_proxy_lock().
+ *
+ * Returns:
+ * 0 - success
+ * <0 - error, one of -EINTR, -ETIMEDOUT
+ *
+ * Special API call for PI-futex support
+ */
+int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter)
+{
+ int ret;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ /* sleep on the mutex */
+ set_current_state(TASK_INTERRUPTIBLE);
+ ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter, NULL);
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock, true);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+}
+
+/**
+ * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or
+ * rt_mutex_wait_proxy_lock().
+ *
+ * Unless we acquired the lock; we're still enqueued on the wait-list and can
+ * in fact still be granted ownership until we're removed. Therefore we can
+ * find we are in fact the owner and must disregard the
+ * rt_mutex_wait_proxy_lock() failure.
+ *
+ * Returns:
+ * true - did the cleanup, we done.
+ * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
+ * caller should disregards its return value.
+ *
+ * Special API call for PI-futex support
+ */
+bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter)
+{
+ bool cleanup = false;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ /*
+ * Do an unconditional try-lock, this deals with the lock stealing
+ * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
+ * sets a NULL owner.
+ *
+ * We're not interested in the return value, because the subsequent
+ * test on rt_mutex_owner() will infer that. If the trylock succeeded,
+ * we will own the lock and it will have removed the waiter. If we
+ * failed the trylock, we're still not owner and we need to remove
+ * ourselves.
+ */
+ try_to_take_rt_mutex(lock, current, waiter);
+ /*
+ * Unless we're the owner; we're still enqueued on the wait_list.
+ * So check if we became owner, if not, take us off the wait_list.
+ */
+ if (rt_mutex_owner(lock) != current) {
+ remove_waiter(lock, waiter);
+ cleanup = true;
+ }
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock, false);
+
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return cleanup;
+}
+
+/*
+ * Recheck the pi chain, in case we got a priority setting
+ *
+ * Called from sched_setscheduler
+ */
+void __sched rt_mutex_adjust_pi(struct task_struct *task)
+{
+ struct rt_mutex_waiter *waiter;
+ struct rt_mutex_base *next_lock;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+ waiter = task->pi_blocked_on;
+ if (!waiter || rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) {
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ return;
+ }
+ next_lock = waiter->lock;
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(task);
+
+ rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
+ next_lock, NULL, task);
+}
+
+/*
+ * Performs the wakeup of the top-waiter and re-enables preemption.
+ */
+void __sched rt_mutex_postunlock(struct rt_wake_q_head *wqh)
+{
+ rt_mutex_wake_up_q(wqh);
+}
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+void rt_mutex_debug_task_free(struct task_struct *task)
+{
+ DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root));
+ DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
+}
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+/* Mutexes */
+static void __mutex_rt_init_generic(struct mutex *mutex)
+{
+ rt_mutex_base_init(&mutex->rtmutex);
+ debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
+}
+
+static __always_inline int __mutex_lock_common(struct mutex *lock,
+ unsigned int state,
+ unsigned int subclass,
+ struct lockdep_map *nest_lock,
+ unsigned long ip)
+{
+ int ret;
+
+ might_sleep();
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
+ ret = __rt_mutex_lock(&lock->rtmutex, state);
+ if (ret)
+ mutex_release(&lock->dep_map, ip);
+ else
+ lock_acquired(&lock->dep_map, ip);
+ return ret;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void mutex_rt_init_lockdep(struct mutex *mutex, const char *name, struct lock_class_key *key)
+{
+ __mutex_rt_init_generic(mutex);
+ lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP);
+}
+EXPORT_SYMBOL(mutex_rt_init_lockdep);
+
+void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass)
+{
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_nested);
+
+void __sched _mutex_lock_nest_lock(struct mutex *lock,
+ struct lockdep_map *nest_lock)
+{
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest_lock, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
+
+int __sched mutex_lock_interruptible_nested(struct mutex *lock,
+ unsigned int subclass)
+{
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
+
+int __sched _mutex_lock_killable(struct mutex *lock, unsigned int subclass,
+ struct lockdep_map *nest_lock)
+{
+ return __mutex_lock_common(lock, TASK_KILLABLE, subclass, nest_lock, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(_mutex_lock_killable);
+
+void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
+{
+ int token;
+
+ might_sleep();
+
+ token = io_schedule_prepare();
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_io_nested);
+
+int __sched _mutex_trylock_nest_lock(struct mutex *lock,
+ struct lockdep_map *nest_lock)
+{
+ int ret;
+
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
+ return 0;
+
+ ret = __rt_mutex_trylock(&lock->rtmutex);
+ if (ret)
+ mutex_acquire_nest(&lock->dep_map, 0, 1, nest_lock, _RET_IP_);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(_mutex_trylock_nest_lock);
+#else /* CONFIG_DEBUG_LOCK_ALLOC */
+
+void mutex_rt_init_generic(struct mutex *mutex)
+{
+ __mutex_rt_init_generic(mutex);
+}
+EXPORT_SYMBOL(mutex_rt_init_generic);
+
+void __sched mutex_lock(struct mutex *lock)
+{
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL(mutex_lock);
+
+int __sched mutex_lock_interruptible(struct mutex *lock)
+{
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL(mutex_lock_interruptible);
+
+int __sched mutex_lock_killable(struct mutex *lock)
+{
+ return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL(mutex_lock_killable);
+
+void __sched mutex_lock_io(struct mutex *lock)
+{
+ int token = io_schedule_prepare();
+
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL(mutex_lock_io);
+
+int __sched mutex_trylock(struct mutex *lock)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
+ return 0;
+
+ return __rt_mutex_trylock(&lock->rtmutex);
+}
+EXPORT_SYMBOL(mutex_trylock);
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
+
+void __sched mutex_unlock(struct mutex *lock)
+{
+ mutex_release(&lock->dep_map, _RET_IP_);
+ __rt_mutex_unlock(&lock->rtmutex);
+}
+EXPORT_SYMBOL(mutex_unlock);
+
+#endif /* CONFIG_PREEMPT_RT */
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 72ad45a9a794..cf6ddd1b23a2 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* RT Mutexes: blocking mutual exclusion locks with PI support
*
@@ -12,73 +13,144 @@
#ifndef __KERNEL_RTMUTEX_COMMON_H
#define __KERNEL_RTMUTEX_COMMON_H
+#include <linux/debug_locks.h>
#include <linux/rtmutex.h>
#include <linux/sched/wake_q.h>
+
+/*
+ * This is a helper for the struct rt_mutex_waiter below. A waiter goes in two
+ * separate trees and they need their own copy of the sort keys because of
+ * different locking requirements.
+ *
+ * @entry: rbtree node to enqueue into the waiters tree
+ * @prio: Priority of the waiter
+ * @deadline: Deadline of the waiter if applicable
+ *
+ * See rt_waiter_node_less() and waiter_*_prio().
+ */
+struct rt_waiter_node {
+ struct rb_node entry;
+ int prio;
+ u64 deadline;
+};
+
/*
* This is the control structure for tasks blocked on a rt_mutex,
* which is allocated on the kernel stack on of the blocked task.
*
- * @tree_entry: pi node to enqueue into the mutex waiters tree
- * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree
+ * @tree: node to enqueue into the mutex waiters tree
+ * @pi_tree: node to enqueue into the mutex owner waiters tree
* @task: task reference to the blocked task
+ * @lock: Pointer to the rt_mutex on which the waiter blocks
+ * @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT)
+ * @ww_ctx: WW context pointer
+ *
+ * @tree is ordered by @lock->wait_lock
+ * @pi_tree is ordered by rt_mutex_owner(@lock)->pi_lock
*/
struct rt_mutex_waiter {
- struct rb_node tree_entry;
- struct rb_node pi_tree_entry;
+ struct rt_waiter_node tree;
+ struct rt_waiter_node pi_tree;
struct task_struct *task;
- struct rt_mutex *lock;
-#ifdef CONFIG_DEBUG_RT_MUTEXES
- unsigned long ip;
- struct pid *deadlock_task_pid;
- struct rt_mutex *deadlock_lock;
-#endif
- int prio;
- u64 deadline;
+ struct rt_mutex_base *lock;
+ unsigned int wake_state;
+ struct ww_acquire_ctx *ww_ctx;
};
+/**
+ * struct rt_wake_q_head - Wrapper around regular wake_q_head to support
+ * "sleeping" spinlocks on RT
+ * @head: The regular wake_q_head for sleeping lock variants
+ * @rtlock_task: Task pointer for RT lock (spin/rwlock) wakeups
+ */
+struct rt_wake_q_head {
+ struct wake_q_head head;
+ struct task_struct *rtlock_task;
+};
+
+#define DEFINE_RT_WAKE_Q(name) \
+ struct rt_wake_q_head name = { \
+ .head = WAKE_Q_HEAD_INITIALIZER(name.head), \
+ .rtlock_task = NULL, \
+ }
+
/*
- * Various helpers to access the waiters-tree:
+ * PI-futex support (proxy locking functions, etc.):
*/
-static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+extern void rt_mutex_init_proxy_locked(struct rt_mutex_base *lock,
+ struct task_struct *proxy_owner);
+extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock);
+extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task,
+ struct wake_q_head *);
+extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
+extern int rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter);
+extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter);
+
+extern int rt_mutex_futex_trylock(struct rt_mutex_base *l);
+extern int __rt_mutex_futex_trylock(struct rt_mutex_base *l);
+
+extern void rt_mutex_futex_unlock(struct rt_mutex_base *lock);
+extern bool __rt_mutex_futex_unlock(struct rt_mutex_base *lock,
+ struct rt_wake_q_head *wqh);
+
+extern void rt_mutex_postunlock(struct rt_wake_q_head *wqh);
+
+/*
+ * Must be guarded because this header is included from rcu/tree_plugin.h
+ * unconditionally.
+ */
+#ifdef CONFIG_RT_MUTEXES
+static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock)
{
- return !RB_EMPTY_ROOT(&lock->waiters);
+ return !RB_EMPTY_ROOT(&lock->waiters.rb_root);
}
-static inline struct rt_mutex_waiter *
-rt_mutex_top_waiter(struct rt_mutex *lock)
+/*
+ * Lockless speculative check whether @waiter is still the top waiter on
+ * @lock. This is solely comparing pointers and not derefencing the
+ * leftmost entry which might be about to vanish.
+ */
+static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter)
{
- struct rt_mutex_waiter *w;
+ struct rb_node *leftmost = rb_first_cached(&lock->waiters);
- w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter,
- tree_entry);
- BUG_ON(w->lock != lock);
-
- return w;
+ return rb_entry(leftmost, struct rt_mutex_waiter, tree.entry) == waiter;
}
-static inline int task_has_pi_waiters(struct task_struct *p)
+static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock)
{
- return !RB_EMPTY_ROOT(&p->pi_waiters);
+ struct rb_node *leftmost = rb_first_cached(&lock->waiters);
+ struct rt_mutex_waiter *w = NULL;
+
+ lockdep_assert_held(&lock->wait_lock);
+
+ if (leftmost) {
+ w = rb_entry(leftmost, struct rt_mutex_waiter, tree.entry);
+ BUG_ON(w->lock != lock);
+ }
+ return w;
}
-static inline struct rt_mutex_waiter *
-task_top_pi_waiter(struct task_struct *p)
+static inline int task_has_pi_waiters(struct task_struct *p)
{
- return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter,
- pi_tree_entry);
+ return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root);
}
-/*
- * lock->owner state tracking:
- */
-#define RT_MUTEX_HAS_WAITERS 1UL
-
-static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
+static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p)
{
- unsigned long owner = (unsigned long) READ_ONCE(lock->owner);
+ lockdep_assert_held(&p->pi_lock);
- return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS);
+ return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter,
+ pi_tree.entry);
}
/*
@@ -96,39 +168,59 @@ enum rtmutex_chainwalk {
RT_MUTEX_FULL_CHAINWALK,
};
-/*
- * PI-futex support (proxy locking functions, etc.):
- */
-extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
-extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
- struct task_struct *proxy_owner);
-extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
- struct task_struct *proxy_owner);
-extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
-extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task);
-extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task);
-extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter);
-extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter);
+static inline void __rt_mutex_base_init(struct rt_mutex_base *lock)
+{
+ raw_spin_lock_init(&lock->wait_lock);
+ lock->waiters = RB_ROOT_CACHED;
+ lock->owner = NULL;
+}
-extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+/* Debug functions */
+static inline void debug_rt_mutex_unlock(struct rt_mutex_base *lock)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
+ DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
+}
-extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
-extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wqh);
+static inline void debug_rt_mutex_proxy_unlock(struct rt_mutex_base *lock)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
+ DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
+}
-extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
+static inline void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
+ memset(waiter, 0x11, sizeof(*waiter));
+}
-#ifdef CONFIG_DEBUG_RT_MUTEXES
-# include "rtmutex-debug.h"
-#else
-# include "rtmutex.h"
-#endif
+static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
+ memset(waiter, 0x22, sizeof(*waiter));
+}
+
+static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ debug_rt_mutex_init_waiter(waiter);
+ RB_CLEAR_NODE(&waiter->pi_tree.entry);
+ RB_CLEAR_NODE(&waiter->tree.entry);
+ waiter->wake_state = TASK_NORMAL;
+ waiter->task = NULL;
+}
+
+static inline void rt_mutex_init_rtlock_waiter(struct rt_mutex_waiter *waiter)
+{
+ rt_mutex_init_waiter(waiter);
+ waiter->wake_state = TASK_RTLOCK_WAIT;
+}
+
+#else /* CONFIG_RT_MUTEXES */
+/* Used in rcu/tree_plugin.h */
+static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock)
+{
+ return NULL;
+}
+#endif /* !CONFIG_RT_MUTEXES */
#endif
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
new file mode 100644
index 000000000000..9f4322c07486
--- /dev/null
+++ b/kernel/locking/rwbase_rt.c
@@ -0,0 +1,303 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * RT-specific reader/writer semaphores and reader/writer locks
+ *
+ * down_write/write_lock()
+ * 1) Lock rtmutex
+ * 2) Remove the reader BIAS to force readers into the slow path
+ * 3) Wait until all readers have left the critical section
+ * 4) Mark it write locked
+ *
+ * up_write/write_unlock()
+ * 1) Remove the write locked marker
+ * 2) Set the reader BIAS, so readers can use the fast path again
+ * 3) Unlock rtmutex, to release blocked readers
+ *
+ * down_read/read_lock()
+ * 1) Try fast path acquisition (reader BIAS is set)
+ * 2) Take tmutex::wait_lock, which protects the writelocked flag
+ * 3) If !writelocked, acquire it for read
+ * 4) If writelocked, block on tmutex
+ * 5) unlock rtmutex, goto 1)
+ *
+ * up_read/read_unlock()
+ * 1) Try fast path release (reader count != 1)
+ * 2) Wake the writer waiting in down_write()/write_lock() #3
+ *
+ * down_read/read_lock()#3 has the consequence, that rw semaphores and rw
+ * locks on RT are not writer fair, but writers, which should be avoided in
+ * RT tasks (think mmap_sem), are subject to the rtmutex priority/DL
+ * inheritance mechanism.
+ *
+ * It's possible to make the rw primitives writer fair by keeping a list of
+ * active readers. A blocked writer would force all newly incoming readers
+ * to block on the rtmutex, but the rtmutex would have to be proxy locked
+ * for one reader after the other. We can't use multi-reader inheritance
+ * because there is no way to support that with SCHED_DEADLINE.
+ * Implementing the one by one reader boosting/handover mechanism is a
+ * major surgery for a very dubious value.
+ *
+ * The risk of writer starvation is there, but the pathological use cases
+ * which trigger it are not necessarily the typical RT workloads.
+ *
+ * Fast-path orderings:
+ * The lock/unlock of readers can run in fast paths: lock and unlock are only
+ * atomic ops, and there is no inner lock to provide ACQUIRE and RELEASE
+ * semantics of rwbase_rt. Atomic ops should thus provide _acquire()
+ * and _release() (or stronger).
+ *
+ * Common code shared between RT rw_semaphore and rwlock
+ */
+
+static __always_inline int rwbase_read_trylock(struct rwbase_rt *rwb)
+{
+ int r;
+
+ /*
+ * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is
+ * set.
+ */
+ for (r = atomic_read(&rwb->readers); r < 0;) {
+ if (likely(atomic_try_cmpxchg_acquire(&rwb->readers, &r, r + 1)))
+ return 1;
+ }
+ return 0;
+}
+
+static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ DEFINE_WAKE_Q(wake_q);
+ int ret;
+
+ rwbase_pre_schedule();
+ raw_spin_lock_irq(&rtm->wait_lock);
+
+ /*
+ * Call into the slow lock path with the rtmutex->wait_lock
+ * held, so this can't result in the following race:
+ *
+ * Reader1 Reader2 Writer
+ * down_read()
+ * down_write()
+ * rtmutex_lock(m)
+ * wait()
+ * down_read()
+ * unlock(m->wait_lock)
+ * up_read()
+ * wake(Writer)
+ * lock(m->wait_lock)
+ * sem->writelocked=true
+ * unlock(m->wait_lock)
+ *
+ * up_write()
+ * sem->writelocked=false
+ * rtmutex_unlock(m)
+ * down_read()
+ * down_write()
+ * rtmutex_lock(m)
+ * wait()
+ * rtmutex_lock(m)
+ *
+ * That would put Reader1 behind the writer waiting on
+ * Reader2 to call up_read(), which might be unbound.
+ */
+
+ trace_contention_begin(rwb, LCB_F_RT | LCB_F_READ);
+
+ /*
+ * For rwlocks this returns 0 unconditionally, so the below
+ * !ret conditionals are optimized out.
+ */
+ ret = rwbase_rtmutex_slowlock_locked(rtm, state, &wake_q);
+
+ /*
+ * On success the rtmutex is held, so there can't be a writer
+ * active. Increment the reader count and immediately drop the
+ * rtmutex again.
+ *
+ * rtmutex->wait_lock has to be unlocked in any case of course.
+ */
+ if (!ret)
+ atomic_inc(&rwb->readers);
+
+ preempt_disable();
+ raw_spin_unlock_irq(&rtm->wait_lock);
+ wake_up_q(&wake_q);
+ preempt_enable();
+
+ if (!ret)
+ rwbase_rtmutex_unlock(rtm);
+
+ trace_contention_end(rwb, ret);
+ rwbase_post_schedule();
+ return ret;
+}
+
+static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ lockdep_assert(!current->pi_blocked_on);
+
+ if (rwbase_read_trylock(rwb))
+ return 0;
+
+ return __rwbase_read_lock(rwb, state);
+}
+
+static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ struct task_struct *owner;
+ DEFINE_RT_WAKE_Q(wqh);
+
+ raw_spin_lock_irq(&rtm->wait_lock);
+ /*
+ * Wake the writer, i.e. the rtmutex owner. It might release the
+ * rtmutex concurrently in the fast path (due to a signal), but to
+ * clean up rwb->readers it needs to acquire rtm->wait_lock. The
+ * worst case which can happen is a spurious wakeup.
+ */
+ owner = rt_mutex_owner(rtm);
+ if (owner)
+ rt_mutex_wake_q_add_task(&wqh, owner, state);
+
+ /* Pairs with the preempt_enable in rt_mutex_wake_up_q() */
+ preempt_disable();
+ raw_spin_unlock_irq(&rtm->wait_lock);
+ rt_mutex_wake_up_q(&wqh);
+}
+
+static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ /*
+ * rwb->readers can only hit 0 when a writer is waiting for the
+ * active readers to leave the critical section.
+ *
+ * dec_and_test() is fully ordered, provides RELEASE.
+ */
+ if (unlikely(atomic_dec_and_test(&rwb->readers)))
+ __rwbase_read_unlock(rwb, state);
+}
+
+static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias,
+ unsigned long flags)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+
+ /*
+ * _release() is needed in case that reader is in fast path, pairing
+ * with atomic_try_cmpxchg_acquire() in rwbase_read_trylock().
+ */
+ (void)atomic_add_return_release(READER_BIAS - bias, &rwb->readers);
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ rwbase_rtmutex_unlock(rtm);
+}
+
+static inline void rwbase_write_unlock(struct rwbase_rt *rwb)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ __rwbase_write_unlock(rwb, WRITER_BIAS, flags);
+}
+
+static inline void rwbase_write_downgrade(struct rwbase_rt *rwb)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ /* Release it and account current as reader */
+ __rwbase_write_unlock(rwb, WRITER_BIAS - 1, flags);
+}
+
+static inline bool __rwbase_write_trylock(struct rwbase_rt *rwb)
+{
+ /* Can do without CAS because we're serialized by wait_lock. */
+ lockdep_assert_held(&rwb->rtmutex.wait_lock);
+
+ /*
+ * _acquire is needed in case the reader is in the fast path, pairing
+ * with rwbase_read_unlock(), provides ACQUIRE.
+ */
+ if (!atomic_read_acquire(&rwb->readers)) {
+ atomic_set(&rwb->readers, WRITER_BIAS);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ /* Take the rtmutex as a first step */
+ if (rwbase_rtmutex_lock_state(rtm, state))
+ return -EINTR;
+
+ /* Force readers into slow path */
+ atomic_sub(READER_BIAS, &rwb->readers);
+
+ rwbase_pre_schedule();
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ if (__rwbase_write_trylock(rwb))
+ goto out_unlock;
+
+ rwbase_set_and_save_current_state(state);
+ trace_contention_begin(rwb, LCB_F_RT | LCB_F_WRITE);
+ for (;;) {
+ /* Optimized out for rwlocks */
+ if (rwbase_signal_pending_state(state, current)) {
+ rwbase_restore_current_state();
+ __rwbase_write_unlock(rwb, 0, flags);
+ rwbase_post_schedule();
+ trace_contention_end(rwb, -EINTR);
+ return -EINTR;
+ }
+
+ if (__rwbase_write_trylock(rwb))
+ break;
+
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ rwbase_schedule();
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+
+ set_current_state(state);
+ }
+ rwbase_restore_current_state();
+ trace_contention_end(rwb, 0);
+
+out_unlock:
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ rwbase_post_schedule();
+ return 0;
+}
+
+static inline int rwbase_write_trylock(struct rwbase_rt *rwb)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ if (!rwbase_rtmutex_trylock(rtm))
+ return 0;
+
+ atomic_sub(READER_BIAS, &rwb->readers);
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ if (__rwbase_write_trylock(rwb)) {
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ return 1;
+ }
+ __rwbase_write_unlock(rwb, 0, flags);
+ return 0;
+}
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
deleted file mode 100644
index 20819df98125..000000000000
--- a/kernel/locking/rwsem-spinlock.c
+++ /dev/null
@@ -1,319 +0,0 @@
-/* rwsem-spinlock.c: R/W semaphores: contention handling functions for
- * generic spinlock implementation
- *
- * Copyright (c) 2001 David Howells (dhowells@redhat.com).
- * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
- * - Derived also from comments by Linus
- */
-#include <linux/rwsem.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/debug.h>
-#include <linux/export.h>
-
-enum rwsem_waiter_type {
- RWSEM_WAITING_FOR_WRITE,
- RWSEM_WAITING_FOR_READ
-};
-
-struct rwsem_waiter {
- struct list_head list;
- struct task_struct *task;
- enum rwsem_waiter_type type;
-};
-
-int rwsem_is_locked(struct rw_semaphore *sem)
-{
- int ret = 1;
- unsigned long flags;
-
- if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
- ret = (sem->count != 0);
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- }
- return ret;
-}
-EXPORT_SYMBOL(rwsem_is_locked);
-
-/*
- * initialise the semaphore
- */
-void __init_rwsem(struct rw_semaphore *sem, const char *name,
- struct lock_class_key *key)
-{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- /*
- * Make sure we are not reinitializing a held semaphore:
- */
- debug_check_no_locks_freed((void *)sem, sizeof(*sem));
- lockdep_init_map(&sem->dep_map, name, key, 0);
-#endif
- sem->count = 0;
- raw_spin_lock_init(&sem->wait_lock);
- INIT_LIST_HEAD(&sem->wait_list);
-}
-EXPORT_SYMBOL(__init_rwsem);
-
-/*
- * handle the lock release when processes blocked on it that can now run
- * - if we come here, then:
- * - the 'active count' _reached_ zero
- * - the 'waiting count' is non-zero
- * - the spinlock must be held by the caller
- * - woken process blocks are discarded from the list after having task zeroed
- * - writers are only woken if wakewrite is non-zero
- */
-static inline struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
-{
- struct rwsem_waiter *waiter;
- struct task_struct *tsk;
- int woken;
-
- waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-
- if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
- if (wakewrite)
- /* Wake up a writer. Note that we do not grant it the
- * lock - it will have to acquire it when it runs. */
- wake_up_process(waiter->task);
- goto out;
- }
-
- /* grant an infinite number of read locks to the front of the queue */
- woken = 0;
- do {
- struct list_head *next = waiter->list.next;
-
- list_del(&waiter->list);
- tsk = waiter->task;
- /*
- * Make sure we do not wakeup the next reader before
- * setting the nil condition to grant the next reader;
- * otherwise we could miss the wakeup on the other
- * side and end up sleeping again. See the pairing
- * in rwsem_down_read_failed().
- */
- smp_mb();
- waiter->task = NULL;
- wake_up_process(tsk);
- put_task_struct(tsk);
- woken++;
- if (next == &sem->wait_list)
- break;
- waiter = list_entry(next, struct rwsem_waiter, list);
- } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
-
- sem->count += woken;
-
- out:
- return sem;
-}
-
-/*
- * wake a single writer
- */
-static inline struct rw_semaphore *
-__rwsem_wake_one_writer(struct rw_semaphore *sem)
-{
- struct rwsem_waiter *waiter;
-
- waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
- wake_up_process(waiter->task);
-
- return sem;
-}
-
-/*
- * get a read lock on the semaphore
- */
-void __sched __down_read(struct rw_semaphore *sem)
-{
- struct rwsem_waiter waiter;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (sem->count >= 0 && list_empty(&sem->wait_list)) {
- /* granted */
- sem->count++;
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- goto out;
- }
-
- set_current_state(TASK_UNINTERRUPTIBLE);
-
- /* set up my own style of waitqueue */
- waiter.task = current;
- waiter.type = RWSEM_WAITING_FOR_READ;
- get_task_struct(current);
-
- list_add_tail(&waiter.list, &sem->wait_list);
-
- /* we don't need to touch the semaphore struct anymore */
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- /* wait to be given the lock */
- for (;;) {
- if (!waiter.task)
- break;
- schedule();
- set_current_state(TASK_UNINTERRUPTIBLE);
- }
-
- __set_current_state(TASK_RUNNING);
- out:
- ;
-}
-
-/*
- * trylock for reading -- returns 1 if successful, 0 if contention
- */
-int __down_read_trylock(struct rw_semaphore *sem)
-{
- unsigned long flags;
- int ret = 0;
-
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (sem->count >= 0 && list_empty(&sem->wait_list)) {
- /* granted */
- sem->count++;
- ret = 1;
- }
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- return ret;
-}
-
-/*
- * get a write lock on the semaphore
- */
-int __sched __down_write_common(struct rw_semaphore *sem, int state)
-{
- struct rwsem_waiter waiter;
- unsigned long flags;
- int ret = 0;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- /* set up my own style of waitqueue */
- waiter.task = current;
- waiter.type = RWSEM_WAITING_FOR_WRITE;
- list_add_tail(&waiter.list, &sem->wait_list);
-
- /* wait for someone to release the lock */
- for (;;) {
- /*
- * That is the key to support write lock stealing: allows the
- * task already on CPU to get the lock soon rather than put
- * itself into sleep and waiting for system woke it or someone
- * else in the head of the wait list up.
- */
- if (sem->count == 0)
- break;
- if (signal_pending_state(state, current))
- goto out_nolock;
-
- set_current_state(state);
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- schedule();
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
- }
- /* got the lock */
- sem->count = -1;
- list_del(&waiter.list);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- return ret;
-
-out_nolock:
- list_del(&waiter.list);
- if (!list_empty(&sem->wait_list) && sem->count >= 0)
- __rwsem_do_wake(sem, 0);
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- return -EINTR;
-}
-
-void __sched __down_write(struct rw_semaphore *sem)
-{
- __down_write_common(sem, TASK_UNINTERRUPTIBLE);
-}
-
-int __sched __down_write_killable(struct rw_semaphore *sem)
-{
- return __down_write_common(sem, TASK_KILLABLE);
-}
-
-/*
- * trylock for writing -- returns 1 if successful, 0 if contention
- */
-int __down_write_trylock(struct rw_semaphore *sem)
-{
- unsigned long flags;
- int ret = 0;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (sem->count == 0) {
- /* got the lock */
- sem->count = -1;
- ret = 1;
- }
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- return ret;
-}
-
-/*
- * release a read lock on the semaphore
- */
-void __up_read(struct rw_semaphore *sem)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (--sem->count == 0 && !list_empty(&sem->wait_list))
- sem = __rwsem_wake_one_writer(sem);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-
-/*
- * release a write lock on the semaphore
- */
-void __up_write(struct rw_semaphore *sem)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- sem->count = 0;
- if (!list_empty(&sem->wait_list))
- sem = __rwsem_do_wake(sem, 1);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-
-/*
- * downgrade a write lock into a read lock
- * - just wake up any readers at the front of the queue
- */
-void __downgrade_write(struct rw_semaphore *sem)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- sem->count = 1;
- if (!list_empty(&sem->wait_list))
- sem = __rwsem_do_wake(sem, 0);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
deleted file mode 100644
index 34e727f18e49..000000000000
--- a/kernel/locking/rwsem-xadd.c
+++ /dev/null
@@ -1,652 +0,0 @@
-/* rwsem.c: R/W semaphores: contention handling functions
- *
- * Written by David Howells (dhowells@redhat.com).
- * Derived from arch/i386/kernel/semaphore.c
- *
- * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
- * and Michel Lespinasse <walken@google.com>
- *
- * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
- * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
- */
-#include <linux/rwsem.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/rt.h>
-#include <linux/sched/wake_q.h>
-#include <linux/sched/debug.h>
-#include <linux/osq_lock.h>
-
-#include "rwsem.h"
-
-/*
- * Guide to the rw_semaphore's count field for common values.
- * (32-bit case illustrated, similar for 64-bit)
- *
- * 0x0000000X (1) X readers active or attempting lock, no writer waiting
- * X = #active_readers + #readers attempting to lock
- * (X*ACTIVE_BIAS)
- *
- * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or
- * attempting to read lock or write lock.
- *
- * 0xffff000X (1) X readers active or attempting lock, with waiters for lock
- * X = #active readers + # readers attempting lock
- * (X*ACTIVE_BIAS + WAITING_BIAS)
- * (2) 1 writer attempting lock, no waiters for lock
- * X-1 = #active readers + #readers attempting lock
- * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
- * (3) 1 writer active, no waiters for lock
- * X-1 = #active readers + #readers attempting lock
- * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
- *
- * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock
- * (WAITING_BIAS + ACTIVE_BIAS)
- * (2) 1 writer active or attempting lock, no waiters for lock
- * (ACTIVE_WRITE_BIAS)
- *
- * 0xffff0000 (1) There are writers or readers queued but none active
- * or in the process of attempting lock.
- * (WAITING_BIAS)
- * Note: writer can attempt to steal lock for this count by adding
- * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count
- *
- * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue.
- * (ACTIVE_WRITE_BIAS + WAITING_BIAS)
- *
- * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking
- * the count becomes more than 0 for successful lock acquisition,
- * i.e. the case where there are only readers or nobody has lock.
- * (1st and 2nd case above).
- *
- * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and
- * checking the count becomes ACTIVE_WRITE_BIAS for successful lock
- * acquisition (i.e. nobody else has lock or attempts lock). If
- * unsuccessful, in rwsem_down_write_failed, we'll check to see if there
- * are only waiters but none active (5th case above), and attempt to
- * steal the lock.
- *
- */
-
-/*
- * Initialize an rwsem:
- */
-void __init_rwsem(struct rw_semaphore *sem, const char *name,
- struct lock_class_key *key)
-{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- /*
- * Make sure we are not reinitializing a held semaphore:
- */
- debug_check_no_locks_freed((void *)sem, sizeof(*sem));
- lockdep_init_map(&sem->dep_map, name, key, 0);
-#endif
- atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
- raw_spin_lock_init(&sem->wait_lock);
- INIT_LIST_HEAD(&sem->wait_list);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
- sem->owner = NULL;
- osq_lock_init(&sem->osq);
-#endif
-}
-
-EXPORT_SYMBOL(__init_rwsem);
-
-enum rwsem_waiter_type {
- RWSEM_WAITING_FOR_WRITE,
- RWSEM_WAITING_FOR_READ
-};
-
-struct rwsem_waiter {
- struct list_head list;
- struct task_struct *task;
- enum rwsem_waiter_type type;
-};
-
-enum rwsem_wake_type {
- RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */
- RWSEM_WAKE_READERS, /* Wake readers only */
- RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
-};
-
-/*
- * handle the lock release when processes blocked on it that can now run
- * - if we come here from up_xxxx(), then:
- * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
- * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
- * - there must be someone on the queue
- * - the wait_lock must be held by the caller
- * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
- * to actually wakeup the blocked task(s) and drop the reference count,
- * preferably when the wait_lock is released
- * - woken process blocks are discarded from the list after having task zeroed
- * - writers are only marked woken if downgrading is false
- */
-static void __rwsem_mark_wake(struct rw_semaphore *sem,
- enum rwsem_wake_type wake_type,
- struct wake_q_head *wake_q)
-{
- struct rwsem_waiter *waiter, *tmp;
- long oldcount, woken = 0, adjustment = 0;
-
- /*
- * Take a peek at the queue head waiter such that we can determine
- * the wakeup(s) to perform.
- */
- waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list);
-
- if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
- if (wake_type == RWSEM_WAKE_ANY) {
- /*
- * Mark writer at the front of the queue for wakeup.
- * Until the task is actually later awoken later by
- * the caller, other writers are able to steal it.
- * Readers, on the other hand, will block as they
- * will notice the queued writer.
- */
- wake_q_add(wake_q, waiter->task);
- }
-
- return;
- }
-
- /*
- * Writers might steal the lock before we grant it to the next reader.
- * We prefer to do the first reader grant before counting readers
- * so we can bail out early if a writer stole the lock.
- */
- if (wake_type != RWSEM_WAKE_READ_OWNED) {
- adjustment = RWSEM_ACTIVE_READ_BIAS;
- try_reader_grant:
- oldcount = atomic_long_fetch_add(adjustment, &sem->count);
- if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
- /*
- * If the count is still less than RWSEM_WAITING_BIAS
- * after removing the adjustment, it is assumed that
- * a writer has stolen the lock. We have to undo our
- * reader grant.
- */
- if (atomic_long_add_return(-adjustment, &sem->count) <
- RWSEM_WAITING_BIAS)
- return;
-
- /* Last active locker left. Retry waking readers. */
- goto try_reader_grant;
- }
- /*
- * It is not really necessary to set it to reader-owned here,
- * but it gives the spinners an early indication that the
- * readers now have the lock.
- */
- rwsem_set_reader_owned(sem);
- }
-
- /*
- * Grant an infinite number of read locks to the readers at the front
- * of the queue. We know that woken will be at least 1 as we accounted
- * for above. Note we increment the 'active part' of the count by the
- * number of readers before waking any processes up.
- */
- list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
- struct task_struct *tsk;
-
- if (waiter->type == RWSEM_WAITING_FOR_WRITE)
- break;
-
- woken++;
- tsk = waiter->task;
-
- wake_q_add(wake_q, tsk);
- list_del(&waiter->list);
- /*
- * Ensure that the last operation is setting the reader
- * waiter to nil such that rwsem_down_read_failed() cannot
- * race with do_exit() by always holding a reference count
- * to the task to wakeup.
- */
- smp_store_release(&waiter->task, NULL);
- }
-
- adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
- if (list_empty(&sem->wait_list)) {
- /* hit end of list above */
- adjustment -= RWSEM_WAITING_BIAS;
- }
-
- if (adjustment)
- atomic_long_add(adjustment, &sem->count);
-}
-
-/*
- * Wait for the read lock to be granted
- */
-__visible
-struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
-{
- long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
- struct rwsem_waiter waiter;
- DEFINE_WAKE_Q(wake_q);
-
- waiter.task = current;
- waiter.type = RWSEM_WAITING_FOR_READ;
-
- raw_spin_lock_irq(&sem->wait_lock);
- if (list_empty(&sem->wait_list))
- adjustment += RWSEM_WAITING_BIAS;
- list_add_tail(&waiter.list, &sem->wait_list);
-
- /* we're now waiting on the lock, but no longer actively locking */
- count = atomic_long_add_return(adjustment, &sem->count);
-
- /*
- * If there are no active locks, wake the front queued process(es).
- *
- * If there are no writers and we are first in the queue,
- * wake our own waiter to join the existing active readers !
- */
- if (count == RWSEM_WAITING_BIAS ||
- (count > RWSEM_WAITING_BIAS &&
- adjustment != -RWSEM_ACTIVE_READ_BIAS))
- __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
-
- raw_spin_unlock_irq(&sem->wait_lock);
- wake_up_q(&wake_q);
-
- /* wait to be given the lock */
- while (true) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (!waiter.task)
- break;
- schedule();
- }
-
- __set_current_state(TASK_RUNNING);
- return sem;
-}
-EXPORT_SYMBOL(rwsem_down_read_failed);
-
-/*
- * This function must be called with the sem->wait_lock held to prevent
- * race conditions between checking the rwsem wait list and setting the
- * sem->count accordingly.
- */
-static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
-{
- /*
- * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS.
- */
- if (count != RWSEM_WAITING_BIAS)
- return false;
-
- /*
- * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there
- * are other tasks on the wait list, we need to add on WAITING_BIAS.
- */
- count = list_is_singular(&sem->wait_list) ?
- RWSEM_ACTIVE_WRITE_BIAS :
- RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS;
-
- if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count)
- == RWSEM_WAITING_BIAS) {
- rwsem_set_owner(sem);
- return true;
- }
-
- return false;
-}
-
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-/*
- * Try to acquire write lock before the writer has been put on wait queue.
- */
-static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
-{
- long old, count = atomic_long_read(&sem->count);
-
- while (true) {
- if (!(count == 0 || count == RWSEM_WAITING_BIAS))
- return false;
-
- old = atomic_long_cmpxchg_acquire(&sem->count, count,
- count + RWSEM_ACTIVE_WRITE_BIAS);
- if (old == count) {
- rwsem_set_owner(sem);
- return true;
- }
-
- count = old;
- }
-}
-
-static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
-{
- struct task_struct *owner;
- bool ret = true;
-
- if (need_resched())
- return false;
-
- rcu_read_lock();
- owner = READ_ONCE(sem->owner);
- if (!rwsem_owner_is_writer(owner)) {
- /*
- * Don't spin if the rwsem is readers owned.
- */
- ret = !rwsem_owner_is_reader(owner);
- goto done;
- }
-
- /*
- * As lock holder preemption issue, we both skip spinning if task is not
- * on cpu or its cpu is preempted
- */
- ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
-done:
- rcu_read_unlock();
- return ret;
-}
-
-/*
- * Return true only if we can still spin on the owner field of the rwsem.
- */
-static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
-{
- struct task_struct *owner = READ_ONCE(sem->owner);
-
- if (!rwsem_owner_is_writer(owner))
- goto out;
-
- rcu_read_lock();
- while (sem->owner == owner) {
- /*
- * Ensure we emit the owner->on_cpu, dereference _after_
- * checking sem->owner still matches owner, if that fails,
- * owner might point to free()d memory, if it still matches,
- * the rcu_read_lock() ensures the memory stays valid.
- */
- barrier();
-
- /*
- * abort spinning when need_resched or owner is not running or
- * owner's cpu is preempted.
- */
- if (!owner->on_cpu || need_resched() ||
- vcpu_is_preempted(task_cpu(owner))) {
- rcu_read_unlock();
- return false;
- }
-
- cpu_relax();
- }
- rcu_read_unlock();
-out:
- /*
- * If there is a new owner or the owner is not set, we continue
- * spinning.
- */
- return !rwsem_owner_is_reader(READ_ONCE(sem->owner));
-}
-
-static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
-{
- bool taken = false;
-
- preempt_disable();
-
- /* sem->wait_lock should not be held when doing optimistic spinning */
- if (!rwsem_can_spin_on_owner(sem))
- goto done;
-
- if (!osq_lock(&sem->osq))
- goto done;
-
- /*
- * Optimistically spin on the owner field and attempt to acquire the
- * lock whenever the owner changes. Spinning will be stopped when:
- * 1) the owning writer isn't running; or
- * 2) readers own the lock as we can't determine if they are
- * actively running or not.
- */
- while (rwsem_spin_on_owner(sem)) {
- /*
- * Try to acquire the lock
- */
- if (rwsem_try_write_lock_unqueued(sem)) {
- taken = true;
- break;
- }
-
- /*
- * When there's no owner, we might have preempted between the
- * owner acquiring the lock and setting the owner field. If
- * we're an RT task that will live-lock because we won't let
- * the owner complete.
- */
- if (!sem->owner && (need_resched() || rt_task(current)))
- break;
-
- /*
- * The cpu_relax() call is a compiler barrier which forces
- * everything in this loop to be re-loaded. We don't need
- * memory barriers as we'll eventually observe the right
- * values at the cost of a few extra spins.
- */
- cpu_relax();
- }
- osq_unlock(&sem->osq);
-done:
- preempt_enable();
- return taken;
-}
-
-/*
- * Return true if the rwsem has active spinner
- */
-static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
-{
- return osq_is_locked(&sem->osq);
-}
-
-#else
-static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
-{
- return false;
-}
-
-static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
-{
- return false;
-}
-#endif
-
-/*
- * Wait until we successfully acquire the write lock
- */
-static inline struct rw_semaphore *
-__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
-{
- long count;
- bool waiting = true; /* any queued threads before us */
- struct rwsem_waiter waiter;
- struct rw_semaphore *ret = sem;
- DEFINE_WAKE_Q(wake_q);
-
- /* undo write bias from down_write operation, stop active locking */
- count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count);
-
- /* do optimistic spinning and steal lock if possible */
- if (rwsem_optimistic_spin(sem))
- return sem;
-
- /*
- * Optimistic spinning failed, proceed to the slowpath
- * and block until we can acquire the sem.
- */
- waiter.task = current;
- waiter.type = RWSEM_WAITING_FOR_WRITE;
-
- raw_spin_lock_irq(&sem->wait_lock);
-
- /* account for this before adding a new element to the list */
- if (list_empty(&sem->wait_list))
- waiting = false;
-
- list_add_tail(&waiter.list, &sem->wait_list);
-
- /* we're now waiting on the lock, but no longer actively locking */
- if (waiting) {
- count = atomic_long_read(&sem->count);
-
- /*
- * If there were already threads queued before us and there are
- * no active writers, the lock must be read owned; so we try to
- * wake any read locks that were queued ahead of us.
- */
- if (count > RWSEM_WAITING_BIAS) {
- __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
- /*
- * The wakeup is normally called _after_ the wait_lock
- * is released, but given that we are proactively waking
- * readers we can deal with the wake_q overhead as it is
- * similar to releasing and taking the wait_lock again
- * for attempting rwsem_try_write_lock().
- */
- wake_up_q(&wake_q);
-
- /*
- * Reinitialize wake_q after use.
- */
- wake_q_init(&wake_q);
- }
-
- } else
- count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count);
-
- /* wait until we successfully acquire the lock */
- set_current_state(state);
- while (true) {
- if (rwsem_try_write_lock(count, sem))
- break;
- raw_spin_unlock_irq(&sem->wait_lock);
-
- /* Block until there are no active lockers. */
- do {
- if (signal_pending_state(state, current))
- goto out_nolock;
-
- schedule();
- set_current_state(state);
- } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK);
-
- raw_spin_lock_irq(&sem->wait_lock);
- }
- __set_current_state(TASK_RUNNING);
- list_del(&waiter.list);
- raw_spin_unlock_irq(&sem->wait_lock);
-
- return ret;
-
-out_nolock:
- __set_current_state(TASK_RUNNING);
- raw_spin_lock_irq(&sem->wait_lock);
- list_del(&waiter.list);
- if (list_empty(&sem->wait_list))
- atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
- else
- __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
- raw_spin_unlock_irq(&sem->wait_lock);
- wake_up_q(&wake_q);
-
- return ERR_PTR(-EINTR);
-}
-
-__visible struct rw_semaphore * __sched
-rwsem_down_write_failed(struct rw_semaphore *sem)
-{
- return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(rwsem_down_write_failed);
-
-__visible struct rw_semaphore * __sched
-rwsem_down_write_failed_killable(struct rw_semaphore *sem)
-{
- return __rwsem_down_write_failed_common(sem, TASK_KILLABLE);
-}
-EXPORT_SYMBOL(rwsem_down_write_failed_killable);
-
-/*
- * handle waking up a waiter on the semaphore
- * - up_read/up_write has decremented the active part of count if we come here
- */
-__visible
-struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
-{
- unsigned long flags;
- DEFINE_WAKE_Q(wake_q);
-
- /*
- * If a spinner is present, it is not necessary to do the wakeup.
- * Try to do wakeup only if the trylock succeeds to minimize
- * spinlock contention which may introduce too much delay in the
- * unlock operation.
- *
- * spinning writer up_write/up_read caller
- * --------------- -----------------------
- * [S] osq_unlock() [L] osq
- * MB RMB
- * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock)
- *
- * Here, it is important to make sure that there won't be a missed
- * wakeup while the rwsem is free and the only spinning writer goes
- * to sleep without taking the rwsem. Even when the spinning writer
- * is just going to break out of the waiting loop, it will still do
- * a trylock in rwsem_down_write_failed() before sleeping. IOW, if
- * rwsem_has_spinner() is true, it will guarantee at least one
- * trylock attempt on the rwsem later on.
- */
- if (rwsem_has_spinner(sem)) {
- /*
- * The smp_rmb() here is to make sure that the spinner
- * state is consulted before reading the wait_lock.
- */
- smp_rmb();
- if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags))
- return sem;
- goto locked;
- }
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-locked:
-
- if (!list_empty(&sem->wait_list))
- __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- wake_up_q(&wake_q);
-
- return sem;
-}
-EXPORT_SYMBOL(rwsem_wake);
-
-/*
- * downgrade a write lock into a read lock
- * - caller incremented waiting part of count and discovered it still negative
- * - just wake up any readers at the front of the queue
- */
-__visible
-struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
-{
- unsigned long flags;
- DEFINE_WAKE_Q(wake_q);
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (!list_empty(&sem->wait_list))
- __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- wake_up_q(&wake_q);
-
- return sem;
-}
-EXPORT_SYMBOL(rwsem_downgrade_wake);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 4d48b1c4870d..24df4d98f7d2 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1,18 +1,1532 @@
+// SPDX-License-Identifier: GPL-2.0
/* kernel/rwsem.c: R/W semaphores, public implementation
*
* Written by David Howells (dhowells@redhat.com).
* Derived from asm-i386/semaphore.h
+ *
+ * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
+ * and Michel Lespinasse <walken@google.com>
+ *
+ * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
+ * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
+ *
+ * Rwsem count bit fields re-definition and rwsem rearchitecture by
+ * Waiman Long <longman@redhat.com> and
+ * Peter Zijlstra <peterz@infradead.org>.
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/task.h>
#include <linux/sched/debug.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/clock.h>
#include <linux/export.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
+#include <linux/hung_task.h>
+#include <trace/events/lock.h>
+
+#ifndef CONFIG_PREEMPT_RT
+#include "lock_events.h"
+
+/*
+ * The least significant 2 bits of the owner value has the following
+ * meanings when set.
+ * - Bit 0: RWSEM_READER_OWNED - rwsem may be owned by readers (just a hint)
+ * - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock
+ *
+ * When the rwsem is reader-owned and a spinning writer has timed out,
+ * the nonspinnable bit will be set to disable optimistic spinning.
+
+ * When a writer acquires a rwsem, it puts its task_struct pointer
+ * into the owner field. It is cleared after an unlock.
+ *
+ * When a reader acquires a rwsem, it will also puts its task_struct
+ * pointer into the owner field with the RWSEM_READER_OWNED bit set.
+ * On unlock, the owner field will largely be left untouched. So
+ * for a free or reader-owned rwsem, the owner value may contain
+ * information about the last reader that acquires the rwsem.
+ *
+ * That information may be helpful in debugging cases where the system
+ * seems to hang on a reader owned rwsem especially if only one reader
+ * is involved. Ideally we would like to track all the readers that own
+ * a rwsem, but the overhead is simply too big.
+ *
+ * A fast path reader optimistic lock stealing is supported when the rwsem
+ * is previously owned by a writer and the following conditions are met:
+ * - rwsem is not currently writer owned
+ * - the handoff isn't set.
+ */
+#define RWSEM_READER_OWNED (1UL << 0)
+#define RWSEM_NONSPINNABLE (1UL << 1)
+#define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
+
+#ifdef CONFIG_DEBUG_RWSEMS
+# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \
+ if (!debug_locks_silent && \
+ WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, magic = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
+ #c, atomic_long_read(&(sem)->count), \
+ (unsigned long) sem->magic, \
+ atomic_long_read(&(sem)->owner), (long)current, \
+ list_empty(&(sem)->wait_list) ? "" : "not ")) \
+ debug_locks_off(); \
+ } while (0)
+#else
+# define DEBUG_RWSEMS_WARN_ON(c, sem)
+#endif
+
+/*
+ * On 64-bit architectures, the bit definitions of the count are:
+ *
+ * Bit 0 - writer locked bit
+ * Bit 1 - waiters present bit
+ * Bit 2 - lock handoff bit
+ * Bits 3-7 - reserved
+ * Bits 8-62 - 55-bit reader count
+ * Bit 63 - read fail bit
+ *
+ * On 32-bit architectures, the bit definitions of the count are:
+ *
+ * Bit 0 - writer locked bit
+ * Bit 1 - waiters present bit
+ * Bit 2 - lock handoff bit
+ * Bits 3-7 - reserved
+ * Bits 8-30 - 23-bit reader count
+ * Bit 31 - read fail bit
+ *
+ * It is not likely that the most significant bit (read fail bit) will ever
+ * be set. This guard bit is still checked anyway in the down_read() fastpath
+ * just in case we need to use up more of the reader bits for other purpose
+ * in the future.
+ *
+ * atomic_long_fetch_add() is used to obtain reader lock, whereas
+ * atomic_long_cmpxchg() will be used to obtain writer lock.
+ *
+ * There are three places where the lock handoff bit may be set or cleared.
+ * 1) rwsem_mark_wake() for readers -- set, clear
+ * 2) rwsem_try_write_lock() for writers -- set, clear
+ * 3) rwsem_del_waiter() -- clear
+ *
+ * For all the above cases, wait_lock will be held. A writer must also
+ * be the first one in the wait_list to be eligible for setting the handoff
+ * bit. So concurrent setting/clearing of handoff bit is not possible.
+ */
+#define RWSEM_WRITER_LOCKED (1UL << 0)
+#define RWSEM_FLAG_WAITERS (1UL << 1)
+#define RWSEM_FLAG_HANDOFF (1UL << 2)
+#define RWSEM_FLAG_READFAIL (1UL << (BITS_PER_LONG - 1))
+
+#define RWSEM_READER_SHIFT 8
+#define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT)
+#define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1))
+#define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED
+#define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK)
+#define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\
+ RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL)
+
+/*
+ * All writes to owner are protected by WRITE_ONCE() to make sure that
+ * store tearing can't happen as optimistic spinners may read and use
+ * the owner value concurrently without lock. Read from owner, however,
+ * may not need READ_ONCE() as long as the pointer value is only used
+ * for comparison and isn't being dereferenced.
+ *
+ * Both rwsem_{set,clear}_owner() functions should be in the same
+ * preempt disable section as the atomic op that changes sem->count.
+ */
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+ lockdep_assert_preemption_disabled();
+ atomic_long_set(&sem->owner, (long)current);
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+ lockdep_assert_preemption_disabled();
+ atomic_long_set(&sem->owner, 0);
+}
+
+/*
+ * Test the flags in the owner field.
+ */
+static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags)
+{
+ return atomic_long_read(&sem->owner) & flags;
+}
+
+/*
+ * The task_struct pointer of the last owning reader will be left in
+ * the owner field.
+ *
+ * Note that the owner value just indicates the task has owned the rwsem
+ * previously, it may not be the real owner or one of the real owners
+ * anymore when that field is examined, so take it with a grain of salt.
+ *
+ * The reader non-spinnable bit is preserved.
+ */
+static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
+ struct task_struct *owner)
+{
+ unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED |
+ (atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE);
+
+ atomic_long_set(&sem->owner, val);
+}
+
+static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
+{
+ __rwsem_set_reader_owned(sem, current);
+}
+
+#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER)
+/*
+ * Return just the real task structure pointer of the owner
+ */
+struct task_struct *rwsem_owner(struct rw_semaphore *sem)
+{
+ return (struct task_struct *)
+ (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
+}
+
+/*
+ * Return true if the rwsem is owned by a reader.
+ */
+bool is_rwsem_reader_owned(struct rw_semaphore *sem)
+{
+ /*
+ * Check the count to see if it is write-locked.
+ */
+ long count = atomic_long_read(&sem->count);
+
+ if (count & RWSEM_WRITER_MASK)
+ return false;
+ return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
+}
+
+/*
+ * With CONFIG_DEBUG_RWSEMS or CONFIG_DETECT_HUNG_TASK_BLOCKER configured,
+ * it will make sure that the owner field of a reader-owned rwsem either
+ * points to a real reader-owner(s) or gets cleared. The only exception is
+ * when the unlock is done by up_read_non_owner().
+ */
+static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
+{
+ unsigned long val = atomic_long_read(&sem->owner);
+
+ while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) {
+ if (atomic_long_try_cmpxchg(&sem->owner, &val,
+ val & RWSEM_OWNER_FLAGS_MASK))
+ return;
+ }
+}
+#else
+static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
+{
+}
+#endif
+
+/*
+ * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag
+ * remains set. Otherwise, the operation will be aborted.
+ */
+static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem)
+{
+ unsigned long owner = atomic_long_read(&sem->owner);
+
+ do {
+ if (!(owner & RWSEM_READER_OWNED))
+ break;
+ if (owner & RWSEM_NONSPINNABLE)
+ break;
+ } while (!atomic_long_try_cmpxchg(&sem->owner, &owner,
+ owner | RWSEM_NONSPINNABLE));
+}
+
+static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp)
+{
+ *cntp = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count);
+
+ if (WARN_ON_ONCE(*cntp < 0))
+ rwsem_set_nonspinnable(sem);
+
+ if (!(*cntp & RWSEM_READ_FAILED_MASK)) {
+ rwsem_set_reader_owned(sem);
+ return true;
+ }
+
+ return false;
+}
+
+static inline bool rwsem_write_trylock(struct rw_semaphore *sem)
+{
+ long tmp = RWSEM_UNLOCKED_VALUE;
+
+ if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) {
+ rwsem_set_owner(sem);
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Return the real task structure pointer of the owner and the embedded
+ * flags in the owner. pflags must be non-NULL.
+ */
+static inline struct task_struct *
+rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags)
+{
+ unsigned long owner = atomic_long_read(&sem->owner);
+
+ *pflags = owner & RWSEM_OWNER_FLAGS_MASK;
+ return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK);
+}
+
+/*
+ * Guide to the rw_semaphore's count field.
+ *
+ * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned
+ * by a writer.
+ *
+ * The lock is owned by readers when
+ * (1) the RWSEM_WRITER_LOCKED isn't set in count,
+ * (2) some of the reader bits are set in count, and
+ * (3) the owner field has RWSEM_READ_OWNED bit set.
+ *
+ * Having some reader bits set is not enough to guarantee a readers owned
+ * lock as the readers may be in the process of backing out from the count
+ * and a writer has just released the lock. So another writer may steal
+ * the lock immediately after that.
+ */
+
+/*
+ * Initialize an rwsem:
+ */
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held semaphore:
+ */
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
+#endif
+#ifdef CONFIG_DEBUG_RWSEMS
+ sem->magic = sem;
+#endif
+ atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
+ raw_spin_lock_init(&sem->wait_lock);
+ INIT_LIST_HEAD(&sem->wait_list);
+ atomic_long_set(&sem->owner, 0L);
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+ osq_lock_init(&sem->osq);
+#endif
+}
+EXPORT_SYMBOL(__init_rwsem);
+
+enum rwsem_waiter_type {
+ RWSEM_WAITING_FOR_WRITE,
+ RWSEM_WAITING_FOR_READ
+};
+
+struct rwsem_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ enum rwsem_waiter_type type;
+ unsigned long timeout;
+ bool handoff_set;
+};
+#define rwsem_first_waiter(sem) \
+ list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
+
+enum rwsem_wake_type {
+ RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */
+ RWSEM_WAKE_READERS, /* Wake readers only */
+ RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
+};
-#include "rwsem.h"
+/*
+ * The typical HZ value is either 250 or 1000. So set the minimum waiting
+ * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
+ * queue before initiating the handoff protocol.
+ */
+#define RWSEM_WAIT_TIMEOUT DIV_ROUND_UP(HZ, 250)
+
+/*
+ * Magic number to batch-wakeup waiting readers, even when writers are
+ * also present in the queue. This both limits the amount of work the
+ * waking thread must do and also prevents any potential counter overflow,
+ * however unlikely.
+ */
+#define MAX_READERS_WAKEUP 0x100
+
+static inline void
+rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+{
+ lockdep_assert_held(&sem->wait_lock);
+ list_add_tail(&waiter->list, &sem->wait_list);
+ /* caller will set RWSEM_FLAG_WAITERS */
+}
+
+/*
+ * Remove a waiter from the wait_list and clear flags.
+ *
+ * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
+ * this function. Modify with care.
+ *
+ * Return: true if wait_list isn't empty and false otherwise
+ */
+static inline bool
+rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+{
+ lockdep_assert_held(&sem->wait_lock);
+ list_del(&waiter->list);
+ if (likely(!list_empty(&sem->wait_list)))
+ return true;
+
+ atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
+ return false;
+}
+
+/*
+ * handle the lock release when processes blocked on it that can now run
+ * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
+ * have been set.
+ * - there must be someone on the queue
+ * - the wait_lock must be held by the caller
+ * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
+ * to actually wakeup the blocked task(s) and drop the reference count,
+ * preferably when the wait_lock is released
+ * - woken process blocks are discarded from the list after having task zeroed
+ * - writers are only marked woken if downgrading is false
+ *
+ * Implies rwsem_del_waiter() for all woken readers.
+ */
+static void rwsem_mark_wake(struct rw_semaphore *sem,
+ enum rwsem_wake_type wake_type,
+ struct wake_q_head *wake_q)
+{
+ struct rwsem_waiter *waiter, *tmp;
+ long oldcount, woken = 0, adjustment = 0;
+ struct list_head wlist;
+
+ lockdep_assert_held(&sem->wait_lock);
+
+ /*
+ * Take a peek at the queue head waiter such that we can determine
+ * the wakeup(s) to perform.
+ */
+ waiter = rwsem_first_waiter(sem);
+
+ if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
+ if (wake_type == RWSEM_WAKE_ANY) {
+ /*
+ * Mark writer at the front of the queue for wakeup.
+ * Until the task is actually later awoken later by
+ * the caller, other writers are able to steal it.
+ * Readers, on the other hand, will block as they
+ * will notice the queued writer.
+ */
+ wake_q_add(wake_q, waiter->task);
+ lockevent_inc(rwsem_wake_writer);
+ }
+
+ return;
+ }
+
+ /*
+ * No reader wakeup if there are too many of them already.
+ */
+ if (unlikely(atomic_long_read(&sem->count) < 0))
+ return;
+
+ /*
+ * Writers might steal the lock before we grant it to the next reader.
+ * We prefer to do the first reader grant before counting readers
+ * so we can bail out early if a writer stole the lock.
+ */
+ if (wake_type != RWSEM_WAKE_READ_OWNED) {
+ struct task_struct *owner;
+
+ adjustment = RWSEM_READER_BIAS;
+ oldcount = atomic_long_fetch_add(adjustment, &sem->count);
+ if (unlikely(oldcount & RWSEM_WRITER_MASK)) {
+ /*
+ * When we've been waiting "too" long (for writers
+ * to give up the lock), request a HANDOFF to
+ * force the issue.
+ */
+ if (time_after(jiffies, waiter->timeout)) {
+ if (!(oldcount & RWSEM_FLAG_HANDOFF)) {
+ adjustment -= RWSEM_FLAG_HANDOFF;
+ lockevent_inc(rwsem_rlock_handoff);
+ }
+ waiter->handoff_set = true;
+ }
+
+ atomic_long_add(-adjustment, &sem->count);
+ return;
+ }
+ /*
+ * Set it to reader-owned to give spinners an early
+ * indication that readers now have the lock.
+ * The reader nonspinnable bit seen at slowpath entry of
+ * the reader is copied over.
+ */
+ owner = waiter->task;
+ __rwsem_set_reader_owned(sem, owner);
+ }
+
+ /*
+ * Grant up to MAX_READERS_WAKEUP read locks to all the readers in the
+ * queue. We know that the woken will be at least 1 as we accounted
+ * for above. Note we increment the 'active part' of the count by the
+ * number of readers before waking any processes up.
+ *
+ * This is an adaptation of the phase-fair R/W locks where at the
+ * reader phase (first waiter is a reader), all readers are eligible
+ * to acquire the lock at the same time irrespective of their order
+ * in the queue. The writers acquire the lock according to their
+ * order in the queue.
+ *
+ * We have to do wakeup in 2 passes to prevent the possibility that
+ * the reader count may be decremented before it is incremented. It
+ * is because the to-be-woken waiter may not have slept yet. So it
+ * may see waiter->task got cleared, finish its critical section and
+ * do an unlock before the reader count increment.
+ *
+ * 1) Collect the read-waiters in a separate list, count them and
+ * fully increment the reader count in rwsem.
+ * 2) For each waiters in the new list, clear waiter->task and
+ * put them into wake_q to be woken up later.
+ */
+ INIT_LIST_HEAD(&wlist);
+ list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
+ if (waiter->type == RWSEM_WAITING_FOR_WRITE)
+ continue;
+
+ woken++;
+ list_move_tail(&waiter->list, &wlist);
+
+ /*
+ * Limit # of readers that can be woken up per wakeup call.
+ */
+ if (unlikely(woken >= MAX_READERS_WAKEUP))
+ break;
+ }
+
+ adjustment = woken * RWSEM_READER_BIAS - adjustment;
+ lockevent_cond_inc(rwsem_wake_reader, woken);
+
+ oldcount = atomic_long_read(&sem->count);
+ if (list_empty(&sem->wait_list)) {
+ /*
+ * Combined with list_move_tail() above, this implies
+ * rwsem_del_waiter().
+ */
+ adjustment -= RWSEM_FLAG_WAITERS;
+ if (oldcount & RWSEM_FLAG_HANDOFF)
+ adjustment -= RWSEM_FLAG_HANDOFF;
+ } else if (woken) {
+ /*
+ * When we've woken a reader, we no longer need to force
+ * writers to give up the lock and we can clear HANDOFF.
+ */
+ if (oldcount & RWSEM_FLAG_HANDOFF)
+ adjustment -= RWSEM_FLAG_HANDOFF;
+ }
+
+ if (adjustment)
+ atomic_long_add(adjustment, &sem->count);
+
+ /* 2nd pass */
+ list_for_each_entry_safe(waiter, tmp, &wlist, list) {
+ struct task_struct *tsk;
+
+ tsk = waiter->task;
+ get_task_struct(tsk);
+
+ /*
+ * Ensure calling get_task_struct() before setting the reader
+ * waiter to nil such that rwsem_down_read_slowpath() cannot
+ * race with do_exit() by always holding a reference count
+ * to the task to wakeup.
+ */
+ smp_store_release(&waiter->task, NULL);
+ /*
+ * Ensure issuing the wakeup (either by us or someone else)
+ * after setting the reader waiter to nil.
+ */
+ wake_q_add_safe(wake_q, tsk);
+ }
+}
+
+/*
+ * Remove a waiter and try to wake up other waiters in the wait queue
+ * This function is called from the out_nolock path of both the reader and
+ * writer slowpaths with wait_lock held. It releases the wait_lock and
+ * optionally wake up waiters before it returns.
+ */
+static inline void
+rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
+ struct wake_q_head *wake_q)
+ __releases(&sem->wait_lock)
+{
+ bool first = rwsem_first_waiter(sem) == waiter;
+
+ wake_q_init(wake_q);
+
+ /*
+ * If the wait_list isn't empty and the waiter to be deleted is
+ * the first waiter, we wake up the remaining waiters as they may
+ * be eligible to acquire or spin on the lock.
+ */
+ if (rwsem_del_waiter(sem, waiter) && first)
+ rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ if (!wake_q_empty(wake_q))
+ wake_up_q(wake_q);
+}
+
+/*
+ * This function must be called with the sem->wait_lock held to prevent
+ * race conditions between checking the rwsem wait list and setting the
+ * sem->count accordingly.
+ *
+ * Implies rwsem_del_waiter() on success.
+ */
+static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
+ struct rwsem_waiter *waiter)
+{
+ struct rwsem_waiter *first = rwsem_first_waiter(sem);
+ long count, new;
+
+ lockdep_assert_held(&sem->wait_lock);
+
+ count = atomic_long_read(&sem->count);
+ do {
+ bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
+
+ if (has_handoff) {
+ /*
+ * Honor handoff bit and yield only when the first
+ * waiter is the one that set it. Otherwisee, we
+ * still try to acquire the rwsem.
+ */
+ if (first->handoff_set && (waiter != first))
+ return false;
+ }
+
+ new = count;
+
+ if (count & RWSEM_LOCK_MASK) {
+ /*
+ * A waiter (first or not) can set the handoff bit
+ * if it is an RT task or wait in the wait queue
+ * for too long.
+ */
+ if (has_handoff || (!rt_or_dl_task(waiter->task) &&
+ !time_after(jiffies, waiter->timeout)))
+ return false;
+
+ new |= RWSEM_FLAG_HANDOFF;
+ } else {
+ new |= RWSEM_WRITER_LOCKED;
+ new &= ~RWSEM_FLAG_HANDOFF;
+
+ if (list_is_singular(&sem->wait_list))
+ new &= ~RWSEM_FLAG_WAITERS;
+ }
+ } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
+
+ /*
+ * We have either acquired the lock with handoff bit cleared or set
+ * the handoff bit. Only the first waiter can have its handoff_set
+ * set here to enable optimistic spinning in slowpath loop.
+ */
+ if (new & RWSEM_FLAG_HANDOFF) {
+ first->handoff_set = true;
+ lockevent_inc(rwsem_wlock_handoff);
+ return false;
+ }
+
+ /*
+ * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
+ * success.
+ */
+ list_del(&waiter->list);
+ rwsem_set_owner(sem);
+ return true;
+}
+
+/*
+ * The rwsem_spin_on_owner() function returns the following 4 values
+ * depending on the lock owner state.
+ * OWNER_NULL : owner is currently NULL
+ * OWNER_WRITER: when owner changes and is a writer
+ * OWNER_READER: when owner changes and the new owner may be a reader.
+ * OWNER_NONSPINNABLE:
+ * when optimistic spinning has to stop because either the
+ * owner stops running, is unknown, or its timeslice has
+ * been used up.
+ */
+enum owner_state {
+ OWNER_NULL = 1 << 0,
+ OWNER_WRITER = 1 << 1,
+ OWNER_READER = 1 << 2,
+ OWNER_NONSPINNABLE = 1 << 3,
+};
+
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+/*
+ * Try to acquire write lock before the writer has been put on wait queue.
+ */
+static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
+{
+ long count = atomic_long_read(&sem->count);
+
+ while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) {
+ if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
+ count | RWSEM_WRITER_LOCKED)) {
+ rwsem_set_owner(sem);
+ lockevent_inc(rwsem_opt_lock);
+ return true;
+ }
+ }
+ return false;
+}
+
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
+{
+ struct task_struct *owner;
+ unsigned long flags;
+ bool ret = true;
+
+ if (need_resched()) {
+ lockevent_inc(rwsem_opt_fail);
+ return false;
+ }
+
+ /*
+ * Disable preemption is equal to the RCU read-side crital section,
+ * thus the task_strcut structure won't go away.
+ */
+ owner = rwsem_owner_flags(sem, &flags);
+ /*
+ * Don't check the read-owner as the entry may be stale.
+ */
+ if ((flags & RWSEM_NONSPINNABLE) ||
+ (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))
+ ret = false;
+
+ lockevent_cond_inc(rwsem_opt_fail, !ret);
+ return ret;
+}
+
+static inline enum owner_state
+rwsem_owner_state(struct task_struct *owner, unsigned long flags)
+{
+ if (flags & RWSEM_NONSPINNABLE)
+ return OWNER_NONSPINNABLE;
+
+ if (flags & RWSEM_READER_OWNED)
+ return OWNER_READER;
+
+ return owner ? OWNER_WRITER : OWNER_NULL;
+}
+
+static noinline enum owner_state
+rwsem_spin_on_owner(struct rw_semaphore *sem)
+{
+ struct task_struct *new, *owner;
+ unsigned long flags, new_flags;
+ enum owner_state state;
+
+ lockdep_assert_preemption_disabled();
+
+ owner = rwsem_owner_flags(sem, &flags);
+ state = rwsem_owner_state(owner, flags);
+ if (state != OWNER_WRITER)
+ return state;
+
+ for (;;) {
+ /*
+ * When a waiting writer set the handoff flag, it may spin
+ * on the owner as well. Once that writer acquires the lock,
+ * we can spin on it. So we don't need to quit even when the
+ * handoff bit is set.
+ */
+ new = rwsem_owner_flags(sem, &new_flags);
+ if ((new != owner) || (new_flags != flags)) {
+ state = rwsem_owner_state(new, new_flags);
+ break;
+ }
+
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_
+ * checking sem->owner still matches owner, if that fails,
+ * owner might point to free()d memory, if it still matches,
+ * our spinning context already disabled preemption which is
+ * equal to RCU read-side crital section ensures the memory
+ * stays valid.
+ */
+ barrier();
+
+ if (need_resched() || !owner_on_cpu(owner)) {
+ state = OWNER_NONSPINNABLE;
+ break;
+ }
+
+ cpu_relax();
+ }
+
+ return state;
+}
+
+/*
+ * Calculate reader-owned rwsem spinning threshold for writer
+ *
+ * The more readers own the rwsem, the longer it will take for them to
+ * wind down and free the rwsem. So the empirical formula used to
+ * determine the actual spinning time limit here is:
+ *
+ * Spinning threshold = (10 + nr_readers/2)us
+ *
+ * The limit is capped to a maximum of 25us (30 readers). This is just
+ * a heuristic and is subjected to change in the future.
+ */
+static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem)
+{
+ long count = atomic_long_read(&sem->count);
+ int readers = count >> RWSEM_READER_SHIFT;
+ u64 delta;
+
+ if (readers > 30)
+ readers = 30;
+ delta = (20 + readers) * NSEC_PER_USEC / 2;
+
+ return sched_clock() + delta;
+}
+
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+ bool taken = false;
+ int prev_owner_state = OWNER_NULL;
+ int loop = 0;
+ u64 rspin_threshold = 0;
+
+ /* sem->wait_lock should not be held when doing optimistic spinning */
+ if (!osq_lock(&sem->osq))
+ goto done;
+
+ /*
+ * Optimistically spin on the owner field and attempt to acquire the
+ * lock whenever the owner changes. Spinning will be stopped when:
+ * 1) the owning writer isn't running; or
+ * 2) readers own the lock and spinning time has exceeded limit.
+ */
+ for (;;) {
+ enum owner_state owner_state;
+
+ owner_state = rwsem_spin_on_owner(sem);
+ if (owner_state == OWNER_NONSPINNABLE)
+ break;
+
+ /*
+ * Try to acquire the lock
+ */
+ taken = rwsem_try_write_lock_unqueued(sem);
+
+ if (taken)
+ break;
+
+ /*
+ * Time-based reader-owned rwsem optimistic spinning
+ */
+ if (owner_state == OWNER_READER) {
+ /*
+ * Re-initialize rspin_threshold every time when
+ * the owner state changes from non-reader to reader.
+ * This allows a writer to steal the lock in between
+ * 2 reader phases and have the threshold reset at
+ * the beginning of the 2nd reader phase.
+ */
+ if (prev_owner_state != OWNER_READER) {
+ if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))
+ break;
+ rspin_threshold = rwsem_rspin_threshold(sem);
+ loop = 0;
+ }
+
+ /*
+ * Check time threshold once every 16 iterations to
+ * avoid calling sched_clock() too frequently so
+ * as to reduce the average latency between the times
+ * when the lock becomes free and when the spinner
+ * is ready to do a trylock.
+ */
+ else if (!(++loop & 0xf) && (sched_clock() > rspin_threshold)) {
+ rwsem_set_nonspinnable(sem);
+ lockevent_inc(rwsem_opt_nospin);
+ break;
+ }
+ }
+
+ /*
+ * An RT task cannot do optimistic spinning if it cannot
+ * be sure the lock holder is running or live-lock may
+ * happen if the current task and the lock holder happen
+ * to run in the same CPU. However, aborting optimistic
+ * spinning while a NULL owner is detected may miss some
+ * opportunity where spinning can continue without causing
+ * problem.
+ *
+ * There are 2 possible cases where an RT task may be able
+ * to continue spinning.
+ *
+ * 1) The lock owner is in the process of releasing the
+ * lock, sem->owner is cleared but the lock has not
+ * been released yet.
+ * 2) The lock was free and owner cleared, but another
+ * task just comes in and acquire the lock before
+ * we try to get it. The new owner may be a spinnable
+ * writer.
+ *
+ * To take advantage of two scenarios listed above, the RT
+ * task is made to retry one more time to see if it can
+ * acquire the lock or continue spinning on the new owning
+ * writer. Of course, if the time lag is long enough or the
+ * new owner is not a writer or spinnable, the RT task will
+ * quit spinning.
+ *
+ * If the owner is a writer, the need_resched() check is
+ * done inside rwsem_spin_on_owner(). If the owner is not
+ * a writer, need_resched() check needs to be done here.
+ */
+ if (owner_state != OWNER_WRITER) {
+ if (need_resched())
+ break;
+ if (rt_or_dl_task(current) &&
+ (prev_owner_state != OWNER_WRITER))
+ break;
+ }
+ prev_owner_state = owner_state;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ cpu_relax();
+ }
+ osq_unlock(&sem->osq);
+done:
+ lockevent_cond_inc(rwsem_opt_fail, !taken);
+ return taken;
+}
+
+/*
+ * Clear the owner's RWSEM_NONSPINNABLE bit if it is set. This should
+ * only be called when the reader count reaches 0.
+ */
+static inline void clear_nonspinnable(struct rw_semaphore *sem)
+{
+ if (unlikely(rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)))
+ atomic_long_andnot(RWSEM_NONSPINNABLE, &sem->owner);
+}
+
+#else
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
+{
+ return false;
+}
+
+static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+ return false;
+}
+
+static inline void clear_nonspinnable(struct rw_semaphore *sem) { }
+
+static inline enum owner_state
+rwsem_spin_on_owner(struct rw_semaphore *sem)
+{
+ return OWNER_NONSPINNABLE;
+}
+#endif
+
+/*
+ * Prepare to wake up waiter(s) in the wait queue by putting them into the
+ * given wake_q if the rwsem lock owner isn't a writer. If rwsem is likely
+ * reader-owned, wake up read lock waiters in queue front or wake up any
+ * front waiter otherwise.
+
+ * This is being called from both reader and writer slow paths.
+ */
+static inline void rwsem_cond_wake_waiter(struct rw_semaphore *sem, long count,
+ struct wake_q_head *wake_q)
+{
+ enum rwsem_wake_type wake_type;
+
+ if (count & RWSEM_WRITER_MASK)
+ return;
+
+ if (count & RWSEM_READER_MASK) {
+ wake_type = RWSEM_WAKE_READERS;
+ } else {
+ wake_type = RWSEM_WAKE_ANY;
+ clear_nonspinnable(sem);
+ }
+ rwsem_mark_wake(sem, wake_type, wake_q);
+}
+
+/*
+ * Wait for the read lock to be granted
+ */
+static struct rw_semaphore __sched *
+rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int state)
+{
+ long adjustment = -RWSEM_READER_BIAS;
+ long rcnt = (count >> RWSEM_READER_SHIFT);
+ struct rwsem_waiter waiter;
+ DEFINE_WAKE_Q(wake_q);
+
+ /*
+ * To prevent a constant stream of readers from starving a sleeping
+ * writer, don't attempt optimistic lock stealing if the lock is
+ * very likely owned by readers.
+ */
+ if ((atomic_long_read(&sem->owner) & RWSEM_READER_OWNED) &&
+ (rcnt > 1) && !(count & RWSEM_WRITER_LOCKED))
+ goto queue;
+
+ /*
+ * Reader optimistic lock stealing.
+ */
+ if (!(count & (RWSEM_WRITER_LOCKED | RWSEM_FLAG_HANDOFF))) {
+ rwsem_set_reader_owned(sem);
+ lockevent_inc(rwsem_rlock_steal);
+
+ /*
+ * Wake up other readers in the wait queue if it is
+ * the first reader.
+ */
+ if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) {
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (!list_empty(&sem->wait_list))
+ rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED,
+ &wake_q);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
+ }
+ return sem;
+ }
+
+queue:
+ waiter.task = current;
+ waiter.type = RWSEM_WAITING_FOR_READ;
+ waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+ waiter.handoff_set = false;
+
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (list_empty(&sem->wait_list)) {
+ /*
+ * In case the wait queue is empty and the lock isn't owned
+ * by a writer, this reader can exit the slowpath and return
+ * immediately as its RWSEM_READER_BIAS has already been set
+ * in the count.
+ */
+ if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) {
+ /* Provide lock ACQUIRE */
+ smp_acquire__after_ctrl_dep();
+ raw_spin_unlock_irq(&sem->wait_lock);
+ rwsem_set_reader_owned(sem);
+ lockevent_inc(rwsem_rlock_fast);
+ return sem;
+ }
+ adjustment += RWSEM_FLAG_WAITERS;
+ }
+ rwsem_add_waiter(sem, &waiter);
+
+ /* we're now waiting on the lock, but no longer actively locking */
+ count = atomic_long_add_return(adjustment, &sem->count);
+
+ rwsem_cond_wake_waiter(sem, count, &wake_q);
+ raw_spin_unlock_irq(&sem->wait_lock);
+
+ if (!wake_q_empty(&wake_q))
+ wake_up_q(&wake_q);
+
+ trace_contention_begin(sem, LCB_F_READ);
+ set_current_state(state);
+
+ if (state == TASK_UNINTERRUPTIBLE)
+ hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_READER);
+
+ /* wait to be given the lock */
+ for (;;) {
+ if (!smp_load_acquire(&waiter.task)) {
+ /* Matches rwsem_mark_wake()'s smp_store_release(). */
+ break;
+ }
+ if (signal_pending_state(state, current)) {
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (waiter.task)
+ goto out_nolock;
+ raw_spin_unlock_irq(&sem->wait_lock);
+ /* Ordered by sem->wait_lock against rwsem_mark_wake(). */
+ break;
+ }
+ schedule_preempt_disabled();
+ lockevent_inc(rwsem_sleep_reader);
+ set_current_state(state);
+ }
+
+ if (state == TASK_UNINTERRUPTIBLE)
+ hung_task_clear_blocker();
+
+ __set_current_state(TASK_RUNNING);
+ lockevent_inc(rwsem_rlock);
+ trace_contention_end(sem, 0);
+ return sem;
+
+out_nolock:
+ rwsem_del_wake_waiter(sem, &waiter, &wake_q);
+ __set_current_state(TASK_RUNNING);
+ lockevent_inc(rwsem_rlock_fail);
+ trace_contention_end(sem, -EINTR);
+ return ERR_PTR(-EINTR);
+}
+
+/*
+ * Wait until we successfully acquire the write lock
+ */
+static struct rw_semaphore __sched *
+rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
+{
+ struct rwsem_waiter waiter;
+ DEFINE_WAKE_Q(wake_q);
+
+ /* do optimistic spinning and steal lock if possible */
+ if (rwsem_can_spin_on_owner(sem) && rwsem_optimistic_spin(sem)) {
+ /* rwsem_optimistic_spin() implies ACQUIRE on success */
+ return sem;
+ }
+
+ /*
+ * Optimistic spinning failed, proceed to the slowpath
+ * and block until we can acquire the sem.
+ */
+ waiter.task = current;
+ waiter.type = RWSEM_WAITING_FOR_WRITE;
+ waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+ waiter.handoff_set = false;
+
+ raw_spin_lock_irq(&sem->wait_lock);
+ rwsem_add_waiter(sem, &waiter);
+
+ /* we're now waiting on the lock */
+ if (rwsem_first_waiter(sem) != &waiter) {
+ rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count),
+ &wake_q);
+ if (!wake_q_empty(&wake_q)) {
+ /*
+ * We want to minimize wait_lock hold time especially
+ * when a large number of readers are to be woken up.
+ */
+ raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
+ raw_spin_lock_irq(&sem->wait_lock);
+ }
+ } else {
+ atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
+ }
+
+ /* wait until we successfully acquire the lock */
+ set_current_state(state);
+ trace_contention_begin(sem, LCB_F_WRITE);
+
+ if (state == TASK_UNINTERRUPTIBLE)
+ hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_WRITER);
+
+ for (;;) {
+ if (rwsem_try_write_lock(sem, &waiter)) {
+ /* rwsem_try_write_lock() implies ACQUIRE on success */
+ break;
+ }
+
+ raw_spin_unlock_irq(&sem->wait_lock);
+
+ if (signal_pending_state(state, current))
+ goto out_nolock;
+
+ /*
+ * After setting the handoff bit and failing to acquire
+ * the lock, attempt to spin on owner to accelerate lock
+ * transfer. If the previous owner is a on-cpu writer and it
+ * has just released the lock, OWNER_NULL will be returned.
+ * In this case, we attempt to acquire the lock again
+ * without sleeping.
+ */
+ if (waiter.handoff_set) {
+ enum owner_state owner_state;
+
+ owner_state = rwsem_spin_on_owner(sem);
+ if (owner_state == OWNER_NULL)
+ goto trylock_again;
+ }
+
+ schedule_preempt_disabled();
+ lockevent_inc(rwsem_sleep_writer);
+ set_current_state(state);
+trylock_again:
+ raw_spin_lock_irq(&sem->wait_lock);
+ }
+
+ if (state == TASK_UNINTERRUPTIBLE)
+ hung_task_clear_blocker();
+
+ __set_current_state(TASK_RUNNING);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ lockevent_inc(rwsem_wlock);
+ trace_contention_end(sem, 0);
+ return sem;
+
+out_nolock:
+ __set_current_state(TASK_RUNNING);
+ raw_spin_lock_irq(&sem->wait_lock);
+ rwsem_del_wake_waiter(sem, &waiter, &wake_q);
+ lockevent_inc(rwsem_wlock_fail);
+ trace_contention_end(sem, -EINTR);
+ return ERR_PTR(-EINTR);
+}
+
+/*
+ * handle waking up a waiter on the semaphore
+ * - up_read/up_write has decremented the active part of count if we come here
+ */
+static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+ DEFINE_WAKE_Q(wake_q);
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ if (!list_empty(&sem->wait_list))
+ rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ wake_up_q(&wake_q);
+
+ return sem;
+}
+
+/*
+ * downgrade a write lock into a read lock
+ * - caller incremented waiting part of count and discovered it still negative
+ * - just wake up any readers at the front of the queue
+ */
+static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+ DEFINE_WAKE_Q(wake_q);
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ if (!list_empty(&sem->wait_list))
+ rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ wake_up_q(&wake_q);
+
+ return sem;
+}
+
+/*
+ * lock for reading
+ */
+static __always_inline int __down_read_common(struct rw_semaphore *sem, int state)
+{
+ int ret = 0;
+ long count;
+
+ preempt_disable();
+ if (!rwsem_read_trylock(sem, &count)) {
+ if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) {
+ ret = -EINTR;
+ goto out;
+ }
+ DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+ }
+out:
+ preempt_enable();
+ return ret;
+}
+
+static __always_inline void __down_read(struct rw_semaphore *sem)
+{
+ __down_read_common(sem, TASK_UNINTERRUPTIBLE);
+}
+
+static __always_inline int __down_read_interruptible(struct rw_semaphore *sem)
+{
+ return __down_read_common(sem, TASK_INTERRUPTIBLE);
+}
+
+static __always_inline int __down_read_killable(struct rw_semaphore *sem)
+{
+ return __down_read_common(sem, TASK_KILLABLE);
+}
+
+static inline int __down_read_trylock(struct rw_semaphore *sem)
+{
+ int ret = 0;
+ long tmp;
+
+ DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
+
+ preempt_disable();
+ tmp = atomic_long_read(&sem->count);
+ while (!(tmp & RWSEM_READ_FAILED_MASK)) {
+ if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
+ tmp + RWSEM_READER_BIAS)) {
+ rwsem_set_reader_owned(sem);
+ ret = 1;
+ break;
+ }
+ }
+ preempt_enable();
+ return ret;
+}
+
+/*
+ * lock for writing
+ */
+static __always_inline int __down_write_common(struct rw_semaphore *sem, int state)
+{
+ int ret = 0;
+
+ preempt_disable();
+ if (unlikely(!rwsem_write_trylock(sem))) {
+ if (IS_ERR(rwsem_down_write_slowpath(sem, state)))
+ ret = -EINTR;
+ }
+ preempt_enable();
+ return ret;
+}
+
+static __always_inline void __down_write(struct rw_semaphore *sem)
+{
+ __down_write_common(sem, TASK_UNINTERRUPTIBLE);
+}
+
+static __always_inline int __down_write_killable(struct rw_semaphore *sem)
+{
+ return __down_write_common(sem, TASK_KILLABLE);
+}
+
+static inline int __down_write_trylock(struct rw_semaphore *sem)
+{
+ int ret;
+
+ preempt_disable();
+ DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
+ ret = rwsem_write_trylock(sem);
+ preempt_enable();
+
+ return ret;
+}
+
+/*
+ * unlock after reading
+ */
+static inline void __up_read(struct rw_semaphore *sem)
+{
+ long tmp;
+
+ DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
+ DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+
+ preempt_disable();
+ rwsem_clear_reader_owned(sem);
+ tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
+ DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
+ if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
+ RWSEM_FLAG_WAITERS)) {
+ clear_nonspinnable(sem);
+ rwsem_wake(sem);
+ }
+ preempt_enable();
+}
+
+/*
+ * unlock after writing
+ */
+static inline void __up_write(struct rw_semaphore *sem)
+{
+ long tmp;
+
+ DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
+ /*
+ * sem->owner may differ from current if the ownership is transferred
+ * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits.
+ */
+ DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&
+ !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
+
+ preempt_disable();
+ rwsem_clear_owner(sem);
+ tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
+ if (unlikely(tmp & RWSEM_FLAG_WAITERS))
+ rwsem_wake(sem);
+ preempt_enable();
+}
+
+/*
+ * downgrade write lock to read lock
+ */
+static inline void __downgrade_write(struct rw_semaphore *sem)
+{
+ long tmp;
+
+ /*
+ * When downgrading from exclusive to shared ownership,
+ * anything inside the write-locked region cannot leak
+ * into the read side. In contrast, anything in the
+ * read-locked region is ok to be re-ordered into the
+ * write side. As such, rely on RELEASE semantics.
+ */
+ DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem);
+ preempt_disable();
+ tmp = atomic_long_fetch_add_release(
+ -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
+ rwsem_set_reader_owned(sem);
+ if (tmp & RWSEM_FLAG_WAITERS)
+ rwsem_downgrade_wake(sem);
+ preempt_enable();
+}
+
+#else /* !CONFIG_PREEMPT_RT */
+
+#define RT_MUTEX_BUILD_MUTEX
+#include "rtmutex.c"
+
+#define rwbase_set_and_save_current_state(state) \
+ set_current_state(state)
+
+#define rwbase_restore_current_state() \
+ __set_current_state(TASK_RUNNING)
+
+#define rwbase_rtmutex_lock_state(rtm, state) \
+ __rt_mutex_lock(rtm, state)
+
+#define rwbase_rtmutex_slowlock_locked(rtm, state, wq) \
+ __rt_mutex_slowlock_locked(rtm, NULL, state, wq)
+
+#define rwbase_rtmutex_unlock(rtm) \
+ __rt_mutex_unlock(rtm)
+
+#define rwbase_rtmutex_trylock(rtm) \
+ __rt_mutex_trylock(rtm)
+
+#define rwbase_signal_pending_state(state, current) \
+ signal_pending_state(state, current)
+
+#define rwbase_pre_schedule() \
+ rt_mutex_pre_schedule()
+
+#define rwbase_schedule() \
+ rt_mutex_schedule()
+
+#define rwbase_post_schedule() \
+ rt_mutex_post_schedule()
+
+#include "rwbase_rt.c"
+
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
+ struct lock_class_key *key)
+{
+ init_rwbase_rt(&(sem)->rwbase);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
+#endif
+}
+EXPORT_SYMBOL(__init_rwsem);
+
+static inline void __down_read(struct rw_semaphore *sem)
+{
+ rwbase_read_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
+}
+
+static inline int __down_read_interruptible(struct rw_semaphore *sem)
+{
+ return rwbase_read_lock(&sem->rwbase, TASK_INTERRUPTIBLE);
+}
+
+static inline int __down_read_killable(struct rw_semaphore *sem)
+{
+ return rwbase_read_lock(&sem->rwbase, TASK_KILLABLE);
+}
+
+static inline int __down_read_trylock(struct rw_semaphore *sem)
+{
+ return rwbase_read_trylock(&sem->rwbase);
+}
+
+static inline void __up_read(struct rw_semaphore *sem)
+{
+ rwbase_read_unlock(&sem->rwbase, TASK_NORMAL);
+}
+
+static inline void __sched __down_write(struct rw_semaphore *sem)
+{
+ rwbase_write_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
+}
+
+static inline int __sched __down_write_killable(struct rw_semaphore *sem)
+{
+ return rwbase_write_lock(&sem->rwbase, TASK_KILLABLE);
+}
+
+static inline int __down_write_trylock(struct rw_semaphore *sem)
+{
+ return rwbase_write_trylock(&sem->rwbase);
+}
+
+static inline void __up_write(struct rw_semaphore *sem)
+{
+ rwbase_write_unlock(&sem->rwbase);
+}
+
+static inline void __downgrade_write(struct rw_semaphore *sem)
+{
+ rwbase_write_downgrade(&sem->rwbase);
+}
+
+/* Debug stubs for the common API */
+#define DEBUG_RWSEMS_WARN_ON(c, sem)
+
+static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
+ struct task_struct *owner)
+{
+}
+
+static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
+{
+ int count = atomic_read(&sem->rwbase.readers);
+
+ return count < 0 && count != READER_BIAS;
+}
+
+#endif /* CONFIG_PREEMPT_RT */
/*
* lock for reading
@@ -23,11 +1537,37 @@ void __sched down_read(struct rw_semaphore *sem)
rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
- rwsem_set_reader_owned(sem);
}
-
EXPORT_SYMBOL(down_read);
+int __sched down_read_interruptible(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+ if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_interruptible)) {
+ rwsem_release(&sem->dep_map, _RET_IP_);
+ return -EINTR;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(down_read_interruptible);
+
+int __sched down_read_killable(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+ if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
+ rwsem_release(&sem->dep_map, _RET_IP_);
+ return -EINTR;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(down_read_killable);
+
/*
* trylock for reading -- returns 1 if successful, 0 if contention
*/
@@ -35,13 +1575,10 @@ int down_read_trylock(struct rw_semaphore *sem)
{
int ret = __down_read_trylock(sem);
- if (ret == 1) {
+ if (ret == 1)
rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
- rwsem_set_reader_owned(sem);
- }
return ret;
}
-
EXPORT_SYMBOL(down_read_trylock);
/*
@@ -51,11 +1588,8 @@ void __sched down_write(struct rw_semaphore *sem)
{
might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
-
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
- rwsem_set_owner(sem);
}
-
EXPORT_SYMBOL(down_write);
/*
@@ -66,15 +1600,14 @@ int __sched down_write_killable(struct rw_semaphore *sem)
might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
- if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) {
- rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
+ __down_write_killable)) {
+ rwsem_release(&sem->dep_map, _RET_IP_);
return -EINTR;
}
- rwsem_set_owner(sem);
return 0;
}
-
EXPORT_SYMBOL(down_write_killable);
/*
@@ -84,14 +1617,11 @@ int down_write_trylock(struct rw_semaphore *sem)
{
int ret = __down_write_trylock(sem);
- if (ret == 1) {
+ if (ret == 1)
rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
- rwsem_set_owner(sem);
- }
return ret;
}
-
EXPORT_SYMBOL(down_write_trylock);
/*
@@ -99,11 +1629,9 @@ EXPORT_SYMBOL(down_write_trylock);
*/
void up_read(struct rw_semaphore *sem)
{
- rwsem_release(&sem->dep_map, 1, _RET_IP_);
-
+ rwsem_release(&sem->dep_map, _RET_IP_);
__up_read(sem);
}
-
EXPORT_SYMBOL(up_read);
/*
@@ -111,12 +1639,9 @@ EXPORT_SYMBOL(up_read);
*/
void up_write(struct rw_semaphore *sem)
{
- rwsem_release(&sem->dep_map, 1, _RET_IP_);
-
- rwsem_clear_owner(sem);
+ rwsem_release(&sem->dep_map, _RET_IP_);
__up_write(sem);
}
-
EXPORT_SYMBOL(up_write);
/*
@@ -125,11 +1650,8 @@ EXPORT_SYMBOL(up_write);
void downgrade_write(struct rw_semaphore *sem)
{
lock_downgrade(&sem->dep_map, _RET_IP_);
-
- rwsem_set_reader_owned(sem);
__downgrade_write(sem);
}
-
EXPORT_SYMBOL(downgrade_write);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -138,42 +1660,52 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
{
might_sleep();
rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
-
LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
- rwsem_set_reader_owned(sem);
}
-
EXPORT_SYMBOL(down_read_nested);
+int down_read_killable_nested(struct rw_semaphore *sem, int subclass)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
+
+ if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
+ rwsem_release(&sem->dep_map, _RET_IP_);
+ return -EINTR;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(down_read_killable_nested);
+
void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
{
might_sleep();
rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
-
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
- rwsem_set_owner(sem);
}
-
EXPORT_SYMBOL(_down_write_nest_lock);
void down_read_non_owner(struct rw_semaphore *sem)
{
might_sleep();
-
__down_read(sem);
+ /*
+ * The owner value for a reader-owned lock is mostly for debugging
+ * purpose only and is not critical to the correct functioning of
+ * rwsem. So it is perfectly fine to set it in a preempt-enabled
+ * context here.
+ */
+ __rwsem_set_reader_owned(sem, NULL);
}
-
EXPORT_SYMBOL(down_read_non_owner);
void down_write_nested(struct rw_semaphore *sem, int subclass)
{
might_sleep();
rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
-
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
- rwsem_set_owner(sem);
}
-
EXPORT_SYMBOL(down_write_nested);
int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
@@ -181,24 +1713,21 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
might_sleep();
rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
- if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) {
- rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
+ __down_write_killable)) {
+ rwsem_release(&sem->dep_map, _RET_IP_);
return -EINTR;
}
- rwsem_set_owner(sem);
return 0;
}
-
EXPORT_SYMBOL(down_write_killable_nested);
void up_read_non_owner(struct rw_semaphore *sem)
{
+ DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
__up_read(sem);
}
-
EXPORT_SYMBOL(up_read_non_owner);
#endif
-
-
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
deleted file mode 100644
index a699f4048ba1..000000000000
--- a/kernel/locking/rwsem.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * The owner field of the rw_semaphore structure will be set to
- * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear
- * the owner field when it unlocks. A reader, on the other hand, will
- * not touch the owner field when it unlocks.
- *
- * In essence, the owner field now has the following 3 states:
- * 1) 0
- * - lock is free or the owner hasn't set the field yet
- * 2) RWSEM_READER_OWNED
- * - lock is currently or previously owned by readers (lock is free
- * or not set by owner yet)
- * 3) Other non-zero value
- * - a writer owns the lock
- */
-#define RWSEM_READER_OWNED ((struct task_struct *)1UL)
-
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-/*
- * All writes to owner are protected by WRITE_ONCE() to make sure that
- * store tearing can't happen as optimistic spinners may read and use
- * the owner value concurrently without lock. Read from owner, however,
- * may not need READ_ONCE() as long as the pointer value is only used
- * for comparison and isn't being dereferenced.
- */
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
- WRITE_ONCE(sem->owner, current);
-}
-
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
- WRITE_ONCE(sem->owner, NULL);
-}
-
-static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
-{
- /*
- * We check the owner value first to make sure that we will only
- * do a write to the rwsem cacheline when it is really necessary
- * to minimize cacheline contention.
- */
- if (sem->owner != RWSEM_READER_OWNED)
- WRITE_ONCE(sem->owner, RWSEM_READER_OWNED);
-}
-
-static inline bool rwsem_owner_is_writer(struct task_struct *owner)
-{
- return owner && owner != RWSEM_READER_OWNED;
-}
-
-static inline bool rwsem_owner_is_reader(struct task_struct *owner)
-{
- return owner == RWSEM_READER_OWNED;
-}
-#else
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
-}
-
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
-}
-
-static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
-{
-}
-#endif
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 561acdd39960..3ef032e22f7e 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -1,9 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008 Intel Corporation
* Author: Matthew Wilcox <willy@linux.intel.com>
*
- * Distributed under the terms of the GNU GPL, version 2
- *
* This file implements counting semaphores.
* A counting semaphore may be acquired 'n' times before sleeping.
* See mutex.c for single-acquisition sleeping locks which enforce
@@ -30,15 +29,53 @@
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/sched/debug.h>
+#include <linux/sched/wake_q.h>
#include <linux/semaphore.h>
#include <linux/spinlock.h>
#include <linux/ftrace.h>
+#include <trace/events/lock.h>
+#include <linux/hung_task.h>
static noinline void __down(struct semaphore *sem);
static noinline int __down_interruptible(struct semaphore *sem);
static noinline int __down_killable(struct semaphore *sem);
static noinline int __down_timeout(struct semaphore *sem, long timeout);
-static noinline void __up(struct semaphore *sem);
+static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q);
+
+#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
+static inline void hung_task_sem_set_holder(struct semaphore *sem)
+{
+ WRITE_ONCE((sem)->last_holder, (unsigned long)current);
+}
+
+static inline void hung_task_sem_clear_if_holder(struct semaphore *sem)
+{
+ if (READ_ONCE((sem)->last_holder) == (unsigned long)current)
+ WRITE_ONCE((sem)->last_holder, 0UL);
+}
+
+unsigned long sem_last_holder(struct semaphore *sem)
+{
+ return READ_ONCE(sem->last_holder);
+}
+#else
+static inline void hung_task_sem_set_holder(struct semaphore *sem)
+{
+}
+static inline void hung_task_sem_clear_if_holder(struct semaphore *sem)
+{
+}
+unsigned long sem_last_holder(struct semaphore *sem)
+{
+ return 0UL;
+}
+#endif
+
+static inline void __sem_acquire(struct semaphore *sem)
+{
+ sem->count--;
+ hung_task_sem_set_holder(sem);
+}
/**
* down - acquire the semaphore
@@ -51,13 +88,14 @@ static noinline void __up(struct semaphore *sem);
* Use of this function is deprecated, please use down_interruptible() or
* down_killable() instead.
*/
-void down(struct semaphore *sem)
+void __sched down(struct semaphore *sem)
{
unsigned long flags;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
__down(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -73,14 +111,15 @@ EXPORT_SYMBOL(down);
* If the sleep is interrupted by a signal, this function will return -EINTR.
* If the semaphore is successfully acquired, this function returns 0.
*/
-int down_interruptible(struct semaphore *sem)
+int __sched down_interruptible(struct semaphore *sem)
{
unsigned long flags;
int result = 0;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
result = __down_interruptible(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -99,14 +138,15 @@ EXPORT_SYMBOL(down_interruptible);
* -EINTR. If the semaphore is successfully acquired, this function returns
* 0.
*/
-int down_killable(struct semaphore *sem)
+int __sched down_killable(struct semaphore *sem)
{
unsigned long flags;
int result = 0;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
result = __down_killable(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -120,7 +160,7 @@ EXPORT_SYMBOL(down_killable);
* @sem: the semaphore to be acquired
*
* Try to acquire the semaphore atomically. Returns 0 if the semaphore has
- * been acquired successfully or 1 if it it cannot be acquired.
+ * been acquired successfully or 1 if it cannot be acquired.
*
* NOTE: This return value is inverted from both spin_trylock and
* mutex_trylock! Be careful about this when converting code.
@@ -128,7 +168,7 @@ EXPORT_SYMBOL(down_killable);
* Unlike mutex_trylock, this function can be used from interrupt context,
* and the semaphore can be released by any task or interrupt.
*/
-int down_trylock(struct semaphore *sem)
+int __sched down_trylock(struct semaphore *sem)
{
unsigned long flags;
int count;
@@ -136,7 +176,7 @@ int down_trylock(struct semaphore *sem)
raw_spin_lock_irqsave(&sem->lock, flags);
count = sem->count - 1;
if (likely(count >= 0))
- sem->count = count;
+ __sem_acquire(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
return (count < 0);
@@ -153,14 +193,15 @@ EXPORT_SYMBOL(down_trylock);
* If the semaphore is not released within the specified number of jiffies,
* this function returns -ETIME. It returns 0 if the semaphore was acquired.
*/
-int down_timeout(struct semaphore *sem, long timeout)
+int __sched down_timeout(struct semaphore *sem, long timeout)
{
unsigned long flags;
int result = 0;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
result = __down_timeout(sem, timeout);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -176,16 +217,22 @@ EXPORT_SYMBOL(down_timeout);
* Release the semaphore. Unlike mutexes, up() may be called from any
* context and even by tasks which have never called down().
*/
-void up(struct semaphore *sem)
+void __sched up(struct semaphore *sem)
{
unsigned long flags;
+ DEFINE_WAKE_Q(wake_q);
raw_spin_lock_irqsave(&sem->lock, flags);
+
+ hung_task_sem_clear_if_holder(sem);
+
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
- __up(sem);
+ __up(sem, &wake_q);
raw_spin_unlock_irqrestore(&sem->lock, flags);
+ if (!wake_q_empty(&wake_q))
+ wake_up_q(&wake_q);
}
EXPORT_SYMBOL(up);
@@ -202,7 +249,7 @@ struct semaphore_waiter {
* constant, and thus optimised away by the compiler. Likewise the
* 'timeout' parameter for the cases without timeouts.
*/
-static inline int __sched __down_common(struct semaphore *sem, long state,
+static inline int __sched ___down_common(struct semaphore *sem, long state,
long timeout)
{
struct semaphore_waiter waiter;
@@ -220,8 +267,10 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
raw_spin_unlock_irq(&sem->lock);
timeout = schedule_timeout(timeout);
raw_spin_lock_irq(&sem->lock);
- if (waiter.up)
+ if (waiter.up) {
+ hung_task_sem_set_holder(sem);
return 0;
+ }
}
timed_out:
@@ -233,6 +282,22 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
return -EINTR;
}
+static inline int __sched __down_common(struct semaphore *sem, long state,
+ long timeout)
+{
+ int ret;
+
+ hung_task_set_blocker(sem, BLOCKER_TYPE_SEM);
+
+ trace_contention_begin(sem, 0);
+ ret = ___down_common(sem, state, timeout);
+ trace_contention_end(sem, ret);
+
+ hung_task_clear_blocker();
+
+ return ret;
+}
+
static noinline void __sched __down(struct semaphore *sem)
{
__down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
@@ -253,11 +318,12 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
}
-static noinline void __sched __up(struct semaphore *sem)
+static noinline void __sched __up(struct semaphore *sem,
+ struct wake_q_head *wake_q)
{
struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
struct semaphore_waiter, list);
list_del(&waiter->list);
waiter->up = true;
- wake_up_process(waiter->task);
+ wake_q_add(wake_q, waiter->task);
}
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 4b082b5cac9e..7685defd7c52 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (2004) Linus Torvalds
*
@@ -21,6 +22,13 @@
#include <linux/debug_locks.h>
#include <linux/export.h>
+#ifdef CONFIG_MMIOWB
+#ifndef arch_mmiowb_state
+DEFINE_PER_CPU(struct mmiowb_state, __mmiowb_state);
+EXPORT_PER_CPU_SYMBOL(__mmiowb_state);
+#endif
+#endif
+
/*
* If lockdep is enabled then we use the non-preemption spin-ops
* even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
@@ -29,11 +37,10 @@
#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
/*
* The __lock_function inlines are taken from
- * include/linux/spinlock_api_smp.h
+ * spinlock : include/linux/spinlock_api_smp.h
+ * rwlock : include/linux/rwlock_api_smp.h
*/
#else
-#define raw_read_can_lock(l) read_can_lock(l)
-#define raw_write_can_lock(l) write_can_lock(l)
/*
* Some architectures can relax in favour of the CPU owning the lock.
@@ -51,14 +58,14 @@
/*
* We build the __lock_function inlines here. They are too large for
* inlining all over the place, but here is only one user per function
- * which embedds them into the calling _lock_function below.
+ * which embeds them into the calling _lock_function below.
*
* This could be a long-held lock. We both prepare to spin for a long
- * time (making _this_ CPU preemptable if possible), and we also signal
+ * time (making _this_ CPU preemptible if possible), and we also signal
* towards that other CPU that it should break the lock ASAP.
*/
#define BUILD_LOCK_OPS(op, locktype) \
-void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
+static void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
{ \
for (;;) { \
preempt_disable(); \
@@ -66,15 +73,11 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
break; \
preempt_enable(); \
\
- if (!(lock)->break_lock) \
- (lock)->break_lock = 1; \
- while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
- arch_##op##_relax(&lock->raw_lock); \
+ arch_##op##_relax(&lock->raw_lock); \
} \
- (lock)->break_lock = 0; \
} \
\
-unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
+static unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
{ \
unsigned long flags; \
\
@@ -86,21 +89,18 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
local_irq_restore(flags); \
preempt_enable(); \
\
- if (!(lock)->break_lock) \
- (lock)->break_lock = 1; \
- while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
- arch_##op##_relax(&lock->raw_lock); \
+ arch_##op##_relax(&lock->raw_lock); \
} \
- (lock)->break_lock = 0; \
+ \
return flags; \
} \
\
-void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \
+static void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \
{ \
_raw_##op##_lock_irqsave(lock); \
} \
\
-void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
+static void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
{ \
unsigned long flags; \
\
@@ -124,13 +124,16 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
* __[spin|read|write]_lock_bh()
*/
BUILD_LOCK_OPS(spin, raw_spinlock);
+
+#ifndef CONFIG_PREEMPT_RT
BUILD_LOCK_OPS(read, rwlock);
BUILD_LOCK_OPS(write, rwlock);
+#endif
#endif
#ifndef CONFIG_INLINE_SPIN_TRYLOCK
-int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
+noinline int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
{
return __raw_spin_trylock(lock);
}
@@ -138,7 +141,7 @@ EXPORT_SYMBOL(_raw_spin_trylock);
#endif
#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
-int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
+noinline int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
{
return __raw_spin_trylock_bh(lock);
}
@@ -146,7 +149,7 @@ EXPORT_SYMBOL(_raw_spin_trylock_bh);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK
-void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
{
__raw_spin_lock(lock);
}
@@ -154,7 +157,7 @@ EXPORT_SYMBOL(_raw_spin_lock);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
-unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
+noinline unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
{
return __raw_spin_lock_irqsave(lock);
}
@@ -162,7 +165,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irqsave);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
-void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
{
__raw_spin_lock_irq(lock);
}
@@ -170,7 +173,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irq);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK_BH
-void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
{
__raw_spin_lock_bh(lock);
}
@@ -178,7 +181,7 @@ EXPORT_SYMBOL(_raw_spin_lock_bh);
#endif
#ifdef CONFIG_UNINLINE_SPIN_UNLOCK
-void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
{
__raw_spin_unlock(lock);
}
@@ -186,7 +189,7 @@ EXPORT_SYMBOL(_raw_spin_unlock);
#endif
#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
-void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
+noinline void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
{
__raw_spin_unlock_irqrestore(lock, flags);
}
@@ -194,7 +197,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_irqrestore);
#endif
#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
-void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
{
__raw_spin_unlock_irq(lock);
}
@@ -202,15 +205,17 @@ EXPORT_SYMBOL(_raw_spin_unlock_irq);
#endif
#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
-void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
{
__raw_spin_unlock_bh(lock);
}
EXPORT_SYMBOL(_raw_spin_unlock_bh);
#endif
+#ifndef CONFIG_PREEMPT_RT
+
#ifndef CONFIG_INLINE_READ_TRYLOCK
-int __lockfunc _raw_read_trylock(rwlock_t *lock)
+noinline int __lockfunc _raw_read_trylock(rwlock_t *lock)
{
return __raw_read_trylock(lock);
}
@@ -218,7 +223,7 @@ EXPORT_SYMBOL(_raw_read_trylock);
#endif
#ifndef CONFIG_INLINE_READ_LOCK
-void __lockfunc _raw_read_lock(rwlock_t *lock)
+noinline void __lockfunc _raw_read_lock(rwlock_t *lock)
{
__raw_read_lock(lock);
}
@@ -226,7 +231,7 @@ EXPORT_SYMBOL(_raw_read_lock);
#endif
#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
-unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
+noinline unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
{
return __raw_read_lock_irqsave(lock);
}
@@ -234,7 +239,7 @@ EXPORT_SYMBOL(_raw_read_lock_irqsave);
#endif
#ifndef CONFIG_INLINE_READ_LOCK_IRQ
-void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
{
__raw_read_lock_irq(lock);
}
@@ -242,7 +247,7 @@ EXPORT_SYMBOL(_raw_read_lock_irq);
#endif
#ifndef CONFIG_INLINE_READ_LOCK_BH
-void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
{
__raw_read_lock_bh(lock);
}
@@ -250,7 +255,7 @@ EXPORT_SYMBOL(_raw_read_lock_bh);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK
-void __lockfunc _raw_read_unlock(rwlock_t *lock)
+noinline void __lockfunc _raw_read_unlock(rwlock_t *lock)
{
__raw_read_unlock(lock);
}
@@ -258,7 +263,7 @@ EXPORT_SYMBOL(_raw_read_unlock);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
-void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+noinline void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
{
__raw_read_unlock_irqrestore(lock, flags);
}
@@ -266,7 +271,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irqrestore);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
-void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
{
__raw_read_unlock_irq(lock);
}
@@ -274,7 +279,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irq);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK_BH
-void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
{
__raw_read_unlock_bh(lock);
}
@@ -282,7 +287,7 @@ EXPORT_SYMBOL(_raw_read_unlock_bh);
#endif
#ifndef CONFIG_INLINE_WRITE_TRYLOCK
-int __lockfunc _raw_write_trylock(rwlock_t *lock)
+noinline int __lockfunc _raw_write_trylock(rwlock_t *lock)
{
return __raw_write_trylock(lock);
}
@@ -290,15 +295,25 @@ EXPORT_SYMBOL(_raw_write_trylock);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK
-void __lockfunc _raw_write_lock(rwlock_t *lock)
+noinline void __lockfunc _raw_write_lock(rwlock_t *lock)
{
__raw_write_lock(lock);
}
EXPORT_SYMBOL(_raw_write_lock);
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+#define __raw_write_lock_nested(lock, subclass) __raw_write_lock(((void)(subclass), (lock)))
+#endif
+
+void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass)
+{
+ __raw_write_lock_nested(lock, subclass);
+}
+EXPORT_SYMBOL(_raw_write_lock_nested);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
-unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
+noinline unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
{
return __raw_write_lock_irqsave(lock);
}
@@ -306,7 +321,7 @@ EXPORT_SYMBOL(_raw_write_lock_irqsave);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
-void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
{
__raw_write_lock_irq(lock);
}
@@ -314,7 +329,7 @@ EXPORT_SYMBOL(_raw_write_lock_irq);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK_BH
-void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
{
__raw_write_lock_bh(lock);
}
@@ -322,7 +337,7 @@ EXPORT_SYMBOL(_raw_write_lock_bh);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK
-void __lockfunc _raw_write_unlock(rwlock_t *lock)
+noinline void __lockfunc _raw_write_unlock(rwlock_t *lock)
{
__raw_write_unlock(lock);
}
@@ -330,7 +345,7 @@ EXPORT_SYMBOL(_raw_write_unlock);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
-void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+noinline void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
{
__raw_write_unlock_irqrestore(lock, flags);
}
@@ -338,7 +353,7 @@ EXPORT_SYMBOL(_raw_write_unlock_irqrestore);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
-void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
{
__raw_write_unlock_irq(lock);
}
@@ -346,13 +361,15 @@ EXPORT_SYMBOL(_raw_write_unlock_irq);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
-void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
{
__raw_write_unlock_bh(lock);
}
EXPORT_SYMBOL(_raw_write_unlock_bh);
#endif
+#endif /* !CONFIG_PREEMPT_RT */
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
@@ -371,8 +388,7 @@ unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
local_irq_save(flags);
preempt_disable();
spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
- LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock,
- do_raw_spin_lock_flags, &flags);
+ LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
return flags;
}
EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested);
@@ -397,3 +413,11 @@ notrace int in_lock_functions(unsigned long addr)
&& addr < (unsigned long)__lock_text_end;
}
EXPORT_SYMBOL(in_lock_functions);
+
+#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_PREEMPT_RT)
+void notrace lockdep_assert_in_softirq_func(void)
+{
+ lockdep_assert_in_softirq();
+}
+EXPORT_SYMBOL(lockdep_assert_in_softirq_func);
+#endif
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 9aa0fccd5d43..2338b3adfb55 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -12,16 +12,17 @@
#include <linux/debug_locks.h>
#include <linux/delay.h>
#include <linux/export.h>
+#include <linux/pid.h>
void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
- struct lock_class_key *key)
+ struct lock_class_key *key, short inner)
{
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/*
* Make sure we are not reinitializing a held lock:
*/
debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lockdep_init_map(&lock->dep_map, name, key, 0);
+ lockdep_init_map_wait(&lock->dep_map, name, key, 0, inner);
#endif
lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
lock->magic = SPINLOCK_MAGIC;
@@ -31,6 +32,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
EXPORT_SYMBOL(__raw_spin_lock_init);
+#ifndef CONFIG_PREEMPT_RT
void __rwlock_init(rwlock_t *lock, const char *name,
struct lock_class_key *key)
{
@@ -39,7 +41,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
* Make sure we are not reinitializing a held lock:
*/
debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lockdep_init_map(&lock->dep_map, name, key, 0);
+ lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_CONFIG);
#endif
lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED;
lock->magic = RWLOCK_MAGIC;
@@ -48,22 +50,23 @@ void __rwlock_init(rwlock_t *lock, const char *name,
}
EXPORT_SYMBOL(__rwlock_init);
+#endif
static void spin_dump(raw_spinlock_t *lock, const char *msg)
{
- struct task_struct *owner = NULL;
+ struct task_struct *owner = READ_ONCE(lock->owner);
- if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT)
- owner = lock->owner;
+ if (owner == SPINLOCK_OWNER_INIT)
+ owner = NULL;
printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n",
msg, raw_smp_processor_id(),
current->comm, task_pid_nr(current));
printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, "
".owner_cpu: %d\n",
- lock, lock->magic,
+ lock, READ_ONCE(lock->magic),
owner ? owner->comm : "<none>",
owner ? task_pid_nr(owner) : -1,
- lock->owner_cpu);
+ READ_ONCE(lock->owner_cpu));
dump_stack();
}
@@ -80,16 +83,16 @@ static void spin_bug(raw_spinlock_t *lock, const char *msg)
static inline void
debug_spin_lock_before(raw_spinlock_t *lock)
{
- SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
- SPIN_BUG_ON(lock->owner == current, lock, "recursion");
- SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
+ SPIN_BUG_ON(READ_ONCE(lock->magic) != SPINLOCK_MAGIC, lock, "bad magic");
+ SPIN_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion");
+ SPIN_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(),
lock, "cpu recursion");
}
static inline void debug_spin_lock_after(raw_spinlock_t *lock)
{
- lock->owner_cpu = raw_smp_processor_id();
- lock->owner = current;
+ WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id());
+ WRITE_ONCE(lock->owner, current);
}
static inline void debug_spin_unlock(raw_spinlock_t *lock)
@@ -99,8 +102,8 @@ static inline void debug_spin_unlock(raw_spinlock_t *lock)
SPIN_BUG_ON(lock->owner != current, lock, "wrong owner");
SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
lock, "wrong CPU");
- lock->owner = SPINLOCK_OWNER_INIT;
- lock->owner_cpu = -1;
+ WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT);
+ WRITE_ONCE(lock->owner_cpu, -1);
}
/*
@@ -111,6 +114,7 @@ void do_raw_spin_lock(raw_spinlock_t *lock)
{
debug_spin_lock_before(lock);
arch_spin_lock(&lock->raw_lock);
+ mmiowb_spin_lock();
debug_spin_lock_after(lock);
}
@@ -118,8 +122,10 @@ int do_raw_spin_trylock(raw_spinlock_t *lock)
{
int ret = arch_spin_trylock(&lock->raw_lock);
- if (ret)
+ if (ret) {
+ mmiowb_spin_lock();
debug_spin_lock_after(lock);
+ }
#ifndef CONFIG_SMP
/*
* Must not happen on UP:
@@ -131,10 +137,12 @@ int do_raw_spin_trylock(raw_spinlock_t *lock)
void do_raw_spin_unlock(raw_spinlock_t *lock)
{
+ mmiowb_spin_unlock();
debug_spin_unlock(lock);
arch_spin_unlock(&lock->raw_lock);
}
+#ifndef CONFIG_PREEMPT_RT
static void rwlock_bug(rwlock_t *lock, const char *msg)
{
if (!debug_locks_off())
@@ -176,15 +184,15 @@ void do_raw_read_unlock(rwlock_t *lock)
static inline void debug_write_lock_before(rwlock_t *lock)
{
RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
- RWLOCK_BUG_ON(lock->owner == current, lock, "recursion");
- RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
+ RWLOCK_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion");
+ RWLOCK_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(),
lock, "cpu recursion");
}
static inline void debug_write_lock_after(rwlock_t *lock)
{
- lock->owner_cpu = raw_smp_processor_id();
- lock->owner = current;
+ WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id());
+ WRITE_ONCE(lock->owner, current);
}
static inline void debug_write_unlock(rwlock_t *lock)
@@ -193,8 +201,8 @@ static inline void debug_write_unlock(rwlock_t *lock)
RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner");
RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
lock, "wrong CPU");
- lock->owner = SPINLOCK_OWNER_INIT;
- lock->owner_cpu = -1;
+ WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT);
+ WRITE_ONCE(lock->owner_cpu, -1);
}
void do_raw_write_lock(rwlock_t *lock)
@@ -224,3 +232,5 @@ void do_raw_write_unlock(rwlock_t *lock)
debug_write_unlock(lock);
arch_write_unlock(&lock->raw_lock);
}
+
+#endif /* !CONFIG_PREEMPT_RT */
diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c
new file mode 100644
index 000000000000..db1e11b45de6
--- /dev/null
+++ b/kernel/locking/spinlock_rt.c
@@ -0,0 +1,287 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * PREEMPT_RT substitution for spin/rw_locks
+ *
+ * spinlocks and rwlocks on RT are based on rtmutexes, with a few twists to
+ * resemble the non RT semantics:
+ *
+ * - Contrary to plain rtmutexes, spinlocks and rwlocks are state
+ * preserving. The task state is saved before blocking on the underlying
+ * rtmutex, and restored when the lock has been acquired. Regular wakeups
+ * during that time are redirected to the saved state so no wake up is
+ * missed.
+ *
+ * - Non RT spin/rwlocks disable preemption and eventually interrupts.
+ * Disabling preemption has the side effect of disabling migration and
+ * preventing RCU grace periods.
+ *
+ * The RT substitutions explicitly disable migration and take
+ * rcu_read_lock() across the lock held section.
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+
+#define RT_MUTEX_BUILD_SPINLOCKS
+#include "rtmutex.c"
+
+/*
+ * __might_resched() skips the state check as rtlocks are state
+ * preserving. Take RCU nesting into account as spin/read/write_lock() can
+ * legitimately nest into an RCU read side critical section.
+ */
+#define RTLOCK_RESCHED_OFFSETS \
+ (rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT)
+
+#define rtlock_might_resched() \
+ __might_resched(__FILE__, __LINE__, RTLOCK_RESCHED_OFFSETS)
+
+static __always_inline void rtlock_lock(struct rt_mutex_base *rtm)
+{
+ lockdep_assert(!current->pi_blocked_on);
+
+ if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
+ rtlock_slowlock(rtm);
+}
+
+static __always_inline void __rt_spin_lock(spinlock_t *lock)
+{
+ rtlock_might_resched();
+ rtlock_lock(&lock->lock);
+ rcu_read_lock();
+ migrate_disable();
+}
+
+void __sched rt_spin_lock(spinlock_t *lock) __acquires(RCU)
+{
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ __rt_spin_lock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched rt_spin_lock_nested(spinlock_t *lock, int subclass)
+{
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ __rt_spin_lock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock_nested);
+
+void __sched rt_spin_lock_nest_lock(spinlock_t *lock,
+ struct lockdep_map *nest_lock)
+{
+ spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
+ __rt_spin_lock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock_nest_lock);
+#endif
+
+void __sched rt_spin_unlock(spinlock_t *lock) __releases(RCU)
+{
+ spin_release(&lock->dep_map, _RET_IP_);
+ migrate_enable();
+ rcu_read_unlock();
+
+ if (unlikely(!rt_mutex_cmpxchg_release(&lock->lock, current, NULL)))
+ rt_mutex_slowunlock(&lock->lock);
+}
+EXPORT_SYMBOL(rt_spin_unlock);
+
+/*
+ * Wait for the lock to get unlocked: instead of polling for an unlock
+ * (like raw spinlocks do), lock and unlock, to force the kernel to
+ * schedule if there's contention:
+ */
+void __sched rt_spin_lock_unlock(spinlock_t *lock)
+{
+ spin_lock(lock);
+ spin_unlock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock_unlock);
+
+static __always_inline int __rt_spin_trylock(spinlock_t *lock)
+{
+ int ret = 1;
+
+ if (unlikely(!rt_mutex_cmpxchg_acquire(&lock->lock, NULL, current)))
+ ret = rt_mutex_slowtrylock(&lock->lock);
+
+ if (ret) {
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ migrate_disable();
+ }
+ return ret;
+}
+
+int __sched rt_spin_trylock(spinlock_t *lock)
+{
+ return __rt_spin_trylock(lock);
+}
+EXPORT_SYMBOL(rt_spin_trylock);
+
+int __sched rt_spin_trylock_bh(spinlock_t *lock)
+{
+ int ret;
+
+ local_bh_disable();
+ ret = __rt_spin_trylock(lock);
+ if (!ret)
+ local_bh_enable();
+ return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock_bh);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __rt_spin_lock_init(spinlock_t *lock, const char *name,
+ struct lock_class_key *key, bool percpu)
+{
+ u8 type = percpu ? LD_LOCK_PERCPU : LD_LOCK_NORMAL;
+
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map_type(&lock->dep_map, name, key, 0, LD_WAIT_CONFIG,
+ LD_WAIT_INV, type);
+}
+EXPORT_SYMBOL(__rt_spin_lock_init);
+#endif
+
+/*
+ * RT-specific reader/writer locks
+ */
+#define rwbase_set_and_save_current_state(state) \
+ current_save_and_set_rtlock_wait_state()
+
+#define rwbase_restore_current_state() \
+ current_restore_rtlock_saved_state()
+
+static __always_inline int
+rwbase_rtmutex_lock_state(struct rt_mutex_base *rtm, unsigned int state)
+{
+ if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
+ rtlock_slowlock(rtm);
+ return 0;
+}
+
+static __always_inline int
+rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state,
+ struct wake_q_head *wake_q)
+{
+ rtlock_slowlock_locked(rtm, wake_q);
+ return 0;
+}
+
+static __always_inline void rwbase_rtmutex_unlock(struct rt_mutex_base *rtm)
+{
+ if (likely(rt_mutex_cmpxchg_acquire(rtm, current, NULL)))
+ return;
+
+ rt_mutex_slowunlock(rtm);
+}
+
+static __always_inline int rwbase_rtmutex_trylock(struct rt_mutex_base *rtm)
+{
+ if (likely(rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
+ return 1;
+
+ return rt_mutex_slowtrylock(rtm);
+}
+
+#define rwbase_signal_pending_state(state, current) (0)
+
+#define rwbase_pre_schedule()
+
+#define rwbase_schedule() \
+ schedule_rtlock()
+
+#define rwbase_post_schedule()
+
+#include "rwbase_rt.c"
+/*
+ * The common functions which get wrapped into the rwlock API.
+ */
+int __sched rt_read_trylock(rwlock_t *rwlock)
+{
+ int ret;
+
+ ret = rwbase_read_trylock(&rwlock->rwbase);
+ if (ret) {
+ rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ migrate_disable();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_read_trylock);
+
+int __sched rt_write_trylock(rwlock_t *rwlock)
+{
+ int ret;
+
+ ret = rwbase_write_trylock(&rwlock->rwbase);
+ if (ret) {
+ rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ migrate_disable();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_write_trylock);
+
+void __sched rt_read_lock(rwlock_t *rwlock) __acquires(RCU)
+{
+ rtlock_might_resched();
+ rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
+ rwbase_read_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+ rcu_read_lock();
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_read_lock);
+
+void __sched rt_write_lock(rwlock_t *rwlock) __acquires(RCU)
+{
+ rtlock_might_resched();
+ rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
+ rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+ rcu_read_lock();
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_write_lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass) __acquires(RCU)
+{
+ rtlock_might_resched();
+ rwlock_acquire(&rwlock->dep_map, subclass, 0, _RET_IP_);
+ rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+ rcu_read_lock();
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_write_lock_nested);
+#endif
+
+void __sched rt_read_unlock(rwlock_t *rwlock) __releases(RCU)
+{
+ rwlock_release(&rwlock->dep_map, _RET_IP_);
+ migrate_enable();
+ rcu_read_unlock();
+ rwbase_read_unlock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+}
+EXPORT_SYMBOL(rt_read_unlock);
+
+void __sched rt_write_unlock(rwlock_t *rwlock) __releases(RCU)
+{
+ rwlock_release(&rwlock->dep_map, _RET_IP_);
+ rcu_read_unlock();
+ migrate_enable();
+ rwbase_write_unlock(&rwlock->rwbase);
+}
+EXPORT_SYMBOL(rt_write_unlock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __rt_rwlock_init(rwlock_t *rwlock, const char *name,
+ struct lock_class_key *key)
+{
+ debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
+ lockdep_init_map_wait(&rwlock->dep_map, name, key, 0, LD_WAIT_CONFIG);
+}
+EXPORT_SYMBOL(__rt_rwlock_init);
+#endif
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 39f56c870051..bcb1b9fea588 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Module-based API test facility for ww_mutexes
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
*/
#include <linux/kernel.h>
@@ -22,13 +9,22 @@
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/module.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
#include <linux/slab.h>
#include <linux/ww_mutex.h>
-static DEFINE_WW_CLASS(ww_class);
+static DEFINE_WD_CLASS(ww_class);
struct workqueue_struct *wq;
+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
+#define ww_acquire_init_noinject(a, b) do { \
+ ww_acquire_init((a), (b)); \
+ (a)->deadlock_inject_countdown = ~0U; \
+ } while (0)
+#else
+#define ww_acquire_init_noinject(a, b) ww_acquire_init((a), (b))
+#endif
+
struct test_mutex {
struct work_struct work;
struct ww_mutex mutex;
@@ -49,7 +45,7 @@ static void test_mutex_work(struct work_struct *work)
wait_for_completion(&mtx->go);
if (mtx->flags & TEST_MTX_TRY) {
- while (!ww_mutex_trylock(&mtx->mutex))
+ while (!ww_mutex_trylock(&mtx->mutex, NULL))
cond_resched();
} else {
ww_mutex_lock(&mtx->mutex, NULL);
@@ -66,7 +62,8 @@ static int __test_mutex(unsigned int flags)
int ret;
ww_mutex_init(&mtx.mutex, &ww_class);
- ww_acquire_init(&ctx, &ww_class);
+ if (flags & TEST_MTX_CTX)
+ ww_acquire_init(&ctx, &ww_class);
INIT_WORK_ONSTACK(&mtx.work, test_mutex_work);
init_completion(&mtx.ready);
@@ -94,7 +91,8 @@ static int __test_mutex(unsigned int flags)
ret = wait_for_completion_timeout(&mtx.done, TIMEOUT);
}
ww_mutex_unlock(&mtx.mutex);
- ww_acquire_fini(&ctx);
+ if (flags & TEST_MTX_CTX)
+ ww_acquire_fini(&ctx);
if (ret) {
pr_err("%s(flags=%x): mutual exclusion failure\n",
@@ -122,19 +120,39 @@ static int test_mutex(void)
return 0;
}
-static int test_aa(void)
+static int test_aa(bool trylock)
{
struct ww_mutex mutex;
struct ww_acquire_ctx ctx;
int ret;
+ const char *from = trylock ? "trylock" : "lock";
ww_mutex_init(&mutex, &ww_class);
ww_acquire_init(&ctx, &ww_class);
- ww_mutex_lock(&mutex, &ctx);
+ if (!trylock) {
+ ret = ww_mutex_lock(&mutex, &ctx);
+ if (ret) {
+ pr_err("%s: initial lock failed!\n", __func__);
+ goto out;
+ }
+ } else {
+ ret = !ww_mutex_trylock(&mutex, &ctx);
+ if (ret) {
+ pr_err("%s: initial trylock failed!\n", __func__);
+ goto out;
+ }
+ }
- if (ww_mutex_trylock(&mutex)) {
- pr_err("%s: trylocked itself!\n", __func__);
+ if (ww_mutex_trylock(&mutex, NULL)) {
+ pr_err("%s: trylocked itself without context from %s!\n", __func__, from);
+ ww_mutex_unlock(&mutex);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (ww_mutex_trylock(&mutex, &ctx)) {
+ pr_err("%s: trylocked itself with context from %s!\n", __func__, from);
ww_mutex_unlock(&mutex);
ret = -EINVAL;
goto out;
@@ -142,17 +160,17 @@ static int test_aa(void)
ret = ww_mutex_lock(&mutex, &ctx);
if (ret != -EALREADY) {
- pr_err("%s: missed deadlock for recursing, ret=%d\n",
- __func__, ret);
+ pr_err("%s: missed deadlock for recursing, ret=%d from %s\n",
+ __func__, ret, from);
if (!ret)
ww_mutex_unlock(&mutex);
ret = -EINVAL;
goto out;
}
+ ww_mutex_unlock(&mutex);
ret = 0;
out:
- ww_mutex_unlock(&mutex);
ww_acquire_fini(&ctx);
return ret;
}
@@ -163,7 +181,7 @@ struct test_abba {
struct ww_mutex b_mutex;
struct completion a_ready;
struct completion b_ready;
- bool resolve;
+ bool resolve, trylock;
int result;
};
@@ -173,8 +191,13 @@ static void test_abba_work(struct work_struct *work)
struct ww_acquire_ctx ctx;
int err;
- ww_acquire_init(&ctx, &ww_class);
- ww_mutex_lock(&abba->b_mutex, &ctx);
+ ww_acquire_init_noinject(&ctx, &ww_class);
+ if (!abba->trylock)
+ ww_mutex_lock(&abba->b_mutex, &ctx);
+ else
+ WARN_ON(!ww_mutex_trylock(&abba->b_mutex, &ctx));
+
+ WARN_ON(READ_ONCE(abba->b_mutex.ctx) != &ctx);
complete(&abba->b_ready);
wait_for_completion(&abba->a_ready);
@@ -194,7 +217,7 @@ static void test_abba_work(struct work_struct *work)
abba->result = err;
}
-static int test_abba(bool resolve)
+static int test_abba(bool trylock, bool resolve)
{
struct test_abba abba;
struct ww_acquire_ctx ctx;
@@ -205,12 +228,18 @@ static int test_abba(bool resolve)
INIT_WORK_ONSTACK(&abba.work, test_abba_work);
init_completion(&abba.a_ready);
init_completion(&abba.b_ready);
+ abba.trylock = trylock;
abba.resolve = resolve;
schedule_work(&abba.work);
- ww_acquire_init(&ctx, &ww_class);
- ww_mutex_lock(&abba.a_mutex, &ctx);
+ ww_acquire_init_noinject(&ctx, &ww_class);
+ if (!trylock)
+ ww_mutex_lock(&abba.a_mutex, &ctx);
+ else
+ WARN_ON(!ww_mutex_trylock(&abba.a_mutex, &ctx));
+
+ WARN_ON(READ_ONCE(abba.a_mutex.ctx) != &ctx);
complete(&abba.a_ready);
wait_for_completion(&abba.b_ready);
@@ -260,9 +289,9 @@ static void test_cycle_work(struct work_struct *work)
{
struct test_cycle *cycle = container_of(work, typeof(*cycle), work);
struct ww_acquire_ctx ctx;
- int err;
+ int err, erra = 0;
- ww_acquire_init(&ctx, &ww_class);
+ ww_acquire_init_noinject(&ctx, &ww_class);
ww_mutex_lock(&cycle->a_mutex, &ctx);
complete(cycle->a_signal);
@@ -270,17 +299,19 @@ static void test_cycle_work(struct work_struct *work)
err = ww_mutex_lock(cycle->b_mutex, &ctx);
if (err == -EDEADLK) {
+ err = 0;
ww_mutex_unlock(&cycle->a_mutex);
ww_mutex_lock_slow(cycle->b_mutex, &ctx);
- err = ww_mutex_lock(&cycle->a_mutex, &ctx);
+ erra = ww_mutex_lock(&cycle->a_mutex, &ctx);
}
if (!err)
ww_mutex_unlock(cycle->b_mutex);
- ww_mutex_unlock(&cycle->a_mutex);
+ if (!erra)
+ ww_mutex_unlock(&cycle->a_mutex);
ww_acquire_fini(&ctx);
- cycle->result = err;
+ cycle->result = err ?: erra;
}
static int __test_cycle(unsigned int nthreads)
@@ -324,7 +355,7 @@ static int __test_cycle(unsigned int nthreads)
if (!cycle->result)
continue;
- pr_err("cylic deadlock not resolved, ret[%d/%d] = %d\n",
+ pr_err("cyclic deadlock not resolved, ret[%d/%d] = %d\n",
n, nthreads, cycle->result);
ret = -EINVAL;
break;
@@ -357,12 +388,25 @@ struct stress {
int nlocks;
};
+struct rnd_state rng;
+DEFINE_SPINLOCK(rng_lock);
+
+static inline u32 prandom_u32_below(u32 ceil)
+{
+ u32 ret;
+
+ spin_lock(&rng_lock);
+ ret = prandom_u32_state(&rng) % ceil;
+ spin_unlock(&rng_lock);
+ return ret;
+}
+
static int *get_random_order(int count)
{
int *order;
- int n, r, tmp;
+ int n, r;
- order = kmalloc_array(count, sizeof(*order), GFP_TEMPORARY);
+ order = kmalloc_array(count, sizeof(*order), GFP_KERNEL);
if (!order)
return order;
@@ -370,12 +414,9 @@ static int *get_random_order(int count)
order[n] = n;
for (n = count - 1; n > 1; n--) {
- r = get_random_int() % (n + 1);
- if (r != n) {
- tmp = order[n];
- order[n] = order[r];
- order[r] = tmp;
- }
+ r = prandom_u32_below(n + 1);
+ if (r != n)
+ swap(order[n], order[r]);
}
return order;
@@ -423,21 +464,21 @@ retry:
ww_mutex_unlock(&locks[order[n]]);
if (err == -EDEADLK) {
- ww_mutex_lock_slow(&locks[order[contended]], &ctx);
- goto retry;
+ if (!time_after(jiffies, stress->timeout)) {
+ ww_mutex_lock_slow(&locks[order[contended]], &ctx);
+ goto retry;
+ }
}
+ ww_acquire_fini(&ctx);
if (err) {
pr_err_once("stress (%s) failed with %d\n",
__func__, err);
break;
}
-
- ww_acquire_fini(&ctx);
} while (!time_after(jiffies, stress->timeout));
kfree(order);
- kfree(stress);
}
struct reorder_lock {
@@ -502,14 +543,13 @@ out:
list_for_each_entry_safe(ll, ln, &locks, link)
kfree(ll);
kfree(order);
- kfree(stress);
}
static void stress_one_work(struct work_struct *work)
{
struct stress *stress = container_of(work, typeof(*stress), work);
const int nlocks = stress->nlocks;
- struct ww_mutex *lock = stress->locks + (get_random_int() % nlocks);
+ struct ww_mutex *lock = stress->locks + get_random_u32_below(nlocks);
int err;
do {
@@ -523,8 +563,6 @@ static void stress_one_work(struct work_struct *work)
break;
}
} while (!time_after(jiffies, stress->timeout));
-
- kfree(stress);
}
#define STRESS_INORDER BIT(0)
@@ -535,15 +573,24 @@ static void stress_one_work(struct work_struct *work)
static int stress(int nlocks, int nthreads, unsigned int flags)
{
struct ww_mutex *locks;
- int n;
+ struct stress *stress_array;
+ int n, count;
locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL);
if (!locks)
return -ENOMEM;
+ stress_array = kmalloc_array(nthreads, sizeof(*stress_array),
+ GFP_KERNEL);
+ if (!stress_array) {
+ kfree(locks);
+ return -ENOMEM;
+ }
+
for (n = 0; n < nlocks; n++)
ww_mutex_init(&locks[n], &ww_class);
+ count = 0;
for (n = 0; nthreads; n++) {
struct stress *stress;
void (*fn)(struct work_struct *work);
@@ -567,9 +614,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
if (!fn)
continue;
- stress = kmalloc(sizeof(*stress), GFP_KERNEL);
- if (!stress)
- break;
+ stress = &stress_array[count++];
INIT_WORK(&stress->work, fn);
stress->locks = locks;
@@ -584,6 +629,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
for (n = 0; n < nlocks; n++)
ww_mutex_destroy(&locks[n]);
+ kfree(stress_array);
kfree(locks);
return 0;
@@ -592,7 +638,11 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
static int __init test_ww_mutex_init(void)
{
int ncpus = num_online_cpus();
- int ret;
+ int ret, i;
+
+ printk(KERN_INFO "Beginning ww mutex selftests\n");
+
+ prandom_seed_state(&rng, get_random_u64());
wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0);
if (!wq)
@@ -602,17 +652,19 @@ static int __init test_ww_mutex_init(void)
if (ret)
return ret;
- ret = test_aa();
+ ret = test_aa(false);
if (ret)
return ret;
- ret = test_abba(false);
+ ret = test_aa(true);
if (ret)
return ret;
- ret = test_abba(true);
- if (ret)
- return ret;
+ for (i = 0; i < 4; i++) {
+ ret = test_abba(i & 1, i & 2);
+ if (ret)
+ return ret;
+ }
ret = test_cycle(ncpus);
if (ret)
@@ -626,10 +678,11 @@ static int __init test_ww_mutex_init(void)
if (ret)
return ret;
- ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
+ ret = stress(2046, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
if (ret)
return ret;
+ printk(KERN_INFO "All ww mutex selftests passed\n");
return 0;
}
@@ -643,3 +696,4 @@ module_exit(test_ww_mutex_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("API test facility for ww_mutexes");
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
new file mode 100644
index 000000000000..31a785afee6c
--- /dev/null
+++ b/kernel/locking/ww_mutex.h
@@ -0,0 +1,594 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef WW_RT
+
+#define MUTEX mutex
+#define MUTEX_WAITER mutex_waiter
+
+static inline struct mutex_waiter *
+__ww_waiter_first(struct mutex *lock)
+{
+ struct mutex_waiter *w;
+
+ w = list_first_entry(&lock->wait_list, struct mutex_waiter, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline struct mutex_waiter *
+__ww_waiter_next(struct mutex *lock, struct mutex_waiter *w)
+{
+ w = list_next_entry(w, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline struct mutex_waiter *
+__ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w)
+{
+ w = list_prev_entry(w, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline struct mutex_waiter *
+__ww_waiter_last(struct mutex *lock)
+{
+ struct mutex_waiter *w;
+
+ w = list_last_entry(&lock->wait_list, struct mutex_waiter, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline void
+__ww_waiter_add(struct mutex *lock, struct mutex_waiter *waiter, struct mutex_waiter *pos)
+{
+ struct list_head *p = &lock->wait_list;
+ if (pos)
+ p = &pos->list;
+ __mutex_add_waiter(lock, waiter, p);
+}
+
+static inline struct task_struct *
+__ww_mutex_owner(struct mutex *lock)
+{
+ return __mutex_owner(lock);
+}
+
+static inline bool
+__ww_mutex_has_waiters(struct mutex *lock)
+{
+ return atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS;
+}
+
+static inline void lock_wait_lock(struct mutex *lock, unsigned long *flags)
+{
+ raw_spin_lock_irqsave(&lock->wait_lock, *flags);
+}
+
+static inline void unlock_wait_lock(struct mutex *lock, unsigned long *flags)
+{
+ raw_spin_unlock_irqrestore(&lock->wait_lock, *flags);
+}
+
+static inline void lockdep_assert_wait_lock_held(struct mutex *lock)
+{
+ lockdep_assert_held(&lock->wait_lock);
+}
+
+#else /* WW_RT */
+
+#define MUTEX rt_mutex
+#define MUTEX_WAITER rt_mutex_waiter
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_first(struct rt_mutex *lock)
+{
+ struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
+}
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_next(struct rt_mutex *lock, struct rt_mutex_waiter *w)
+{
+ struct rb_node *n = rb_next(&w->tree.entry);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
+}
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w)
+{
+ struct rb_node *n = rb_prev(&w->tree.entry);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
+}
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_last(struct rt_mutex *lock)
+{
+ struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
+}
+
+static inline void
+__ww_waiter_add(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct rt_mutex_waiter *pos)
+{
+ /* RT unconditionally adds the waiter first and then removes it on error */
+}
+
+static inline struct task_struct *
+__ww_mutex_owner(struct rt_mutex *lock)
+{
+ return rt_mutex_owner(&lock->rtmutex);
+}
+
+static inline bool
+__ww_mutex_has_waiters(struct rt_mutex *lock)
+{
+ return rt_mutex_has_waiters(&lock->rtmutex);
+}
+
+static inline void lock_wait_lock(struct rt_mutex *lock, unsigned long *flags)
+{
+ raw_spin_lock_irqsave(&lock->rtmutex.wait_lock, *flags);
+}
+
+static inline void unlock_wait_lock(struct rt_mutex *lock, unsigned long *flags)
+{
+ raw_spin_unlock_irqrestore(&lock->rtmutex.wait_lock, *flags);
+}
+
+static inline void lockdep_assert_wait_lock_held(struct rt_mutex *lock)
+{
+ lockdep_assert_held(&lock->rtmutex.wait_lock);
+}
+
+#endif /* WW_RT */
+
+/*
+ * Wait-Die:
+ * The newer transactions are killed when:
+ * It (the new transaction) makes a request for a lock being held
+ * by an older transaction.
+ *
+ * Wound-Wait:
+ * The newer transactions are wounded when:
+ * An older transaction makes a request for a lock being held by
+ * the newer transaction.
+ */
+
+/*
+ * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired
+ * it.
+ */
+static __always_inline void
+ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
+{
+#ifdef DEBUG_WW_MUTEXES
+ /*
+ * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
+ * but released with a normal mutex_unlock in this call.
+ *
+ * This should never happen, always use ww_mutex_unlock.
+ */
+ DEBUG_LOCKS_WARN_ON(ww->ctx);
+
+ /*
+ * Not quite done after calling ww_acquire_done() ?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
+
+ if (ww_ctx->contending_lock) {
+ /*
+ * After -EDEADLK you tried to
+ * acquire a different ww_mutex? Bad!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
+
+ /*
+ * You called ww_mutex_lock after receiving -EDEADLK,
+ * but 'forgot' to unlock everything else first?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
+ ww_ctx->contending_lock = NULL;
+ }
+
+ /*
+ * Naughty, using a different class will lead to undefined behavior!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
+#endif
+ ww_ctx->acquired++;
+ ww->ctx = ww_ctx;
+}
+
+/*
+ * Determine if @a is 'less' than @b. IOW, either @a is a lower priority task
+ * or, when of equal priority, a younger transaction than @b.
+ *
+ * Depending on the algorithm, @a will either need to wait for @b, or die.
+ */
+static inline bool
+__ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
+{
+/*
+ * Can only do the RT prio for WW_RT, because task->prio isn't stable due to PI,
+ * so the wait_list ordering will go wobbly. rt_mutex re-queues the waiter and
+ * isn't affected by this.
+ */
+#ifdef WW_RT
+ /* kernel prio; less is more */
+ int a_prio = a->task->prio;
+ int b_prio = b->task->prio;
+
+ if (rt_or_dl_prio(a_prio) || rt_or_dl_prio(b_prio)) {
+
+ if (a_prio > b_prio)
+ return true;
+
+ if (a_prio < b_prio)
+ return false;
+
+ /* equal static prio */
+
+ if (dl_prio(a_prio)) {
+ if (dl_time_before(b->task->dl.deadline,
+ a->task->dl.deadline))
+ return true;
+
+ if (dl_time_before(a->task->dl.deadline,
+ b->task->dl.deadline))
+ return false;
+ }
+
+ /* equal prio */
+ }
+#endif
+
+ /* FIFO order tie break -- bigger is younger */
+ return (signed long)(a->stamp - b->stamp) > 0;
+}
+
+/*
+ * Wait-Die; wake a lesser waiter context (when locks held) such that it can
+ * die.
+ *
+ * Among waiters with context, only the first one can have other locks acquired
+ * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and
+ * __ww_mutex_check_kill() wake any but the earliest context.
+ */
+static bool
+__ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
+ struct ww_acquire_ctx *ww_ctx, struct wake_q_head *wake_q)
+{
+ if (!ww_ctx->is_wait_die)
+ return false;
+
+ if (waiter->ww_ctx->acquired > 0 && __ww_ctx_less(waiter->ww_ctx, ww_ctx)) {
+#ifndef WW_RT
+ debug_mutex_wake_waiter(lock, waiter);
+#endif
+ /*
+ * When waking up the task to die, be sure to clear the
+ * blocked_on pointer. Otherwise we can see circular
+ * blocked_on relationships that can't resolve.
+ */
+ __clear_task_blocked_on(waiter->task, lock);
+ wake_q_add(wake_q, waiter->task);
+ }
+
+ return true;
+}
+
+/*
+ * Wound-Wait; wound a lesser @hold_ctx if it holds the lock.
+ *
+ * Wound the lock holder if there are waiters with more important transactions
+ * than the lock holders. Even if multiple waiters may wound the lock holder,
+ * it's sufficient that only one does.
+ */
+static bool __ww_mutex_wound(struct MUTEX *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ struct ww_acquire_ctx *hold_ctx,
+ struct wake_q_head *wake_q)
+{
+ struct task_struct *owner = __ww_mutex_owner(lock);
+
+ lockdep_assert_wait_lock_held(lock);
+
+ /*
+ * Possible through __ww_mutex_add_waiter() when we race with
+ * ww_mutex_set_context_fastpath(). In that case we'll get here again
+ * through __ww_mutex_check_waiters().
+ */
+ if (!hold_ctx)
+ return false;
+
+ /*
+ * Can have !owner because of __mutex_unlock_slowpath(), but if owner,
+ * it cannot go away because we'll have FLAG_WAITERS set and hold
+ * wait_lock.
+ */
+ if (!owner)
+ return false;
+
+ if (ww_ctx->acquired > 0 && __ww_ctx_less(hold_ctx, ww_ctx)) {
+ hold_ctx->wounded = 1;
+
+ /*
+ * wake_up_process() paired with set_current_state()
+ * inserts sufficient barriers to make sure @owner either sees
+ * it's wounded in __ww_mutex_check_kill() or has a
+ * wakeup pending to re-read the wounded state.
+ */
+ if (owner != current) {
+ /*
+ * When waking up the task to wound, be sure to clear the
+ * blocked_on pointer. Otherwise we can see circular
+ * blocked_on relationships that can't resolve.
+ *
+ * NOTE: We pass NULL here instead of lock, because we
+ * are waking the mutex owner, who may be currently
+ * blocked on a different mutex.
+ */
+ __clear_task_blocked_on(owner, NULL);
+ wake_q_add(wake_q, owner);
+ }
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * We just acquired @lock under @ww_ctx, if there are more important contexts
+ * waiting behind us on the wait-list, check if they need to die, or wound us.
+ *
+ * See __ww_mutex_add_waiter() for the list-order construction; basically the
+ * list is ordered by stamp, smallest (oldest) first.
+ *
+ * This relies on never mixing wait-die/wound-wait on the same wait-list;
+ * which is currently ensured by that being a ww_class property.
+ *
+ * The current task must not be on the wait list.
+ */
+static void
+__ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx,
+ struct wake_q_head *wake_q)
+{
+ struct MUTEX_WAITER *cur;
+
+ lockdep_assert_wait_lock_held(lock);
+
+ for (cur = __ww_waiter_first(lock); cur;
+ cur = __ww_waiter_next(lock, cur)) {
+
+ if (!cur->ww_ctx)
+ continue;
+
+ if (__ww_mutex_die(lock, cur, ww_ctx, wake_q) ||
+ __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx, wake_q))
+ break;
+ }
+}
+
+/*
+ * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx
+ * and wake up any waiters so they can recheck.
+ */
+static __always_inline void
+ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ DEFINE_WAKE_Q(wake_q);
+ unsigned long flags;
+
+ ww_mutex_lock_acquired(lock, ctx);
+
+ /*
+ * The lock->ctx update should be visible on all cores before
+ * the WAITERS check is done, otherwise contended waiters might be
+ * missed. The contended waiters will either see ww_ctx == NULL
+ * and keep spinning, or it will acquire wait_lock, add itself
+ * to waiter list and sleep.
+ */
+ smp_mb(); /* See comments above and below. */
+
+ /*
+ * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS
+ * MB MB
+ * [R] MUTEX_FLAG_WAITERS [R] ww->ctx
+ *
+ * The memory barrier above pairs with the memory barrier in
+ * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx
+ * and/or !empty list.
+ */
+ if (likely(!__ww_mutex_has_waiters(&lock->base)))
+ return;
+
+ /*
+ * Uh oh, we raced in fastpath, check if any of the waiters need to
+ * die or wound us.
+ */
+ lock_wait_lock(&lock->base, &flags);
+ __ww_mutex_check_waiters(&lock->base, ctx, &wake_q);
+ preempt_disable();
+ unlock_wait_lock(&lock->base, &flags);
+ wake_up_q(&wake_q);
+ preempt_enable();
+}
+
+static __always_inline int
+__ww_mutex_kill(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx)
+{
+ if (ww_ctx->acquired > 0) {
+#ifdef DEBUG_WW_MUTEXES
+ struct ww_mutex *ww;
+
+ ww = container_of(lock, struct ww_mutex, base);
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
+ ww_ctx->contending_lock = ww;
+#endif
+ return -EDEADLK;
+ }
+
+ return 0;
+}
+
+/*
+ * Check the wound condition for the current lock acquire.
+ *
+ * Wound-Wait: If we're wounded, kill ourself.
+ *
+ * Wait-Die: If we're trying to acquire a lock already held by an older
+ * context, kill ourselves.
+ *
+ * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to
+ * look at waiters before us in the wait-list.
+ */
+static inline int
+__ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
+ struct ww_acquire_ctx *ctx)
+{
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+ struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
+ struct MUTEX_WAITER *cur;
+
+ if (ctx->acquired == 0)
+ return 0;
+
+ if (!ctx->is_wait_die) {
+ if (ctx->wounded)
+ return __ww_mutex_kill(lock, ctx);
+
+ return 0;
+ }
+
+ if (hold_ctx && __ww_ctx_less(ctx, hold_ctx))
+ return __ww_mutex_kill(lock, ctx);
+
+ /*
+ * If there is a waiter in front of us that has a context, then its
+ * stamp is earlier than ours and we must kill ourself.
+ */
+ for (cur = __ww_waiter_prev(lock, waiter); cur;
+ cur = __ww_waiter_prev(lock, cur)) {
+
+ if (!cur->ww_ctx)
+ continue;
+
+ return __ww_mutex_kill(lock, ctx);
+ }
+
+ return 0;
+}
+
+/*
+ * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest
+ * first. Such that older contexts are preferred to acquire the lock over
+ * younger contexts.
+ *
+ * Waiters without context are interspersed in FIFO order.
+ *
+ * Furthermore, for Wait-Die kill ourself immediately when possible (there are
+ * older contexts already waiting) to avoid unnecessary waiting and for
+ * Wound-Wait ensure we wound the owning context when it is younger.
+ */
+static inline int
+__ww_mutex_add_waiter(struct MUTEX_WAITER *waiter,
+ struct MUTEX *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ struct wake_q_head *wake_q)
+{
+ struct MUTEX_WAITER *cur, *pos = NULL;
+ bool is_wait_die;
+
+ if (!ww_ctx) {
+ __ww_waiter_add(lock, waiter, NULL);
+ return 0;
+ }
+
+ is_wait_die = ww_ctx->is_wait_die;
+
+ /*
+ * Add the waiter before the first waiter with a higher stamp.
+ * Waiters without a context are skipped to avoid starving
+ * them. Wait-Die waiters may die here. Wound-Wait waiters
+ * never die here, but they are sorted in stamp order and
+ * may wound the lock holder.
+ */
+ for (cur = __ww_waiter_last(lock); cur;
+ cur = __ww_waiter_prev(lock, cur)) {
+
+ if (!cur->ww_ctx)
+ continue;
+
+ if (__ww_ctx_less(ww_ctx, cur->ww_ctx)) {
+ /*
+ * Wait-Die: if we find an older context waiting, there
+ * is no point in queueing behind it, as we'd have to
+ * die the moment it would acquire the lock.
+ */
+ if (is_wait_die) {
+ int ret = __ww_mutex_kill(lock, ww_ctx);
+
+ if (ret)
+ return ret;
+ }
+
+ break;
+ }
+
+ pos = cur;
+
+ /* Wait-Die: ensure younger waiters die. */
+ __ww_mutex_die(lock, cur, ww_ctx, wake_q);
+ }
+
+ __ww_waiter_add(lock, waiter, pos);
+
+ /*
+ * Wound-Wait: if we're blocking on a mutex owned by a younger context,
+ * wound that such that we might proceed.
+ */
+ if (!is_wait_die) {
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+
+ /*
+ * See ww_mutex_set_context_fastpath(). Orders setting
+ * MUTEX_FLAG_WAITERS vs the ww->ctx load,
+ * such that either we or the fastpath will wound @ww->ctx.
+ */
+ smp_mb();
+ __ww_mutex_wound(lock, ww_ctx, ww->ctx, wake_q);
+ }
+
+ return 0;
+}
+
+static inline void __ww_mutex_unlock(struct ww_mutex *lock)
+{
+ if (lock->ctx) {
+#ifdef DEBUG_WW_MUTEXES
+ DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
+#endif
+ if (lock->ctx->acquired > 0)
+ lock->ctx->acquired--;
+ lock->ctx = NULL;
+ }
+}
diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c
new file mode 100644
index 000000000000..c7196de838ed
--- /dev/null
+++ b/kernel/locking/ww_rt_mutex.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * rtmutex API
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+
+#define RT_MUTEX_BUILD_MUTEX
+#define WW_RT
+#include "rtmutex.c"
+
+int ww_mutex_trylock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
+{
+ struct rt_mutex *rtm = &lock->base;
+
+ if (!ww_ctx)
+ return rt_mutex_trylock(rtm);
+
+ /*
+ * Reset the wounded flag after a kill. No other process can
+ * race and wound us here, since they can't have a valid owner
+ * pointer if we don't have any locks held.
+ */
+ if (ww_ctx->acquired == 0)
+ ww_ctx->wounded = 0;
+
+ if (__rt_mutex_trylock(&rtm->rtmutex)) {
+ ww_mutex_set_context_fastpath(lock, ww_ctx);
+ mutex_acquire_nest(&rtm->dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_);
+ return 1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(ww_mutex_trylock);
+
+static int __sched
+__ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx,
+ unsigned int state, unsigned long ip)
+{
+ struct lockdep_map __maybe_unused *nest_lock = NULL;
+ struct rt_mutex *rtm = &lock->base;
+ int ret;
+
+ might_sleep();
+
+ if (ww_ctx) {
+ if (unlikely(ww_ctx == READ_ONCE(lock->ctx)))
+ return -EALREADY;
+
+ /*
+ * Reset the wounded flag after a kill. No other process can
+ * race and wound us here, since they can't have a valid owner
+ * pointer if we don't have any locks held.
+ */
+ if (ww_ctx->acquired == 0)
+ ww_ctx->wounded = 0;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ nest_lock = &ww_ctx->dep_map;
+#endif
+ }
+ mutex_acquire_nest(&rtm->dep_map, 0, 0, nest_lock, ip);
+
+ if (likely(rt_mutex_try_acquire(&rtm->rtmutex))) {
+ if (ww_ctx)
+ ww_mutex_set_context_fastpath(lock, ww_ctx);
+ return 0;
+ }
+
+ ret = rt_mutex_slowlock(&rtm->rtmutex, ww_ctx, state);
+
+ if (ret)
+ mutex_release(&rtm->dep_map, ip);
+ return ret;
+}
+
+int __sched
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ return __ww_rt_mutex_lock(lock, ctx, TASK_UNINTERRUPTIBLE, _RET_IP_);
+}
+EXPORT_SYMBOL(ww_mutex_lock);
+
+int __sched
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ return __ww_rt_mutex_lock(lock, ctx, TASK_INTERRUPTIBLE, _RET_IP_);
+}
+EXPORT_SYMBOL(ww_mutex_lock_interruptible);
+
+void __sched ww_mutex_unlock(struct ww_mutex *lock)
+{
+ struct rt_mutex *rtm = &lock->base;
+
+ __ww_mutex_unlock(lock);
+
+ mutex_release(&rtm->dep_map, _RET_IP_);
+ __rt_mutex_unlock(&rtm->rtmutex);
+}
+EXPORT_SYMBOL(ww_mutex_unlock);