summaryrefslogtreecommitdiff
path: root/kernel/locking
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/locking')
-rw-r--r--kernel/locking/Makefile8
-rw-r--r--kernel/locking/irqflag-debug.c13
-rw-r--r--kernel/locking/lock_events.c10
-rw-r--r--kernel/locking/lock_events.h4
-rw-r--r--kernel/locking/lockdep.c703
-rw-r--r--kernel/locking/lockdep_internals.h14
-rw-r--r--kernel/locking/lockdep_proc.c81
-rw-r--r--kernel/locking/locktorture.c696
-rw-r--r--kernel/locking/mcs_spinlock.h2
-rw-r--r--kernel/locking/mutex-debug.c9
-rw-r--r--kernel/locking/mutex-debug.h29
-rw-r--r--kernel/locking/mutex.c688
-rw-r--r--kernel/locking/mutex.h50
-rw-r--r--kernel/locking/osq_lock.c41
-rw-r--r--kernel/locking/percpu-rwsem.c23
-rw-r--r--kernel/locking/qrwlock.c35
-rw-r--r--kernel/locking/qspinlock.c11
-rw-r--r--kernel/locking/qspinlock_paravirt.h26
-rw-r--r--kernel/locking/rtmutex-debug.c182
-rw-r--r--kernel/locking/rtmutex-debug.h37
-rw-r--r--kernel/locking/rtmutex.c1615
-rw-r--r--kernel/locking/rtmutex.h35
-rw-r--r--kernel/locking/rtmutex_api.c612
-rw-r--r--kernel/locking/rtmutex_common.h231
-rw-r--r--kernel/locking/rwbase_rt.c297
-rw-r--r--kernel/locking/rwsem.c599
-rw-r--r--kernel/locking/rwsem.h0
-rw-r--r--kernel/locking/semaphore.c33
-rw-r--r--kernel/locking/spinlock.c80
-rw-r--r--kernel/locking/spinlock_debug.c6
-rw-r--r--kernel/locking/spinlock_rt.c286
-rw-r--r--kernel/locking/test-ww_mutex.c139
-rw-r--r--kernel/locking/ww_mutex.h569
-rw-r--r--kernel/locking/ww_rt_mutex.c101
34 files changed, 4804 insertions, 2461 deletions
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 6d11cfb9b41f..0db4093d17b8 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -5,16 +5,16 @@ KCOV_INSTRUMENT := n
obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
-# Avoid recursion lockdep -> KCSAN -> ... -> lockdep.
+# Avoid recursion lockdep -> sanitizer -> ... -> lockdep.
KCSAN_SANITIZE_lockdep.o := n
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
endif
+obj-$(CONFIG_DEBUG_IRQFLAGS) += irqflag-debug.o
obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
obj-$(CONFIG_LOCKDEP) += lockdep.o
ifeq ($(CONFIG_PROC_FS),y)
@@ -24,8 +24,8 @@ obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
-obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
-obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
+obj-$(CONFIG_RT_MUTEXES) += rtmutex_api.o
+obj-$(CONFIG_PREEMPT_RT) += spinlock_rt.o ww_rt_mutex.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
diff --git a/kernel/locking/irqflag-debug.c b/kernel/locking/irqflag-debug.c
new file mode 100644
index 000000000000..810b50344d35
--- /dev/null
+++ b/kernel/locking/irqflag-debug.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/irqflags.h>
+
+noinstr void warn_bogus_irq_restore(void)
+{
+ instrumentation_begin();
+ WARN_ONCE(1, "raw_local_irq_restore() called with IRQs enabled\n");
+ instrumentation_end();
+}
+EXPORT_SYMBOL(warn_bogus_irq_restore);
diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c
index fa2c2f951c6b..e68d82099558 100644
--- a/kernel/locking/lock_events.c
+++ b/kernel/locking/lock_events.c
@@ -146,7 +146,7 @@ static int __init init_lockevent_counts(void)
struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL);
int i;
- if (!d_counts)
+ if (IS_ERR(d_counts))
goto out;
/*
@@ -159,14 +159,14 @@ static int __init init_lockevent_counts(void)
for (i = 0; i < lockevent_num; i++) {
if (skip_lockevent(lockevent_names[i]))
continue;
- if (!debugfs_create_file(lockevent_names[i], 0400, d_counts,
- (void *)(long)i, &fops_lockevent))
+ if (IS_ERR(debugfs_create_file(lockevent_names[i], 0400, d_counts,
+ (void *)(long)i, &fops_lockevent)))
goto fail_undo;
}
- if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
+ if (IS_ERR(debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
d_counts, (void *)(long)LOCKEVENT_reset_cnts,
- &fops_lockevent))
+ &fops_lockevent)))
goto fail_undo;
return 0;
diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
index 8c7e7d25f09c..a6016b91803d 100644
--- a/kernel/locking/lock_events.h
+++ b/kernel/locking/lock_events.h
@@ -57,4 +57,8 @@ static inline void __lockevent_add(enum lock_events event, int inc)
#define lockevent_cond_inc(ev, c)
#endif /* CONFIG_LOCK_EVENT_COUNTS */
+
+ssize_t lockevent_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos);
+
#endif /* __LOCKING_LOCK_EVENTS_H */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index bdaf4829098c..151bd3de5936 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -54,28 +54,60 @@
#include <linux/nmi.h>
#include <linux/rcupdate.h>
#include <linux/kprobes.h>
+#include <linux/lockdep.h>
+#include <linux/context_tracking.h>
#include <asm/sections.h>
#include "lockdep_internals.h"
-#define CREATE_TRACE_POINTS
#include <trace/events/lock.h>
#ifdef CONFIG_PROVE_LOCKING
-int prove_locking = 1;
+static int prove_locking = 1;
module_param(prove_locking, int, 0644);
#else
#define prove_locking 0
#endif
#ifdef CONFIG_LOCK_STAT
-int lock_stat = 1;
+static int lock_stat = 1;
module_param(lock_stat, int, 0644);
#else
#define lock_stat 0
#endif
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_lockdep_table[] = {
+#ifdef CONFIG_PROVE_LOCKING
+ {
+ .procname = "prove_locking",
+ .data = &prove_locking,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_PROVE_LOCKING */
+#ifdef CONFIG_LOCK_STAT
+ {
+ .procname = "lock_stat",
+ .data = &lock_stat,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_LOCK_STAT */
+ { }
+};
+
+static __init int kernel_lockdep_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_lockdep_table);
+ return 0;
+}
+late_initcall(kernel_lockdep_sysctls_init);
+#endif /* CONFIG_SYSCTL */
+
DEFINE_PER_CPU(unsigned int, lockdep_recursion);
EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion);
@@ -182,11 +214,9 @@ static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES);
static struct hlist_head lock_keys_hash[KEYHASH_SIZE];
unsigned long nr_lock_classes;
unsigned long nr_zapped_classes;
-#ifndef CONFIG_DEBUG_LOCKDEP
-static
-#endif
+unsigned long max_lock_class_idx;
struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
-static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
+DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
static inline struct lock_class *hlock_class(struct held_lock *hlock)
{
@@ -337,7 +367,7 @@ static inline void lock_release_holdtime(struct held_lock *hlock)
* elements. These elements are linked together by the lock_entry member in
* struct lock_class.
*/
-LIST_HEAD(all_lock_classes);
+static LIST_HEAD(all_lock_classes);
static LIST_HEAD(free_lock_classes);
/**
@@ -679,7 +709,7 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
usage[i] = '\0';
}
-static void __print_lock_name(struct lock_class *class)
+static void __print_lock_name(struct held_lock *hlock, struct lock_class *class)
{
char str[KSYM_NAME_LEN];
const char *name;
@@ -694,18 +724,20 @@ static void __print_lock_name(struct lock_class *class)
printk(KERN_CONT "#%d", class->name_version);
if (class->subclass)
printk(KERN_CONT "/%d", class->subclass);
+ if (hlock && class->print_fn)
+ class->print_fn(hlock->instance);
}
}
-static void print_lock_name(struct lock_class *class)
+static void print_lock_name(struct held_lock *hlock, struct lock_class *class)
{
char usage[LOCK_USAGE_CHARS];
get_usage_chars(class, usage);
printk(KERN_CONT " (");
- __print_lock_name(class);
- printk(KERN_CONT "){%s}-{%hd:%hd}", usage,
+ __print_lock_name(hlock, class);
+ printk(KERN_CONT "){%s}-{%d:%d}", usage,
class->wait_type_outer ?: class->wait_type_inner,
class->wait_type_inner);
}
@@ -742,7 +774,7 @@ static void print_lock(struct held_lock *hlock)
}
printk(KERN_CONT "%px", hlock->instance);
- print_lock_name(lock);
+ print_lock_name(hlock, lock);
printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
}
@@ -759,7 +791,7 @@ static void lockdep_print_held_locks(struct task_struct *p)
* It's not reliable to print a task's held locks if it's not sleeping
* and it's not the current task.
*/
- if (p->state == TASK_RUNNING && p != current)
+ if (p != current && task_is_running(p))
return;
for (i = 0; i < depth; i++) {
printk(" #%d: ", i);
@@ -789,20 +821,24 @@ static int very_verbose(struct lock_class *class)
#ifdef __KERNEL__
static int static_obj(const void *obj)
{
- unsigned long start = (unsigned long) &_stext,
- end = (unsigned long) &_end,
- addr = (unsigned long) obj;
+ unsigned long addr = (unsigned long) obj;
- if (arch_is_kernel_initmem_freed(addr))
- return 0;
+ if (is_kernel_core_data(addr))
+ return 1;
/*
- * static variable?
+ * keys are allowed in the __ro_after_init section.
*/
- if ((addr >= start) && (addr < end))
+ if (is_kernel_rodata(addr))
return 1;
- if (arch_is_kernel_data(addr))
+ /*
+ * in initdata section and used during bootup only?
+ * NOTE: On some platforms the initdata section is
+ * outside of the _stext ... _end range.
+ */
+ if (system_state < SYSTEM_FREEING_INITMEM &&
+ init_section_contains((void *)addr, 1))
return 1;
/*
@@ -842,7 +878,7 @@ static int count_matching_names(struct lock_class *new_class)
}
/* used from NMI context -- must be lockless */
-static __always_inline struct lock_class *
+static noinstr struct lock_class *
look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
{
struct lockdep_subclass_key *key;
@@ -850,12 +886,14 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
struct lock_class *class;
if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
+ instrumentation_begin();
debug_locks_off();
printk(KERN_ERR
"BUG: looking up invalid subclass: %u\n", subclass);
printk(KERN_ERR
"turning off the locking correctness validator.\n");
dump_stack();
+ instrumentation_end();
return NULL;
}
@@ -885,14 +923,16 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return NULL;
- hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
+ hlist_for_each_entry_rcu_notrace(class, hash_head, hash_entry) {
if (class->key == key) {
/*
* Huh! same key, different name? Did someone trample
* on some memory? We're most confused.
*/
- WARN_ON_ONCE(class->name != lock->name &&
- lock->key != &__lockdep_no_validate__);
+ WARN_ONCE(class->name != lock->name &&
+ lock->key != &__lockdep_no_validate__,
+ "Looking for class \"%s\" with key %ps, but found a different class \"%s\" with the same key\n",
+ lock->name, lock->key, class->name);
return class;
}
}
@@ -930,7 +970,8 @@ static bool assign_lock_key(struct lockdep_map *lock)
/* Debug-check: all keys must be persistent! */
debug_locks_off();
pr_err("INFO: trying to register non-static key.\n");
- pr_err("the code is fine but needs lockdep annotation.\n");
+ pr_err("The code is fine but needs lockdep annotation, or maybe\n");
+ pr_err("you didn't initialize this object before use?\n");
pr_err("turning off the locking correctness validator.\n");
dump_stack();
return false;
@@ -1236,6 +1277,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
struct lockdep_subclass_key *key;
struct hlist_head *hash_head;
struct lock_class *class;
+ int idx;
DEBUG_LOCKS_WARN_ON(!irqs_disabled());
@@ -1290,6 +1332,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
class->name_version = count_matching_names(class);
class->wait_type_inner = lock->wait_type_inner;
class->wait_type_outer = lock->wait_type_outer;
+ class->lock_type = lock->lock_type;
/*
* We use RCU's safe list-add method to make
* parallel walking of the hash-list safe:
@@ -1300,6 +1343,9 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
* of classes.
*/
list_move_tail(&class->lock_entry, &all_lock_classes);
+ idx = class - lock_classes;
+ if (idx > max_lock_class_idx)
+ max_lock_class_idx = idx;
if (verbose(class)) {
graph_unlock();
@@ -1361,7 +1407,7 @@ static struct lock_list *alloc_list_entry(void)
*/
static int add_lock_to_list(struct lock_class *this,
struct lock_class *links_to, struct list_head *head,
- unsigned long ip, u16 distance, u8 dep,
+ u16 distance, u8 dep,
const struct lock_trace *trace)
{
struct lock_list *entry;
@@ -1391,7 +1437,7 @@ static int add_lock_to_list(struct lock_class *this,
/*
* For good efficiency of modular, we use power of 2
*/
-#define MAX_CIRCULAR_QUEUE_SIZE 4096UL
+#define MAX_CIRCULAR_QUEUE_SIZE (1UL << CONFIG_LOCKDEP_CIRCULAR_QUEUE_BITS)
#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1)
/*
@@ -1671,6 +1717,7 @@ static inline struct lock_list *__bfs_next(struct lock_list *lock, int offset)
static enum bfs_result __bfs(struct lock_list *source_entry,
void *data,
bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
struct lock_list **target_entry,
int offset)
{
@@ -1731,7 +1778,12 @@ static enum bfs_result __bfs(struct lock_list *source_entry,
/*
* Step 3: we haven't visited this and there is a strong
* dependency path to this, so check with @match.
+ * If @skip is provide and returns true, we skip this
+ * lock (and any path this lock is in).
*/
+ if (skip && skip(lock, data))
+ continue;
+
if (match(lock, data)) {
*target_entry = lock;
return BFS_RMATCH;
@@ -1739,7 +1791,7 @@ static enum bfs_result __bfs(struct lock_list *source_entry,
/*
* Step 4: if not match, expand the path by adding the
- * forward or backwards dependencis in the search
+ * forward or backwards dependencies in the search
*
*/
first = true;
@@ -1774,9 +1826,10 @@ static inline enum bfs_result
__bfs_forwards(struct lock_list *src_entry,
void *data,
bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
struct lock_list **target_entry)
{
- return __bfs(src_entry, data, match, target_entry,
+ return __bfs(src_entry, data, match, skip, target_entry,
offsetof(struct lock_class, locks_after));
}
@@ -1785,9 +1838,10 @@ static inline enum bfs_result
__bfs_backwards(struct lock_list *src_entry,
void *data,
bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
struct lock_list **target_entry)
{
- return __bfs(src_entry, data, match, target_entry,
+ return __bfs(src_entry, data, match, skip, target_entry,
offsetof(struct lock_class, locks_before));
}
@@ -1808,7 +1862,7 @@ print_circular_bug_entry(struct lock_list *target, int depth)
if (debug_locks_silent)
return;
printk("\n-> #%u", depth);
- print_lock_name(target->class);
+ print_lock_name(NULL, target->class);
printk(KERN_CONT ":\n");
print_lock_trace(target->trace, 6);
}
@@ -1821,6 +1875,8 @@ print_circular_lock_scenario(struct held_lock *src,
struct lock_class *source = hlock_class(src);
struct lock_class *target = hlock_class(tgt);
struct lock_class *parent = prt->class;
+ int src_read = src->read;
+ int tgt_read = tgt->read;
/*
* A direct locking problem where unsafe_class lock is taken
@@ -1837,28 +1893,36 @@ print_circular_lock_scenario(struct held_lock *src,
*/
if (parent != source) {
printk("Chain exists of:\n ");
- __print_lock_name(source);
+ __print_lock_name(src, source);
printk(KERN_CONT " --> ");
- __print_lock_name(parent);
+ __print_lock_name(NULL, parent);
printk(KERN_CONT " --> ");
- __print_lock_name(target);
+ __print_lock_name(tgt, target);
printk(KERN_CONT "\n\n");
}
printk(" Possible unsafe locking scenario:\n\n");
printk(" CPU0 CPU1\n");
printk(" ---- ----\n");
- printk(" lock(");
- __print_lock_name(target);
+ if (tgt_read != 0)
+ printk(" rlock(");
+ else
+ printk(" lock(");
+ __print_lock_name(tgt, target);
printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(parent);
+ __print_lock_name(NULL, parent);
printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(target);
+ __print_lock_name(tgt, target);
printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(source);
+ if (src_read != 0)
+ printk(" rlock(");
+ else if (src->sync)
+ printk(" sync(");
+ else
+ printk(" lock(");
+ __print_lock_name(src, source);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
@@ -1906,7 +1970,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
* -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the
* dependency graph, as any strong path ..-> A -> B ->.. we can get with
* having dependency A -> B, we could already get a equivalent path ..-> A ->
- * .. -> B -> .. with A -> .. -> B. Therefore A -> B is reduntant.
+ * .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant.
*
* We need to make sure both the start and the end of A -> .. -> B is not
* weaker than A -> B. For the start part, please see the comment in
@@ -2018,7 +2082,7 @@ static unsigned long __lockdep_count_forward_deps(struct lock_list *this)
unsigned long count = 0;
struct lock_list *target_entry;
- __bfs_forwards(this, (void *)&count, noop_count, &target_entry);
+ __bfs_forwards(this, (void *)&count, noop_count, NULL, &target_entry);
return count;
}
@@ -2043,7 +2107,7 @@ static unsigned long __lockdep_count_backward_deps(struct lock_list *this)
unsigned long count = 0;
struct lock_list *target_entry;
- __bfs_backwards(this, (void *)&count, noop_count, &target_entry);
+ __bfs_backwards(this, (void *)&count, noop_count, NULL, &target_entry);
return count;
}
@@ -2071,11 +2135,12 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
static noinline enum bfs_result
check_path(struct held_lock *target, struct lock_list *src_entry,
bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
struct lock_list **target_entry)
{
enum bfs_result ret;
- ret = __bfs_forwards(src_entry, target, match, target_entry);
+ ret = __bfs_forwards(src_entry, target, match, skip, target_entry);
if (unlikely(bfs_error(ret)))
print_bfs_bug(ret);
@@ -2083,6 +2148,8 @@ check_path(struct held_lock *target, struct lock_list *src_entry,
return ret;
}
+static void print_deadlock_bug(struct task_struct *, struct held_lock *, struct held_lock *);
+
/*
* Prove that the dependency graph starting at <src> can not
* lead to <target>. If it can, there is a circle when adding
@@ -2102,7 +2169,7 @@ check_noncircular(struct held_lock *src, struct held_lock *target,
debug_atomic_inc(nr_cyclic_checks);
- ret = check_path(target, &src_entry, hlock_conflict, &target_entry);
+ ret = check_path(target, &src_entry, hlock_conflict, NULL, &target_entry);
if (unlikely(ret == BFS_RMATCH)) {
if (!*trace) {
@@ -2114,52 +2181,15 @@ check_noncircular(struct held_lock *src, struct held_lock *target,
*trace = save_trace();
}
- print_circular_bug(&src_entry, target_entry, src, target);
+ if (src->class_idx == target->class_idx)
+ print_deadlock_bug(current, src, target);
+ else
+ print_circular_bug(&src_entry, target_entry, src, target);
}
return ret;
}
-#ifdef CONFIG_LOCKDEP_SMALL
-/*
- * Check that the dependency graph starting at <src> can lead to
- * <target> or not. If it can, <src> -> <target> dependency is already
- * in the graph.
- *
- * Return BFS_RMATCH if it does, or BFS_RMATCH if it does not, return BFS_E* if
- * any error appears in the bfs search.
- */
-static noinline enum bfs_result
-check_redundant(struct held_lock *src, struct held_lock *target)
-{
- enum bfs_result ret;
- struct lock_list *target_entry;
- struct lock_list src_entry;
-
- bfs_init_root(&src_entry, src);
- /*
- * Special setup for check_redundant().
- *
- * To report redundant, we need to find a strong dependency path that
- * is equal to or stronger than <src> -> <target>. So if <src> is E,
- * we need to let __bfs() only search for a path starting at a -(E*)->,
- * we achieve this by setting the initial node's ->only_xr to true in
- * that case. And if <prev> is S, we set initial ->only_xr to false
- * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant.
- */
- src_entry.only_xr = src->read == 0;
-
- debug_atomic_inc(nr_redundant_checks);
-
- ret = check_path(target, &src_entry, hlock_equal, &target_entry);
-
- if (ret == BFS_RMATCH)
- debug_atomic_inc(nr_redundant);
-
- return ret;
-}
-#endif
-
#ifdef CONFIG_TRACE_IRQFLAGS
/*
@@ -2230,6 +2260,49 @@ static inline bool usage_match(struct lock_list *entry, void *mask)
return !!((entry->class->usage_mask & LOCKF_IRQ) & *(unsigned long *)mask);
}
+static inline bool usage_skip(struct lock_list *entry, void *mask)
+{
+ if (entry->class->lock_type == LD_LOCK_NORMAL)
+ return false;
+
+ /*
+ * Skip local_lock() for irq inversion detection.
+ *
+ * For !RT, local_lock() is not a real lock, so it won't carry any
+ * dependency.
+ *
+ * For RT, an irq inversion happens when we have lock A and B, and on
+ * some CPU we can have:
+ *
+ * lock(A);
+ * <interrupted>
+ * lock(B);
+ *
+ * where lock(B) cannot sleep, and we have a dependency B -> ... -> A.
+ *
+ * Now we prove local_lock() cannot exist in that dependency. First we
+ * have the observation for any lock chain L1 -> ... -> Ln, for any
+ * 1 <= i <= n, Li.inner_wait_type <= L1.inner_wait_type, otherwise
+ * wait context check will complain. And since B is not a sleep lock,
+ * therefore B.inner_wait_type >= 2, and since the inner_wait_type of
+ * local_lock() is 3, which is greater than 2, therefore there is no
+ * way the local_lock() exists in the dependency B -> ... -> A.
+ *
+ * As a result, we will skip local_lock(), when we search for irq
+ * inversion bugs.
+ */
+ if (entry->class->lock_type == LD_LOCK_PERCPU &&
+ DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
+ return false;
+
+ /*
+ * Skip WAIT_OVERRIDE for irq inversion detection -- it's not actually
+ * a lock and only used to override the wait_type.
+ */
+
+ return true;
+}
+
/*
* Find a node in the forwards-direction dependency sub-graph starting
* at @root->class that matches @bit.
@@ -2245,7 +2318,7 @@ find_usage_forwards(struct lock_list *root, unsigned long usage_mask,
debug_atomic_inc(nr_find_usage_forwards_checks);
- result = __bfs_forwards(root, &usage_mask, usage_match, target_entry);
+ result = __bfs_forwards(root, &usage_mask, usage_match, usage_skip, target_entry);
return result;
}
@@ -2262,7 +2335,7 @@ find_usage_backwards(struct lock_list *root, unsigned long usage_mask,
debug_atomic_inc(nr_find_usage_backwards_checks);
- result = __bfs_backwards(root, &usage_mask, usage_match, target_entry);
+ result = __bfs_backwards(root, &usage_mask, usage_match, usage_skip, target_entry);
return result;
}
@@ -2272,7 +2345,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
int bit;
printk("%*s->", depth, "");
- print_lock_name(class);
+ print_lock_name(NULL, class);
#ifdef CONFIG_DEBUG_LOCKDEP
printk(KERN_CONT " ops: %lu", debug_class_ops_read(class));
#endif
@@ -2294,7 +2367,56 @@ static void print_lock_class_header(struct lock_class *class, int depth)
}
/*
- * printk the shortest lock dependencies from @start to @end in reverse order:
+ * Dependency path printing:
+ *
+ * After BFS we get a lock dependency path (linked via ->parent of lock_list),
+ * printing out each lock in the dependency path will help on understanding how
+ * the deadlock could happen. Here are some details about dependency path
+ * printing:
+ *
+ * 1) A lock_list can be either forwards or backwards for a lock dependency,
+ * for a lock dependency A -> B, there are two lock_lists:
+ *
+ * a) lock_list in the ->locks_after list of A, whose ->class is B and
+ * ->links_to is A. In this case, we can say the lock_list is
+ * "A -> B" (forwards case).
+ *
+ * b) lock_list in the ->locks_before list of B, whose ->class is A
+ * and ->links_to is B. In this case, we can say the lock_list is
+ * "B <- A" (bacwards case).
+ *
+ * The ->trace of both a) and b) point to the call trace where B was
+ * acquired with A held.
+ *
+ * 2) A "helper" lock_list is introduced during BFS, this lock_list doesn't
+ * represent a certain lock dependency, it only provides an initial entry
+ * for BFS. For example, BFS may introduce a "helper" lock_list whose
+ * ->class is A, as a result BFS will search all dependencies starting with
+ * A, e.g. A -> B or A -> C.
+ *
+ * The notation of a forwards helper lock_list is like "-> A", which means
+ * we should search the forwards dependencies starting with "A", e.g A -> B
+ * or A -> C.
+ *
+ * The notation of a bacwards helper lock_list is like "<- B", which means
+ * we should search the backwards dependencies ending with "B", e.g.
+ * B <- A or B <- C.
+ */
+
+/*
+ * printk the shortest lock dependencies from @root to @leaf in reverse order.
+ *
+ * We have a lock dependency path as follow:
+ *
+ * @root @leaf
+ * | |
+ * V V
+ * ->parent ->parent
+ * | lock_list | <--------- | lock_list | ... | lock_list | <--------- | lock_list |
+ * | -> L1 | | L1 -> L2 | ... |Ln-2 -> Ln-1| | Ln-1 -> Ln|
+ *
+ * , so it's natural that we start from @leaf and print every ->class and
+ * ->trace until we reach the @root.
*/
static void __used
print_shortest_lock_dependencies(struct lock_list *leaf,
@@ -2322,6 +2444,61 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
} while (entry && (depth >= 0));
}
+/*
+ * printk the shortest lock dependencies from @leaf to @root.
+ *
+ * We have a lock dependency path (from a backwards search) as follow:
+ *
+ * @leaf @root
+ * | |
+ * V V
+ * ->parent ->parent
+ * | lock_list | ---------> | lock_list | ... | lock_list | ---------> | lock_list |
+ * | L2 <- L1 | | L3 <- L2 | ... | Ln <- Ln-1 | | <- Ln |
+ *
+ * , so when we iterate from @leaf to @root, we actually print the lock
+ * dependency path L1 -> L2 -> .. -> Ln in the non-reverse order.
+ *
+ * Another thing to notice here is that ->class of L2 <- L1 is L1, while the
+ * ->trace of L2 <- L1 is the call trace of L2, in fact we don't have the call
+ * trace of L1 in the dependency path, which is alright, because most of the
+ * time we can figure out where L1 is held from the call trace of L2.
+ */
+static void __used
+print_shortest_lock_dependencies_backwards(struct lock_list *leaf,
+ struct lock_list *root)
+{
+ struct lock_list *entry = leaf;
+ const struct lock_trace *trace = NULL;
+ int depth;
+
+ /*compute depth from generated tree by BFS*/
+ depth = get_lock_depth(leaf);
+
+ do {
+ print_lock_class_header(entry->class, depth);
+ if (trace) {
+ printk("%*s ... acquired at:\n", depth, "");
+ print_lock_trace(trace, 2);
+ printk("\n");
+ }
+
+ /*
+ * Record the pointer to the trace for the next lock_list
+ * entry, see the comments for the function.
+ */
+ trace = entry->trace;
+
+ if (depth == 0 && (entry != root)) {
+ printk("lockdep:%s bad path found in chain graph\n", __func__);
+ break;
+ }
+
+ entry = get_lock_parent(entry);
+ depth--;
+ } while (entry && (depth >= 0));
+}
+
static void
print_irq_lock_scenario(struct lock_list *safe_entry,
struct lock_list *unsafe_entry,
@@ -2350,11 +2527,11 @@ print_irq_lock_scenario(struct lock_list *safe_entry,
*/
if (middle_class != unsafe_class) {
printk("Chain exists of:\n ");
- __print_lock_name(safe_class);
+ __print_lock_name(NULL, safe_class);
printk(KERN_CONT " --> ");
- __print_lock_name(middle_class);
+ __print_lock_name(NULL, middle_class);
printk(KERN_CONT " --> ");
- __print_lock_name(unsafe_class);
+ __print_lock_name(NULL, unsafe_class);
printk(KERN_CONT "\n\n");
}
@@ -2362,18 +2539,18 @@ print_irq_lock_scenario(struct lock_list *safe_entry,
printk(" CPU0 CPU1\n");
printk(" ---- ----\n");
printk(" lock(");
- __print_lock_name(unsafe_class);
+ __print_lock_name(NULL, unsafe_class);
printk(KERN_CONT ");\n");
printk(" local_irq_disable();\n");
printk(" lock(");
- __print_lock_name(safe_class);
+ __print_lock_name(NULL, safe_class);
printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(middle_class);
+ __print_lock_name(NULL, middle_class);
printk(KERN_CONT ");\n");
printk(" <Interrupt>\n");
printk(" lock(");
- __print_lock_name(safe_class);
+ __print_lock_name(NULL, safe_class);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
@@ -2410,20 +2587,20 @@ print_bad_irq_dependency(struct task_struct *curr,
pr_warn("\nand this task is already holding:\n");
print_lock(prev);
pr_warn("which would create a new lock dependency:\n");
- print_lock_name(hlock_class(prev));
+ print_lock_name(prev, hlock_class(prev));
pr_cont(" ->");
- print_lock_name(hlock_class(next));
+ print_lock_name(next, hlock_class(next));
pr_cont("\n");
pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n",
irqclass);
- print_lock_name(backwards_entry->class);
+ print_lock_name(NULL, backwards_entry->class);
pr_warn("\n... which became %s-irq-safe at:\n", irqclass);
print_lock_trace(backwards_entry->class->usage_traces[bit1], 1);
pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);
- print_lock_name(forwards_entry->class);
+ print_lock_name(NULL, forwards_entry->class);
pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);
pr_warn("...");
@@ -2436,10 +2613,7 @@ print_bad_irq_dependency(struct task_struct *curr,
lockdep_print_held_locks(curr);
pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
- prev_root->trace = save_trace();
- if (!prev_root->trace)
- return;
- print_shortest_lock_dependencies(backwards_entry, prev_root);
+ print_shortest_lock_dependencies_backwards(backwards_entry, prev_root);
pr_warn("\nthe dependencies between the lock to be acquired");
pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
@@ -2627,7 +2801,7 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
*/
bfs_init_rootb(&this, prev);
- ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL);
+ ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, usage_skip, NULL);
if (bfs_error(ret)) {
print_bfs_bug(ret);
return 0;
@@ -2657,8 +2831,18 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
* Step 3: we found a bad match! Now retrieve a lock from the backward
* list whose usage mask matches the exclusive usage mask from the
* lock found on the forward list.
+ *
+ * Note, we should only keep the LOCKF_ENABLED_IRQ_ALL bits, considering
+ * the follow case:
+ *
+ * When trying to add A -> B to the graph, we find that there is a
+ * hardirq-safe L, that L -> ... -> A, and another hardirq-unsafe M,
+ * that B -> ... -> M. However M is **softirq-safe**, if we use exact
+ * invert bits of M's usage_mask, we will find another lock N that is
+ * **softirq-unsafe** and N -> ... -> A, however N -> .. -> M will not
+ * cause a inversion deadlock.
*/
- backward_mask = original_mask(target_entry1->class->usage_mask);
+ backward_mask = original_mask(target_entry1->class->usage_mask & LOCKF_ENABLED_IRQ_ALL);
ret = find_usage_backwards(&this, backward_mask, &target_entry);
if (bfs_error(ret)) {
@@ -2694,8 +2878,68 @@ static inline int check_irq_usage(struct task_struct *curr,
{
return 1;
}
+
+static inline bool usage_skip(struct lock_list *entry, void *mask)
+{
+ return false;
+}
+
#endif /* CONFIG_TRACE_IRQFLAGS */
+#ifdef CONFIG_LOCKDEP_SMALL
+/*
+ * Check that the dependency graph starting at <src> can lead to
+ * <target> or not. If it can, <src> -> <target> dependency is already
+ * in the graph.
+ *
+ * Return BFS_RMATCH if it does, or BFS_RNOMATCH if it does not, return BFS_E* if
+ * any error appears in the bfs search.
+ */
+static noinline enum bfs_result
+check_redundant(struct held_lock *src, struct held_lock *target)
+{
+ enum bfs_result ret;
+ struct lock_list *target_entry;
+ struct lock_list src_entry;
+
+ bfs_init_root(&src_entry, src);
+ /*
+ * Special setup for check_redundant().
+ *
+ * To report redundant, we need to find a strong dependency path that
+ * is equal to or stronger than <src> -> <target>. So if <src> is E,
+ * we need to let __bfs() only search for a path starting at a -(E*)->,
+ * we achieve this by setting the initial node's ->only_xr to true in
+ * that case. And if <prev> is S, we set initial ->only_xr to false
+ * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant.
+ */
+ src_entry.only_xr = src->read == 0;
+
+ debug_atomic_inc(nr_redundant_checks);
+
+ /*
+ * Note: we skip local_lock() for redundant check, because as the
+ * comment in usage_skip(), A -> local_lock() -> B and A -> B are not
+ * the same.
+ */
+ ret = check_path(target, &src_entry, hlock_equal, usage_skip, &target_entry);
+
+ if (ret == BFS_RMATCH)
+ debug_atomic_inc(nr_redundant);
+
+ return ret;
+}
+
+#else
+
+static inline enum bfs_result
+check_redundant(struct held_lock *src, struct held_lock *target)
+{
+ return BFS_RNOMATCH;
+}
+
+#endif
+
static void inc_chains(int irq_context)
{
if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT)
@@ -2726,10 +2970,10 @@ print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv)
printk(" CPU0\n");
printk(" ----\n");
printk(" lock(");
- __print_lock_name(prev);
+ __print_lock_name(prv, prev);
printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(next);
+ __print_lock_name(nxt, next);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
printk(" May be due to missing lock nesting notation\n\n");
@@ -2739,6 +2983,8 @@ static void
print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
struct held_lock *next)
{
+ struct lock_class *class = hlock_class(prev);
+
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return;
@@ -2753,6 +2999,11 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
pr_warn("\nbut task is already holding lock:\n");
print_lock(prev);
+ if (class->cmp_fn) {
+ pr_warn("and the lock comparison function returns %i:\n",
+ class->cmp_fn(prev->instance, next->instance));
+ }
+
pr_warn("\nother info that might help us debug this:\n");
print_deadlock_scenario(next, prev);
lockdep_print_held_locks(curr);
@@ -2774,6 +3025,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
static int
check_deadlock(struct task_struct *curr, struct held_lock *next)
{
+ struct lock_class *class;
struct held_lock *prev;
struct held_lock *nest = NULL;
int i;
@@ -2794,6 +3046,12 @@ check_deadlock(struct task_struct *curr, struct held_lock *next)
if ((next->read == 2) && prev->read)
continue;
+ class = hlock_class(prev);
+
+ if (class->cmp_fn &&
+ class->cmp_fn(prev->instance, next->instance) < 0)
+ continue;
+
/*
* We're holding the nest_lock, which serializes this lock's
* nesting behaviour.
@@ -2855,6 +3113,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
return 2;
}
+ if (prev->class_idx == next->class_idx) {
+ struct lock_class *class = hlock_class(prev);
+
+ if (class->cmp_fn &&
+ class->cmp_fn(prev->instance, next->instance) < 0)
+ return 2;
+ }
+
/*
* Prove that the new <prev> -> <next> dependency would not
* create a circular dependency in the graph. (We do this by
@@ -2916,7 +3182,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
}
}
-#ifdef CONFIG_LOCKDEP_SMALL
/*
* Is the <prev> -> <next> link redundant?
*/
@@ -2925,7 +3190,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
return 0;
else if (ret == BFS_RMATCH)
return 2;
-#endif
if (!*trace) {
*trace = save_trace();
@@ -2938,19 +3202,15 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* to the previous lock's dependency list:
*/
ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
- &hlock_class(prev)->locks_after,
- next->acquire_ip, distance,
- calc_dep(prev, next),
- *trace);
+ &hlock_class(prev)->locks_after, distance,
+ calc_dep(prev, next), *trace);
if (!ret)
return 0;
ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
- &hlock_class(next)->locks_before,
- next->acquire_ip, distance,
- calc_depb(prev, next),
- *trace);
+ &hlock_class(next)->locks_before, distance,
+ calc_depb(prev, next), *trace);
if (!ret)
return 0;
@@ -3237,7 +3497,8 @@ static int alloc_chain_hlocks(int req)
size = chain_block_size(curr);
if (likely(size >= req)) {
del_chain_block(0, size, chain_block_next(curr));
- add_chain_block(curr + req, size - req);
+ if (size > req)
+ add_chain_block(curr + req, size - req);
return curr;
}
}
@@ -3269,7 +3530,7 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
u16 chain_hlock = chain_hlocks[chain->base + i];
unsigned int class_idx = chain_hlock_class_idx(chain_hlock);
- return lock_classes + class_idx - 1;
+ return lock_classes + class_idx;
}
/*
@@ -3337,7 +3598,7 @@ static void print_chain_keys_chain(struct lock_chain *chain)
hlock_id = chain_hlocks[chain->base + i];
chain_key = print_chain_key_iteration(hlock_id, chain_key);
- print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id) - 1);
+ print_lock_name(NULL, lock_classes + chain_hlock_class_idx(hlock_id));
printk("\n");
}
}
@@ -3694,11 +3955,11 @@ static void print_usage_bug_scenario(struct held_lock *lock)
printk(" CPU0\n");
printk(" ----\n");
printk(" lock(");
- __print_lock_name(class);
+ __print_lock_name(lock, class);
printk(KERN_CONT ");\n");
printk(" <Interrupt>\n");
printk(" lock(");
- __print_lock_name(class);
+ __print_lock_name(lock, class);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
@@ -3707,7 +3968,7 @@ static void
print_usage_bug(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
{
- if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+ if (!debug_locks_off() || debug_locks_silent)
return;
pr_warn("\n");
@@ -3748,6 +4009,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
{
if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) {
+ graph_unlock();
print_usage_bug(curr, this, bad_bit, new_bit);
return 0;
}
@@ -3783,7 +4045,7 @@ print_irq_inversion_bug(struct task_struct *curr,
pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
else
pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
- print_lock_name(other->class);
+ print_lock_name(NULL, other->class);
pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n");
pr_warn("\nother info that might help us debug this:\n");
@@ -4040,14 +4302,13 @@ static void __trace_hardirqs_on_caller(void)
/**
* lockdep_hardirqs_on_prepare - Prepare for enabling interrupts
- * @ip: Caller address
*
* Invoked before a possible transition to RCU idle from exit to user or
* guest mode. This ensures that all RCU operations are done before RCU
* stops watching. After the RCU transition lockdep_hardirqs_on() has to be
* invoked to set the final state.
*/
-void lockdep_hardirqs_on_prepare(unsigned long ip)
+void lockdep_hardirqs_on_prepare(void)
{
if (unlikely(!debug_locks))
return;
@@ -4307,7 +4568,13 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
return 0;
}
}
- if (!hlock->hardirqs_off) {
+
+ /*
+ * For lock_sync(), don't mark the ENABLED usage, since lock_sync()
+ * creates no critical section and no extra dependency can be introduced
+ * by interrupts
+ */
+ if (!hlock->hardirqs_off && !hlock->sync) {
if (hlock->read) {
if (!mark_lock(curr, hlock,
LOCK_ENABLED_HARDIRQ_READ))
@@ -4489,7 +4756,7 @@ print_lock_invalid_wait_context(struct task_struct *curr,
/*
* Verify the wait_type context.
*
- * This check validates we takes locks in the right wait-type order; that is it
+ * This check validates we take locks in the right wait-type order; that is it
* ensures that we do not take mutexes inside spinlocks and do not attempt to
* acquire spinlocks inside raw_spinlocks and the sort.
*
@@ -4503,12 +4770,12 @@ print_lock_invalid_wait_context(struct task_struct *curr,
*/
static int check_wait_context(struct task_struct *curr, struct held_lock *next)
{
- short next_inner = hlock_class(next)->wait_type_inner;
- short next_outer = hlock_class(next)->wait_type_outer;
- short curr_inner;
+ u8 next_inner = hlock_class(next)->wait_type_inner;
+ u8 next_outer = hlock_class(next)->wait_type_outer;
+ u8 curr_inner;
int depth;
- if (!curr->lockdep_depth || !next_inner || next->trylock)
+ if (!next_inner || next->trylock)
return 0;
if (!next_outer)
@@ -4528,7 +4795,8 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
for (; depth < curr->lockdep_depth; depth++) {
struct held_lock *prev = curr->held_locks + depth;
- short prev_inner = hlock_class(prev)->wait_type_inner;
+ struct lock_class *class = hlock_class(prev);
+ u8 prev_inner = class->wait_type_inner;
if (prev_inner) {
/*
@@ -4538,6 +4806,14 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
* Also due to trylocks.
*/
curr_inner = min(curr_inner, prev_inner);
+
+ /*
+ * Allow override for annotations -- this is typically
+ * only valid/needed for code that only exists when
+ * CONFIG_PREEMPT_RT=n.
+ */
+ if (unlikely(class->lock_type == LD_LOCK_WAIT_OVERRIDE))
+ curr_inner = prev_inner;
}
}
@@ -4577,9 +4853,9 @@ static inline int check_wait_context(struct task_struct *curr,
/*
* Initialize a lock instance's lock-class mapping info:
*/
-void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
+void lockdep_init_map_type(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, int subclass,
- short inner, short outer)
+ u8 inner, u8 outer, u8 lock_type)
{
int i;
@@ -4602,6 +4878,7 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
lock->wait_type_outer = outer;
lock->wait_type_inner = inner;
+ lock->lock_type = lock_type;
/*
* No key, no joy, we need to hash something.
@@ -4636,15 +4913,41 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
raw_local_irq_restore(flags);
}
}
-EXPORT_SYMBOL_GPL(lockdep_init_map_waits);
+EXPORT_SYMBOL_GPL(lockdep_init_map_type);
struct lock_class_key __lockdep_no_validate__;
EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
+#ifdef CONFIG_PROVE_LOCKING
+void lockdep_set_lock_cmp_fn(struct lockdep_map *lock, lock_cmp_fn cmp_fn,
+ lock_print_fn print_fn)
+{
+ struct lock_class *class = lock->class_cache[0];
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ lockdep_recursion_inc();
+
+ if (!class)
+ class = register_lock_class(lock, 0, 0);
+
+ if (class) {
+ WARN_ON(class->cmp_fn && class->cmp_fn != cmp_fn);
+ WARN_ON(class->print_fn && class->print_fn != print_fn);
+
+ class->cmp_fn = cmp_fn;
+ class->print_fn = print_fn;
+ }
+
+ lockdep_recursion_finish();
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lockdep_set_lock_cmp_fn);
+#endif
+
static void
print_lock_nested_lock_not_held(struct task_struct *curr,
- struct held_lock *hlock,
- unsigned long ip)
+ struct held_lock *hlock)
{
if (!debug_locks_off())
return;
@@ -4686,7 +4989,7 @@ static int __lock_is_held(const struct lockdep_map *lock, int read);
static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int trylock, int read, int check, int hardirqs_off,
struct lockdep_map *nest_lock, unsigned long ip,
- int references, int pin_count)
+ int references, int pin_count, int sync)
{
struct task_struct *curr = current;
struct lock_class *class = NULL;
@@ -4737,7 +5040,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
class_idx = class - lock_classes;
- if (depth) { /* we're holding locks */
+ if (depth && !sync) {
+ /* we're holding locks and the new held lock is not a sync */
hlock = curr->held_locks + depth - 1;
if (hlock->class_idx == class_idx && nest_lock) {
if (!references)
@@ -4771,6 +5075,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
hlock->trylock = trylock;
hlock->read = read;
hlock->check = check;
+ hlock->sync = !!sync;
hlock->hardirqs_off = !!hardirqs_off;
hlock->references = references;
#ifdef CONFIG_LOCK_STAT
@@ -4820,7 +5125,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
chain_key = iterate_chain_key(chain_key, hlock_id(hlock));
if (nest_lock && !__lock_is_held(nest_lock, -1)) {
- print_lock_nested_lock_not_held(curr, hlock, ip);
+ print_lock_nested_lock_not_held(curr, hlock);
return 0;
}
@@ -4832,6 +5137,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (!validate_chain(curr, hlock, chain_head, chain_key))
return 0;
+ /* For lock_sync(), we are done here since no actual critical section */
+ if (hlock->sync)
+ return 1;
+
curr->curr_chain_key = chain_key;
curr->lockdep_depth++;
check_chain_key(curr);
@@ -4973,7 +5282,7 @@ static int reacquire_held_locks(struct task_struct *curr, unsigned int depth,
hlock->read, hlock->check,
hlock->hardirqs_off,
hlock->nest_lock, hlock->acquire_ip,
- hlock->references, hlock->pin_count)) {
+ hlock->references, hlock->pin_count, 0)) {
case 0:
return 1;
case 1:
@@ -5017,9 +5326,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
return 0;
}
- lockdep_init_map_waits(lock, name, key, 0,
- lock->wait_type_inner,
- lock->wait_type_outer);
+ lockdep_init_map_type(lock, name, key, 0,
+ lock->wait_type_inner,
+ lock->wait_type_outer,
+ lock->lock_type);
class = register_lock_class(lock, subclass, 0);
hlock->class_idx = class - lock_classes;
@@ -5183,14 +5493,14 @@ int __lock_is_held(const struct lockdep_map *lock, int read)
struct held_lock *hlock = curr->held_locks + i;
if (match_held_lock(hlock, lock)) {
- if (read == -1 || hlock->read == read)
- return 1;
+ if (read == -1 || !!hlock->read == read)
+ return LOCK_STATE_HELD;
- return 0;
+ return LOCK_STATE_NOT_HELD;
}
}
- return 0;
+ return LOCK_STATE_NOT_HELD;
}
static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock)
@@ -5211,7 +5521,7 @@ static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock)
* be guessable and still allows some pin nesting in
* our u32 pin_count.
*/
- cookie.val = 1 + (prandom_u32() >> 16);
+ cookie.val = 1 + (sched_clock() & 0xffff);
hlock->pin_count += cookie.val;
return cookie;
}
@@ -5290,6 +5600,7 @@ static noinstr void check_flags(unsigned long flags)
}
}
+#ifndef CONFIG_PREEMPT_RT
/*
* We dont accurately track softirq state in e.g.
* hardirq contexts (such as on 4KSTACKS), so only
@@ -5304,6 +5615,7 @@ static noinstr void check_flags(unsigned long flags)
DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
}
}
+#endif
if (!debug_locks)
print_irqtrace_events(current);
@@ -5440,7 +5752,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
lockdep_recursion_inc();
__lock_acquire(lock, subclass, trylock, read, check,
- irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
+ irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 0);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
@@ -5466,13 +5778,45 @@ void lock_release(struct lockdep_map *lock, unsigned long ip)
}
EXPORT_SYMBOL_GPL(lock_release);
+/*
+ * lock_sync() - A special annotation for synchronize_{s,}rcu()-like API.
+ *
+ * No actual critical section is created by the APIs annotated with this: these
+ * APIs are used to wait for one or multiple critical sections (on other CPUs
+ * or threads), and it means that calling these APIs inside these critical
+ * sections is potential deadlock.
+ */
+void lock_sync(struct lockdep_map *lock, unsigned subclass, int read,
+ int check, struct lockdep_map *nest_lock, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(!lockdep_enabled()))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ lockdep_recursion_inc();
+ __lock_acquire(lock, subclass, 0, read, check,
+ irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 1);
+ check_chain_key(current);
+ lockdep_recursion_finish();
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_sync);
+
noinstr int lock_is_held_type(const struct lockdep_map *lock, int read)
{
unsigned long flags;
- int ret = 0;
+ int ret = LOCK_STATE_NOT_HELD;
+ /*
+ * Avoid false negative lockdep_assert_held() and
+ * lockdep_assert_not_held().
+ */
if (unlikely(!lockdep_enabled()))
- return 1; /* avoid false negative lockdep_assert_held() */
+ return LOCK_STATE_UNKNOWN;
raw_local_irq_save(flags);
check_flags(flags);
@@ -5662,7 +6006,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
- trace_lock_acquired(lock, ip);
+ trace_lock_contended(lock, ip);
if (unlikely(!lock_stat || !lockdep_enabled()))
return;
@@ -5680,7 +6024,7 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
- trace_lock_contended(lock, ip);
+ trace_lock_acquired(lock, ip);
if (unlikely(!lock_stat || !lockdep_enabled()))
return;
@@ -5799,6 +6143,8 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
WRITE_ONCE(class->name, NULL);
nr_lock_classes--;
__clear_bit(class - lock_classes, lock_classes_in_use);
+ if (class - lock_classes == max_lock_class_idx)
+ max_lock_class_idx--;
} else {
WARN_ONCE(true, "%s() failed for class %s\n", __func__,
class->name);
@@ -5810,13 +6156,10 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
static void reinit_class(struct lock_class *class)
{
- void *const p = class;
- const unsigned int offset = offsetof(struct lock_class, key);
-
WARN_ON_ONCE(!class->lock_entry.next);
WARN_ON_ONCE(!list_empty(&class->locks_after));
WARN_ON_ONCE(!list_empty(&class->locks_before));
- memset(p + offset, 0, sizeof(*class) - offset);
+ memset_startat(class, 0, key);
WARN_ON_ONCE(!class->lock_entry.next);
WARN_ON_ONCE(!list_empty(&class->locks_after));
WARN_ON_ONCE(!list_empty(&class->locks_before));
@@ -6089,7 +6432,13 @@ void lockdep_reset_lock(struct lockdep_map *lock)
lockdep_reset_lock_reg(lock);
}
-/* Unregister a dynamically allocated key. */
+/*
+ * Unregister a dynamically allocated key.
+ *
+ * Unlike lockdep_register_key(), a search is always done to find a matching
+ * key irrespective of debug_locks to avoid potential invalid access to freed
+ * memory in lock_class entry.
+ */
void lockdep_unregister_key(struct lock_class_key *key)
{
struct hlist_head *hash_head = keyhashentry(key);
@@ -6104,10 +6453,8 @@ void lockdep_unregister_key(struct lock_class_key *key)
return;
raw_local_irq_save(flags);
- if (!graph_lock())
- goto out_irq;
+ lockdep_lock();
- pf = get_pending_free();
hlist_for_each_entry_rcu(k, hash_head, hash_entry) {
if (k == key) {
hlist_del_rcu(&k->hash_entry);
@@ -6115,11 +6462,13 @@ void lockdep_unregister_key(struct lock_class_key *key)
break;
}
}
- WARN_ON_ONCE(!found);
- __lockdep_free_key_range(pf, key, 1);
- call_rcu_zapped(pf);
- graph_unlock();
-out_irq:
+ WARN_ON_ONCE(!found && debug_locks);
+ if (found) {
+ pf = get_pending_free();
+ __lockdep_free_key_range(pf, key, 1);
+ call_rcu_zapped(pf);
+ }
+ lockdep_unlock();
raw_local_irq_restore(flags);
/* Wait until is_dynamic_key() has finished accessing k->hash_entry. */
@@ -6319,6 +6668,8 @@ asmlinkage __visible void lockdep_sys_exit(void)
void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
{
struct task_struct *curr = current;
+ int dl = READ_ONCE(debug_locks);
+ bool rcu = warn_rcu_enter();
/* Note: the following can be executed concurrently, so be careful. */
pr_warn("\n");
@@ -6328,15 +6679,16 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
pr_warn("-----------------------------\n");
pr_warn("%s:%d %s!\n", file, line, s);
pr_warn("\nother info that might help us debug this:\n\n");
- pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
+ pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n%s",
!rcu_lockdep_current_cpu_online()
? "RCU used illegally from offline CPU!\n"
: "",
- rcu_scheduler_active, debug_locks);
+ rcu_scheduler_active, dl,
+ dl ? "" : "Possible false positive due to lockdep disabling via debug_locks = 0\n");
/*
* If a CPU is in the RCU-free window in idle (ie: in the section
- * between rcu_idle_enter() and rcu_idle_exit(), then RCU
+ * between ct_idle_enter() and ct_idle_exit(), then RCU
* considers that CPU to be in an "extended quiescent state",
* which means that RCU will be completely ignoring that CPU.
* Therefore, rcu_read_lock() and friends have absolutely no
@@ -6358,5 +6710,6 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
lockdep_print_held_locks(curr);
pr_warn("\nstack backtrace:\n");
dump_stack();
+ warn_rcu_exit(rcu);
}
EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index de49f9e1c11b..bbe9000260d0 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -99,16 +99,16 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =
#define MAX_STACK_TRACE_ENTRIES 262144UL
#define STACK_TRACE_HASH_SIZE 8192
#else
-#define MAX_LOCKDEP_ENTRIES 32768UL
+#define MAX_LOCKDEP_ENTRIES (1UL << CONFIG_LOCKDEP_BITS)
-#define MAX_LOCKDEP_CHAINS_BITS 16
+#define MAX_LOCKDEP_CHAINS_BITS CONFIG_LOCKDEP_CHAINS_BITS
/*
* Stack-trace: tightly packed array of stack backtrace
* addresses. Protected by the hash_lock.
*/
-#define MAX_STACK_TRACE_ENTRIES 524288UL
-#define STACK_TRACE_HASH_SIZE 16384
+#define MAX_STACK_TRACE_ENTRIES (1UL << CONFIG_LOCKDEP_STACK_TRACE_BITS)
+#define STACK_TRACE_HASH_SIZE (1 << CONFIG_LOCKDEP_STACK_TRACE_HASH_BITS)
#endif
/*
@@ -121,7 +121,6 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =
#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
-extern struct list_head all_lock_classes;
extern struct lock_chain lock_chains[];
#define LOCK_USAGE_CHARS (2*XXX_LOCK_USAGE_STATES + 1)
@@ -151,6 +150,10 @@ extern unsigned int nr_large_chain_blocks;
extern unsigned int max_lockdep_depth;
extern unsigned int max_bfs_queue_depth;
+extern unsigned long max_lock_class_idx;
+
+extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+extern unsigned long lock_classes_in_use[];
#ifdef CONFIG_PROVE_LOCKING
extern unsigned long lockdep_count_forward_deps(struct lock_class *);
@@ -205,7 +208,6 @@ struct lockdep_stats {
};
DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
-extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
#define __debug_atomic_inc(ptr) \
this_cpu_inc(lockdep_stats.ptr);
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 02ef87f50df2..e2bfb1db589d 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -24,14 +24,33 @@
#include "lockdep_internals.h"
+/*
+ * Since iteration of lock_classes is done without holding the lockdep lock,
+ * it is not safe to iterate all_lock_classes list directly as the iteration
+ * may branch off to free_lock_classes or the zapped list. Iteration is done
+ * directly on the lock_classes array by checking the lock_classes_in_use
+ * bitmap and max_lock_class_idx.
+ */
+#define iterate_lock_classes(idx, class) \
+ for (idx = 0, class = lock_classes; idx <= max_lock_class_idx; \
+ idx++, class++)
+
static void *l_next(struct seq_file *m, void *v, loff_t *pos)
{
- return seq_list_next(v, &all_lock_classes, pos);
+ struct lock_class *class = v;
+
+ ++class;
+ *pos = class - lock_classes;
+ return (*pos > max_lock_class_idx) ? NULL : class;
}
static void *l_start(struct seq_file *m, loff_t *pos)
{
- return seq_list_start_head(&all_lock_classes, *pos);
+ unsigned long idx = *pos;
+
+ if (idx > max_lock_class_idx)
+ return NULL;
+ return lock_classes + idx;
}
static void l_stop(struct seq_file *m, void *v)
@@ -57,39 +76,43 @@ static void print_name(struct seq_file *m, struct lock_class *class)
static int l_show(struct seq_file *m, void *v)
{
- struct lock_class *class = list_entry(v, struct lock_class, lock_entry);
+ struct lock_class *class = v;
struct lock_list *entry;
char usage[LOCK_USAGE_CHARS];
+ int idx = class - lock_classes;
- if (v == &all_lock_classes) {
+ if (v == lock_classes)
seq_printf(m, "all lock classes:\n");
+
+ if (!test_bit(idx, lock_classes_in_use))
return 0;
- }
seq_printf(m, "%p", class->key);
#ifdef CONFIG_DEBUG_LOCKDEP
seq_printf(m, " OPS:%8ld", debug_class_ops_read(class));
#endif
-#ifdef CONFIG_PROVE_LOCKING
- seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class));
- seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
-#endif
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
+ seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class));
+ seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
- get_usage_chars(class, usage);
- seq_printf(m, " %s", usage);
+ get_usage_chars(class, usage);
+ seq_printf(m, " %s", usage);
+ }
seq_printf(m, ": ");
print_name(m, class);
seq_puts(m, "\n");
- list_for_each_entry(entry, &class->locks_after, entry) {
- if (entry->distance == 1) {
- seq_printf(m, " -> [%p] ", entry->class->key);
- print_name(m, entry->class);
- seq_puts(m, "\n");
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
+ list_for_each_entry(entry, &class->locks_after, entry) {
+ if (entry->distance == 1) {
+ seq_printf(m, " -> [%p] ", entry->class->key);
+ print_name(m, entry->class);
+ seq_puts(m, "\n");
+ }
}
+ seq_puts(m, "\n");
}
- seq_puts(m, "\n");
return 0;
}
@@ -218,8 +241,11 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
#ifdef CONFIG_PROVE_LOCKING
struct lock_class *class;
+ unsigned long idx;
- list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
if (class->usage_mask == 0)
nr_unused++;
@@ -252,6 +278,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
sum_forward_deps += lockdep_count_forward_deps(class);
}
+
#ifdef CONFIG_DEBUG_LOCKDEP
DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
#endif
@@ -343,12 +370,14 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
seq_printf(m, " max bfs queue depth: %11u\n",
max_bfs_queue_depth);
#endif
+ seq_printf(m, " max lock class index: %11lu\n",
+ max_lock_class_idx);
lockdep_stats_debug_show(m);
seq_printf(m, " debug_locks: %11u\n",
debug_locks);
/*
- * Zappped classes and lockdep data buffers reuse statistics.
+ * Zapped classes and lockdep data buffers reuse statistics.
*/
seq_puts(m, "\n");
seq_printf(m, " zapped classes: %11lu\n",
@@ -411,7 +440,7 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr)
static void seq_time(struct seq_file *m, s64 time)
{
- char num[15];
+ char num[22];
snprint_time(num, sizeof(num), time);
seq_printf(m, " %14s", num);
@@ -620,12 +649,16 @@ static int lock_stat_open(struct inode *inode, struct file *file)
if (!res) {
struct lock_stat_data *iter = data->stats;
struct seq_file *m = file->private_data;
+ unsigned long idx;
- list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
iter->class = class;
iter->stats = lock_stats(class);
iter++;
}
+
data->iter_end = iter;
sort(data->stats, data->iter_end - data->stats,
@@ -643,6 +676,7 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct lock_class *class;
+ unsigned long idx;
char c;
if (count) {
@@ -652,8 +686,11 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf,
if (c != '0')
return count;
- list_for_each_entry(class, &all_lock_classes, lock_entry)
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
clear_lock_stats(class);
+ }
}
return count;
}
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index fd838cea3934..415d81e6ce70 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -27,40 +27,92 @@
#include <linux/moduleparam.h>
#include <linux/delay.h>
#include <linux/slab.h>
-#include <linux/percpu-rwsem.h>
#include <linux/torture.h>
#include <linux/reboot.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
-torture_param(int, nwriters_stress, -1,
- "Number of write-locking stress-test threads");
-torture_param(int, nreaders_stress, -1,
- "Number of read-locking stress-test threads");
+torture_param(int, acq_writer_lim, 0, "Write_acquisition time limit (jiffies).");
+torture_param(int, call_rcu_chains, 0, "Self-propagate call_rcu() chains during test (0=disable).");
+torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable");
+torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)");
+torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads");
+torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads");
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
-torture_param(int, onoff_interval, 0,
- "Time between CPU hotplugs (s), 0=disable");
-torture_param(int, shuffle_interval, 3,
- "Number of jiffies between shuffles, 0=disable");
+torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, rt_boost, 2,
+ "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types.");
+torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens.");
+torture_param(int, shuffle_interval, 3, "Number of jiffies between shuffles, 0=disable");
torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
-torture_param(int, stat_interval, 60,
- "Number of seconds between stats printk()s");
+torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
-torture_param(int, verbose, 1,
- "Enable verbose debugging printk()s");
+torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
+torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority");
+/* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */
+#define MAX_NESTED_LOCKS 8
-static char *torture_type = "spin_lock";
+static char *torture_type = IS_ENABLED(CONFIG_PREEMPT_RT) ? "raw_spin_lock" : "spin_lock";
module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type,
"Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)");
+static cpumask_var_t bind_readers; // Bind the readers to the specified set of CPUs.
+static cpumask_var_t bind_writers; // Bind the writers to the specified set of CPUs.
+
+// Parse a cpumask kernel parameter. If there are more users later on,
+// this might need to got to a more central location.
+static int param_set_cpumask(const char *val, const struct kernel_param *kp)
+{
+ cpumask_var_t *cm_bind = kp->arg;
+ int ret;
+ char *s;
+
+ if (!alloc_cpumask_var(cm_bind, GFP_KERNEL)) {
+ s = "Out of memory";
+ ret = -ENOMEM;
+ goto out_err;
+ }
+ ret = cpulist_parse(val, *cm_bind);
+ if (!ret)
+ return ret;
+ s = "Bad CPU range";
+out_err:
+ pr_warn("%s: %s, all CPUs set\n", kp->name, s);
+ cpumask_setall(*cm_bind);
+ return ret;
+}
+
+// Output a cpumask kernel parameter.
+static int param_get_cpumask(char *buffer, const struct kernel_param *kp)
+{
+ cpumask_var_t *cm_bind = kp->arg;
+
+ return sprintf(buffer, "%*pbl", cpumask_pr_args(*cm_bind));
+}
+
+static bool cpumask_nonempty(cpumask_var_t mask)
+{
+ return cpumask_available(mask) && !cpumask_empty(mask);
+}
+
+static const struct kernel_param_ops lt_bind_ops = {
+ .set = param_set_cpumask,
+ .get = param_get_cpumask,
+};
+
+module_param_cb(bind_readers, &lt_bind_ops, &bind_readers, 0644);
+module_param_cb(bind_writers, &lt_bind_ops, &bind_writers, 0644);
+
+long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask);
+
static struct task_struct *stats_task;
static struct task_struct **writer_tasks;
static struct task_struct **reader_tasks;
static bool lock_is_write_held;
-static bool lock_is_read_held;
+static atomic_t lock_is_read_held;
static unsigned long last_lock_release;
struct lock_stress_stats {
@@ -68,6 +120,12 @@ struct lock_stress_stats {
long n_lock_acquired;
};
+struct call_rcu_chain {
+ struct rcu_head crc_rh;
+ bool crc_stop;
+};
+struct call_rcu_chain *call_rcu_chain_list;
+
/* Forward reference. */
static void lock_torture_cleanup(void);
@@ -77,13 +135,15 @@ static void lock_torture_cleanup(void);
struct lock_torture_ops {
void (*init)(void);
void (*exit)(void);
- int (*writelock)(void);
+ int (*nested_lock)(int tid, u32 lockset);
+ int (*writelock)(int tid);
void (*write_delay)(struct torture_random_state *trsp);
void (*task_boost)(struct torture_random_state *trsp);
- void (*writeunlock)(void);
- int (*readlock)(void);
+ void (*writeunlock)(int tid);
+ void (*nested_unlock)(int tid, u32 lockset);
+ int (*readlock)(int tid);
void (*read_delay)(struct torture_random_state *trsp);
- void (*readunlock)(void);
+ void (*readunlock)(int tid);
unsigned long flags; /* for irq spinlocks */
const char *name;
@@ -106,37 +166,69 @@ static struct lock_torture_cxt cxt = { 0, 0, false, false,
* Definitions for lock torture testing.
*/
-static int torture_lock_busted_write_lock(void)
+static int torture_lock_busted_write_lock(int tid __maybe_unused)
{
return 0; /* BUGGY, do not use in real life!!! */
}
static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_lock_busted_write_unlock(void)
+static void torture_lock_busted_write_unlock(int tid __maybe_unused)
{
/* BUGGY, do not use in real life!!! */
}
-static void torture_boost_dummy(struct torture_random_state *trsp)
+static void __torture_rt_boost(struct torture_random_state *trsp)
{
- /* Only rtmutexes care about priority */
+ const unsigned int factor = rt_boost_factor;
+
+ if (!rt_task(current)) {
+ /*
+ * Boost priority once every rt_boost_factor operations. When
+ * the task tries to take the lock, the rtmutex it will account
+ * for the new priority, and do any corresponding pi-dance.
+ */
+ if (trsp && !(torture_random(trsp) %
+ (cxt.nrealwriters_stress * factor))) {
+ sched_set_fifo(current);
+ } else /* common case, do nothing */
+ return;
+ } else {
+ /*
+ * The task will remain boosted for another 10 * rt_boost_factor
+ * operations, then restored back to its original prio, and so
+ * forth.
+ *
+ * When @trsp is nil, we want to force-reset the task for
+ * stopping the kthread.
+ */
+ if (!trsp || !(torture_random(trsp) %
+ (cxt.nrealwriters_stress * factor * 2))) {
+ sched_set_normal(current, 0);
+ } else /* common case, do nothing */
+ return;
+ }
+}
+
+static void torture_rt_boost(struct torture_random_state *trsp)
+{
+ if (rt_boost != 2)
+ return;
+
+ __torture_rt_boost(trsp);
}
static struct lock_torture_ops lock_busted_ops = {
.writelock = torture_lock_busted_write_lock,
.write_delay = torture_lock_busted_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_lock_busted_write_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -146,7 +238,8 @@ static struct lock_torture_ops lock_busted_ops = {
static DEFINE_SPINLOCK(torture_spinlock);
-static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
+static int torture_spin_lock_write_lock(int tid __maybe_unused)
+__acquires(torture_spinlock)
{
spin_lock(&torture_spinlock);
return 0;
@@ -155,22 +248,24 @@ static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_ms = 100;
+ unsigned long j;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2 * shortdelay_us)))
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) {
+ j = jiffies;
+ mdelay(long_hold);
+ pr_alert("%s: delay = %lu jiffies.\n", __func__, jiffies - j);
+ }
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us)))
udelay(shortdelay_us);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
+static void torture_spin_lock_write_unlock(int tid __maybe_unused)
+__releases(torture_spinlock)
{
spin_unlock(&torture_spinlock);
}
@@ -178,7 +273,7 @@ static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
static struct lock_torture_ops spin_lock_ops = {
.writelock = torture_spin_lock_write_lock,
.write_delay = torture_spin_lock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_spin_lock_write_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -186,7 +281,7 @@ static struct lock_torture_ops spin_lock_ops = {
.name = "spin_lock"
};
-static int torture_spin_lock_write_lock_irq(void)
+static int torture_spin_lock_write_lock_irq(int tid __maybe_unused)
__acquires(torture_spinlock)
{
unsigned long flags;
@@ -196,7 +291,7 @@ __acquires(torture_spinlock)
return 0;
}
-static void torture_lock_spin_write_unlock_irq(void)
+static void torture_lock_spin_write_unlock_irq(int tid __maybe_unused)
__releases(torture_spinlock)
{
spin_unlock_irqrestore(&torture_spinlock, cxt.cur_ops->flags);
@@ -205,7 +300,7 @@ __releases(torture_spinlock)
static struct lock_torture_ops spin_lock_irq_ops = {
.writelock = torture_spin_lock_write_lock_irq,
.write_delay = torture_spin_lock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_lock_spin_write_unlock_irq,
.readlock = NULL,
.read_delay = NULL,
@@ -213,9 +308,63 @@ static struct lock_torture_ops spin_lock_irq_ops = {
.name = "spin_lock_irq"
};
+static DEFINE_RAW_SPINLOCK(torture_raw_spinlock);
+
+static int torture_raw_spin_lock_write_lock(int tid __maybe_unused)
+__acquires(torture_raw_spinlock)
+{
+ raw_spin_lock(&torture_raw_spinlock);
+ return 0;
+}
+
+static void torture_raw_spin_lock_write_unlock(int tid __maybe_unused)
+__releases(torture_raw_spinlock)
+{
+ raw_spin_unlock(&torture_raw_spinlock);
+}
+
+static struct lock_torture_ops raw_spin_lock_ops = {
+ .writelock = torture_raw_spin_lock_write_lock,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_spin_lock_write_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_spin_lock"
+};
+
+static int torture_raw_spin_lock_write_lock_irq(int tid __maybe_unused)
+__acquires(torture_raw_spinlock)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&torture_raw_spinlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_raw_spin_lock_write_unlock_irq(int tid __maybe_unused)
+__releases(torture_raw_spinlock)
+{
+ raw_spin_unlock_irqrestore(&torture_raw_spinlock, cxt.cur_ops->flags);
+}
+
+static struct lock_torture_ops raw_spin_lock_irq_ops = {
+ .writelock = torture_raw_spin_lock_write_lock_irq,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_spin_lock_write_unlock_irq,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_spin_lock_irq"
+};
+
static DEFINE_RWLOCK(torture_rwlock);
-static int torture_rwlock_write_lock(void) __acquires(torture_rwlock)
+static int torture_rwlock_write_lock(int tid __maybe_unused)
+__acquires(torture_rwlock)
{
write_lock(&torture_rwlock);
return 0;
@@ -224,24 +373,24 @@ static int torture_rwlock_write_lock(void) __acquires(torture_rwlock)
static void torture_rwlock_write_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_ms = 100;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold);
else
udelay(shortdelay_us);
}
-static void torture_rwlock_write_unlock(void) __releases(torture_rwlock)
+static void torture_rwlock_write_unlock(int tid __maybe_unused)
+__releases(torture_rwlock)
{
write_unlock(&torture_rwlock);
}
-static int torture_rwlock_read_lock(void) __acquires(torture_rwlock)
+static int torture_rwlock_read_lock(int tid __maybe_unused)
+__acquires(torture_rwlock)
{
read_lock(&torture_rwlock);
return 0;
@@ -250,19 +399,18 @@ static int torture_rwlock_read_lock(void) __acquires(torture_rwlock)
static void torture_rwlock_read_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 10;
- const unsigned long longdelay_ms = 100;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
- if (!(torture_random(trsp) %
- (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold)))
+ mdelay(long_hold);
else
udelay(shortdelay_us);
}
-static void torture_rwlock_read_unlock(void) __releases(torture_rwlock)
+static void torture_rwlock_read_unlock(int tid __maybe_unused)
+__releases(torture_rwlock)
{
read_unlock(&torture_rwlock);
}
@@ -270,7 +418,7 @@ static void torture_rwlock_read_unlock(void) __releases(torture_rwlock)
static struct lock_torture_ops rw_lock_ops = {
.writelock = torture_rwlock_write_lock,
.write_delay = torture_rwlock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwlock_write_unlock,
.readlock = torture_rwlock_read_lock,
.read_delay = torture_rwlock_read_delay,
@@ -278,7 +426,8 @@ static struct lock_torture_ops rw_lock_ops = {
.name = "rw_lock"
};
-static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock)
+static int torture_rwlock_write_lock_irq(int tid __maybe_unused)
+__acquires(torture_rwlock)
{
unsigned long flags;
@@ -287,13 +436,14 @@ static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock)
return 0;
}
-static void torture_rwlock_write_unlock_irq(void)
+static void torture_rwlock_write_unlock_irq(int tid __maybe_unused)
__releases(torture_rwlock)
{
write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
}
-static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock)
+static int torture_rwlock_read_lock_irq(int tid __maybe_unused)
+__acquires(torture_rwlock)
{
unsigned long flags;
@@ -302,7 +452,7 @@ static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock)
return 0;
}
-static void torture_rwlock_read_unlock_irq(void)
+static void torture_rwlock_read_unlock_irq(int tid __maybe_unused)
__releases(torture_rwlock)
{
read_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
@@ -311,7 +461,7 @@ __releases(torture_rwlock)
static struct lock_torture_ops rw_lock_irq_ops = {
.writelock = torture_rwlock_write_lock_irq,
.write_delay = torture_rwlock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwlock_write_unlock_irq,
.readlock = torture_rwlock_read_lock_irq,
.read_delay = torture_rwlock_read_delay,
@@ -320,8 +470,31 @@ static struct lock_torture_ops rw_lock_irq_ops = {
};
static DEFINE_MUTEX(torture_mutex);
+static struct mutex torture_nested_mutexes[MAX_NESTED_LOCKS];
+static struct lock_class_key nested_mutex_keys[MAX_NESTED_LOCKS];
-static int torture_mutex_lock(void) __acquires(torture_mutex)
+static void torture_mutex_init(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_NESTED_LOCKS; i++)
+ __mutex_init(&torture_nested_mutexes[i], __func__,
+ &nested_mutex_keys[i]);
+}
+
+static int torture_mutex_nested_lock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = 0; i < nested_locks; i++)
+ if (lockset & (1 << i))
+ mutex_lock(&torture_nested_mutexes[i]);
+ return 0;
+}
+
+static int torture_mutex_lock(int tid __maybe_unused)
+__acquires(torture_mutex)
{
mutex_lock(&torture_mutex);
return 0;
@@ -329,28 +502,37 @@ static int torture_mutex_lock(void) __acquires(torture_mutex)
static void torture_mutex_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms * 5);
- else
- mdelay(longdelay_ms / 5);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold * 5);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_mutex_unlock(void) __releases(torture_mutex)
+static void torture_mutex_unlock(int tid __maybe_unused)
+__releases(torture_mutex)
{
mutex_unlock(&torture_mutex);
}
+static void torture_mutex_nested_unlock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = nested_locks - 1; i >= 0; i--)
+ if (lockset & (1 << i))
+ mutex_unlock(&torture_nested_mutexes[i]);
+}
+
static struct lock_torture_ops mutex_lock_ops = {
+ .init = torture_mutex_init,
+ .nested_lock = torture_mutex_nested_lock,
.writelock = torture_mutex_lock,
.write_delay = torture_mutex_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_mutex_unlock,
+ .nested_unlock = torture_mutex_nested_unlock,
.readlock = NULL,
.read_delay = NULL,
.readunlock = NULL,
@@ -358,12 +540,34 @@ static struct lock_torture_ops mutex_lock_ops = {
};
#include <linux/ww_mutex.h>
+/*
+ * The torture ww_mutexes should belong to the same lock class as
+ * torture_ww_class to avoid lockdep problem. The ww_mutex_init()
+ * function is called for initialization to ensure that.
+ */
static DEFINE_WD_CLASS(torture_ww_class);
-static DEFINE_WW_MUTEX(torture_ww_mutex_0, &torture_ww_class);
-static DEFINE_WW_MUTEX(torture_ww_mutex_1, &torture_ww_class);
-static DEFINE_WW_MUTEX(torture_ww_mutex_2, &torture_ww_class);
+static struct ww_mutex torture_ww_mutex_0, torture_ww_mutex_1, torture_ww_mutex_2;
+static struct ww_acquire_ctx *ww_acquire_ctxs;
+
+static void torture_ww_mutex_init(void)
+{
+ ww_mutex_init(&torture_ww_mutex_0, &torture_ww_class);
+ ww_mutex_init(&torture_ww_mutex_1, &torture_ww_class);
+ ww_mutex_init(&torture_ww_mutex_2, &torture_ww_class);
+
+ ww_acquire_ctxs = kmalloc_array(cxt.nrealwriters_stress,
+ sizeof(*ww_acquire_ctxs),
+ GFP_KERNEL);
+ if (!ww_acquire_ctxs)
+ VERBOSE_TOROUT_STRING("ww_acquire_ctx: Out of memory");
+}
+
+static void torture_ww_mutex_exit(void)
+{
+ kfree(ww_acquire_ctxs);
+}
-static int torture_ww_mutex_lock(void)
+static int torture_ww_mutex_lock(int tid)
__acquires(torture_ww_mutex_0)
__acquires(torture_ww_mutex_1)
__acquires(torture_ww_mutex_2)
@@ -373,7 +577,7 @@ __acquires(torture_ww_mutex_2)
struct list_head link;
struct ww_mutex *lock;
} locks[3], *ll, *ln;
- struct ww_acquire_ctx ctx;
+ struct ww_acquire_ctx *ctx = &ww_acquire_ctxs[tid];
locks[0].lock = &torture_ww_mutex_0;
list_add(&locks[0].link, &list);
@@ -384,12 +588,12 @@ __acquires(torture_ww_mutex_2)
locks[2].lock = &torture_ww_mutex_2;
list_add(&locks[2].link, &list);
- ww_acquire_init(&ctx, &torture_ww_class);
+ ww_acquire_init(ctx, &torture_ww_class);
list_for_each_entry(ll, &list, link) {
int err;
- err = ww_mutex_lock(ll->lock, &ctx);
+ err = ww_mutex_lock(ll->lock, ctx);
if (!err)
continue;
@@ -400,28 +604,32 @@ __acquires(torture_ww_mutex_2)
if (err != -EDEADLK)
return err;
- ww_mutex_lock_slow(ll->lock, &ctx);
+ ww_mutex_lock_slow(ll->lock, ctx);
list_move(&ll->link, &list);
}
- ww_acquire_fini(&ctx);
return 0;
}
-static void torture_ww_mutex_unlock(void)
+static void torture_ww_mutex_unlock(int tid)
__releases(torture_ww_mutex_0)
__releases(torture_ww_mutex_1)
__releases(torture_ww_mutex_2)
{
+ struct ww_acquire_ctx *ctx = &ww_acquire_ctxs[tid];
+
ww_mutex_unlock(&torture_ww_mutex_0);
ww_mutex_unlock(&torture_ww_mutex_1);
ww_mutex_unlock(&torture_ww_mutex_2);
+ ww_acquire_fini(ctx);
}
static struct lock_torture_ops ww_mutex_lock_ops = {
+ .init = torture_ww_mutex_init,
+ .exit = torture_ww_mutex_exit,
.writelock = torture_ww_mutex_lock,
.write_delay = torture_mutex_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_ww_mutex_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -431,73 +639,85 @@ static struct lock_torture_ops ww_mutex_lock_ops = {
#ifdef CONFIG_RT_MUTEXES
static DEFINE_RT_MUTEX(torture_rtmutex);
+static struct rt_mutex torture_nested_rtmutexes[MAX_NESTED_LOCKS];
+static struct lock_class_key nested_rtmutex_keys[MAX_NESTED_LOCKS];
-static int torture_rtmutex_lock(void) __acquires(torture_rtmutex)
+static void torture_rtmutex_init(void)
{
- rt_mutex_lock(&torture_rtmutex);
- return 0;
+ int i;
+
+ for (i = 0; i < MAX_NESTED_LOCKS; i++)
+ __rt_mutex_init(&torture_nested_rtmutexes[i], __func__,
+ &nested_rtmutex_keys[i]);
}
-static void torture_rtmutex_boost(struct torture_random_state *trsp)
+static int torture_rtmutex_nested_lock(int tid __maybe_unused,
+ u32 lockset)
{
- const unsigned int factor = 50000; /* yes, quite arbitrary */
+ int i;
- if (!rt_task(current)) {
- /*
- * Boost priority once every ~50k operations. When the
- * task tries to take the lock, the rtmutex it will account
- * for the new priority, and do any corresponding pi-dance.
- */
- if (trsp && !(torture_random(trsp) %
- (cxt.nrealwriters_stress * factor))) {
- sched_set_fifo(current);
- } else /* common case, do nothing */
- return;
- } else {
- /*
- * The task will remain boosted for another ~500k operations,
- * then restored back to its original prio, and so forth.
- *
- * When @trsp is nil, we want to force-reset the task for
- * stopping the kthread.
- */
- if (!trsp || !(torture_random(trsp) %
- (cxt.nrealwriters_stress * factor * 2))) {
- sched_set_normal(current, 0);
- } else /* common case, do nothing */
- return;
- }
+ for (i = 0; i < nested_locks; i++)
+ if (lockset & (1 << i))
+ rt_mutex_lock(&torture_nested_rtmutexes[i]);
+ return 0;
+}
+
+static int torture_rtmutex_lock(int tid __maybe_unused)
+__acquires(torture_rtmutex)
+{
+ rt_mutex_lock(&torture_rtmutex);
+ return 0;
}
static void torture_rtmutex_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_ms = 100;
/*
* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold);
if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2 * shortdelay_us)))
+ (cxt.nrealwriters_stress * 200 * shortdelay_us)))
udelay(shortdelay_us);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_rtmutex_unlock(void) __releases(torture_rtmutex)
+static void torture_rtmutex_unlock(int tid __maybe_unused)
+__releases(torture_rtmutex)
{
rt_mutex_unlock(&torture_rtmutex);
}
+static void torture_rt_boost_rtmutex(struct torture_random_state *trsp)
+{
+ if (!rt_boost)
+ return;
+
+ __torture_rt_boost(trsp);
+}
+
+static void torture_rtmutex_nested_unlock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = nested_locks - 1; i >= 0; i--)
+ if (lockset & (1 << i))
+ rt_mutex_unlock(&torture_nested_rtmutexes[i]);
+}
+
static struct lock_torture_ops rtmutex_lock_ops = {
+ .init = torture_rtmutex_init,
+ .nested_lock = torture_rtmutex_nested_lock,
.writelock = torture_rtmutex_lock,
.write_delay = torture_rtmutex_delay,
- .task_boost = torture_rtmutex_boost,
+ .task_boost = torture_rt_boost_rtmutex,
.writeunlock = torture_rtmutex_unlock,
+ .nested_unlock = torture_rtmutex_nested_unlock,
.readlock = NULL,
.read_delay = NULL,
.readunlock = NULL,
@@ -506,7 +726,8 @@ static struct lock_torture_ops rtmutex_lock_ops = {
#endif
static DECLARE_RWSEM(torture_rwsem);
-static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
+static int torture_rwsem_down_write(int tid __maybe_unused)
+__acquires(torture_rwsem)
{
down_write(&torture_rwsem);
return 0;
@@ -514,24 +735,21 @@ static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
static void torture_rwsem_write_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms * 10);
- else
- mdelay(longdelay_ms / 10);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold * 10);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_rwsem_up_write(void) __releases(torture_rwsem)
+static void torture_rwsem_up_write(int tid __maybe_unused)
+__releases(torture_rwsem)
{
up_write(&torture_rwsem);
}
-static int torture_rwsem_down_read(void) __acquires(torture_rwsem)
+static int torture_rwsem_down_read(int tid __maybe_unused)
+__acquires(torture_rwsem)
{
down_read(&torture_rwsem);
return 0;
@@ -539,19 +757,17 @@ static int torture_rwsem_down_read(void) __acquires(torture_rwsem)
static void torture_rwsem_read_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms * 2);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold)))
+ mdelay(long_hold * 2);
else
- mdelay(longdelay_ms / 2);
+ mdelay(long_hold / 2);
if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_rwsem_up_read(void) __releases(torture_rwsem)
+static void torture_rwsem_up_read(int tid __maybe_unused)
+__releases(torture_rwsem)
{
up_read(&torture_rwsem);
}
@@ -559,7 +775,7 @@ static void torture_rwsem_up_read(void) __releases(torture_rwsem)
static struct lock_torture_ops rwsem_lock_ops = {
.writelock = torture_rwsem_down_write,
.write_delay = torture_rwsem_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwsem_up_write,
.readlock = torture_rwsem_down_read,
.read_delay = torture_rwsem_read_delay,
@@ -580,24 +796,28 @@ static void torture_percpu_rwsem_exit(void)
percpu_free_rwsem(&pcpu_rwsem);
}
-static int torture_percpu_rwsem_down_write(void) __acquires(pcpu_rwsem)
+static int torture_percpu_rwsem_down_write(int tid __maybe_unused)
+__acquires(pcpu_rwsem)
{
percpu_down_write(&pcpu_rwsem);
return 0;
}
-static void torture_percpu_rwsem_up_write(void) __releases(pcpu_rwsem)
+static void torture_percpu_rwsem_up_write(int tid __maybe_unused)
+__releases(pcpu_rwsem)
{
percpu_up_write(&pcpu_rwsem);
}
-static int torture_percpu_rwsem_down_read(void) __acquires(pcpu_rwsem)
+static int torture_percpu_rwsem_down_read(int tid __maybe_unused)
+__acquires(pcpu_rwsem)
{
percpu_down_read(&pcpu_rwsem);
return 0;
}
-static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem)
+static void torture_percpu_rwsem_up_read(int tid __maybe_unused)
+__releases(pcpu_rwsem)
{
percpu_up_read(&pcpu_rwsem);
}
@@ -607,7 +827,7 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = {
.exit = torture_percpu_rwsem_exit,
.writelock = torture_percpu_rwsem_down_write,
.write_delay = torture_rwsem_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_percpu_rwsem_up_write,
.readlock = torture_percpu_rwsem_down_read,
.read_delay = torture_rwsem_read_delay,
@@ -621,29 +841,63 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = {
*/
static int lock_torture_writer(void *arg)
{
+ unsigned long j;
+ unsigned long j1;
+ u32 lockset_mask;
struct lock_stress_stats *lwsp = arg;
DEFINE_TORTURE_RANDOM(rand);
+ bool skip_main_lock;
+ int tid = lwsp - cxt.lwsa;
VERBOSE_TOROUT_STRING("lock_torture_writer task started");
- set_user_nice(current, MAX_NICE);
+ if (!rt_task(current))
+ set_user_nice(current, MAX_NICE);
do {
if ((torture_random(&rand) & 0xfffff) == 0)
schedule_timeout_uninterruptible(1);
+ lockset_mask = torture_random(&rand);
+ /*
+ * When using nested_locks, we want to occasionally
+ * skip the main lock so we can avoid always serializing
+ * the lock chains on that central lock. By skipping the
+ * main lock occasionally, we can create different
+ * contention patterns (allowing for multiple disjoint
+ * blocked trees)
+ */
+ skip_main_lock = (nested_locks &&
+ !(torture_random(&rand) % 100));
+
cxt.cur_ops->task_boost(&rand);
- cxt.cur_ops->writelock();
- if (WARN_ON_ONCE(lock_is_write_held))
- lwsp->n_lock_fail++;
- lock_is_write_held = true;
- if (WARN_ON_ONCE(lock_is_read_held))
- lwsp->n_lock_fail++; /* rare, but... */
+ if (cxt.cur_ops->nested_lock)
+ cxt.cur_ops->nested_lock(tid, lockset_mask);
+
+ if (!skip_main_lock) {
+ if (acq_writer_lim > 0)
+ j = jiffies;
+ cxt.cur_ops->writelock(tid);
+ if (WARN_ON_ONCE(lock_is_write_held))
+ lwsp->n_lock_fail++;
+ lock_is_write_held = true;
+ if (WARN_ON_ONCE(atomic_read(&lock_is_read_held)))
+ lwsp->n_lock_fail++; /* rare, but... */
+ if (acq_writer_lim > 0) {
+ j1 = jiffies;
+ WARN_ONCE(time_after(j1, j + acq_writer_lim),
+ "%s: Lock acquisition took %lu jiffies.\n",
+ __func__, j1 - j);
+ }
+ lwsp->n_lock_acquired++;
- lwsp->n_lock_acquired++;
- cxt.cur_ops->write_delay(&rand);
- lock_is_write_held = false;
- WRITE_ONCE(last_lock_release, jiffies);
- cxt.cur_ops->writeunlock();
+ cxt.cur_ops->write_delay(&rand);
+
+ lock_is_write_held = false;
+ WRITE_ONCE(last_lock_release, jiffies);
+ cxt.cur_ops->writeunlock(tid);
+ }
+ if (cxt.cur_ops->nested_unlock)
+ cxt.cur_ops->nested_unlock(tid, lockset_mask);
stutter_wait("lock_torture_writer");
} while (!torture_must_stop());
@@ -660,6 +914,7 @@ static int lock_torture_writer(void *arg)
static int lock_torture_reader(void *arg)
{
struct lock_stress_stats *lrsp = arg;
+ int tid = lrsp - cxt.lrsa;
DEFINE_TORTURE_RANDOM(rand);
VERBOSE_TOROUT_STRING("lock_torture_reader task started");
@@ -669,15 +924,15 @@ static int lock_torture_reader(void *arg)
if ((torture_random(&rand) & 0xfffff) == 0)
schedule_timeout_uninterruptible(1);
- cxt.cur_ops->readlock();
- lock_is_read_held = true;
+ cxt.cur_ops->readlock(tid);
+ atomic_inc(&lock_is_read_held);
if (WARN_ON_ONCE(lock_is_write_held))
lrsp->n_lock_fail++; /* rare, but... */
lrsp->n_lock_acquired++;
cxt.cur_ops->read_delay(&rand);
- lock_is_read_held = false;
- cxt.cur_ops->readunlock();
+ atomic_dec(&lock_is_read_held);
+ cxt.cur_ops->readunlock(tid);
stutter_wait("lock_torture_reader");
} while (!torture_must_stop());
@@ -691,20 +946,22 @@ static int lock_torture_reader(void *arg)
static void __torture_print_stats(char *page,
struct lock_stress_stats *statp, bool write)
{
+ long cur;
bool fail = false;
int i, n_stress;
- long max = 0, min = statp ? statp[0].n_lock_acquired : 0;
+ long max = 0, min = statp ? data_race(statp[0].n_lock_acquired) : 0;
long long sum = 0;
n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress;
for (i = 0; i < n_stress; i++) {
- if (statp[i].n_lock_fail)
+ if (data_race(statp[i].n_lock_fail))
fail = true;
- sum += statp[i].n_lock_acquired;
- if (max < statp[i].n_lock_acquired)
- max = statp[i].n_lock_acquired;
- if (min > statp[i].n_lock_acquired)
- min = statp[i].n_lock_acquired;
+ cur = data_race(statp[i].n_lock_acquired);
+ sum += cur;
+ if (max < cur)
+ max = cur;
+ if (min > cur)
+ min = cur;
}
page += sprintf(page,
"%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
@@ -776,16 +1033,69 @@ static int lock_torture_stats(void *arg)
return 0;
}
+
static inline void
lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
const char *tag)
{
+ static cpumask_t cpumask_all;
+ cpumask_t *rcmp = cpumask_nonempty(bind_readers) ? bind_readers : &cpumask_all;
+ cpumask_t *wcmp = cpumask_nonempty(bind_writers) ? bind_writers : &cpumask_all;
+
+ cpumask_setall(&cpumask_all);
pr_alert("%s" TORTURE_FLAG
- "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+ "--- %s%s: acq_writer_lim=%d bind_readers=%*pbl bind_writers=%*pbl call_rcu_chains=%d long_hold=%d nested_locks=%d nreaders_stress=%d nwriters_stress=%d onoff_holdoff=%d onoff_interval=%d rt_boost=%d rt_boost_factor=%d shuffle_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d verbose=%d writer_fifo=%d\n",
torture_type, tag, cxt.debug_lock ? " [debug]": "",
- cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval,
- verbose, shuffle_interval, stutter, shutdown_secs,
- onoff_interval, onoff_holdoff);
+ acq_writer_lim, cpumask_pr_args(rcmp), cpumask_pr_args(wcmp),
+ call_rcu_chains, long_hold, nested_locks, cxt.nrealreaders_stress,
+ cxt.nrealwriters_stress, onoff_holdoff, onoff_interval, rt_boost,
+ rt_boost_factor, shuffle_interval, shutdown_secs, stat_interval, stutter,
+ verbose, writer_fifo);
+}
+
+// If requested, maintain call_rcu() chains to keep a grace period always
+// in flight. These increase the probability of getting an RCU CPU stall
+// warning and associated diagnostics when a locking primitive stalls.
+
+static void call_rcu_chain_cb(struct rcu_head *rhp)
+{
+ struct call_rcu_chain *crcp = container_of(rhp, struct call_rcu_chain, crc_rh);
+
+ if (!smp_load_acquire(&crcp->crc_stop)) {
+ (void)start_poll_synchronize_rcu(); // Start one grace period...
+ call_rcu(&crcp->crc_rh, call_rcu_chain_cb); // ... and later start another.
+ }
+}
+
+// Start the requested number of call_rcu() chains.
+static int call_rcu_chain_init(void)
+{
+ int i;
+
+ if (call_rcu_chains <= 0)
+ return 0;
+ call_rcu_chain_list = kcalloc(call_rcu_chains, sizeof(*call_rcu_chain_list), GFP_KERNEL);
+ if (!call_rcu_chain_list)
+ return -ENOMEM;
+ for (i = 0; i < call_rcu_chains; i++) {
+ call_rcu_chain_list[i].crc_stop = false;
+ call_rcu(&call_rcu_chain_list[i].crc_rh, call_rcu_chain_cb);
+ }
+ return 0;
+}
+
+// Stop all of the call_rcu() chains.
+static void call_rcu_chain_cleanup(void)
+{
+ int i;
+
+ if (!call_rcu_chain_list)
+ return;
+ for (i = 0; i < call_rcu_chains; i++)
+ smp_store_release(&call_rcu_chain_list[i].crc_stop, true);
+ rcu_barrier();
+ kfree(call_rcu_chain_list);
+ call_rcu_chain_list = NULL;
}
static void lock_torture_cleanup(void)
@@ -807,8 +1117,7 @@ static void lock_torture_cleanup(void)
if (writer_tasks) {
for (i = 0; i < cxt.nrealwriters_stress; i++)
- torture_stop_kthread(lock_torture_writer,
- writer_tasks[i]);
+ torture_stop_kthread(lock_torture_writer, writer_tasks[i]);
kfree(writer_tasks);
writer_tasks = NULL;
}
@@ -839,6 +1148,8 @@ static void lock_torture_cleanup(void)
kfree(cxt.lrsa);
cxt.lrsa = NULL;
+ call_rcu_chain_cleanup();
+
end:
if (cxt.init_called) {
if (cxt.cur_ops->exit)
@@ -855,6 +1166,7 @@ static int __init lock_torture_init(void)
static struct lock_torture_ops *torture_ops[] = {
&lock_busted_ops,
&spin_lock_ops, &spin_lock_irq_ops,
+ &raw_spin_lock_ops, &raw_spin_lock_irq_ops,
&rw_lock_ops, &rw_lock_irq_ops,
&mutex_lock_ops,
&ww_mutex_lock_ops,
@@ -892,16 +1204,16 @@ static int __init lock_torture_init(void)
goto unwind;
}
- if (cxt.cur_ops->init) {
- cxt.cur_ops->init();
- cxt.init_called = true;
- }
-
if (nwriters_stress >= 0)
cxt.nrealwriters_stress = nwriters_stress;
else
cxt.nrealwriters_stress = 2 * num_online_cpus();
+ if (cxt.cur_ops->init) {
+ cxt.cur_ops->init();
+ cxt.init_called = true;
+ }
+
#ifdef CONFIG_DEBUG_MUTEXES
if (str_has_prefix(torture_type, "mutex"))
cxt.debug_lock = true;
@@ -949,7 +1261,6 @@ static int __init lock_torture_init(void)
}
if (nreaders_stress) {
- lock_is_read_held = false;
cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress,
sizeof(*cxt.lrsa),
GFP_KERNEL);
@@ -968,29 +1279,33 @@ static int __init lock_torture_init(void)
}
}
+ firsterr = call_rcu_chain_init();
+ if (torture_init_error(firsterr))
+ goto unwind;
+
lock_torture_print_module_parms(cxt.cur_ops, "Start of test");
/* Prepare torture context. */
if (onoff_interval > 0) {
firsterr = torture_onoff_init(onoff_holdoff * HZ,
onoff_interval * HZ, NULL);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (shuffle_interval > 0) {
firsterr = torture_shuffle_init(shuffle_interval);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (shutdown_secs > 0) {
firsterr = torture_shutdown_init(shutdown_secs,
lock_torture_cleanup);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (stutter > 0) {
firsterr = torture_stutter_init(stutter, stutter);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
@@ -999,18 +1314,22 @@ static int __init lock_torture_init(void)
sizeof(writer_tasks[0]),
GFP_KERNEL);
if (writer_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
+ TOROUT_ERRSTRING("writer_tasks: Out of memory");
firsterr = -ENOMEM;
goto unwind;
}
}
+ /* cap nested_locks to MAX_NESTED_LOCKS */
+ if (nested_locks > MAX_NESTED_LOCKS)
+ nested_locks = MAX_NESTED_LOCKS;
+
if (cxt.cur_ops->readlock) {
reader_tasks = kcalloc(cxt.nrealreaders_stress,
sizeof(reader_tasks[0]),
GFP_KERNEL);
if (reader_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
+ TOROUT_ERRSTRING("reader_tasks: Out of memory");
kfree(writer_tasks);
writer_tasks = NULL;
firsterr = -ENOMEM;
@@ -1032,10 +1351,13 @@ static int __init lock_torture_init(void)
goto create_reader;
/* Create writer. */
- firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i],
- writer_tasks[i]);
- if (firsterr)
+ firsterr = torture_create_kthread_cb(lock_torture_writer, &cxt.lwsa[i],
+ writer_tasks[i],
+ writer_fifo ? sched_set_fifo : NULL);
+ if (torture_init_error(firsterr))
goto unwind;
+ if (cpumask_nonempty(bind_writers))
+ torture_sched_setaffinity(writer_tasks[i]->pid, bind_writers);
create_reader:
if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress))
@@ -1043,13 +1365,15 @@ static int __init lock_torture_init(void)
/* Create reader. */
firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j],
reader_tasks[j]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
+ if (cpumask_nonempty(bind_readers))
+ torture_sched_setaffinity(reader_tasks[j]->pid, bind_readers);
}
if (stat_interval > 0) {
firsterr = torture_create_kthread(lock_torture_stats, NULL,
stats_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
torture_init_end();
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 5e10153b4d3c..85251d8771d9 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -7,7 +7,7 @@
* The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
* with the desirable properties of being fair, and with each cpu trying
* to acquire the lock spinning on a local variable.
- * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * It avoids expensive cache bounces that common test-and-set spin-lock
* implementations incur.
*/
#ifndef __LINUX_MCS_SPINLOCK_H
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index a7276aaf2abc..bc8abb8549d2 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -1,6 +1,4 @@
/*
- * kernel/mutex-debug.c
- *
* Debugging code for mutexes
*
* Started by Ingo Molnar:
@@ -22,7 +20,7 @@
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
-#include "mutex-debug.h"
+#include "mutex.h"
/*
* Must be called with lock->wait_lock held.
@@ -32,6 +30,7 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
waiter->magic = waiter;
INIT_LIST_HEAD(&waiter->list);
+ waiter->ww_ctx = MUTEX_POISON_WW_CTX;
}
void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
@@ -57,7 +56,7 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
task->blocked_on = waiter;
}
-void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct task_struct *task)
{
DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
@@ -65,7 +64,7 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter);
task->blocked_on = NULL;
- list_del_init(&waiter->list);
+ INIT_LIST_HEAD(&waiter->list);
waiter->task = NULL;
}
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
deleted file mode 100644
index 1edd3f45a4ec..000000000000
--- a/kernel/locking/mutex-debug.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Mutexes: blocking mutual exclusion locks
- *
- * started by Ingo Molnar:
- *
- * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- * This file contains mutex debugging related internal declarations,
- * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case.
- * More details are in kernel/mutex-debug.c.
- */
-
-/*
- * This must be called with lock->wait_lock held.
- */
-extern void debug_mutex_lock_common(struct mutex *lock,
- struct mutex_waiter *waiter);
-extern void debug_mutex_wake_waiter(struct mutex *lock,
- struct mutex_waiter *waiter);
-extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
-extern void debug_mutex_add_waiter(struct mutex *lock,
- struct mutex_waiter *waiter,
- struct task_struct *task);
-extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
- struct task_struct *task);
-extern void debug_mutex_unlock(struct mutex *lock);
-extern void debug_mutex_init(struct mutex *lock, const char *name,
- struct lock_class_key *key);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 5352ce50a97e..cbae8c0b89ab 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -30,17 +30,23 @@
#include <linux/debug_locks.h>
#include <linux/osq_lock.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/lock.h>
+
+#ifndef CONFIG_PREEMPT_RT
+#include "mutex.h"
+
#ifdef CONFIG_DEBUG_MUTEXES
-# include "mutex-debug.h"
+# define MUTEX_WARN_ON(cond) DEBUG_LOCKS_WARN_ON(cond)
#else
-# include "mutex.h"
+# define MUTEX_WARN_ON(cond)
#endif
void
__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
{
atomic_long_set(&lock->owner, 0);
- spin_lock_init(&lock->wait_lock);
+ raw_spin_lock_init(&lock->wait_lock);
INIT_LIST_HEAD(&lock->wait_list);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
osq_lock_init(&lock->osq);
@@ -86,70 +92,64 @@ bool mutex_is_locked(struct mutex *lock)
}
EXPORT_SYMBOL(mutex_is_locked);
-__must_check enum mutex_trylock_recursive_enum
-mutex_trylock_recursive(struct mutex *lock)
-{
- if (unlikely(__mutex_owner(lock) == current))
- return MUTEX_TRYLOCK_RECURSIVE;
-
- return mutex_trylock(lock);
-}
-EXPORT_SYMBOL(mutex_trylock_recursive);
-
static inline unsigned long __owner_flags(unsigned long owner)
{
return owner & MUTEX_FLAGS;
}
/*
- * Trylock variant that retuns the owning task on failure.
+ * Returns: __mutex_owner(lock) on failure or NULL on success.
*/
-static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock)
+static inline struct task_struct *__mutex_trylock_common(struct mutex *lock, bool handoff)
{
unsigned long owner, curr = (unsigned long)current;
owner = atomic_long_read(&lock->owner);
for (;;) { /* must loop, can race against a flag */
- unsigned long old, flags = __owner_flags(owner);
+ unsigned long flags = __owner_flags(owner);
unsigned long task = owner & ~MUTEX_FLAGS;
if (task) {
- if (likely(task != curr))
- break;
-
- if (likely(!(flags & MUTEX_FLAG_PICKUP)))
+ if (flags & MUTEX_FLAG_PICKUP) {
+ if (task != curr)
+ break;
+ flags &= ~MUTEX_FLAG_PICKUP;
+ } else if (handoff) {
+ if (flags & MUTEX_FLAG_HANDOFF)
+ break;
+ flags |= MUTEX_FLAG_HANDOFF;
+ } else {
break;
-
- flags &= ~MUTEX_FLAG_PICKUP;
+ }
} else {
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(flags & MUTEX_FLAG_PICKUP);
-#endif
+ MUTEX_WARN_ON(flags & (MUTEX_FLAG_HANDOFF | MUTEX_FLAG_PICKUP));
+ task = curr;
}
- /*
- * We set the HANDOFF bit, we must make sure it doesn't live
- * past the point where we acquire it. This would be possible
- * if we (accidentally) set the bit on an unlocked mutex.
- */
- flags &= ~MUTEX_FLAG_HANDOFF;
-
- old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags);
- if (old == owner)
- return NULL;
-
- owner = old;
+ if (atomic_long_try_cmpxchg_acquire(&lock->owner, &owner, task | flags)) {
+ if (task == curr)
+ return NULL;
+ break;
+ }
}
return __owner_task(owner);
}
/*
+ * Trylock or set HANDOFF
+ */
+static inline bool __mutex_trylock_or_handoff(struct mutex *lock, bool handoff)
+{
+ return !__mutex_trylock_common(lock, handoff);
+}
+
+/*
* Actual trylock that will work on any unlocked state.
*/
static inline bool __mutex_trylock(struct mutex *lock)
{
- return !__mutex_trylock_or_owner(lock);
+ return !__mutex_trylock_common(lock, false);
}
#ifndef CONFIG_DEBUG_LOCK_ALLOC
@@ -178,10 +178,7 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
{
unsigned long curr = (unsigned long)current;
- if (atomic_long_cmpxchg_release(&lock->owner, curr, 0UL) == curr)
- return true;
-
- return false;
+ return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL);
}
#endif
@@ -204,7 +201,7 @@ static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_wait
* Add @waiter to a given location in the lock wait_list and set the
* FLAG_WAITERS flag if it's the first waiter.
*/
-static void __sched
+static void
__mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct list_head *list)
{
@@ -215,9 +212,19 @@ __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
__mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
}
+static void
+__mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter)
+{
+ list_del(&waiter->list);
+ if (likely(list_empty(&lock->wait_list)))
+ __mutex_clear_flag(lock, MUTEX_FLAGS);
+
+ debug_mutex_remove_waiter(lock, waiter, current);
+}
+
/*
* Give up ownership to a specific task, when @task = NULL, this is equivalent
- * to a regular unlock. Sets PICKUP on a handoff, clears HANDOF, preserves
+ * to a regular unlock. Sets PICKUP on a handoff, clears HANDOFF, preserves
* WAITERS. Provides RELEASE semantics like a regular unlock, the
* __mutex_trylock() provides a matching ACQUIRE semantics for the handoff.
*/
@@ -226,23 +233,18 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task)
unsigned long owner = atomic_long_read(&lock->owner);
for (;;) {
- unsigned long old, new;
+ unsigned long new;
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
- DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP);
-#endif
+ MUTEX_WARN_ON(__owner_task(owner) != current);
+ MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP);
new = (owner & MUTEX_FLAG_WAITERS);
new |= (unsigned long)task;
if (task)
new |= MUTEX_FLAG_PICKUP;
- old = atomic_long_cmpxchg_release(&lock->owner, owner, new);
- if (old == owner)
+ if (atomic_long_try_cmpxchg_release(&lock->owner, &owner, new))
break;
-
- owner = old;
}
}
@@ -286,218 +288,18 @@ void __sched mutex_lock(struct mutex *lock)
EXPORT_SYMBOL(mutex_lock);
#endif
-/*
- * Wait-Die:
- * The newer transactions are killed when:
- * It (the new transaction) makes a request for a lock being held
- * by an older transaction.
- *
- * Wound-Wait:
- * The newer transactions are wounded when:
- * An older transaction makes a request for a lock being held by
- * the newer transaction.
- */
-
-/*
- * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired
- * it.
- */
-static __always_inline void
-ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
-{
-#ifdef CONFIG_DEBUG_MUTEXES
- /*
- * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
- * but released with a normal mutex_unlock in this call.
- *
- * This should never happen, always use ww_mutex_unlock.
- */
- DEBUG_LOCKS_WARN_ON(ww->ctx);
-
- /*
- * Not quite done after calling ww_acquire_done() ?
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
-
- if (ww_ctx->contending_lock) {
- /*
- * After -EDEADLK you tried to
- * acquire a different ww_mutex? Bad!
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
-
- /*
- * You called ww_mutex_lock after receiving -EDEADLK,
- * but 'forgot' to unlock everything else first?
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
- ww_ctx->contending_lock = NULL;
- }
-
- /*
- * Naughty, using a different class will lead to undefined behavior!
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
-#endif
- ww_ctx->acquired++;
- ww->ctx = ww_ctx;
-}
+#include "ww_mutex.h"
-/*
- * Determine if context @a is 'after' context @b. IOW, @a is a younger
- * transaction than @b and depending on algorithm either needs to wait for
- * @b or die.
- */
-static inline bool __sched
-__ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
-{
-
- return (signed long)(a->stamp - b->stamp) > 0;
-}
-
-/*
- * Wait-Die; wake a younger waiter context (when locks held) such that it can
- * die.
- *
- * Among waiters with context, only the first one can have other locks acquired
- * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and
- * __ww_mutex_check_kill() wake any but the earliest context.
- */
-static bool __sched
-__ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter,
- struct ww_acquire_ctx *ww_ctx)
-{
- if (!ww_ctx->is_wait_die)
- return false;
-
- if (waiter->ww_ctx->acquired > 0 &&
- __ww_ctx_stamp_after(waiter->ww_ctx, ww_ctx)) {
- debug_mutex_wake_waiter(lock, waiter);
- wake_up_process(waiter->task);
- }
-
- return true;
-}
-
-/*
- * Wound-Wait; wound a younger @hold_ctx if it holds the lock.
- *
- * Wound the lock holder if there are waiters with older transactions than
- * the lock holders. Even if multiple waiters may wound the lock holder,
- * it's sufficient that only one does.
- */
-static bool __ww_mutex_wound(struct mutex *lock,
- struct ww_acquire_ctx *ww_ctx,
- struct ww_acquire_ctx *hold_ctx)
-{
- struct task_struct *owner = __mutex_owner(lock);
-
- lockdep_assert_held(&lock->wait_lock);
-
- /*
- * Possible through __ww_mutex_add_waiter() when we race with
- * ww_mutex_set_context_fastpath(). In that case we'll get here again
- * through __ww_mutex_check_waiters().
- */
- if (!hold_ctx)
- return false;
-
- /*
- * Can have !owner because of __mutex_unlock_slowpath(), but if owner,
- * it cannot go away because we'll have FLAG_WAITERS set and hold
- * wait_lock.
- */
- if (!owner)
- return false;
-
- if (ww_ctx->acquired > 0 && __ww_ctx_stamp_after(hold_ctx, ww_ctx)) {
- hold_ctx->wounded = 1;
-
- /*
- * wake_up_process() paired with set_current_state()
- * inserts sufficient barriers to make sure @owner either sees
- * it's wounded in __ww_mutex_check_kill() or has a
- * wakeup pending to re-read the wounded state.
- */
- if (owner != current)
- wake_up_process(owner);
-
- return true;
- }
-
- return false;
-}
-
-/*
- * We just acquired @lock under @ww_ctx, if there are later contexts waiting
- * behind us on the wait-list, check if they need to die, or wound us.
- *
- * See __ww_mutex_add_waiter() for the list-order construction; basically the
- * list is ordered by stamp, smallest (oldest) first.
- *
- * This relies on never mixing wait-die/wound-wait on the same wait-list;
- * which is currently ensured by that being a ww_class property.
- *
- * The current task must not be on the wait list.
- */
-static void __sched
-__ww_mutex_check_waiters(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
-{
- struct mutex_waiter *cur;
-
- lockdep_assert_held(&lock->wait_lock);
-
- list_for_each_entry(cur, &lock->wait_list, list) {
- if (!cur->ww_ctx)
- continue;
-
- if (__ww_mutex_die(lock, cur, ww_ctx) ||
- __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx))
- break;
- }
-}
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
/*
- * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx
- * and wake up any waiters so they can recheck.
+ * Trylock variant that returns the owning task on failure.
*/
-static __always_inline void
-ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock)
{
- ww_mutex_lock_acquired(lock, ctx);
-
- /*
- * The lock->ctx update should be visible on all cores before
- * the WAITERS check is done, otherwise contended waiters might be
- * missed. The contended waiters will either see ww_ctx == NULL
- * and keep spinning, or it will acquire wait_lock, add itself
- * to waiter list and sleep.
- */
- smp_mb(); /* See comments above and below. */
-
- /*
- * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS
- * MB MB
- * [R] MUTEX_FLAG_WAITERS [R] ww->ctx
- *
- * The memory barrier above pairs with the memory barrier in
- * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx
- * and/or !empty list.
- */
- if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS)))
- return;
-
- /*
- * Uh oh, we raced in fastpath, check if any of the waiters need to
- * die or wound us.
- */
- spin_lock(&lock->base.wait_lock);
- __ww_mutex_check_waiters(&lock->base, ctx);
- spin_unlock(&lock->base.wait_lock);
+ return __mutex_trylock_common(lock, false);
}
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-
static inline
bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
struct mutex_waiter *waiter)
@@ -552,21 +354,23 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
{
bool ret = true;
- rcu_read_lock();
+ lockdep_assert_preemption_disabled();
+
while (__mutex_owner(lock) == owner) {
/*
* Ensure we emit the owner->on_cpu, dereference _after_
- * checking lock->owner still matches owner. If that fails,
- * owner might point to freed memory. If it still matches,
- * the rcu_read_lock() ensures the memory stays valid.
+ * checking lock->owner still matches owner. And we already
+ * disabled preemption which is equal to the RCU read-side
+ * crital section in optimistic spinning code. Thus the
+ * task_strcut structure won't go away during the spinning
+ * period
*/
barrier();
/*
* Use vcpu_is_preempted to detect lock holder preemption issue.
*/
- if (!owner->on_cpu || need_resched() ||
- vcpu_is_preempted(task_cpu(owner))) {
+ if (!owner_on_cpu(owner) || need_resched()) {
ret = false;
break;
}
@@ -578,7 +382,6 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
cpu_relax();
}
- rcu_read_unlock();
return ret;
}
@@ -591,19 +394,19 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
struct task_struct *owner;
int retval = 1;
+ lockdep_assert_preemption_disabled();
+
if (need_resched())
return 0;
- rcu_read_lock();
- owner = __mutex_owner(lock);
-
/*
- * As lock holder preemption issue, we both skip spinning if task is not
- * on cpu or its cpu is preempted
+ * We already disabled preemption which is equal to the RCU read-side
+ * crital section in optimistic spinning code. Thus the task_strcut
+ * structure won't go away during the spinning period.
*/
+ owner = __mutex_owner(lock);
if (owner)
- retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
- rcu_read_unlock();
+ retval = owner_on_cpu(owner);
/*
* If lock->owner is not set, the mutex has been released. Return true
@@ -636,7 +439,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
*/
static __always_inline bool
mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
- const bool use_ww_ctx, struct mutex_waiter *waiter)
+ struct mutex_waiter *waiter)
{
if (!waiter) {
/*
@@ -712,7 +515,7 @@ fail:
#else
static __always_inline bool
mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
- const bool use_ww_ctx, struct mutex_waiter *waiter)
+ struct mutex_waiter *waiter)
{
return false;
}
@@ -729,6 +532,11 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
* This function must not be used in interrupt context. Unlocking
* of a not locked mutex is not allowed.
*
+ * The caller must ensure that the mutex stays alive until this function has
+ * returned - mutex_unlock() can NOT directly be used to release an object such
+ * that another concurrent task can free it.
+ * Mutexes are different from spinlocks & refcounts in this aspect.
+ *
* This function is similar to (but not equivalent to) up().
*/
void __sched mutex_unlock(struct mutex *lock)
@@ -754,192 +562,32 @@ EXPORT_SYMBOL(mutex_unlock);
*/
void __sched ww_mutex_unlock(struct ww_mutex *lock)
{
- /*
- * The unlocking fastpath is the 0->1 transition from 'locked'
- * into 'unlocked' state:
- */
- if (lock->ctx) {
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
-#endif
- if (lock->ctx->acquired > 0)
- lock->ctx->acquired--;
- lock->ctx = NULL;
- }
-
+ __ww_mutex_unlock(lock);
mutex_unlock(&lock->base);
}
EXPORT_SYMBOL(ww_mutex_unlock);
-
-static __always_inline int __sched
-__ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
-{
- if (ww_ctx->acquired > 0) {
-#ifdef CONFIG_DEBUG_MUTEXES
- struct ww_mutex *ww;
-
- ww = container_of(lock, struct ww_mutex, base);
- DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
- ww_ctx->contending_lock = ww;
-#endif
- return -EDEADLK;
- }
-
- return 0;
-}
-
-
-/*
- * Check the wound condition for the current lock acquire.
- *
- * Wound-Wait: If we're wounded, kill ourself.
- *
- * Wait-Die: If we're trying to acquire a lock already held by an older
- * context, kill ourselves.
- *
- * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to
- * look at waiters before us in the wait-list.
- */
-static inline int __sched
-__ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter,
- struct ww_acquire_ctx *ctx)
-{
- struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
- struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
- struct mutex_waiter *cur;
-
- if (ctx->acquired == 0)
- return 0;
-
- if (!ctx->is_wait_die) {
- if (ctx->wounded)
- return __ww_mutex_kill(lock, ctx);
-
- return 0;
- }
-
- if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx))
- return __ww_mutex_kill(lock, ctx);
-
- /*
- * If there is a waiter in front of us that has a context, then its
- * stamp is earlier than ours and we must kill ourself.
- */
- cur = waiter;
- list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) {
- if (!cur->ww_ctx)
- continue;
-
- return __ww_mutex_kill(lock, ctx);
- }
-
- return 0;
-}
-
-/*
- * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest
- * first. Such that older contexts are preferred to acquire the lock over
- * younger contexts.
- *
- * Waiters without context are interspersed in FIFO order.
- *
- * Furthermore, for Wait-Die kill ourself immediately when possible (there are
- * older contexts already waiting) to avoid unnecessary waiting and for
- * Wound-Wait ensure we wound the owning context when it is younger.
- */
-static inline int __sched
-__ww_mutex_add_waiter(struct mutex_waiter *waiter,
- struct mutex *lock,
- struct ww_acquire_ctx *ww_ctx)
-{
- struct mutex_waiter *cur;
- struct list_head *pos;
- bool is_wait_die;
-
- if (!ww_ctx) {
- __mutex_add_waiter(lock, waiter, &lock->wait_list);
- return 0;
- }
-
- is_wait_die = ww_ctx->is_wait_die;
-
- /*
- * Add the waiter before the first waiter with a higher stamp.
- * Waiters without a context are skipped to avoid starving
- * them. Wait-Die waiters may die here. Wound-Wait waiters
- * never die here, but they are sorted in stamp order and
- * may wound the lock holder.
- */
- pos = &lock->wait_list;
- list_for_each_entry_reverse(cur, &lock->wait_list, list) {
- if (!cur->ww_ctx)
- continue;
-
- if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) {
- /*
- * Wait-Die: if we find an older context waiting, there
- * is no point in queueing behind it, as we'd have to
- * die the moment it would acquire the lock.
- */
- if (is_wait_die) {
- int ret = __ww_mutex_kill(lock, ww_ctx);
-
- if (ret)
- return ret;
- }
-
- break;
- }
-
- pos = &cur->list;
-
- /* Wait-Die: ensure younger waiters die. */
- __ww_mutex_die(lock, cur, ww_ctx);
- }
-
- __mutex_add_waiter(lock, waiter, pos);
-
- /*
- * Wound-Wait: if we're blocking on a mutex owned by a younger context,
- * wound that such that we might proceed.
- */
- if (!is_wait_die) {
- struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
-
- /*
- * See ww_mutex_set_context_fastpath(). Orders setting
- * MUTEX_FLAG_WAITERS vs the ww->ctx load,
- * such that either we or the fastpath will wound @ww->ctx.
- */
- smp_mb();
- __ww_mutex_wound(lock, ww_ctx, ww->ctx);
- }
-
- return 0;
-}
-
/*
* Lock a mutex (possibly interruptible), slowpath:
*/
static __always_inline int __sched
-__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
+__mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclass,
struct lockdep_map *nest_lock, unsigned long ip,
struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
{
struct mutex_waiter waiter;
- bool first = false;
struct ww_mutex *ww;
int ret;
+ if (!use_ww_ctx)
+ ww_ctx = NULL;
+
might_sleep();
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(lock->magic != lock);
-#endif
+ MUTEX_WARN_ON(lock->magic != lock);
ww = container_of(lock, struct ww_mutex, base);
- if (use_ww_ctx && ww_ctx) {
+ if (ww_ctx) {
if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
return -EALREADY;
@@ -950,44 +598,48 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
*/
if (ww_ctx->acquired == 0)
ww_ctx->wounded = 0;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ nest_lock = &ww_ctx->dep_map;
+#endif
}
preempt_disable();
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
+ trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
if (__mutex_trylock(lock) ||
- mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, NULL)) {
+ mutex_optimistic_spin(lock, ww_ctx, NULL)) {
/* got the lock, yay! */
lock_acquired(&lock->dep_map, ip);
- if (use_ww_ctx && ww_ctx)
+ if (ww_ctx)
ww_mutex_set_context_fastpath(ww, ww_ctx);
+ trace_contention_end(lock, 0);
preempt_enable();
return 0;
}
- spin_lock(&lock->wait_lock);
+ raw_spin_lock(&lock->wait_lock);
/*
* After waiting to acquire the wait_lock, try again.
*/
if (__mutex_trylock(lock)) {
- if (use_ww_ctx && ww_ctx)
+ if (ww_ctx)
__ww_mutex_check_waiters(lock, ww_ctx);
goto skip_wait;
}
debug_mutex_lock_common(lock, &waiter);
+ waiter.task = current;
+ if (use_ww_ctx)
+ waiter.ww_ctx = ww_ctx;
lock_contended(&lock->dep_map, ip);
if (!use_ww_ctx) {
/* add waiting tasks to the end of the waitqueue (FIFO): */
__mutex_add_waiter(lock, &waiter, &lock->wait_list);
-
-
-#ifdef CONFIG_DEBUG_MUTEXES
- waiter.ww_ctx = MUTEX_POISON_WW_CTX;
-#endif
} else {
/*
* Add in stamp order, waking up waiters that must kill
@@ -996,14 +648,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx);
if (ret)
goto err_early_kill;
-
- waiter.ww_ctx = ww_ctx;
}
- waiter.task = current;
-
set_current_state(state);
+ trace_contention_begin(lock, LCB_F_MUTEX);
for (;;) {
+ bool first;
+
/*
* Once we hold wait_lock, we're serialized against
* mutex_unlock() handing the lock off to us, do a trylock
@@ -1023,24 +674,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
goto err;
}
- if (use_ww_ctx && ww_ctx) {
+ if (ww_ctx) {
ret = __ww_mutex_check_kill(lock, &waiter, ww_ctx);
if (ret)
goto err;
}
- spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&lock->wait_lock);
schedule_preempt_disabled();
- /*
- * ww_mutex needs to always recheck its position since its waiter
- * list is not FIFO ordered.
- */
- if ((use_ww_ctx && ww_ctx) || !first) {
- first = __mutex_waiter_is_first(lock, &waiter);
- if (first)
- __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
- }
+ first = __mutex_waiter_is_first(lock, &waiter);
set_current_state(state);
/*
@@ -1048,17 +691,23 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* state back to RUNNING and fall through the next schedule(),
* or we must see its unlock and acquire.
*/
- if (__mutex_trylock(lock) ||
- (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, &waiter)))
+ if (__mutex_trylock_or_handoff(lock, first))
break;
- spin_lock(&lock->wait_lock);
+ if (first) {
+ trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
+ if (mutex_optimistic_spin(lock, ww_ctx, &waiter))
+ break;
+ trace_contention_begin(lock, LCB_F_MUTEX);
+ }
+
+ raw_spin_lock(&lock->wait_lock);
}
- spin_lock(&lock->wait_lock);
+ raw_spin_lock(&lock->wait_lock);
acquired:
__set_current_state(TASK_RUNNING);
- if (use_ww_ctx && ww_ctx) {
+ if (ww_ctx) {
/*
* Wound-Wait; we stole the lock (!first_waiter), check the
* waiters as anyone might want to wound us.
@@ -1068,28 +717,28 @@ acquired:
__ww_mutex_check_waiters(lock, ww_ctx);
}
- mutex_remove_waiter(lock, &waiter, current);
- if (likely(list_empty(&lock->wait_list)))
- __mutex_clear_flag(lock, MUTEX_FLAGS);
+ __mutex_remove_waiter(lock, &waiter);
debug_mutex_free_waiter(&waiter);
skip_wait:
/* got the lock - cleanup and rejoice! */
lock_acquired(&lock->dep_map, ip);
+ trace_contention_end(lock, 0);
- if (use_ww_ctx && ww_ctx)
+ if (ww_ctx)
ww_mutex_lock_acquired(ww, ww_ctx);
- spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&lock->wait_lock);
preempt_enable();
return 0;
err:
__set_current_state(TASK_RUNNING);
- mutex_remove_waiter(lock, &waiter, current);
+ __mutex_remove_waiter(lock, &waiter);
err_early_kill:
- spin_unlock(&lock->wait_lock);
+ trace_contention_end(lock, ret);
+ raw_spin_unlock(&lock->wait_lock);
debug_mutex_free_waiter(&waiter);
mutex_release(&lock->dep_map, ip);
preempt_enable();
@@ -1097,19 +746,56 @@ err_early_kill:
}
static int __sched
-__mutex_lock(struct mutex *lock, long state, unsigned int subclass,
+__mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
struct lockdep_map *nest_lock, unsigned long ip)
{
return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false);
}
static int __sched
-__ww_mutex_lock(struct mutex *lock, long state, unsigned int subclass,
- struct lockdep_map *nest_lock, unsigned long ip,
- struct ww_acquire_ctx *ww_ctx)
+__ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
+ unsigned long ip, struct ww_acquire_ctx *ww_ctx)
+{
+ return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true);
+}
+
+/**
+ * ww_mutex_trylock - tries to acquire the w/w mutex with optional acquire context
+ * @ww: mutex to lock
+ * @ww_ctx: optional w/w acquire context
+ *
+ * Trylocks a mutex with the optional acquire context; no deadlock detection is
+ * possible. Returns 1 if the mutex has been acquired successfully, 0 otherwise.
+ *
+ * Unlike ww_mutex_lock, no deadlock handling is performed. However, if a @ctx is
+ * specified, -EALREADY handling may happen in calls to ww_mutex_trylock.
+ *
+ * A mutex acquired with this function must be released with ww_mutex_unlock.
+ */
+int ww_mutex_trylock(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
{
- return __mutex_lock_common(lock, state, subclass, nest_lock, ip, ww_ctx, true);
+ if (!ww_ctx)
+ return mutex_trylock(&ww->base);
+
+ MUTEX_WARN_ON(ww->base.magic != &ww->base);
+
+ /*
+ * Reset the wounded flag after a kill. No other process can
+ * race and wound us here, since they can't have a valid owner
+ * pointer if we don't have any locks held.
+ */
+ if (ww_ctx->acquired == 0)
+ ww_ctx->wounded = 0;
+
+ if (__mutex_trylock(&ww->base)) {
+ ww_mutex_set_context_fastpath(ww, ww_ctx);
+ mutex_acquire_nest(&ww->base.dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_);
+ return 1;
+ }
+
+ return 0;
}
+EXPORT_SYMBOL(ww_mutex_trylock);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void __sched
@@ -1188,8 +874,7 @@ ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
might_sleep();
ret = __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE,
- 0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
- ctx);
+ 0, _RET_IP_, ctx);
if (!ret && ctx && ctx->acquired > 1)
return ww_mutex_deadlock_injection(lock, ctx);
@@ -1204,8 +889,7 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
might_sleep();
ret = __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE,
- 0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
- ctx);
+ 0, _RET_IP_, ctx);
if (!ret && ctx && ctx->acquired > 1)
return ww_mutex_deadlock_injection(lock, ctx);
@@ -1236,29 +920,21 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
*/
owner = atomic_long_read(&lock->owner);
for (;;) {
- unsigned long old;
-
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
- DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP);
-#endif
+ MUTEX_WARN_ON(__owner_task(owner) != current);
+ MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP);
if (owner & MUTEX_FLAG_HANDOFF)
break;
- old = atomic_long_cmpxchg_release(&lock->owner, owner,
- __owner_flags(owner));
- if (old == owner) {
+ if (atomic_long_try_cmpxchg_release(&lock->owner, &owner, __owner_flags(owner))) {
if (owner & MUTEX_FLAG_WAITERS)
break;
return;
}
-
- owner = old;
}
- spin_lock(&lock->wait_lock);
+ raw_spin_lock(&lock->wait_lock);
debug_mutex_unlock(lock);
if (!list_empty(&lock->wait_list)) {
/* get the first entry from the wait-list: */
@@ -1275,7 +951,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
if (owner & MUTEX_FLAG_HANDOFF)
__mutex_handoff(lock, next);
- spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&lock->wait_lock);
wake_up_q(&wake_q);
}
@@ -1379,7 +1055,7 @@ __mutex_lock_interruptible_slowpath(struct mutex *lock)
static noinline int __sched
__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
- return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, NULL,
+ return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0,
_RET_IP_, ctx);
}
@@ -1387,7 +1063,7 @@ static noinline int __sched
__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
struct ww_acquire_ctx *ctx)
{
- return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, NULL,
+ return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0,
_RET_IP_, ctx);
}
@@ -1411,9 +1087,7 @@ int __sched mutex_trylock(struct mutex *lock)
{
bool locked;
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(lock->magic != lock);
-#endif
+ MUTEX_WARN_ON(lock->magic != lock);
locked = __mutex_trylock(lock);
if (locked)
@@ -1454,7 +1128,11 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
}
EXPORT_SYMBOL(ww_mutex_lock_interruptible);
-#endif
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
+#endif /* !CONFIG_PREEMPT_RT */
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(contention_begin);
+EXPORT_TRACEPOINT_SYMBOL_GPL(contention_end);
/**
* atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 1c2287d3fa71..0b2a79c4013b 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -5,21 +5,41 @@
* started by Ingo Molnar:
*
* Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- * This file contains mutex debugging related internal prototypes, for the
- * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
*/
-#define mutex_remove_waiter(lock, waiter, task) \
- __list_del((waiter)->list.prev, (waiter)->list.next)
-
-#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
-#define debug_mutex_free_waiter(waiter) do { } while (0)
-#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
-#define debug_mutex_unlock(lock) do { } while (0)
-#define debug_mutex_init(lock, name, key) do { } while (0)
+/*
+ * This is the control structure for tasks blocked on mutex, which resides
+ * on the blocked task's kernel stack:
+ */
+struct mutex_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ struct ww_acquire_ctx *ww_ctx;
+#ifdef CONFIG_DEBUG_MUTEXES
+ void *magic;
+#endif
+};
-static inline void
-debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
-{
-}
+#ifdef CONFIG_DEBUG_MUTEXES
+extern void debug_mutex_lock_common(struct mutex *lock,
+ struct mutex_waiter *waiter);
+extern void debug_mutex_wake_waiter(struct mutex *lock,
+ struct mutex_waiter *waiter);
+extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
+extern void debug_mutex_add_waiter(struct mutex *lock,
+ struct mutex_waiter *waiter,
+ struct task_struct *task);
+extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+ struct task_struct *task);
+extern void debug_mutex_unlock(struct mutex *lock);
+extern void debug_mutex_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key);
+#else /* CONFIG_DEBUG_MUTEXES */
+# define debug_mutex_lock_common(lock, waiter) do { } while (0)
+# define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
+# define debug_mutex_free_waiter(waiter) do { } while (0)
+# define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
+# define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0)
+# define debug_mutex_unlock(lock) do { } while (0)
+# define debug_mutex_init(lock, name, key) do { } while (0)
+#endif /* !CONFIG_DEBUG_MUTEXES */
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 1de006ed3aa8..75a6f6133866 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -11,6 +11,13 @@
* called from interrupt context and we have preemption disabled while
* spinning.
*/
+
+struct optimistic_spin_node {
+ struct optimistic_spin_node *next, *prev;
+ int locked; /* 1 if lock acquired */
+ int cpu; /* encoded CPU # + 1 value */
+};
+
static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
/*
@@ -37,32 +44,28 @@ static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
/*
* Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
* Can return NULL in case we were the last queued and we updated @lock instead.
+ *
+ * If osq_lock() is being cancelled there must be a previous node
+ * and 'old_cpu' is its CPU #.
+ * For osq_unlock() there is never a previous node and old_cpu is
+ * set to OSQ_UNLOCKED_VAL.
*/
static inline struct optimistic_spin_node *
osq_wait_next(struct optimistic_spin_queue *lock,
struct optimistic_spin_node *node,
- struct optimistic_spin_node *prev)
+ int old_cpu)
{
- struct optimistic_spin_node *next = NULL;
int curr = encode_cpu(smp_processor_id());
- int old;
-
- /*
- * If there is a prev node in queue, then the 'old' value will be
- * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
- * we're currently last in queue, then the queue will then become empty.
- */
- old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
for (;;) {
if (atomic_read(&lock->tail) == curr &&
- atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) {
+ atomic_cmpxchg_acquire(&lock->tail, curr, old_cpu) == curr) {
/*
* We were the last queued, we moved @lock back. @prev
* will now observe @lock and will complete its
* unlock()/unqueue().
*/
- break;
+ return NULL;
}
/*
@@ -76,15 +79,15 @@ osq_wait_next(struct optimistic_spin_queue *lock,
* wait for a new @node->next from its Step-C.
*/
if (node->next) {
+ struct optimistic_spin_node *next;
+
next = xchg(&node->next, NULL);
if (next)
- break;
+ return next;
}
cpu_relax();
}
-
- return next;
}
bool osq_lock(struct optimistic_spin_queue *lock)
@@ -135,7 +138,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
*/
/*
- * Wait to acquire the lock or cancelation. Note that need_resched()
+ * Wait to acquire the lock or cancellation. Note that need_resched()
* will come with an IPI, which will wake smp_cond_load_relaxed() if it
* is implemented with a monitor-wait. vcpu_is_preempted() relies on
* polling, be careful.
@@ -164,7 +167,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
/*
* We can only fail the cmpxchg() racing against an unlock(),
- * in which case we should observe @node->locked becomming
+ * in which case we should observe @node->locked becoming
* true.
*/
if (smp_load_acquire(&node->locked))
@@ -186,7 +189,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
* back to @prev.
*/
- next = osq_wait_next(lock, node, prev);
+ next = osq_wait_next(lock, node, prev->cpu);
if (!next)
return false;
@@ -226,7 +229,7 @@ void osq_unlock(struct optimistic_spin_queue *lock)
return;
}
- next = osq_wait_next(lock, node, NULL);
+ next = osq_wait_next(lock, node, OSQ_UNLOCKED_VAL);
if (next)
WRITE_ONCE(next->locked, 1);
}
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 70a32a576f3f..6083883c4fe0 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -7,7 +7,9 @@
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
+#include <linux/sched/debug.h>
#include <linux/errno.h>
+#include <trace/events/lock.h>
int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
const char *name, struct lock_class_key *key)
@@ -162,7 +164,7 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
__set_current_state(TASK_RUNNING);
}
-bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
+bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
{
if (__percpu_down_read_trylock(sem))
return true;
@@ -170,9 +172,11 @@ bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
if (try)
return false;
+ trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ);
preempt_enable();
percpu_rwsem_wait(sem, /* .reader = */ true);
preempt_disable();
+ trace_contention_end(sem, 0);
return true;
}
@@ -188,6 +192,12 @@ EXPORT_SYMBOL_GPL(__percpu_down_read);
__sum; \
})
+bool percpu_is_read_locked(struct percpu_rw_semaphore *sem)
+{
+ return per_cpu_sum(*sem->read_count) != 0 && !atomic_read(&sem->block);
+}
+EXPORT_SYMBOL_GPL(percpu_is_read_locked);
+
/*
* Return true if the modular sum of the sem->read_count per-CPU variable is
* zero. If this sum is zero, then it is stable due to the fact that if any
@@ -211,8 +221,10 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem)
return true;
}
-void percpu_down_write(struct percpu_rw_semaphore *sem)
+void __sched percpu_down_write(struct percpu_rw_semaphore *sem)
{
+ bool contended = false;
+
might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
@@ -223,8 +235,11 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
* Try set sem->block; this provides writer-writer exclusion.
* Having sem->block set makes new readers block.
*/
- if (!__percpu_down_write_trylock(sem))
+ if (!__percpu_down_write_trylock(sem)) {
+ trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE);
percpu_rwsem_wait(sem, /* .reader = */ false);
+ contended = true;
+ }
/* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */
@@ -236,6 +251,8 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
/* Wait for all active readers to complete. */
rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE);
+ if (contended)
+ trace_contention_end(sem, 0);
}
EXPORT_SYMBOL_GPL(percpu_down_write);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index fe9ca92faa2a..d2ef312a8611 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -12,13 +12,13 @@
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/spinlock.h>
-#include <asm/qrwlock.h>
+#include <trace/events/lock.h>
/**
- * queued_read_lock_slowpath - acquire read lock of a queue rwlock
- * @lock: Pointer to queue rwlock structure
+ * queued_read_lock_slowpath - acquire read lock of a queued rwlock
+ * @lock: Pointer to queued rwlock structure
*/
-void queued_read_lock_slowpath(struct qrwlock *lock)
+void __lockfunc queued_read_lock_slowpath(struct qrwlock *lock)
{
/*
* Readers come here when they cannot get the lock without waiting
@@ -35,6 +35,8 @@ void queued_read_lock_slowpath(struct qrwlock *lock)
}
atomic_sub(_QR_BIAS, &lock->cnts);
+ trace_contention_begin(lock, LCB_F_SPIN | LCB_F_READ);
+
/*
* Put the reader into the wait queue
*/
@@ -52,32 +54,39 @@ void queued_read_lock_slowpath(struct qrwlock *lock)
* Signal the next one in queue to become queue head
*/
arch_spin_unlock(&lock->wait_lock);
+
+ trace_contention_end(lock, 0);
}
EXPORT_SYMBOL(queued_read_lock_slowpath);
/**
- * queued_write_lock_slowpath - acquire write lock of a queue rwlock
- * @lock : Pointer to queue rwlock structure
+ * queued_write_lock_slowpath - acquire write lock of a queued rwlock
+ * @lock : Pointer to queued rwlock structure
*/
-void queued_write_lock_slowpath(struct qrwlock *lock)
+void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock)
{
+ int cnts;
+
+ trace_contention_begin(lock, LCB_F_SPIN | LCB_F_WRITE);
+
/* Put the writer into the wait queue */
arch_spin_lock(&lock->wait_lock);
/* Try to acquire the lock directly if no reader is present */
- if (!atomic_read(&lock->cnts) &&
- (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0))
+ if (!(cnts = atomic_read(&lock->cnts)) &&
+ atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED))
goto unlock;
/* Set the waiting flag to notify readers that a writer is pending */
- atomic_add(_QW_WAITING, &lock->cnts);
+ atomic_or(_QW_WAITING, &lock->cnts);
/* When no more readers or writers, set the locked flag */
do {
- atomic_cond_read_acquire(&lock->cnts, VAL == _QW_WAITING);
- } while (atomic_cmpxchg_relaxed(&lock->cnts, _QW_WAITING,
- _QW_LOCKED) != _QW_WAITING);
+ cnts = atomic_cond_read_relaxed(&lock->cnts, VAL == _QW_WAITING);
+ } while (!atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED));
unlock:
arch_spin_unlock(&lock->wait_lock);
+
+ trace_contention_end(lock, 0);
}
EXPORT_SYMBOL(queued_write_lock_slowpath);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index cbff6ba53d56..ebe6b8ec7cb3 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -22,6 +22,7 @@
#include <linux/prefetch.h>
#include <asm/byteorder.h>
#include <asm/qspinlock.h>
+#include <trace/events/lock.h>
/*
* Include queued spinlock statistics code
@@ -312,7 +313,7 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
* contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' :
* queue : ^--' :
*/
-void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
{
struct mcs_spinlock *prev, *next, *node;
u32 old, tail;
@@ -370,7 +371,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
/*
* We're pending, wait for the owner to go away.
*
- * 0,1,1 -> 0,1,0
+ * 0,1,1 -> *,1,0
*
* this wait loop must be a load-acquire such that we match the
* store-release that clears the locked bit and create lock
@@ -379,7 +380,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* barriers.
*/
if (val & _Q_LOCKED_MASK)
- atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_MASK));
+ smp_cond_load_acquire(&lock->locked, !VAL);
/*
* take ownership and clear the pending bit.
@@ -401,6 +402,8 @@ pv_queue:
idx = node->count++;
tail = encode_tail(smp_processor_id(), idx);
+ trace_contention_begin(lock, LCB_F_SPIN);
+
/*
* 4 nodes are allocated based on the assumption that there will
* not be nested NMIs taking spinlocks. That may not be true in
@@ -554,6 +557,8 @@ locked:
pv_kick_node(lock, next);
release:
+ trace_contention_end(lock, 0);
+
/*
* release the node
*/
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index e84d21aa0722..ae2b12f68b90 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -294,8 +294,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
{
struct pv_node *pn = (struct pv_node *)node;
struct pv_node *pp = (struct pv_node *)prev;
+ bool __maybe_unused wait_early;
int loop;
- bool wait_early;
for (;;) {
for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
@@ -486,10 +486,20 @@ gotlock:
}
/*
+ * Include the architecture specific callee-save thunk of the
+ * __pv_queued_spin_unlock(). This thunk is put together with
+ * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
+ * function close to each other sharing consecutive instruction cachelines.
+ * Alternatively, architecture specific version of __pv_queued_spin_unlock()
+ * can be defined.
+ */
+#include <asm/qspinlock_paravirt.h>
+
+/*
* PV versions of the unlock fastpath and slowpath functions to be used
* instead of queued_spin_unlock().
*/
-__visible void
+__visible __lockfunc void
__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
{
struct pv_node *node;
@@ -533,18 +543,8 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
pv_kick(node->cpu);
}
-/*
- * Include the architecture specific callee-save thunk of the
- * __pv_queued_spin_unlock(). This thunk is put together with
- * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
- * function close to each other sharing consecutive instruction cachelines.
- * Alternatively, architecture specific version of __pv_queued_spin_unlock()
- * can be defined.
- */
-#include <asm/qspinlock_paravirt.h>
-
#ifndef __pv_queued_spin_unlock
-__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+__visible __lockfunc void __pv_queued_spin_unlock(struct qspinlock *lock)
{
u8 locked;
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
deleted file mode 100644
index 36e69100e8e0..000000000000
--- a/kernel/locking/rtmutex-debug.c
+++ /dev/null
@@ -1,182 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * RT-Mutexes: blocking mutual exclusion locks with PI support
- *
- * started by Ingo Molnar and Thomas Gleixner:
- *
- * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- * This code is based on the rt.c implementation in the preempt-rt tree.
- * Portions of said code are
- *
- * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey
- * Copyright (C) 2006 Esben Nielsen
- * Copyright (C) 2006 Kihon Technologies Inc.,
- * Steven Rostedt <rostedt@goodmis.org>
- *
- * See rt.c in preempt-rt for proper credits and further information
- */
-#include <linux/sched.h>
-#include <linux/sched/rt.h>
-#include <linux/sched/debug.h>
-#include <linux/delay.h>
-#include <linux/export.h>
-#include <linux/spinlock.h>
-#include <linux/kallsyms.h>
-#include <linux/syscalls.h>
-#include <linux/interrupt.h>
-#include <linux/rbtree.h>
-#include <linux/fs.h>
-#include <linux/debug_locks.h>
-
-#include "rtmutex_common.h"
-
-static void printk_task(struct task_struct *p)
-{
- if (p)
- printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio);
- else
- printk("<none>");
-}
-
-static void printk_lock(struct rt_mutex *lock, int print_owner)
-{
- if (lock->name)
- printk(" [%p] {%s}\n",
- lock, lock->name);
- else
- printk(" [%p] {%s:%d}\n",
- lock, lock->file, lock->line);
-
- if (print_owner && rt_mutex_owner(lock)) {
- printk(".. ->owner: %p\n", lock->owner);
- printk(".. held by: ");
- printk_task(rt_mutex_owner(lock));
- printk("\n");
- }
-}
-
-void rt_mutex_debug_task_free(struct task_struct *task)
-{
- DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root));
- DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
-}
-
-/*
- * We fill out the fields in the waiter to store the information about
- * the deadlock. We print when we return. act_waiter can be NULL in
- * case of a remove waiter operation.
- */
-void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
- struct rt_mutex_waiter *act_waiter,
- struct rt_mutex *lock)
-{
- struct task_struct *task;
-
- if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter)
- return;
-
- task = rt_mutex_owner(act_waiter->lock);
- if (task && task != current) {
- act_waiter->deadlock_task_pid = get_pid(task_pid(task));
- act_waiter->deadlock_lock = lock;
- }
-}
-
-void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
-{
- struct task_struct *task;
-
- if (!waiter->deadlock_lock || !debug_locks)
- return;
-
- rcu_read_lock();
- task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID);
- if (!task) {
- rcu_read_unlock();
- return;
- }
-
- if (!debug_locks_off()) {
- rcu_read_unlock();
- return;
- }
-
- pr_warn("\n");
- pr_warn("============================================\n");
- pr_warn("WARNING: circular locking deadlock detected!\n");
- pr_warn("%s\n", print_tainted());
- pr_warn("--------------------------------------------\n");
- printk("%s/%d is deadlocking current task %s/%d\n\n",
- task->comm, task_pid_nr(task),
- current->comm, task_pid_nr(current));
-
- printk("\n1) %s/%d is trying to acquire this lock:\n",
- current->comm, task_pid_nr(current));
- printk_lock(waiter->lock, 1);
-
- printk("\n2) %s/%d is blocked on this lock:\n",
- task->comm, task_pid_nr(task));
- printk_lock(waiter->deadlock_lock, 1);
-
- debug_show_held_locks(current);
- debug_show_held_locks(task);
-
- printk("\n%s/%d's [blocked] stackdump:\n\n",
- task->comm, task_pid_nr(task));
- show_stack(task, NULL, KERN_DEFAULT);
- printk("\n%s/%d's [current] stackdump:\n\n",
- current->comm, task_pid_nr(current));
- dump_stack();
- debug_show_all_locks();
- rcu_read_unlock();
-
- printk("[ turning off deadlock detection."
- "Please report this trace. ]\n\n");
-}
-
-void debug_rt_mutex_lock(struct rt_mutex *lock)
-{
-}
-
-void debug_rt_mutex_unlock(struct rt_mutex *lock)
-{
- DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
-}
-
-void
-debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
-{
-}
-
-void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
-{
- DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
-}
-
-void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
-{
- memset(waiter, 0x11, sizeof(*waiter));
- waiter->deadlock_task_pid = NULL;
-}
-
-void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
-{
- put_pid(waiter->deadlock_task_pid);
- memset(waiter, 0x22, sizeof(*waiter));
-}
-
-void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key)
-{
- /*
- * Make sure we are not reinitializing a held lock:
- */
- debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lock->name = name;
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- lockdep_init_map(&lock->dep_map, name, key, 0);
-#endif
-}
-
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
deleted file mode 100644
index fc549713bba3..000000000000
--- a/kernel/locking/rtmutex-debug.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * RT-Mutexes: blocking mutual exclusion locks with PI support
- *
- * started by Ingo Molnar and Thomas Gleixner:
- *
- * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- * This file contains macros used solely by rtmutex.c. Debug version.
- */
-
-extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
-extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
-extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key);
-extern void debug_rt_mutex_lock(struct rt_mutex *lock);
-extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
-extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
- struct task_struct *powner);
-extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
-extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
- struct rt_mutex_waiter *waiter,
- struct rt_mutex *lock);
-extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
-# define debug_rt_mutex_reset_waiter(w) \
- do { (w)->deadlock_lock = NULL; } while (0)
-
-static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
- enum rtmutex_chainwalk walk)
-{
- return (waiter != NULL);
-}
-
-static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
-{
- debug_rt_mutex_print_deadlock(w);
-}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 2f8cd616d3b2..88d08eeb8bc0 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -8,20 +8,60 @@
* Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
* Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
* Copyright (C) 2006 Esben Nielsen
+ * Adaptive Spinlocks:
+ * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
+ * and Peter Morreale,
+ * Adaptive Spinlocks simplification:
+ * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
*
* See Documentation/locking/rt-mutex-design.rst for details.
*/
-#include <linux/spinlock.h>
-#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/deadline.h>
#include <linux/sched/signal.h>
#include <linux/sched/rt.h>
-#include <linux/sched/deadline.h>
#include <linux/sched/wake_q.h>
-#include <linux/sched/debug.h>
-#include <linux/timer.h>
+#include <linux/ww_mutex.h>
+
+#include <trace/events/lock.h>
#include "rtmutex_common.h"
+#ifndef WW_RT
+# define build_ww_mutex() (false)
+# define ww_container_of(rtm) NULL
+
+static inline int __ww_mutex_add_waiter(struct rt_mutex_waiter *waiter,
+ struct rt_mutex *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ return 0;
+}
+
+static inline void __ww_mutex_check_waiters(struct rt_mutex *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+}
+
+static inline void ww_mutex_lock_acquired(struct ww_mutex *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+}
+
+static inline int __ww_mutex_check_kill(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ return 0;
+}
+
+#else
+# define build_ww_mutex() (true)
+# define ww_container_of(rtm) container_of(rtm, struct ww_mutex, base)
+# include "ww_mutex.h"
+#endif
+
/*
* lock->owner state tracking:
*
@@ -49,24 +89,41 @@
* set this bit before looking at the lock.
*/
-static void
-rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
+static __always_inline struct task_struct *
+rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner)
{
unsigned long val = (unsigned long)owner;
if (rt_mutex_has_waiters(lock))
val |= RT_MUTEX_HAS_WAITERS;
- WRITE_ONCE(lock->owner, (struct task_struct *)val);
+ return (struct task_struct *)val;
}
-static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
+static __always_inline void
+rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner)
+{
+ /*
+ * lock->wait_lock is held but explicit acquire semantics are needed
+ * for a new lock owner so WRITE_ONCE is insufficient.
+ */
+ xchg_acquire(&lock->owner, rt_mutex_owner_encode(lock, owner));
+}
+
+static __always_inline void rt_mutex_clear_owner(struct rt_mutex_base *lock)
+{
+ /* lock->wait_lock is held so the unlock provides release semantics. */
+ WRITE_ONCE(lock->owner, rt_mutex_owner_encode(lock, NULL));
+}
+
+static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock)
{
lock->owner = (struct task_struct *)
((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
}
-static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
+static __always_inline void
+fixup_rt_mutex_waiters(struct rt_mutex_base *lock, bool acquire_lock)
{
unsigned long owner, *p = (unsigned long *) &lock->owner;
@@ -132,8 +189,21 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
* still set.
*/
owner = READ_ONCE(*p);
- if (owner & RT_MUTEX_HAS_WAITERS)
- WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
+ if (owner & RT_MUTEX_HAS_WAITERS) {
+ /*
+ * See rt_mutex_set_owner() and rt_mutex_clear_owner() on
+ * why xchg_acquire() is used for updating owner for
+ * locking and WRITE_ONCE() for unlocking.
+ *
+ * WRITE_ONCE() would work for the acquire case too, but
+ * in case that the lock acquisition failed it might
+ * force other lockers into the slow path unnecessarily.
+ */
+ if (acquire_lock)
+ xchg_acquire(p, owner & ~RT_MUTEX_HAS_WAITERS);
+ else
+ WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
+ }
}
/*
@@ -141,22 +211,46 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
* set up.
*/
#ifndef CONFIG_DEBUG_RT_MUTEXES
-# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c)
-# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c)
+static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return try_cmpxchg_acquire(&lock->owner, &old, new);
+}
+
+static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
+{
+ return rt_mutex_cmpxchg_acquire(lock, NULL, current);
+}
+
+static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return try_cmpxchg_release(&lock->owner, &old, new);
+}
/*
* Callers must hold the ->wait_lock -- which is the whole purpose as we force
* all future threads that attempt to [Rmw] the lock to the slowpath. As such
* relaxed semantics suffice.
*/
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock)
{
- unsigned long owner, *p = (unsigned long *) &lock->owner;
+ unsigned long *p = (unsigned long *) &lock->owner;
+ unsigned long owner, new;
+ owner = READ_ONCE(*p);
do {
- owner = *p;
- } while (cmpxchg_relaxed(p, owner,
- owner | RT_MUTEX_HAS_WAITERS) != owner);
+ new = owner | RT_MUTEX_HAS_WAITERS;
+ } while (!try_cmpxchg_relaxed(p, &owner, new));
+
+ /*
+ * The cmpxchg loop above is relaxed to avoid back-to-back ACQUIRE
+ * operations in the event of contention. Ensure the successful
+ * cmpxchg is visible.
+ */
+ smp_mb__after_atomic();
}
/*
@@ -165,8 +259,8 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
* 2) Drop lock->wait_lock
* 3) Try to unlock the lock with cmpxchg
*/
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
- unsigned long flags)
+static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock,
+ unsigned long flags)
__releases(lock->wait_lock)
{
struct task_struct *owner = rt_mutex_owner(lock);
@@ -201,10 +295,36 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
}
#else
-# define rt_mutex_cmpxchg_acquire(l,c,n) (0)
-# define rt_mutex_cmpxchg_release(l,c,n) (0)
+static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return false;
+
+}
+
+static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock);
+
+static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
+{
+ /*
+ * With debug enabled rt_mutex_cmpxchg trylock() will always fail.
+ *
+ * Avoid unconditionally taking the slow path by using
+ * rt_mutex_slow_trylock() which is covered by the debug code and can
+ * acquire a non-contended rtmutex.
+ */
+ return rt_mutex_slowtrylock(lock);
+}
+
+static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return false;
+}
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock)
{
lock->owner = (struct task_struct *)
((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
@@ -213,8 +333,8 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
/*
* Simple slow path only version: lock->owner is protected by lock->wait_lock.
*/
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
- unsigned long flags)
+static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock,
+ unsigned long flags)
__releases(lock->wait_lock)
{
lock->owner = NULL;
@@ -223,15 +343,53 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
}
#endif
+static __always_inline int __waiter_prio(struct task_struct *task)
+{
+ int prio = task->prio;
+
+ if (!rt_prio(prio))
+ return DEFAULT_PRIO;
+
+ return prio;
+}
+
+/*
+ * Update the waiter->tree copy of the sort keys.
+ */
+static __always_inline void
+waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
+{
+ lockdep_assert_held(&waiter->lock->wait_lock);
+ lockdep_assert(RB_EMPTY_NODE(&waiter->tree.entry));
+
+ waiter->tree.prio = __waiter_prio(task);
+ waiter->tree.deadline = task->dl.deadline;
+}
+
+/*
+ * Update the waiter->pi_tree copy of the sort keys (from the tree copy).
+ */
+static __always_inline void
+waiter_clone_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
+{
+ lockdep_assert_held(&waiter->lock->wait_lock);
+ lockdep_assert_held(&task->pi_lock);
+ lockdep_assert(RB_EMPTY_NODE(&waiter->pi_tree.entry));
+
+ waiter->pi_tree.prio = waiter->tree.prio;
+ waiter->pi_tree.deadline = waiter->tree.deadline;
+}
+
/*
- * Only use with rt_mutex_waiter_{less,equal}()
+ * Only use with rt_waiter_node_{less,equal}()
*/
+#define task_to_waiter_node(p) \
+ &(struct rt_waiter_node){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }
#define task_to_waiter(p) \
- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
+ &(struct rt_mutex_waiter){ .tree = *task_to_waiter_node(p) }
-static inline int
-rt_mutex_waiter_less(struct rt_mutex_waiter *left,
- struct rt_mutex_waiter *right)
+static __always_inline int rt_waiter_node_less(struct rt_waiter_node *left,
+ struct rt_waiter_node *right)
{
if (left->prio < right->prio)
return 1;
@@ -248,9 +406,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left,
return 0;
}
-static inline int
-rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
- struct rt_mutex_waiter *right)
+static __always_inline int rt_waiter_node_equal(struct rt_waiter_node *left,
+ struct rt_waiter_node *right)
{
if (left->prio != right->prio)
return 0;
@@ -267,76 +424,110 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
return 1;
}
-static void
-rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
+static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
+ struct rt_mutex_waiter *top_waiter)
{
- struct rb_node **link = &lock->waiters.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct rt_mutex_waiter *entry;
- bool leftmost = true;
-
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry);
- if (rt_mutex_waiter_less(waiter, entry)) {
- link = &parent->rb_left;
- } else {
- link = &parent->rb_right;
- leftmost = false;
- }
+ if (rt_waiter_node_less(&waiter->tree, &top_waiter->tree))
+ return true;
+
+#ifdef RT_MUTEX_BUILD_SPINLOCKS
+ /*
+ * Note that RT tasks are excluded from same priority (lateral)
+ * steals to prevent the introduction of an unbounded latency.
+ */
+ if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio))
+ return false;
+
+ return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree);
+#else
+ return false;
+#endif
+}
+
+#define __node_2_waiter(node) \
+ rb_entry((node), struct rt_mutex_waiter, tree.entry)
+
+static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b)
+{
+ struct rt_mutex_waiter *aw = __node_2_waiter(a);
+ struct rt_mutex_waiter *bw = __node_2_waiter(b);
+
+ if (rt_waiter_node_less(&aw->tree, &bw->tree))
+ return 1;
+
+ if (!build_ww_mutex())
+ return 0;
+
+ if (rt_waiter_node_less(&bw->tree, &aw->tree))
+ return 0;
+
+ /* NOTE: relies on waiter->ww_ctx being set before insertion */
+ if (aw->ww_ctx) {
+ if (!bw->ww_ctx)
+ return 1;
+
+ return (signed long)(aw->ww_ctx->stamp -
+ bw->ww_ctx->stamp) < 0;
}
- rb_link_node(&waiter->tree_entry, parent, link);
- rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost);
+ return 0;
}
-static void
-rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
+static __always_inline void
+rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
{
- if (RB_EMPTY_NODE(&waiter->tree_entry))
+ lockdep_assert_held(&lock->wait_lock);
+
+ rb_add_cached(&waiter->tree.entry, &lock->waiters, __waiter_less);
+}
+
+static __always_inline void
+rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
+{
+ lockdep_assert_held(&lock->wait_lock);
+
+ if (RB_EMPTY_NODE(&waiter->tree.entry))
return;
- rb_erase_cached(&waiter->tree_entry, &lock->waiters);
- RB_CLEAR_NODE(&waiter->tree_entry);
+ rb_erase_cached(&waiter->tree.entry, &lock->waiters);
+ RB_CLEAR_NODE(&waiter->tree.entry);
+}
+
+#define __node_2_rt_node(node) \
+ rb_entry((node), struct rt_waiter_node, entry)
+
+static __always_inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b)
+{
+ return rt_waiter_node_less(__node_2_rt_node(a), __node_2_rt_node(b));
}
-static void
+static __always_inline void
rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
- struct rb_node **link = &task->pi_waiters.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct rt_mutex_waiter *entry;
- bool leftmost = true;
-
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry);
- if (rt_mutex_waiter_less(waiter, entry)) {
- link = &parent->rb_left;
- } else {
- link = &parent->rb_right;
- leftmost = false;
- }
- }
+ lockdep_assert_held(&task->pi_lock);
- rb_link_node(&waiter->pi_tree_entry, parent, link);
- rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost);
+ rb_add_cached(&waiter->pi_tree.entry, &task->pi_waiters, __pi_waiter_less);
}
-static void
+static __always_inline void
rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
- if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
+ lockdep_assert_held(&task->pi_lock);
+
+ if (RB_EMPTY_NODE(&waiter->pi_tree.entry))
return;
- rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters);
- RB_CLEAR_NODE(&waiter->pi_tree_entry);
+ rb_erase_cached(&waiter->pi_tree.entry, &task->pi_waiters);
+ RB_CLEAR_NODE(&waiter->pi_tree.entry);
}
-static void rt_mutex_adjust_prio(struct task_struct *p)
+static __always_inline void rt_mutex_adjust_prio(struct rt_mutex_base *lock,
+ struct task_struct *p)
{
struct task_struct *pi_task = NULL;
+ lockdep_assert_held(&lock->wait_lock);
+ lockdep_assert(rt_mutex_owner(lock) == p);
lockdep_assert_held(&p->pi_lock);
if (task_has_pi_waiters(p))
@@ -345,6 +536,42 @@ static void rt_mutex_adjust_prio(struct task_struct *p)
rt_mutex_setprio(p, pi_task);
}
+/* RT mutex specific wake_q wrappers */
+static __always_inline void rt_mutex_wake_q_add_task(struct rt_wake_q_head *wqh,
+ struct task_struct *task,
+ unsigned int wake_state)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && wake_state == TASK_RTLOCK_WAIT) {
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING))
+ WARN_ON_ONCE(wqh->rtlock_task);
+ get_task_struct(task);
+ wqh->rtlock_task = task;
+ } else {
+ wake_q_add(&wqh->head, task);
+ }
+}
+
+static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh,
+ struct rt_mutex_waiter *w)
+{
+ rt_mutex_wake_q_add_task(wqh, w->task, w->wake_state);
+}
+
+static __always_inline void rt_mutex_wake_up_q(struct rt_wake_q_head *wqh)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && wqh->rtlock_task) {
+ wake_up_state(wqh->rtlock_task, TASK_RTLOCK_WAIT);
+ put_task_struct(wqh->rtlock_task);
+ wqh->rtlock_task = NULL;
+ }
+
+ if (!wake_q_empty(&wqh->head))
+ wake_up_q(&wqh->head);
+
+ /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */
+ preempt_enable();
+}
+
/*
* Deadlock detection is conditional:
*
@@ -358,25 +585,16 @@ static void rt_mutex_adjust_prio(struct task_struct *p)
* deadlock detection is disabled independent of the detect argument
* and the config settings.
*/
-static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
- enum rtmutex_chainwalk chwalk)
+static __always_inline bool
+rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
+ enum rtmutex_chainwalk chwalk)
{
- /*
- * This is just a wrapper function for the following call,
- * because debug_rt_mutex_detect_deadlock() smells like a magic
- * debug feature and I wanted to keep the cond function in the
- * main source file along with the comments instead of having
- * two of the same in the headers.
- */
- return debug_rt_mutex_detect_deadlock(waiter, chwalk);
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
+ return waiter != NULL;
+ return chwalk == RT_MUTEX_FULL_CHAINWALK;
}
-/*
- * Max number of times we'll walk the boosting chain:
- */
-int max_lock_depth = 1024;
-
-static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
+static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_struct *p)
{
return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
}
@@ -405,9 +623,14 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
* Chain walk basics and protection scope
*
* [R] refcount on task
- * [P] task->pi_lock held
+ * [Pn] task->pi_lock held
* [L] rtmutex->wait_lock held
*
+ * Normal locking order:
+ *
+ * rtmutex->wait_lock
+ * task->pi_lock
+ *
* Step Description Protected by
* function arguments:
* @task [R]
@@ -422,39 +645,44 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
* again:
* loop_sanity_check();
* retry:
- * [1] lock(task->pi_lock); [R] acquire [P]
- * [2] waiter = task->pi_blocked_on; [P]
- * [3] check_exit_conditions_1(); [P]
- * [4] lock = waiter->lock; [P]
- * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L]
- * unlock(task->pi_lock); release [P]
+ * [1] lock(task->pi_lock); [R] acquire [P1]
+ * [2] waiter = task->pi_blocked_on; [P1]
+ * [3] check_exit_conditions_1(); [P1]
+ * [4] lock = waiter->lock; [P1]
+ * [5] if (!try_lock(lock->wait_lock)) { [P1] try to acquire [L]
+ * unlock(task->pi_lock); release [P1]
* goto retry;
* }
- * [6] check_exit_conditions_2(); [P] + [L]
- * [7] requeue_lock_waiter(lock, waiter); [P] + [L]
- * [8] unlock(task->pi_lock); release [P]
+ * [6] check_exit_conditions_2(); [P1] + [L]
+ * [7] requeue_lock_waiter(lock, waiter); [P1] + [L]
+ * [8] unlock(task->pi_lock); release [P1]
* put_task_struct(task); release [R]
* [9] check_exit_conditions_3(); [L]
* [10] task = owner(lock); [L]
* get_task_struct(task); [L] acquire [R]
- * lock(task->pi_lock); [L] acquire [P]
- * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L]
- * [12] check_exit_conditions_4(); [P] + [L]
- * [13] unlock(task->pi_lock); release [P]
+ * lock(task->pi_lock); [L] acquire [P2]
+ * [11] requeue_pi_waiter(tsk, waiters(lock));[P2] + [L]
+ * [12] check_exit_conditions_4(); [P2] + [L]
+ * [13] unlock(task->pi_lock); release [P2]
* unlock(lock->wait_lock); release [L]
* goto again;
+ *
+ * Where P1 is the blocking task and P2 is the lock owner; going up one step
+ * the owner becomes the next blocked task etc..
+ *
+*
*/
-static int rt_mutex_adjust_prio_chain(struct task_struct *task,
- enum rtmutex_chainwalk chwalk,
- struct rt_mutex *orig_lock,
- struct rt_mutex *next_lock,
- struct rt_mutex_waiter *orig_waiter,
- struct task_struct *top_task)
+static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
+ enum rtmutex_chainwalk chwalk,
+ struct rt_mutex_base *orig_lock,
+ struct rt_mutex_base *next_lock,
+ struct rt_mutex_waiter *orig_waiter,
+ struct task_struct *top_task)
{
struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
struct rt_mutex_waiter *prerequeue_top_waiter;
int ret = 0, depth = 0;
- struct rt_mutex *lock;
+ struct rt_mutex_base *lock;
bool detect_deadlock;
bool requeue = true;
@@ -537,6 +765,31 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
goto out_unlock_pi;
/*
+ * There could be 'spurious' loops in the lock graph due to ww_mutex,
+ * consider:
+ *
+ * P1: A, ww_A, ww_B
+ * P2: ww_B, ww_A
+ * P3: A
+ *
+ * P3 should not return -EDEADLK because it gets trapped in the cycle
+ * created by P1 and P2 (which will resolve -- and runs into
+ * max_lock_depth above). Therefore disable detect_deadlock such that
+ * the below termination condition can trigger once all relevant tasks
+ * are boosted.
+ *
+ * Even when we start with ww_mutex we can disable deadlock detection,
+ * since we would supress a ww_mutex induced deadlock at [6] anyway.
+ * Supressing it here however is not sufficient since we might still
+ * hit [6] due to adjustment driven iteration.
+ *
+ * NOTE: if someone were to create a deadlock between 2 ww_classes we'd
+ * utterly fail to report it; lockdep should.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && waiter->ww_ctx && detect_deadlock)
+ detect_deadlock = false;
+
+ /*
* Drop out, when the task has no waiters. Note,
* top_waiter can be NULL, when we are in the deboosting
* mode!
@@ -565,7 +818,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* enabled we continue, but stop the requeueing in the chain
* walk.
*/
- if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
+ if (rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) {
if (!detect_deadlock)
goto out_unlock_pi;
else
@@ -573,13 +826,18 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
}
/*
- * [4] Get the next lock
+ * [4] Get the next lock; per holding task->pi_lock we can't unblock
+ * and guarantee @lock's existence.
*/
lock = waiter->lock;
/*
* [5] We need to trylock here as we are holding task->pi_lock,
* which is the reverse lock order versus the other rtmutex
* operations.
+ *
+ * Per the above, holding task->pi_lock guarantees lock exists, so
+ * inverting this lock order is infeasible from a life-time
+ * perspective.
*/
if (!raw_spin_trylock(&lock->wait_lock)) {
raw_spin_unlock_irq(&task->pi_lock);
@@ -597,9 +855,21 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* walk, we detected a deadlock.
*/
if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
- debug_rt_mutex_deadlock(chwalk, orig_waiter, lock);
- raw_spin_unlock(&lock->wait_lock);
ret = -EDEADLK;
+
+ /*
+ * When the deadlock is due to ww_mutex; also see above. Don't
+ * report the deadlock and instead let the ww_mutex wound/die
+ * logic pick which of the contending threads gets -EDEADLK.
+ *
+ * NOTE: assumes the cycle only contains a single ww_class; any
+ * other configuration and we fail to report; also, see
+ * lockdep.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter && orig_waiter->ww_ctx)
+ ret = 0;
+
+ raw_spin_unlock(&lock->wait_lock);
goto out_unlock_pi;
}
@@ -671,18 +941,18 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* or
*
* DL CBS enforcement advancing the effective deadline.
- *
- * Even though pi_waiters also uses these fields, and that tree is only
- * updated in [11], we can do this here, since we hold [L], which
- * serializes all pi_waiters access and rb_erase() does not care about
- * the values of the node being removed.
*/
- waiter->prio = task->prio;
- waiter->deadline = task->dl.deadline;
+ waiter_update_prio(waiter, task);
rt_mutex_enqueue(lock, waiter);
- /* [8] Release the task */
+ /*
+ * [8] Release the (blocking) task in preparation for
+ * taking the owner task in [10].
+ *
+ * Since we hold lock->waiter_lock, task cannot unblock, even if we
+ * release task->pi_lock.
+ */
raw_spin_unlock(&task->pi_lock);
put_task_struct(task);
@@ -699,13 +969,19 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* then we need to wake the new top waiter up to try
* to get the lock.
*/
- if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
- wake_up_process(rt_mutex_top_waiter(lock)->task);
+ top_waiter = rt_mutex_top_waiter(lock);
+ if (prerequeue_top_waiter != top_waiter)
+ wake_up_state(top_waiter->task, top_waiter->wake_state);
raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
- /* [10] Grab the next task, i.e. the owner of @lock */
+ /*
+ * [10] Grab the next task, i.e. the owner of @lock
+ *
+ * Per holding lock->wait_lock and checking for !owner above, there
+ * must be an owner and it cannot go away.
+ */
task = get_task_struct(rt_mutex_owner(lock));
raw_spin_lock(&task->pi_lock);
@@ -718,13 +994,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* and adjust the priority of the owner.
*/
rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
+ waiter_clone_prio(waiter, task);
rt_mutex_enqueue_pi(task, waiter);
- rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(lock, task);
} else if (prerequeue_top_waiter == waiter) {
/*
* The waiter was the top waiter on the lock, but is
- * no longer the top prority waiter. Replace waiter in
+ * no longer the top priority waiter. Replace waiter in
* the owner tasks pi waiters tree with the new top
* (highest priority) waiter and adjust the priority
* of the owner.
@@ -734,8 +1011,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
*/
rt_mutex_dequeue_pi(task, waiter);
waiter = rt_mutex_top_waiter(lock);
+ waiter_clone_prio(waiter, task);
rt_mutex_enqueue_pi(task, waiter);
- rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(lock, task);
} else {
/*
* Nothing changed. No need to do any priority
@@ -802,8 +1080,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* @waiter: The waiter that is queued to the lock's wait tree if the
* callsite called task_blocked_on_lock(), otherwise NULL
*/
-static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
- struct rt_mutex_waiter *waiter)
+static int __sched
+try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task,
+ struct rt_mutex_waiter *waiter)
{
lockdep_assert_held(&lock->wait_lock);
@@ -838,19 +1117,21 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* trylock attempt.
*/
if (waiter) {
- /*
- * If waiter is not the highest priority waiter of
- * @lock, give up.
- */
- if (waiter != rt_mutex_top_waiter(lock))
- return 0;
+ struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock);
/*
- * We can acquire the lock. Remove the waiter from the
- * lock waiters tree.
+ * If waiter is the highest priority waiter of @lock,
+ * or allowed to steal it, take it over.
*/
- rt_mutex_dequeue(lock, waiter);
-
+ if (waiter == top_waiter || rt_mutex_steal(waiter, top_waiter)) {
+ /*
+ * We can acquire the lock. Remove the waiter from the
+ * lock waiters tree.
+ */
+ rt_mutex_dequeue(lock, waiter);
+ } else {
+ return 0;
+ }
} else {
/*
* If the lock has waiters already we check whether @task is
@@ -861,13 +1142,9 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* not need to be dequeued.
*/
if (rt_mutex_has_waiters(lock)) {
- /*
- * If @task->prio is greater than or equal to
- * the top waiter priority (kernel view),
- * @task lost.
- */
- if (!rt_mutex_waiter_less(task_to_waiter(task),
- rt_mutex_top_waiter(lock)))
+ /* Check whether the trylock can steal it. */
+ if (!rt_mutex_steal(task_to_waiter(task),
+ rt_mutex_top_waiter(lock)))
return 0;
/*
@@ -904,9 +1181,6 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
raw_spin_unlock(&task->pi_lock);
takeit:
- /* We got the lock. */
- debug_rt_mutex_lock(lock);
-
/*
* This either preserves the RT_MUTEX_HAS_WAITERS bit if there
* are still waiters or clears it.
@@ -923,14 +1197,15 @@ takeit:
*
* This must be called with lock->wait_lock held and interrupts disabled
*/
-static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task,
- enum rtmutex_chainwalk chwalk)
+static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task,
+ struct ww_acquire_ctx *ww_ctx,
+ enum rtmutex_chainwalk chwalk)
{
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex_waiter *top_waiter = waiter;
- struct rt_mutex *next_lock;
+ struct rt_mutex_base *next_lock;
int chain_walk = 0, res;
lockdep_assert_held(&lock->wait_lock);
@@ -943,15 +1218,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
* the other will detect the deadlock and return -EDEADLOCK,
* which is wrong, as the other waiter is not in a deadlock
* situation.
+ *
+ * Except for ww_mutex, in that case the chain walk must already deal
+ * with spurious cycles, see the comments at [3] and [6].
*/
- if (owner == task)
+ if (owner == task && !(build_ww_mutex() && ww_ctx))
return -EDEADLK;
raw_spin_lock(&task->pi_lock);
waiter->task = task;
waiter->lock = lock;
- waiter->prio = task->prio;
- waiter->deadline = task->dl.deadline;
+ waiter_update_prio(waiter, task);
+ waiter_clone_prio(waiter, task);
/* Get the top priority waiter on the lock */
if (rt_mutex_has_waiters(lock))
@@ -962,6 +1240,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
raw_spin_unlock(&task->pi_lock);
+ if (build_ww_mutex() && ww_ctx) {
+ struct rt_mutex *rtm;
+
+ /* Check whether the waiter should back out immediately */
+ rtm = container_of(lock, struct rt_mutex, rtmutex);
+ res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx);
+ if (res) {
+ raw_spin_lock(&task->pi_lock);
+ rt_mutex_dequeue(lock, waiter);
+ task->pi_blocked_on = NULL;
+ raw_spin_unlock(&task->pi_lock);
+ return res;
+ }
+ }
+
if (!owner)
return 0;
@@ -970,7 +1263,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
rt_mutex_dequeue_pi(owner, top_waiter);
rt_mutex_enqueue_pi(owner, waiter);
- rt_mutex_adjust_prio(owner);
+ rt_mutex_adjust_prio(lock, owner);
if (owner->pi_blocked_on)
chain_walk = 1;
} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
@@ -1012,11 +1305,13 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
*
* Called with lock->wait_lock held and interrupts disabled.
*/
-static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
- struct rt_mutex *lock)
+static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh,
+ struct rt_mutex_base *lock)
{
struct rt_mutex_waiter *waiter;
+ lockdep_assert_held(&lock->wait_lock);
+
raw_spin_lock(&current->pi_lock);
waiter = rt_mutex_top_waiter(lock);
@@ -1029,7 +1324,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
* task unblocks.
*/
rt_mutex_dequeue_pi(current, waiter);
- rt_mutex_adjust_prio(current);
+ rt_mutex_adjust_prio(lock, current);
/*
* As we are waking up the top waiter, and the waiter stays
@@ -1049,244 +1344,14 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
* deboost but before waking our donor task, hence the preempt_disable()
* before unlock.
*
- * Pairs with preempt_enable() in rt_mutex_postunlock();
+ * Pairs with preempt_enable() in rt_mutex_wake_up_q();
*/
preempt_disable();
- wake_q_add(wake_q, waiter->task);
+ rt_mutex_wake_q_add(wqh, waiter);
raw_spin_unlock(&current->pi_lock);
}
-/*
- * Remove a waiter from a lock and give up
- *
- * Must be called with lock->wait_lock held and interrupts disabled. I must
- * have just failed to try_to_take_rt_mutex().
- */
-static void remove_waiter(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter)
-{
- bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
- struct task_struct *owner = rt_mutex_owner(lock);
- struct rt_mutex *next_lock;
-
- lockdep_assert_held(&lock->wait_lock);
-
- raw_spin_lock(&current->pi_lock);
- rt_mutex_dequeue(lock, waiter);
- current->pi_blocked_on = NULL;
- raw_spin_unlock(&current->pi_lock);
-
- /*
- * Only update priority if the waiter was the highest priority
- * waiter of the lock and there is an owner to update.
- */
- if (!owner || !is_top_waiter)
- return;
-
- raw_spin_lock(&owner->pi_lock);
-
- rt_mutex_dequeue_pi(owner, waiter);
-
- if (rt_mutex_has_waiters(lock))
- rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
-
- rt_mutex_adjust_prio(owner);
-
- /* Store the lock on which owner is blocked or NULL */
- next_lock = task_blocked_on_lock(owner);
-
- raw_spin_unlock(&owner->pi_lock);
-
- /*
- * Don't walk the chain, if the owner task is not blocked
- * itself.
- */
- if (!next_lock)
- return;
-
- /* gets dropped in rt_mutex_adjust_prio_chain()! */
- get_task_struct(owner);
-
- raw_spin_unlock_irq(&lock->wait_lock);
-
- rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
- next_lock, NULL, current);
-
- raw_spin_lock_irq(&lock->wait_lock);
-}
-
-/*
- * Recheck the pi chain, in case we got a priority setting
- *
- * Called from sched_setscheduler
- */
-void rt_mutex_adjust_pi(struct task_struct *task)
-{
- struct rt_mutex_waiter *waiter;
- struct rt_mutex *next_lock;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&task->pi_lock, flags);
-
- waiter = task->pi_blocked_on;
- if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- return;
- }
- next_lock = waiter->lock;
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-
- /* gets dropped in rt_mutex_adjust_prio_chain()! */
- get_task_struct(task);
-
- rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
- next_lock, NULL, task);
-}
-
-void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
-{
- debug_rt_mutex_init_waiter(waiter);
- RB_CLEAR_NODE(&waiter->pi_tree_entry);
- RB_CLEAR_NODE(&waiter->tree_entry);
- waiter->task = NULL;
-}
-
-/**
- * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
- * @lock: the rt_mutex to take
- * @state: the state the task should block in (TASK_INTERRUPTIBLE
- * or TASK_UNINTERRUPTIBLE)
- * @timeout: the pre-initialized and started timer, or NULL for none
- * @waiter: the pre-initialized rt_mutex_waiter
- *
- * Must be called with lock->wait_lock held and interrupts disabled
- */
-static int __sched
-__rt_mutex_slowlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- struct rt_mutex_waiter *waiter)
-{
- int ret = 0;
-
- for (;;) {
- /* Try to acquire the lock: */
- if (try_to_take_rt_mutex(lock, current, waiter))
- break;
-
- /*
- * TASK_INTERRUPTIBLE checks for signals and
- * timeout. Ignored otherwise.
- */
- if (likely(state == TASK_INTERRUPTIBLE)) {
- /* Signal pending? */
- if (signal_pending(current))
- ret = -EINTR;
- if (timeout && !timeout->task)
- ret = -ETIMEDOUT;
- if (ret)
- break;
- }
-
- raw_spin_unlock_irq(&lock->wait_lock);
-
- debug_rt_mutex_print_deadlock(waiter);
-
- schedule();
-
- raw_spin_lock_irq(&lock->wait_lock);
- set_current_state(state);
- }
-
- __set_current_state(TASK_RUNNING);
- return ret;
-}
-
-static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
- struct rt_mutex_waiter *w)
-{
- /*
- * If the result is not -EDEADLOCK or the caller requested
- * deadlock detection, nothing to do here.
- */
- if (res != -EDEADLOCK || detect_deadlock)
- return;
-
- /*
- * Yell lowdly and stop the task right here.
- */
- rt_mutex_print_deadlock(w);
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
- schedule();
- }
-}
-
-/*
- * Slow path lock function:
- */
-static int __sched
-rt_mutex_slowlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk)
-{
- struct rt_mutex_waiter waiter;
- unsigned long flags;
- int ret = 0;
-
- rt_mutex_init_waiter(&waiter);
-
- /*
- * Technically we could use raw_spin_[un]lock_irq() here, but this can
- * be called in early boot if the cmpxchg() fast path is disabled
- * (debug, no architecture support). In this case we will acquire the
- * rtmutex with lock->wait_lock held. But we cannot unconditionally
- * enable interrupts in that early boot case. So we need to use the
- * irqsave/restore variants.
- */
- raw_spin_lock_irqsave(&lock->wait_lock, flags);
-
- /* Try to acquire the lock again: */
- if (try_to_take_rt_mutex(lock, current, NULL)) {
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- return 0;
- }
-
- set_current_state(state);
-
- /* Setup the timer, when timeout != NULL */
- if (unlikely(timeout))
- hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-
- ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
-
- if (likely(!ret))
- /* sleep on the mutex */
- ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
-
- if (unlikely(ret)) {
- __set_current_state(TASK_RUNNING);
- remove_waiter(lock, &waiter);
- rt_mutex_handle_deadlock(ret, chwalk, &waiter);
- }
-
- /*
- * try_to_take_rt_mutex() sets the waiter bit
- * unconditionally. We might have to fix that up.
- */
- fixup_rt_mutex_waiters(lock);
-
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
-
- /* Remove pending timer: */
- if (unlikely(timeout))
- hrtimer_cancel(&timeout->timer);
-
- debug_rt_mutex_free_waiter(&waiter);
-
- return ret;
-}
-
-static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock)
+static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock)
{
int ret = try_to_take_rt_mutex(lock, current, NULL);
@@ -1294,7 +1359,7 @@ static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock)
* try_to_take_rt_mutex() sets the lock waiters bit
* unconditionally. Clean this up.
*/
- fixup_rt_mutex_waiters(lock);
+ fixup_rt_mutex_waiters(lock, true);
return ret;
}
@@ -1302,7 +1367,7 @@ static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock)
/*
* Slow path try-lock function:
*/
-static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
+static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock)
{
unsigned long flags;
int ret;
@@ -1328,14 +1393,20 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
return ret;
}
+static __always_inline int __rt_mutex_trylock(struct rt_mutex_base *lock)
+{
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+ return 1;
+
+ return rt_mutex_slowtrylock(lock);
+}
+
/*
* Slow path to release a rt-mutex.
- *
- * Return whether the current task needs to call rt_mutex_postunlock().
*/
-static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
- struct wake_q_head *wake_q)
+static void __sched rt_mutex_slowunlock(struct rt_mutex_base *lock)
{
+ DEFINE_RT_WAKE_Q(wqh);
unsigned long flags;
/* irqsave required to support early boot calls */
@@ -1377,7 +1448,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
while (!rt_mutex_has_waiters(lock)) {
/* Drops lock->wait_lock ! */
if (unlock_rt_mutex_safe(lock, flags) == true)
- return false;
+ return;
/* Relock the rtmutex and try again */
raw_spin_lock_irqsave(&lock->wait_lock, flags);
}
@@ -1388,533 +1459,407 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
*
* Queue the next waiter for wakeup once we release the wait_lock.
*/
- mark_wakeup_next_waiter(wake_q, lock);
+ mark_wakeup_next_waiter(&wqh, lock);
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- return true; /* call rt_mutex_postunlock() */
+ rt_mutex_wake_up_q(&wqh);
}
-/*
- * debug aware fast / slowpath lock,trylock,unlock
- *
- * The atomic acquire/release ops are compiled away, when either the
- * architecture does not support cmpxchg or when debugging is enabled.
- */
-static inline int
-rt_mutex_fastlock(struct rt_mutex *lock, int state,
- int (*slowfn)(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk))
+static __always_inline void __rt_mutex_unlock(struct rt_mutex_base *lock)
{
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
- return 0;
+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ return;
- return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
+ rt_mutex_slowunlock(lock);
}
-static inline int
-rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk,
- int (*slowfn)(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk))
+#ifdef CONFIG_SMP
+static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *owner)
{
- if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
- likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
- return 0;
+ bool res = true;
- return slowfn(lock, state, timeout, chwalk);
+ rcu_read_lock();
+ for (;;) {
+ /* If owner changed, trylock again. */
+ if (owner != rt_mutex_owner(lock))
+ break;
+ /*
+ * Ensure that @owner is dereferenced after checking that
+ * the lock owner still matches @owner. If that fails,
+ * @owner might point to freed memory. If it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
+ /*
+ * Stop spinning when:
+ * - the lock owner has been scheduled out
+ * - current is not longer the top waiter
+ * - current is requested to reschedule (redundant
+ * for CONFIG_PREEMPT_RCU=y)
+ * - the VCPU on which owner runs is preempted
+ */
+ if (!owner_on_cpu(owner) || need_resched() ||
+ !rt_mutex_waiter_is_top_waiter(lock, waiter)) {
+ res = false;
+ break;
+ }
+ cpu_relax();
+ }
+ rcu_read_unlock();
+ return res;
}
-
-static inline int
-rt_mutex_fasttrylock(struct rt_mutex *lock,
- int (*slowfn)(struct rt_mutex *lock))
+#else
+static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *owner)
{
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
- return 1;
-
- return slowfn(lock);
+ return false;
}
+#endif
+#ifdef RT_MUTEX_BUILD_MUTEX
/*
- * Performs the wakeup of the the top-waiter and re-enables preemption.
+ * Functions required for:
+ * - rtmutex, futex on all kernels
+ * - mutex and rwsem substitutions on RT kernels
*/
-void rt_mutex_postunlock(struct wake_q_head *wake_q)
+
+/*
+ * Remove a waiter from a lock and give up
+ *
+ * Must be called with lock->wait_lock held and interrupts disabled. It must
+ * have just failed to try_to_take_rt_mutex().
+ */
+static void __sched remove_waiter(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter)
{
- wake_up_q(wake_q);
+ bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
+ struct task_struct *owner = rt_mutex_owner(lock);
+ struct rt_mutex_base *next_lock;
- /* Pairs with preempt_disable() in rt_mutex_slowunlock() */
- preempt_enable();
-}
+ lockdep_assert_held(&lock->wait_lock);
-static inline void
-rt_mutex_fastunlock(struct rt_mutex *lock,
- bool (*slowfn)(struct rt_mutex *lock,
- struct wake_q_head *wqh))
-{
- DEFINE_WAKE_Q(wake_q);
+ raw_spin_lock(&current->pi_lock);
+ rt_mutex_dequeue(lock, waiter);
+ current->pi_blocked_on = NULL;
+ raw_spin_unlock(&current->pi_lock);
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ /*
+ * Only update priority if the waiter was the highest priority
+ * waiter of the lock and there is an owner to update.
+ */
+ if (!owner || !is_top_waiter)
return;
- if (slowfn(lock, &wake_q))
- rt_mutex_postunlock(&wake_q);
-}
-
-static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass)
-{
- might_sleep();
+ raw_spin_lock(&owner->pi_lock);
- mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
- rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
-}
+ rt_mutex_dequeue_pi(owner, waiter);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-/**
- * rt_mutex_lock_nested - lock a rt_mutex
- *
- * @lock: the rt_mutex to be locked
- * @subclass: the lockdep subclass
- */
-void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass)
-{
- __rt_mutex_lock(lock, subclass);
-}
-EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
+ if (rt_mutex_has_waiters(lock))
+ rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
-#else /* !CONFIG_DEBUG_LOCK_ALLOC */
+ rt_mutex_adjust_prio(lock, owner);
-/**
- * rt_mutex_lock - lock a rt_mutex
- *
- * @lock: the rt_mutex to be locked
- */
-void __sched rt_mutex_lock(struct rt_mutex *lock)
-{
- __rt_mutex_lock(lock, 0);
-}
-EXPORT_SYMBOL_GPL(rt_mutex_lock);
-#endif
+ /* Store the lock on which owner is blocked or NULL */
+ next_lock = task_blocked_on_lock(owner);
-/**
- * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
- *
- * @lock: the rt_mutex to be locked
- *
- * Returns:
- * 0 on success
- * -EINTR when interrupted by a signal
- */
-int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
-{
- int ret;
+ raw_spin_unlock(&owner->pi_lock);
- might_sleep();
+ /*
+ * Don't walk the chain, if the owner task is not blocked
+ * itself.
+ */
+ if (!next_lock)
+ return;
- mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
- ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
- if (ret)
- mutex_release(&lock->dep_map, _RET_IP_);
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(owner);
- return ret;
-}
-EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+ raw_spin_unlock_irq(&lock->wait_lock);
-/*
- * Futex variant, must not use fastpath.
- */
-int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
-{
- return rt_mutex_slowtrylock(lock);
-}
+ rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
+ next_lock, NULL, current);
-int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
-{
- return __rt_mutex_slowtrylock(lock);
+ raw_spin_lock_irq(&lock->wait_lock);
}
/**
- * rt_mutex_timed_lock - lock a rt_mutex interruptible
- * the timeout structure is provided
- * by the caller
- *
- * @lock: the rt_mutex to be locked
- * @timeout: timeout structure or NULL (no timeout)
+ * rt_mutex_slowlock_block() - Perform the wait-wake-try-to-take loop
+ * @lock: the rt_mutex to take
+ * @ww_ctx: WW mutex context pointer
+ * @state: the state the task should block in (TASK_INTERRUPTIBLE
+ * or TASK_UNINTERRUPTIBLE)
+ * @timeout: the pre-initialized and started timer, or NULL for none
+ * @waiter: the pre-initialized rt_mutex_waiter
*
- * Returns:
- * 0 on success
- * -EINTR when interrupted by a signal
- * -ETIMEDOUT when the timeout expired
+ * Must be called with lock->wait_lock held and interrupts disabled
*/
-int
-rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
+static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state,
+ struct hrtimer_sleeper *timeout,
+ struct rt_mutex_waiter *waiter)
{
- int ret;
+ struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex);
+ struct task_struct *owner;
+ int ret = 0;
- might_sleep();
+ for (;;) {
+ /* Try to acquire the lock: */
+ if (try_to_take_rt_mutex(lock, current, waiter))
+ break;
- mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
- ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
- RT_MUTEX_MIN_CHAINWALK,
- rt_mutex_slowlock);
- if (ret)
- mutex_release(&lock->dep_map, _RET_IP_);
+ if (timeout && !timeout->task) {
+ ret = -ETIMEDOUT;
+ break;
+ }
+ if (signal_pending_state(state, current)) {
+ ret = -EINTR;
+ break;
+ }
- return ret;
-}
-EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
+ if (build_ww_mutex() && ww_ctx) {
+ ret = __ww_mutex_check_kill(rtm, waiter, ww_ctx);
+ if (ret)
+ break;
+ }
-/**
- * rt_mutex_trylock - try to lock a rt_mutex
- *
- * @lock: the rt_mutex to be locked
- *
- * This function can only be called in thread context. It's safe to
- * call it from atomic regions, but not from hard interrupt or soft
- * interrupt context.
- *
- * Returns 1 on success and 0 on contention
- */
-int __sched rt_mutex_trylock(struct rt_mutex *lock)
-{
- int ret;
+ if (waiter == rt_mutex_top_waiter(lock))
+ owner = rt_mutex_owner(lock);
+ else
+ owner = NULL;
+ raw_spin_unlock_irq(&lock->wait_lock);
- if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
- return 0;
+ if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
+ rt_mutex_schedule();
- ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
- if (ret)
- mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ raw_spin_lock_irq(&lock->wait_lock);
+ set_current_state(state);
+ }
+ __set_current_state(TASK_RUNNING);
return ret;
}
-EXPORT_SYMBOL_GPL(rt_mutex_trylock);
-/**
- * rt_mutex_unlock - unlock a rt_mutex
- *
- * @lock: the rt_mutex to be unlocked
- */
-void __sched rt_mutex_unlock(struct rt_mutex *lock)
+static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
+ struct rt_mutex_waiter *w)
{
- mutex_release(&lock->dep_map, _RET_IP_);
- rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
+ /*
+ * If the result is not -EDEADLOCK or the caller requested
+ * deadlock detection, nothing to do here.
+ */
+ if (res != -EDEADLOCK || detect_deadlock)
+ return;
+
+ if (build_ww_mutex() && w->ww_ctx)
+ return;
+
+ /*
+ * Yell loudly and stop the task right here.
+ */
+ WARN(1, "rtmutex deadlock detected\n");
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ rt_mutex_schedule();
+ }
}
-EXPORT_SYMBOL_GPL(rt_mutex_unlock);
/**
- * Futex variant, that since futex variants do not use the fast-path, can be
- * simple and will not need to retry.
+ * __rt_mutex_slowlock - Locking slowpath invoked with lock::wait_lock held
+ * @lock: The rtmutex to block lock
+ * @ww_ctx: WW mutex context pointer
+ * @state: The task state for sleeping
+ * @chwalk: Indicator whether full or partial chainwalk is requested
+ * @waiter: Initializer waiter for blocking
*/
-bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wake_q)
+static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state,
+ enum rtmutex_chainwalk chwalk,
+ struct rt_mutex_waiter *waiter)
{
+ struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex);
+ struct ww_mutex *ww = ww_container_of(rtm);
+ int ret;
+
lockdep_assert_held(&lock->wait_lock);
- debug_rt_mutex_unlock(lock);
+ /* Try to acquire the lock again: */
+ if (try_to_take_rt_mutex(lock, current, NULL)) {
+ if (build_ww_mutex() && ww_ctx) {
+ __ww_mutex_check_waiters(rtm, ww_ctx);
+ ww_mutex_lock_acquired(ww, ww_ctx);
+ }
+ return 0;
+ }
+
+ set_current_state(state);
+
+ trace_contention_begin(lock, LCB_F_RT);
- if (!rt_mutex_has_waiters(lock)) {
- lock->owner = NULL;
- return false; /* done */
+ ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk);
+ if (likely(!ret))
+ ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter);
+
+ if (likely(!ret)) {
+ /* acquired the lock */
+ if (build_ww_mutex() && ww_ctx) {
+ if (!ww_ctx->is_wait_die)
+ __ww_mutex_check_waiters(rtm, ww_ctx);
+ ww_mutex_lock_acquired(ww, ww_ctx);
+ }
+ } else {
+ __set_current_state(TASK_RUNNING);
+ remove_waiter(lock, waiter);
+ rt_mutex_handle_deadlock(ret, chwalk, waiter);
}
/*
- * We've already deboosted, mark_wakeup_next_waiter() will
- * retain preempt_disabled when we drop the wait_lock, to
- * avoid inversion prior to the wakeup. preempt_disable()
- * therein pairs with rt_mutex_postunlock().
+ * try_to_take_rt_mutex() sets the waiter bit
+ * unconditionally. We might have to fix that up.
*/
- mark_wakeup_next_waiter(wake_q, lock);
+ fixup_rt_mutex_waiters(lock, true);
+
+ trace_contention_end(lock, ret);
- return true; /* call postunlock() */
+ return ret;
}
-void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
+static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state)
{
- DEFINE_WAKE_Q(wake_q);
- unsigned long flags;
- bool postunlock;
+ struct rt_mutex_waiter waiter;
+ int ret;
- raw_spin_lock_irqsave(&lock->wait_lock, flags);
- postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ rt_mutex_init_waiter(&waiter);
+ waiter.ww_ctx = ww_ctx;
- if (postunlock)
- rt_mutex_postunlock(&wake_q);
-}
+ ret = __rt_mutex_slowlock(lock, ww_ctx, state, RT_MUTEX_MIN_CHAINWALK,
+ &waiter);
-/**
- * rt_mutex_destroy - mark a mutex unusable
- * @lock: the mutex to be destroyed
- *
- * This function marks the mutex uninitialized, and any subsequent
- * use of the mutex is forbidden. The mutex must not be locked when
- * this function is called.
- */
-void rt_mutex_destroy(struct rt_mutex *lock)
-{
- WARN_ON(rt_mutex_is_locked(lock));
-#ifdef CONFIG_DEBUG_RT_MUTEXES
- lock->magic = NULL;
-#endif
+ debug_rt_mutex_free_waiter(&waiter);
+ return ret;
}
-EXPORT_SYMBOL_GPL(rt_mutex_destroy);
-/**
- * __rt_mutex_init - initialize the rt lock
- *
- * @lock: the rt lock to be initialized
- *
- * Initialize the rt lock to unlocked state.
- *
- * Initializing of a locked rt lock is not allowed
+/*
+ * rt_mutex_slowlock - Locking slowpath invoked when fast path fails
+ * @lock: The rtmutex to block lock
+ * @ww_ctx: WW mutex context pointer
+ * @state: The task state for sleeping
*/
-void __rt_mutex_init(struct rt_mutex *lock, const char *name,
- struct lock_class_key *key)
+static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state)
{
- lock->owner = NULL;
- raw_spin_lock_init(&lock->wait_lock);
- lock->waiters = RB_ROOT_CACHED;
+ unsigned long flags;
+ int ret;
+
+ /*
+ * Do all pre-schedule work here, before we queue a waiter and invoke
+ * PI -- any such work that trips on rtlock (PREEMPT_RT spinlock) would
+ * otherwise recurse back into task_blocks_on_rt_mutex() through
+ * rtlock_slowlock() and will then enqueue a second waiter for this
+ * same task and things get really confusing real fast.
+ */
+ rt_mutex_pre_schedule();
- if (name && key)
- debug_rt_mutex_init(lock, name, key);
+ /*
+ * Technically we could use raw_spin_[un]lock_irq() here, but this can
+ * be called in early boot if the cmpxchg() fast path is disabled
+ * (debug, no architecture support). In this case we will acquire the
+ * rtmutex with lock->wait_lock held. But we cannot unconditionally
+ * enable interrupts in that early boot case. So we need to use the
+ * irqsave/restore variants.
+ */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ rt_mutex_post_schedule();
+
+ return ret;
}
-EXPORT_SYMBOL_GPL(__rt_mutex_init);
-/**
- * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
- * proxy owner
- *
- * @lock: the rt_mutex to be locked
- * @proxy_owner:the task to set as owner
- *
- * No locking. Caller has to do serializing itself
- *
- * Special API call for PI-futex support. This initializes the rtmutex and
- * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not
- * possible at this point because the pi_state which contains the rtmutex
- * is not yet visible to other tasks.
- */
-void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
- struct task_struct *proxy_owner)
+static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock,
+ unsigned int state)
{
- __rt_mutex_init(lock, NULL, NULL);
- debug_rt_mutex_proxy_lock(lock, proxy_owner);
- rt_mutex_set_owner(lock, proxy_owner);
+ lockdep_assert(!current->pi_blocked_on);
+
+ if (likely(rt_mutex_try_acquire(lock)))
+ return 0;
+
+ return rt_mutex_slowlock(lock, NULL, state);
}
+#endif /* RT_MUTEX_BUILD_MUTEX */
-/**
- * rt_mutex_proxy_unlock - release a lock on behalf of owner
- *
- * @lock: the rt_mutex to be locked
- *
- * No locking. Caller has to do serializing itself
- *
- * Special API call for PI-futex support. This merrily cleans up the rtmutex
- * (debugging) state. Concurrent operations on this rt_mutex are not
- * possible because it belongs to the pi_state which is about to be freed
- * and it is not longer visible to other tasks.
+#ifdef RT_MUTEX_BUILD_SPINLOCKS
+/*
+ * Functions required for spin/rw_lock substitution on RT kernels
*/
-void rt_mutex_proxy_unlock(struct rt_mutex *lock)
-{
- debug_rt_mutex_proxy_unlock(lock);
- rt_mutex_set_owner(lock, NULL);
-}
/**
- * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
- * @lock: the rt_mutex to take
- * @waiter: the pre-initialized rt_mutex_waiter
- * @task: the task to prepare
- *
- * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
- * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
- *
- * NOTE: does _NOT_ remove the @waiter on failure; must either call
- * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this.
- *
- * Returns:
- * 0 - task blocked on lock
- * 1 - acquired the lock for task, caller should wake it up
- * <0 - error
- *
- * Special API call for PI-futex support.
+ * rtlock_slowlock_locked - Slow path lock acquisition for RT locks
+ * @lock: The underlying RT mutex
*/
-int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task)
+static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock)
{
- int ret;
+ struct rt_mutex_waiter waiter;
+ struct task_struct *owner;
lockdep_assert_held(&lock->wait_lock);
- if (try_to_take_rt_mutex(lock, task, NULL))
- return 1;
+ if (try_to_take_rt_mutex(lock, current, NULL))
+ return;
- /* We enforce deadlock detection for futexes */
- ret = task_blocks_on_rt_mutex(lock, waiter, task,
- RT_MUTEX_FULL_CHAINWALK);
+ rt_mutex_init_rtlock_waiter(&waiter);
- if (ret && !rt_mutex_owner(lock)) {
- /*
- * Reset the return value. We might have
- * returned with -EDEADLK and the owner
- * released the lock while we were walking the
- * pi chain. Let the waiter sort it out.
- */
- ret = 0;
- }
+ /* Save current state and set state to TASK_RTLOCK_WAIT */
+ current_save_and_set_rtlock_wait_state();
- debug_rt_mutex_print_deadlock(waiter);
+ trace_contention_begin(lock, LCB_F_RT);
- return ret;
-}
+ task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK);
-/**
- * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
- * @lock: the rt_mutex to take
- * @waiter: the pre-initialized rt_mutex_waiter
- * @task: the task to prepare
- *
- * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
- * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
- *
- * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter
- * on failure.
- *
- * Returns:
- * 0 - task blocked on lock
- * 1 - acquired the lock for task, caller should wake it up
- * <0 - error
- *
- * Special API call for PI-futex support.
- */
-int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task)
-{
- int ret;
-
- raw_spin_lock_irq(&lock->wait_lock);
- ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
- if (unlikely(ret))
- remove_waiter(lock, waiter);
- raw_spin_unlock_irq(&lock->wait_lock);
+ for (;;) {
+ /* Try to acquire the lock again */
+ if (try_to_take_rt_mutex(lock, current, &waiter))
+ break;
- return ret;
-}
+ if (&waiter == rt_mutex_top_waiter(lock))
+ owner = rt_mutex_owner(lock);
+ else
+ owner = NULL;
+ raw_spin_unlock_irq(&lock->wait_lock);
-/**
- * rt_mutex_next_owner - return the next owner of the lock
- *
- * @lock: the rt lock query
- *
- * Returns the next owner of the lock or NULL
- *
- * Caller has to serialize against other accessors to the lock
- * itself.
- *
- * Special API call for PI-futex support
- */
-struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
-{
- if (!rt_mutex_has_waiters(lock))
- return NULL;
+ if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner))
+ schedule_rtlock();
- return rt_mutex_top_waiter(lock)->task;
-}
+ raw_spin_lock_irq(&lock->wait_lock);
+ set_current_state(TASK_RTLOCK_WAIT);
+ }
-/**
- * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
- * @lock: the rt_mutex we were woken on
- * @to: the timeout, null if none. hrtimer should already have
- * been started.
- * @waiter: the pre-initialized rt_mutex_waiter
- *
- * Wait for the the lock acquisition started on our behalf by
- * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
- * rt_mutex_cleanup_proxy_lock().
- *
- * Returns:
- * 0 - success
- * <0 - error, one of -EINTR, -ETIMEDOUT
- *
- * Special API call for PI-futex support
- */
-int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter)
-{
- int ret;
+ /* Restore the task state */
+ current_restore_rtlock_saved_state();
- raw_spin_lock_irq(&lock->wait_lock);
- /* sleep on the mutex */
- set_current_state(TASK_INTERRUPTIBLE);
- ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
/*
- * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
- * have to fix that up.
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally.
+ * We might have to fix that up:
*/
- fixup_rt_mutex_waiters(lock);
- raw_spin_unlock_irq(&lock->wait_lock);
+ fixup_rt_mutex_waiters(lock, true);
+ debug_rt_mutex_free_waiter(&waiter);
- return ret;
+ trace_contention_end(lock, 0);
}
-/**
- * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
- * @lock: the rt_mutex we were woken on
- * @waiter: the pre-initialized rt_mutex_waiter
- *
- * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or
- * rt_mutex_wait_proxy_lock().
- *
- * Unless we acquired the lock; we're still enqueued on the wait-list and can
- * in fact still be granted ownership until we're removed. Therefore we can
- * find we are in fact the owner and must disregard the
- * rt_mutex_wait_proxy_lock() failure.
- *
- * Returns:
- * true - did the cleanup, we done.
- * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
- * caller should disregards its return value.
- *
- * Special API call for PI-futex support
- */
-bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter)
+static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock)
{
- bool cleanup = false;
-
- raw_spin_lock_irq(&lock->wait_lock);
- /*
- * Do an unconditional try-lock, this deals with the lock stealing
- * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
- * sets a NULL owner.
- *
- * We're not interested in the return value, because the subsequent
- * test on rt_mutex_owner() will infer that. If the trylock succeeded,
- * we will own the lock and it will have removed the waiter. If we
- * failed the trylock, we're still not owner and we need to remove
- * ourselves.
- */
- try_to_take_rt_mutex(lock, current, waiter);
- /*
- * Unless we're the owner; we're still enqueued on the wait_list.
- * So check if we became owner, if not, take us off the wait_list.
- */
- if (rt_mutex_owner(lock) != current) {
- remove_waiter(lock, waiter);
- cleanup = true;
- }
- /*
- * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
- * have to fix that up.
- */
- fixup_rt_mutex_waiters(lock);
-
- raw_spin_unlock_irq(&lock->wait_lock);
+ unsigned long flags;
- return cleanup;
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ rtlock_slowlock_locked(lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
}
+
+#endif /* RT_MUTEX_BUILD_SPINLOCKS */
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
deleted file mode 100644
index 732f96abf462..000000000000
--- a/kernel/locking/rtmutex.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * RT-Mutexes: blocking mutual exclusion locks with PI support
- *
- * started by Ingo Molnar and Thomas Gleixner:
- *
- * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- * This file contains macros used solely by rtmutex.c.
- * Non-debug version.
- */
-
-#define rt_mutex_deadlock_check(l) (0)
-#define debug_rt_mutex_init_waiter(w) do { } while (0)
-#define debug_rt_mutex_free_waiter(w) do { } while (0)
-#define debug_rt_mutex_lock(l) do { } while (0)
-#define debug_rt_mutex_proxy_lock(l,p) do { } while (0)
-#define debug_rt_mutex_proxy_unlock(l) do { } while (0)
-#define debug_rt_mutex_unlock(l) do { } while (0)
-#define debug_rt_mutex_init(m, n, k) do { } while (0)
-#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
-#define debug_rt_mutex_print_deadlock(w) do { } while (0)
-#define debug_rt_mutex_reset_waiter(w) do { } while (0)
-
-static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
-{
- WARN(1, "rtmutex deadlock detected\n");
-}
-
-static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w,
- enum rtmutex_chainwalk walk)
-{
- return walk == RT_MUTEX_FULL_CHAINWALK;
-}
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
new file mode 100644
index 000000000000..a6974d044593
--- /dev/null
+++ b/kernel/locking/rtmutex_api.c
@@ -0,0 +1,612 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * rtmutex API
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+
+#define RT_MUTEX_BUILD_MUTEX
+#include "rtmutex.c"
+
+/*
+ * Max number of times we'll walk the boosting chain:
+ */
+int max_lock_depth = 1024;
+
+/*
+ * Debug aware fast / slowpath lock,trylock,unlock
+ *
+ * The atomic acquire/release ops are compiled away, when either the
+ * architecture does not support cmpxchg or when debugging is enabled.
+ */
+static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock,
+ unsigned int state,
+ struct lockdep_map *nest_lock,
+ unsigned int subclass)
+{
+ int ret;
+
+ might_sleep();
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, _RET_IP_);
+ ret = __rt_mutex_lock(&lock->rtmutex, state);
+ if (ret)
+ mutex_release(&lock->dep_map, _RET_IP_);
+ return ret;
+}
+
+void rt_mutex_base_init(struct rt_mutex_base *rtb)
+{
+ __rt_mutex_base_init(rtb);
+}
+EXPORT_SYMBOL(rt_mutex_base_init);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/**
+ * rt_mutex_lock_nested - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ * @subclass: the lockdep subclass
+ */
+void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass)
+{
+ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, subclass);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
+
+void __sched _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock)
+{
+ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, nest_lock, 0);
+}
+EXPORT_SYMBOL_GPL(_rt_mutex_lock_nest_lock);
+
+#else /* !CONFIG_DEBUG_LOCK_ALLOC */
+
+/**
+ * rt_mutex_lock - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ */
+void __sched rt_mutex_lock(struct rt_mutex *lock)
+{
+ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock);
+#endif
+
+/**
+ * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ */
+int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
+{
+ return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+
+/**
+ * rt_mutex_lock_killable - lock a rt_mutex killable
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ */
+int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
+{
+ return __rt_mutex_lock_common(lock, TASK_KILLABLE, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
+
+/**
+ * rt_mutex_trylock - try to lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * This function can only be called in thread context. It's safe to call it
+ * from atomic regions, but not from hard or soft interrupt context.
+ *
+ * Returns:
+ * 1 on success
+ * 0 on contention
+ */
+int __sched rt_mutex_trylock(struct rt_mutex *lock)
+{
+ int ret;
+
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
+ return 0;
+
+ ret = __rt_mutex_trylock(&lock->rtmutex);
+ if (ret)
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rt_mutex_trylock);
+
+/**
+ * rt_mutex_unlock - unlock a rt_mutex
+ *
+ * @lock: the rt_mutex to be unlocked
+ */
+void __sched rt_mutex_unlock(struct rt_mutex *lock)
+{
+ mutex_release(&lock->dep_map, _RET_IP_);
+ __rt_mutex_unlock(&lock->rtmutex);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+
+/*
+ * Futex variants, must not use fastpath.
+ */
+int __sched rt_mutex_futex_trylock(struct rt_mutex_base *lock)
+{
+ return rt_mutex_slowtrylock(lock);
+}
+
+int __sched __rt_mutex_futex_trylock(struct rt_mutex_base *lock)
+{
+ return __rt_mutex_slowtrylock(lock);
+}
+
+/**
+ * __rt_mutex_futex_unlock - Futex variant, that since futex variants
+ * do not use the fast-path, can be simple and will not need to retry.
+ *
+ * @lock: The rt_mutex to be unlocked
+ * @wqh: The wake queue head from which to get the next lock waiter
+ */
+bool __sched __rt_mutex_futex_unlock(struct rt_mutex_base *lock,
+ struct rt_wake_q_head *wqh)
+{
+ lockdep_assert_held(&lock->wait_lock);
+
+ debug_rt_mutex_unlock(lock);
+
+ if (!rt_mutex_has_waiters(lock)) {
+ lock->owner = NULL;
+ return false; /* done */
+ }
+
+ /*
+ * We've already deboosted, mark_wakeup_next_waiter() will
+ * retain preempt_disabled when we drop the wait_lock, to
+ * avoid inversion prior to the wakeup. preempt_disable()
+ * therein pairs with rt_mutex_postunlock().
+ */
+ mark_wakeup_next_waiter(wqh, lock);
+
+ return true; /* call postunlock() */
+}
+
+void __sched rt_mutex_futex_unlock(struct rt_mutex_base *lock)
+{
+ DEFINE_RT_WAKE_Q(wqh);
+ unsigned long flags;
+ bool postunlock;
+
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ postunlock = __rt_mutex_futex_unlock(lock, &wqh);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ if (postunlock)
+ rt_mutex_postunlock(&wqh);
+}
+
+/**
+ * __rt_mutex_init - initialize the rt_mutex
+ *
+ * @lock: The rt_mutex to be initialized
+ * @name: The lock name used for debugging
+ * @key: The lock class key used for debugging
+ *
+ * Initialize the rt_mutex to unlocked state.
+ *
+ * Initializing of a locked rt_mutex is not allowed
+ */
+void __sched __rt_mutex_init(struct rt_mutex *lock, const char *name,
+ struct lock_class_key *key)
+{
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ __rt_mutex_base_init(&lock->rtmutex);
+ lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP);
+}
+EXPORT_SYMBOL_GPL(__rt_mutex_init);
+
+/**
+ * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
+ * proxy owner
+ *
+ * @lock: the rt_mutex to be locked
+ * @proxy_owner:the task to set as owner
+ *
+ * No locking. Caller has to do serializing itself
+ *
+ * Special API call for PI-futex support. This initializes the rtmutex and
+ * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not
+ * possible at this point because the pi_state which contains the rtmutex
+ * is not yet visible to other tasks.
+ */
+void __sched rt_mutex_init_proxy_locked(struct rt_mutex_base *lock,
+ struct task_struct *proxy_owner)
+{
+ static struct lock_class_key pi_futex_key;
+
+ __rt_mutex_base_init(lock);
+ /*
+ * On PREEMPT_RT the futex hashbucket spinlock becomes 'sleeping'
+ * and rtmutex based. That causes a lockdep false positive, because
+ * some of the futex functions invoke spin_unlock(&hb->lock) with
+ * the wait_lock of the rtmutex associated to the pi_futex held.
+ * spin_unlock() in turn takes wait_lock of the rtmutex on which
+ * the spinlock is based, which makes lockdep notice a lock
+ * recursion. Give the futex/rtmutex wait_lock a separate key.
+ */
+ lockdep_set_class(&lock->wait_lock, &pi_futex_key);
+ rt_mutex_set_owner(lock, proxy_owner);
+}
+
+/**
+ * rt_mutex_proxy_unlock - release a lock on behalf of owner
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * No locking. Caller has to do serializing itself
+ *
+ * Special API call for PI-futex support. This just cleans up the rtmutex
+ * (debugging) state. Concurrent operations on this rt_mutex are not
+ * possible because it belongs to the pi_state which is about to be freed
+ * and it is not longer visible to other tasks.
+ */
+void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock)
+{
+ debug_rt_mutex_proxy_unlock(lock);
+ rt_mutex_clear_owner(lock);
+}
+
+/**
+ * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: does _NOT_ remove the @waiter on failure; must either call
+ * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this.
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for PI-futex support.
+ */
+int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+{
+ int ret;
+
+ lockdep_assert_held(&lock->wait_lock);
+
+ if (try_to_take_rt_mutex(lock, task, NULL))
+ return 1;
+
+ /* We enforce deadlock detection for futexes */
+ ret = task_blocks_on_rt_mutex(lock, waiter, task, NULL,
+ RT_MUTEX_FULL_CHAINWALK);
+
+ if (ret && !rt_mutex_owner(lock)) {
+ /*
+ * Reset the return value. We might have
+ * returned with -EDEADLK and the owner
+ * released the lock while we were walking the
+ * pi chain. Let the waiter sort it out.
+ */
+ ret = 0;
+ }
+
+ return ret;
+}
+
+/**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter
+ * on failure.
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for PI-futex support.
+ */
+int __sched rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+{
+ int ret;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
+ if (unlikely(ret))
+ remove_waiter(lock, waiter);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+}
+
+/**
+ * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @to: the timeout, null if none. hrtimer should already have
+ * been started.
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Wait for the lock acquisition started on our behalf by
+ * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
+ * rt_mutex_cleanup_proxy_lock().
+ *
+ * Returns:
+ * 0 - success
+ * <0 - error, one of -EINTR, -ETIMEDOUT
+ *
+ * Special API call for PI-futex support
+ */
+int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter)
+{
+ int ret;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ /* sleep on the mutex */
+ set_current_state(TASK_INTERRUPTIBLE);
+ ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter);
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock, true);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+}
+
+/**
+ * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or
+ * rt_mutex_wait_proxy_lock().
+ *
+ * Unless we acquired the lock; we're still enqueued on the wait-list and can
+ * in fact still be granted ownership until we're removed. Therefore we can
+ * find we are in fact the owner and must disregard the
+ * rt_mutex_wait_proxy_lock() failure.
+ *
+ * Returns:
+ * true - did the cleanup, we done.
+ * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
+ * caller should disregards its return value.
+ *
+ * Special API call for PI-futex support
+ */
+bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter)
+{
+ bool cleanup = false;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ /*
+ * Do an unconditional try-lock, this deals with the lock stealing
+ * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
+ * sets a NULL owner.
+ *
+ * We're not interested in the return value, because the subsequent
+ * test on rt_mutex_owner() will infer that. If the trylock succeeded,
+ * we will own the lock and it will have removed the waiter. If we
+ * failed the trylock, we're still not owner and we need to remove
+ * ourselves.
+ */
+ try_to_take_rt_mutex(lock, current, waiter);
+ /*
+ * Unless we're the owner; we're still enqueued on the wait_list.
+ * So check if we became owner, if not, take us off the wait_list.
+ */
+ if (rt_mutex_owner(lock) != current) {
+ remove_waiter(lock, waiter);
+ cleanup = true;
+ }
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock, false);
+
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return cleanup;
+}
+
+/*
+ * Recheck the pi chain, in case we got a priority setting
+ *
+ * Called from sched_setscheduler
+ */
+void __sched rt_mutex_adjust_pi(struct task_struct *task)
+{
+ struct rt_mutex_waiter *waiter;
+ struct rt_mutex_base *next_lock;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+ waiter = task->pi_blocked_on;
+ if (!waiter || rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) {
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ return;
+ }
+ next_lock = waiter->lock;
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(task);
+
+ rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
+ next_lock, NULL, task);
+}
+
+/*
+ * Performs the wakeup of the top-waiter and re-enables preemption.
+ */
+void __sched rt_mutex_postunlock(struct rt_wake_q_head *wqh)
+{
+ rt_mutex_wake_up_q(wqh);
+}
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+void rt_mutex_debug_task_free(struct task_struct *task)
+{
+ DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root));
+ DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
+}
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+/* Mutexes */
+void __mutex_rt_init(struct mutex *mutex, const char *name,
+ struct lock_class_key *key)
+{
+ debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
+ lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP);
+}
+EXPORT_SYMBOL(__mutex_rt_init);
+
+static __always_inline int __mutex_lock_common(struct mutex *lock,
+ unsigned int state,
+ unsigned int subclass,
+ struct lockdep_map *nest_lock,
+ unsigned long ip)
+{
+ int ret;
+
+ might_sleep();
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
+ ret = __rt_mutex_lock(&lock->rtmutex, state);
+ if (ret)
+ mutex_release(&lock->dep_map, ip);
+ else
+ lock_acquired(&lock->dep_map, ip);
+ return ret;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass)
+{
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_nested);
+
+void __sched _mutex_lock_nest_lock(struct mutex *lock,
+ struct lockdep_map *nest_lock)
+{
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest_lock, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
+
+int __sched mutex_lock_interruptible_nested(struct mutex *lock,
+ unsigned int subclass)
+{
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
+
+int __sched mutex_lock_killable_nested(struct mutex *lock,
+ unsigned int subclass)
+{
+ return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
+
+void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
+{
+ int token;
+
+ might_sleep();
+
+ token = io_schedule_prepare();
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_io_nested);
+
+#else /* CONFIG_DEBUG_LOCK_ALLOC */
+
+void __sched mutex_lock(struct mutex *lock)
+{
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL(mutex_lock);
+
+int __sched mutex_lock_interruptible(struct mutex *lock)
+{
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL(mutex_lock_interruptible);
+
+int __sched mutex_lock_killable(struct mutex *lock)
+{
+ return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL(mutex_lock_killable);
+
+void __sched mutex_lock_io(struct mutex *lock)
+{
+ int token = io_schedule_prepare();
+
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL(mutex_lock_io);
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
+
+int __sched mutex_trylock(struct mutex *lock)
+{
+ int ret;
+
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
+ return 0;
+
+ ret = __rt_mutex_trylock(&lock->rtmutex);
+ if (ret)
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+ return ret;
+}
+EXPORT_SYMBOL(mutex_trylock);
+
+void __sched mutex_unlock(struct mutex *lock)
+{
+ mutex_release(&lock->dep_map, _RET_IP_);
+ __rt_mutex_unlock(&lock->rtmutex);
+}
+EXPORT_SYMBOL(mutex_unlock);
+
+#endif /* CONFIG_PREEMPT_RT */
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index ca6fb489007b..1162e07cdaea 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -13,50 +13,127 @@
#ifndef __KERNEL_RTMUTEX_COMMON_H
#define __KERNEL_RTMUTEX_COMMON_H
+#include <linux/debug_locks.h>
#include <linux/rtmutex.h>
#include <linux/sched/wake_q.h>
+
+/*
+ * This is a helper for the struct rt_mutex_waiter below. A waiter goes in two
+ * separate trees and they need their own copy of the sort keys because of
+ * different locking requirements.
+ *
+ * @entry: rbtree node to enqueue into the waiters tree
+ * @prio: Priority of the waiter
+ * @deadline: Deadline of the waiter if applicable
+ *
+ * See rt_waiter_node_less() and waiter_*_prio().
+ */
+struct rt_waiter_node {
+ struct rb_node entry;
+ int prio;
+ u64 deadline;
+};
+
/*
* This is the control structure for tasks blocked on a rt_mutex,
* which is allocated on the kernel stack on of the blocked task.
*
- * @tree_entry: pi node to enqueue into the mutex waiters tree
- * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree
+ * @tree: node to enqueue into the mutex waiters tree
+ * @pi_tree: node to enqueue into the mutex owner waiters tree
* @task: task reference to the blocked task
+ * @lock: Pointer to the rt_mutex on which the waiter blocks
+ * @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT)
+ * @ww_ctx: WW context pointer
+ *
+ * @tree is ordered by @lock->wait_lock
+ * @pi_tree is ordered by rt_mutex_owner(@lock)->pi_lock
*/
struct rt_mutex_waiter {
- struct rb_node tree_entry;
- struct rb_node pi_tree_entry;
+ struct rt_waiter_node tree;
+ struct rt_waiter_node pi_tree;
struct task_struct *task;
- struct rt_mutex *lock;
-#ifdef CONFIG_DEBUG_RT_MUTEXES
- unsigned long ip;
- struct pid *deadlock_task_pid;
- struct rt_mutex *deadlock_lock;
-#endif
- int prio;
- u64 deadline;
+ struct rt_mutex_base *lock;
+ unsigned int wake_state;
+ struct ww_acquire_ctx *ww_ctx;
+};
+
+/**
+ * rt_wake_q_head - Wrapper around regular wake_q_head to support
+ * "sleeping" spinlocks on RT
+ * @head: The regular wake_q_head for sleeping lock variants
+ * @rtlock_task: Task pointer for RT lock (spin/rwlock) wakeups
+ */
+struct rt_wake_q_head {
+ struct wake_q_head head;
+ struct task_struct *rtlock_task;
};
+#define DEFINE_RT_WAKE_Q(name) \
+ struct rt_wake_q_head name = { \
+ .head = WAKE_Q_HEAD_INITIALIZER(name.head), \
+ .rtlock_task = NULL, \
+ }
+
/*
- * Various helpers to access the waiters-tree:
+ * PI-futex support (proxy locking functions, etc.):
*/
+extern void rt_mutex_init_proxy_locked(struct rt_mutex_base *lock,
+ struct task_struct *proxy_owner);
+extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock);
+extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
+extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
+extern int rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter);
+extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter);
-#ifdef CONFIG_RT_MUTEXES
+extern int rt_mutex_futex_trylock(struct rt_mutex_base *l);
+extern int __rt_mutex_futex_trylock(struct rt_mutex_base *l);
+
+extern void rt_mutex_futex_unlock(struct rt_mutex_base *lock);
+extern bool __rt_mutex_futex_unlock(struct rt_mutex_base *lock,
+ struct rt_wake_q_head *wqh);
-static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+extern void rt_mutex_postunlock(struct rt_wake_q_head *wqh);
+
+/*
+ * Must be guarded because this header is included from rcu/tree_plugin.h
+ * unconditionally.
+ */
+#ifdef CONFIG_RT_MUTEXES
+static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock)
{
return !RB_EMPTY_ROOT(&lock->waiters.rb_root);
}
-static inline struct rt_mutex_waiter *
-rt_mutex_top_waiter(struct rt_mutex *lock)
+/*
+ * Lockless speculative check whether @waiter is still the top waiter on
+ * @lock. This is solely comparing pointers and not derefencing the
+ * leftmost entry which might be about to vanish.
+ */
+static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter)
+{
+ struct rb_node *leftmost = rb_first_cached(&lock->waiters);
+
+ return rb_entry(leftmost, struct rt_mutex_waiter, tree.entry) == waiter;
+}
+
+static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock)
{
struct rb_node *leftmost = rb_first_cached(&lock->waiters);
struct rt_mutex_waiter *w = NULL;
+ lockdep_assert_held(&lock->wait_lock);
+
if (leftmost) {
- w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry);
+ w = rb_entry(leftmost, struct rt_mutex_waiter, tree.entry);
BUG_ON(w->lock != lock);
}
return w;
@@ -67,45 +144,17 @@ static inline int task_has_pi_waiters(struct task_struct *p)
return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root);
}
-static inline struct rt_mutex_waiter *
-task_top_pi_waiter(struct task_struct *p)
-{
- return rb_entry(p->pi_waiters.rb_leftmost,
- struct rt_mutex_waiter, pi_tree_entry);
-}
-
-#else
-
-static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
-{
- return false;
-}
-
-static inline struct rt_mutex_waiter *
-rt_mutex_top_waiter(struct rt_mutex *lock)
+static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p)
{
- return NULL;
-}
+ lockdep_assert_held(&p->pi_lock);
-static inline int task_has_pi_waiters(struct task_struct *p)
-{
- return false;
+ return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter,
+ pi_tree.entry);
}
-static inline struct rt_mutex_waiter *
-task_top_pi_waiter(struct task_struct *p)
-{
- return NULL;
-}
-
-#endif
-
-/*
- * lock->owner state tracking:
- */
#define RT_MUTEX_HAS_WAITERS 1UL
-static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
+static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock)
{
unsigned long owner = (unsigned long) READ_ONCE(lock->owner);
@@ -127,39 +176,59 @@ enum rtmutex_chainwalk {
RT_MUTEX_FULL_CHAINWALK,
};
-/*
- * PI-futex support (proxy locking functions, etc.):
- */
-extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
-extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
- struct task_struct *proxy_owner);
-extern void rt_mutex_proxy_unlock(struct rt_mutex *lock);
-extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
-extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task);
-extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task);
-extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter);
-extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter);
+static inline void __rt_mutex_base_init(struct rt_mutex_base *lock)
+{
+ raw_spin_lock_init(&lock->wait_lock);
+ lock->waiters = RB_ROOT_CACHED;
+ lock->owner = NULL;
+}
-extern int rt_mutex_futex_trylock(struct rt_mutex *l);
-extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
+/* Debug functions */
+static inline void debug_rt_mutex_unlock(struct rt_mutex_base *lock)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
+ DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
+}
-extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
-extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wqh);
+static inline void debug_rt_mutex_proxy_unlock(struct rt_mutex_base *lock)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
+ DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
+}
-extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
+static inline void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
+ memset(waiter, 0x11, sizeof(*waiter));
+}
-#ifdef CONFIG_DEBUG_RT_MUTEXES
-# include "rtmutex-debug.h"
-#else
-# include "rtmutex.h"
-#endif
+static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
+ memset(waiter, 0x22, sizeof(*waiter));
+}
+
+static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ debug_rt_mutex_init_waiter(waiter);
+ RB_CLEAR_NODE(&waiter->pi_tree.entry);
+ RB_CLEAR_NODE(&waiter->tree.entry);
+ waiter->wake_state = TASK_NORMAL;
+ waiter->task = NULL;
+}
+
+static inline void rt_mutex_init_rtlock_waiter(struct rt_mutex_waiter *waiter)
+{
+ rt_mutex_init_waiter(waiter);
+ waiter->wake_state = TASK_RTLOCK_WAIT;
+}
+
+#else /* CONFIG_RT_MUTEXES */
+/* Used in rcu/tree_plugin.h */
+static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock)
+{
+ return NULL;
+}
+#endif /* !CONFIG_RT_MUTEXES */
#endif
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
new file mode 100644
index 000000000000..34a59569db6b
--- /dev/null
+++ b/kernel/locking/rwbase_rt.c
@@ -0,0 +1,297 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * RT-specific reader/writer semaphores and reader/writer locks
+ *
+ * down_write/write_lock()
+ * 1) Lock rtmutex
+ * 2) Remove the reader BIAS to force readers into the slow path
+ * 3) Wait until all readers have left the critical section
+ * 4) Mark it write locked
+ *
+ * up_write/write_unlock()
+ * 1) Remove the write locked marker
+ * 2) Set the reader BIAS, so readers can use the fast path again
+ * 3) Unlock rtmutex, to release blocked readers
+ *
+ * down_read/read_lock()
+ * 1) Try fast path acquisition (reader BIAS is set)
+ * 2) Take tmutex::wait_lock, which protects the writelocked flag
+ * 3) If !writelocked, acquire it for read
+ * 4) If writelocked, block on tmutex
+ * 5) unlock rtmutex, goto 1)
+ *
+ * up_read/read_unlock()
+ * 1) Try fast path release (reader count != 1)
+ * 2) Wake the writer waiting in down_write()/write_lock() #3
+ *
+ * down_read/read_lock()#3 has the consequence, that rw semaphores and rw
+ * locks on RT are not writer fair, but writers, which should be avoided in
+ * RT tasks (think mmap_sem), are subject to the rtmutex priority/DL
+ * inheritance mechanism.
+ *
+ * It's possible to make the rw primitives writer fair by keeping a list of
+ * active readers. A blocked writer would force all newly incoming readers
+ * to block on the rtmutex, but the rtmutex would have to be proxy locked
+ * for one reader after the other. We can't use multi-reader inheritance
+ * because there is no way to support that with SCHED_DEADLINE.
+ * Implementing the one by one reader boosting/handover mechanism is a
+ * major surgery for a very dubious value.
+ *
+ * The risk of writer starvation is there, but the pathological use cases
+ * which trigger it are not necessarily the typical RT workloads.
+ *
+ * Fast-path orderings:
+ * The lock/unlock of readers can run in fast paths: lock and unlock are only
+ * atomic ops, and there is no inner lock to provide ACQUIRE and RELEASE
+ * semantics of rwbase_rt. Atomic ops should thus provide _acquire()
+ * and _release() (or stronger).
+ *
+ * Common code shared between RT rw_semaphore and rwlock
+ */
+
+static __always_inline int rwbase_read_trylock(struct rwbase_rt *rwb)
+{
+ int r;
+
+ /*
+ * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is
+ * set.
+ */
+ for (r = atomic_read(&rwb->readers); r < 0;) {
+ if (likely(atomic_try_cmpxchg_acquire(&rwb->readers, &r, r + 1)))
+ return 1;
+ }
+ return 0;
+}
+
+static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ int ret;
+
+ rwbase_pre_schedule();
+ raw_spin_lock_irq(&rtm->wait_lock);
+
+ /*
+ * Call into the slow lock path with the rtmutex->wait_lock
+ * held, so this can't result in the following race:
+ *
+ * Reader1 Reader2 Writer
+ * down_read()
+ * down_write()
+ * rtmutex_lock(m)
+ * wait()
+ * down_read()
+ * unlock(m->wait_lock)
+ * up_read()
+ * wake(Writer)
+ * lock(m->wait_lock)
+ * sem->writelocked=true
+ * unlock(m->wait_lock)
+ *
+ * up_write()
+ * sem->writelocked=false
+ * rtmutex_unlock(m)
+ * down_read()
+ * down_write()
+ * rtmutex_lock(m)
+ * wait()
+ * rtmutex_lock(m)
+ *
+ * That would put Reader1 behind the writer waiting on
+ * Reader2 to call up_read(), which might be unbound.
+ */
+
+ trace_contention_begin(rwb, LCB_F_RT | LCB_F_READ);
+
+ /*
+ * For rwlocks this returns 0 unconditionally, so the below
+ * !ret conditionals are optimized out.
+ */
+ ret = rwbase_rtmutex_slowlock_locked(rtm, state);
+
+ /*
+ * On success the rtmutex is held, so there can't be a writer
+ * active. Increment the reader count and immediately drop the
+ * rtmutex again.
+ *
+ * rtmutex->wait_lock has to be unlocked in any case of course.
+ */
+ if (!ret)
+ atomic_inc(&rwb->readers);
+ raw_spin_unlock_irq(&rtm->wait_lock);
+ if (!ret)
+ rwbase_rtmutex_unlock(rtm);
+
+ trace_contention_end(rwb, ret);
+ rwbase_post_schedule();
+ return ret;
+}
+
+static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ lockdep_assert(!current->pi_blocked_on);
+
+ if (rwbase_read_trylock(rwb))
+ return 0;
+
+ return __rwbase_read_lock(rwb, state);
+}
+
+static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ struct task_struct *owner;
+ DEFINE_RT_WAKE_Q(wqh);
+
+ raw_spin_lock_irq(&rtm->wait_lock);
+ /*
+ * Wake the writer, i.e. the rtmutex owner. It might release the
+ * rtmutex concurrently in the fast path (due to a signal), but to
+ * clean up rwb->readers it needs to acquire rtm->wait_lock. The
+ * worst case which can happen is a spurious wakeup.
+ */
+ owner = rt_mutex_owner(rtm);
+ if (owner)
+ rt_mutex_wake_q_add_task(&wqh, owner, state);
+
+ /* Pairs with the preempt_enable in rt_mutex_wake_up_q() */
+ preempt_disable();
+ raw_spin_unlock_irq(&rtm->wait_lock);
+ rt_mutex_wake_up_q(&wqh);
+}
+
+static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ /*
+ * rwb->readers can only hit 0 when a writer is waiting for the
+ * active readers to leave the critical section.
+ *
+ * dec_and_test() is fully ordered, provides RELEASE.
+ */
+ if (unlikely(atomic_dec_and_test(&rwb->readers)))
+ __rwbase_read_unlock(rwb, state);
+}
+
+static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias,
+ unsigned long flags)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+
+ /*
+ * _release() is needed in case that reader is in fast path, pairing
+ * with atomic_try_cmpxchg_acquire() in rwbase_read_trylock().
+ */
+ (void)atomic_add_return_release(READER_BIAS - bias, &rwb->readers);
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ rwbase_rtmutex_unlock(rtm);
+}
+
+static inline void rwbase_write_unlock(struct rwbase_rt *rwb)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ __rwbase_write_unlock(rwb, WRITER_BIAS, flags);
+}
+
+static inline void rwbase_write_downgrade(struct rwbase_rt *rwb)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ /* Release it and account current as reader */
+ __rwbase_write_unlock(rwb, WRITER_BIAS - 1, flags);
+}
+
+static inline bool __rwbase_write_trylock(struct rwbase_rt *rwb)
+{
+ /* Can do without CAS because we're serialized by wait_lock. */
+ lockdep_assert_held(&rwb->rtmutex.wait_lock);
+
+ /*
+ * _acquire is needed in case the reader is in the fast path, pairing
+ * with rwbase_read_unlock(), provides ACQUIRE.
+ */
+ if (!atomic_read_acquire(&rwb->readers)) {
+ atomic_set(&rwb->readers, WRITER_BIAS);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ /* Take the rtmutex as a first step */
+ if (rwbase_rtmutex_lock_state(rtm, state))
+ return -EINTR;
+
+ /* Force readers into slow path */
+ atomic_sub(READER_BIAS, &rwb->readers);
+
+ rwbase_pre_schedule();
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ if (__rwbase_write_trylock(rwb))
+ goto out_unlock;
+
+ rwbase_set_and_save_current_state(state);
+ trace_contention_begin(rwb, LCB_F_RT | LCB_F_WRITE);
+ for (;;) {
+ /* Optimized out for rwlocks */
+ if (rwbase_signal_pending_state(state, current)) {
+ rwbase_restore_current_state();
+ __rwbase_write_unlock(rwb, 0, flags);
+ rwbase_post_schedule();
+ trace_contention_end(rwb, -EINTR);
+ return -EINTR;
+ }
+
+ if (__rwbase_write_trylock(rwb))
+ break;
+
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ rwbase_schedule();
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+
+ set_current_state(state);
+ }
+ rwbase_restore_current_state();
+ trace_contention_end(rwb, 0);
+
+out_unlock:
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ rwbase_post_schedule();
+ return 0;
+}
+
+static inline int rwbase_write_trylock(struct rwbase_rt *rwb)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ if (!rwbase_rtmutex_trylock(rtm))
+ return 0;
+
+ atomic_sub(READER_BIAS, &rwb->readers);
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ if (__rwbase_write_trylock(rwb)) {
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ return 1;
+ }
+ __rwbase_write_unlock(rwb, 0, flags);
+ return 0;
+}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index ba67600c7b2c..c6d17aee4209 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -27,13 +27,15 @@
#include <linux/export.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
+#include <trace/events/lock.h>
+#ifndef CONFIG_PREEMPT_RT
#include "lock_events.h"
/*
* The least significant 2 bits of the owner value has the following
* meanings when set.
- * - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers
+ * - Bit 0: RWSEM_READER_OWNED - rwsem may be owned by readers (just a hint)
* - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock
*
* When the rwsem is reader-owned and a spinning writer has timed out,
@@ -55,7 +57,6 @@
*
* A fast path reader optimistic lock stealing is supported when the rwsem
* is previously owned by a writer and the following conditions are met:
- * - OSQ is empty
* - rwsem is not currently writer owned
* - the handoff isn't set.
*/
@@ -105,9 +106,9 @@
* atomic_long_cmpxchg() will be used to obtain writer lock.
*
* There are three places where the lock handoff bit may be set or cleared.
- * 1) rwsem_mark_wake() for readers.
- * 2) rwsem_try_write_lock() for writers.
- * 3) Error path of rwsem_down_write_slowpath().
+ * 1) rwsem_mark_wake() for readers -- set, clear
+ * 2) rwsem_try_write_lock() for writers -- set, clear
+ * 3) rwsem_del_waiter() -- clear
*
* For all the above cases, wait_lock will be held. A writer must also
* be the first one in the wait_list to be eligible for setting the handoff
@@ -132,14 +133,19 @@
* the owner value concurrently without lock. Read from owner, however,
* may not need READ_ONCE() as long as the pointer value is only used
* for comparison and isn't being dereferenced.
+ *
+ * Both rwsem_{set,clear}_owner() functions should be in the same
+ * preempt disable section as the atomic op that changes sem->count.
*/
static inline void rwsem_set_owner(struct rw_semaphore *sem)
{
+ lockdep_assert_preemption_disabled();
atomic_long_set(&sem->owner, (long)current);
}
static inline void rwsem_clear_owner(struct rw_semaphore *sem)
{
+ lockdep_assert_preemption_disabled();
atomic_long_set(&sem->owner, 0);
}
@@ -334,6 +340,7 @@ struct rwsem_waiter {
struct task_struct *task;
enum rwsem_waiter_type type;
unsigned long timeout;
+ bool handoff_set;
};
#define rwsem_first_waiter(sem) \
list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
@@ -344,12 +351,6 @@ enum rwsem_wake_type {
RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
};
-enum writer_wait_state {
- WRITER_NOT_FIRST, /* Writer is not first in wait list */
- WRITER_FIRST, /* Writer is first in wait list */
- WRITER_HANDOFF /* Writer is first & handoff needed */
-};
-
/*
* The typical HZ value is either 250 or 1000. So set the minimum waiting
* time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
@@ -365,6 +366,34 @@ enum writer_wait_state {
*/
#define MAX_READERS_WAKEUP 0x100
+static inline void
+rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+{
+ lockdep_assert_held(&sem->wait_lock);
+ list_add_tail(&waiter->list, &sem->wait_list);
+ /* caller will set RWSEM_FLAG_WAITERS */
+}
+
+/*
+ * Remove a waiter from the wait_list and clear flags.
+ *
+ * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
+ * this function. Modify with care.
+ *
+ * Return: true if wait_list isn't empty and false otherwise
+ */
+static inline bool
+rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+{
+ lockdep_assert_held(&sem->wait_lock);
+ list_del(&waiter->list);
+ if (likely(!list_empty(&sem->wait_list)))
+ return true;
+
+ atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
+ return false;
+}
+
/*
* handle the lock release when processes blocked on it that can now run
* - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
@@ -376,6 +405,8 @@ enum writer_wait_state {
* preferably when the wait_lock is released
* - woken process blocks are discarded from the list after having task zeroed
* - writers are only marked woken if downgrading is false
+ *
+ * Implies rwsem_del_waiter() for all woken readers.
*/
static void rwsem_mark_wake(struct rw_semaphore *sem,
enum rwsem_wake_type wake_type,
@@ -431,10 +462,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
* to give up the lock), request a HANDOFF to
* force the issue.
*/
- if (!(oldcount & RWSEM_FLAG_HANDOFF) &&
- time_after(jiffies, waiter->timeout)) {
- adjustment -= RWSEM_FLAG_HANDOFF;
- lockevent_inc(rwsem_rlock_handoff);
+ if (time_after(jiffies, waiter->timeout)) {
+ if (!(oldcount & RWSEM_FLAG_HANDOFF)) {
+ adjustment -= RWSEM_FLAG_HANDOFF;
+ lockevent_inc(rwsem_rlock_handoff);
+ }
+ waiter->handoff_set = true;
}
atomic_long_add(-adjustment, &sem->count);
@@ -484,24 +517,31 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
/*
* Limit # of readers that can be woken up per wakeup call.
*/
- if (woken >= MAX_READERS_WAKEUP)
+ if (unlikely(woken >= MAX_READERS_WAKEUP))
break;
}
adjustment = woken * RWSEM_READER_BIAS - adjustment;
lockevent_cond_inc(rwsem_wake_reader, woken);
+
+ oldcount = atomic_long_read(&sem->count);
if (list_empty(&sem->wait_list)) {
- /* hit end of list above */
+ /*
+ * Combined with list_move_tail() above, this implies
+ * rwsem_del_waiter().
+ */
adjustment -= RWSEM_FLAG_WAITERS;
+ if (oldcount & RWSEM_FLAG_HANDOFF)
+ adjustment -= RWSEM_FLAG_HANDOFF;
+ } else if (woken) {
+ /*
+ * When we've woken a reader, we no longer need to force
+ * writers to give up the lock and we can clear HANDOFF.
+ */
+ if (oldcount & RWSEM_FLAG_HANDOFF)
+ adjustment -= RWSEM_FLAG_HANDOFF;
}
- /*
- * When we've woken a reader, we no longer need to force writers
- * to give up the lock and we can clear HANDOFF.
- */
- if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF))
- adjustment -= RWSEM_FLAG_HANDOFF;
-
if (adjustment)
atomic_long_add(adjustment, &sem->count);
@@ -528,16 +568,43 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
}
/*
+ * Remove a waiter and try to wake up other waiters in the wait queue
+ * This function is called from the out_nolock path of both the reader and
+ * writer slowpaths with wait_lock held. It releases the wait_lock and
+ * optionally wake up waiters before it returns.
+ */
+static inline void
+rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
+ struct wake_q_head *wake_q)
+ __releases(&sem->wait_lock)
+{
+ bool first = rwsem_first_waiter(sem) == waiter;
+
+ wake_q_init(wake_q);
+
+ /*
+ * If the wait_list isn't empty and the waiter to be deleted is
+ * the first waiter, we wake up the remaining waiters as they may
+ * be eligible to acquire or spin on the lock.
+ */
+ if (rwsem_del_waiter(sem, waiter) && first)
+ rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ if (!wake_q_empty(wake_q))
+ wake_up_q(wake_q);
+}
+
+/*
* This function must be called with the sem->wait_lock held to prevent
* race conditions between checking the rwsem wait list and setting the
* sem->count accordingly.
*
- * If wstate is WRITER_HANDOFF, it will make sure that either the handoff
- * bit is set or the lock is acquired with handoff bit cleared.
+ * Implies rwsem_del_waiter() on success.
*/
static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
- enum writer_wait_state wstate)
+ struct rwsem_waiter *waiter)
{
+ struct rwsem_waiter *first = rwsem_first_waiter(sem);
long count, new;
lockdep_assert_held(&sem->wait_lock);
@@ -546,13 +613,26 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
do {
bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
- if (has_handoff && wstate == WRITER_NOT_FIRST)
- return false;
+ if (has_handoff) {
+ /*
+ * Honor handoff bit and yield only when the first
+ * waiter is the one that set it. Otherwisee, we
+ * still try to acquire the rwsem.
+ */
+ if (first->handoff_set && (waiter != first))
+ return false;
+ }
new = count;
if (count & RWSEM_LOCK_MASK) {
- if (has_handoff || (wstate != WRITER_HANDOFF))
+ /*
+ * A waiter (first or not) can set the handoff bit
+ * if it is an RT task or wait in the wait queue
+ * for too long.
+ */
+ if (has_handoff || (!rt_task(waiter->task) &&
+ !time_after(jiffies, waiter->timeout)))
return false;
new |= RWSEM_FLAG_HANDOFF;
@@ -566,16 +646,43 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
} while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
/*
- * We have either acquired the lock with handoff bit cleared or
- * set the handoff bit.
+ * We have either acquired the lock with handoff bit cleared or set
+ * the handoff bit. Only the first waiter can have its handoff_set
+ * set here to enable optimistic spinning in slowpath loop.
*/
- if (new & RWSEM_FLAG_HANDOFF)
+ if (new & RWSEM_FLAG_HANDOFF) {
+ first->handoff_set = true;
+ lockevent_inc(rwsem_wlock_handoff);
return false;
+ }
+ /*
+ * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
+ * success.
+ */
+ list_del(&waiter->list);
rwsem_set_owner(sem);
return true;
}
+/*
+ * The rwsem_spin_on_owner() function returns the following 4 values
+ * depending on the lock owner state.
+ * OWNER_NULL : owner is currently NULL
+ * OWNER_WRITER: when owner changes and is a writer
+ * OWNER_READER: when owner changes and the new owner may be a reader.
+ * OWNER_NONSPINNABLE:
+ * when optimistic spinning has to stop because either the
+ * owner stops running, is unknown, or its timeslice has
+ * been used up.
+ */
+enum owner_state {
+ OWNER_NULL = 1 << 0,
+ OWNER_WRITER = 1 << 1,
+ OWNER_READER = 1 << 2,
+ OWNER_NONSPINNABLE = 1 << 3,
+};
+
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
/*
* Try to acquire write lock before the writer has been put on wait queue.
@@ -595,15 +702,6 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
return false;
}
-static inline bool owner_on_cpu(struct task_struct *owner)
-{
- /*
- * As lock holder preemption issue, we both skip spinning if
- * task is not on cpu or its cpu is preempted
- */
- return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
-}
-
static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
{
struct task_struct *owner;
@@ -615,8 +713,10 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
return false;
}
- preempt_disable();
- rcu_read_lock();
+ /*
+ * Disable preemption is equal to the RCU read-side crital section,
+ * thus the task_strcut structure won't go away.
+ */
owner = rwsem_owner_flags(sem, &flags);
/*
* Don't check the read-owner as the entry may be stale.
@@ -624,30 +724,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
if ((flags & RWSEM_NONSPINNABLE) ||
(owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))
ret = false;
- rcu_read_unlock();
- preempt_enable();
lockevent_cond_inc(rwsem_opt_fail, !ret);
return ret;
}
-/*
- * The rwsem_spin_on_owner() function returns the folowing 4 values
- * depending on the lock owner state.
- * OWNER_NULL : owner is currently NULL
- * OWNER_WRITER: when owner changes and is a writer
- * OWNER_READER: when owner changes and the new owner may be a reader.
- * OWNER_NONSPINNABLE:
- * when optimistic spinning has to stop because either the
- * owner stops running, is unknown, or its timeslice has
- * been used up.
- */
-enum owner_state {
- OWNER_NULL = 1 << 0,
- OWNER_WRITER = 1 << 1,
- OWNER_READER = 1 << 2,
- OWNER_NONSPINNABLE = 1 << 3,
-};
#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER)
static inline enum owner_state
@@ -669,12 +750,13 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
unsigned long flags, new_flags;
enum owner_state state;
+ lockdep_assert_preemption_disabled();
+
owner = rwsem_owner_flags(sem, &flags);
state = rwsem_owner_state(owner, flags);
if (state != OWNER_WRITER)
return state;
- rcu_read_lock();
for (;;) {
/*
* When a waiting writer set the handoff flag, it may spin
@@ -692,7 +774,9 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
* Ensure we emit the owner->on_cpu, dereference _after_
* checking sem->owner still matches owner, if that fails,
* owner might point to free()d memory, if it still matches,
- * the rcu_read_lock() ensures the memory stays valid.
+ * our spinning context already disabled preemption which is
+ * equal to RCU read-side crital section ensures the memory
+ * stays valid.
*/
barrier();
@@ -703,7 +787,6 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
cpu_relax();
}
- rcu_read_unlock();
return state;
}
@@ -740,8 +823,6 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
int loop = 0;
u64 rspin_threshold = 0;
- preempt_disable();
-
/* sem->wait_lock should not be held when doing optimistic spinning */
if (!osq_lock(&sem->osq))
goto done;
@@ -819,7 +900,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
* we try to get it. The new owner may be a spinnable
* writer.
*
- * To take advantage of two scenarios listed agove, the RT
+ * To take advantage of two scenarios listed above, the RT
* task is made to retry one more time to see if it can
* acquire the lock or continue spinning on the new owning
* writer. Of course, if the time lag is long enough or the
@@ -849,7 +930,6 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
}
osq_unlock(&sem->osq);
done:
- preempt_enable();
lockevent_cond_inc(rwsem_opt_fail, !taken);
return taken;
}
@@ -860,7 +940,7 @@ done:
*/
static inline void clear_nonspinnable(struct rw_semaphore *sem)
{
- if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))
+ if (unlikely(rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)))
atomic_long_andnot(RWSEM_NONSPINNABLE, &sem->owner);
}
@@ -877,30 +957,53 @@ static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem)
static inline void clear_nonspinnable(struct rw_semaphore *sem) { }
-static inline int
+static inline enum owner_state
rwsem_spin_on_owner(struct rw_semaphore *sem)
{
- return 0;
+ return OWNER_NONSPINNABLE;
}
-#define OWNER_NULL 1
#endif
/*
+ * Prepare to wake up waiter(s) in the wait queue by putting them into the
+ * given wake_q if the rwsem lock owner isn't a writer. If rwsem is likely
+ * reader-owned, wake up read lock waiters in queue front or wake up any
+ * front waiter otherwise.
+
+ * This is being called from both reader and writer slow paths.
+ */
+static inline void rwsem_cond_wake_waiter(struct rw_semaphore *sem, long count,
+ struct wake_q_head *wake_q)
+{
+ enum rwsem_wake_type wake_type;
+
+ if (count & RWSEM_WRITER_MASK)
+ return;
+
+ if (count & RWSEM_READER_MASK) {
+ wake_type = RWSEM_WAKE_READERS;
+ } else {
+ wake_type = RWSEM_WAKE_ANY;
+ clear_nonspinnable(sem);
+ }
+ rwsem_mark_wake(sem, wake_type, wake_q);
+}
+
+/*
* Wait for the read lock to be granted
*/
static struct rw_semaphore __sched *
-rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, int state)
+rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int state)
{
long adjustment = -RWSEM_READER_BIAS;
long rcnt = (count >> RWSEM_READER_SHIFT);
struct rwsem_waiter waiter;
DEFINE_WAKE_Q(wake_q);
- bool wake = false;
/*
* To prevent a constant stream of readers from starving a sleeping
- * waiter, don't attempt optimistic lock stealing if the lock is
- * currently owned by readers.
+ * writer, don't attempt optimistic lock stealing if the lock is
+ * very likely owned by readers.
*/
if ((atomic_long_read(&sem->owner) & RWSEM_READER_OWNED) &&
(rcnt > 1) && !(count & RWSEM_WRITER_LOCKED))
@@ -932,17 +1035,17 @@ queue:
waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_READ;
waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+ waiter.handoff_set = false;
raw_spin_lock_irq(&sem->wait_lock);
if (list_empty(&sem->wait_list)) {
/*
* In case the wait queue is empty and the lock isn't owned
- * by a writer or has the handoff bit set, this reader can
- * exit the slowpath and return immediately as its
- * RWSEM_READER_BIAS has already been set in the count.
+ * by a writer, this reader can exit the slowpath and return
+ * immediately as its RWSEM_READER_BIAS has already been set
+ * in the count.
*/
- if (!(atomic_long_read(&sem->count) &
- (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
+ if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) {
/* Provide lock ACQUIRE */
smp_acquire__after_ctrl_dep();
raw_spin_unlock_irq(&sem->wait_lock);
@@ -952,27 +1055,18 @@ queue:
}
adjustment += RWSEM_FLAG_WAITERS;
}
- list_add_tail(&waiter.list, &sem->wait_list);
+ rwsem_add_waiter(sem, &waiter);
/* we're now waiting on the lock, but no longer actively locking */
count = atomic_long_add_return(adjustment, &sem->count);
- /*
- * If there are no active locks, wake the front queued process(es).
- *
- * If there are no writers and we are first in the queue,
- * wake our own waiter to join the existing active readers !
- */
- if (!(count & RWSEM_LOCK_MASK)) {
- clear_nonspinnable(sem);
- wake = true;
- }
- if (wake || (!(count & RWSEM_WRITER_MASK) &&
- (adjustment & RWSEM_FLAG_WAITERS)))
- rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
-
+ rwsem_cond_wake_waiter(sem, count, &wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
- wake_up_q(&wake_q);
+
+ if (!wake_q_empty(&wake_q))
+ wake_up_q(&wake_q);
+
+ trace_contention_begin(sem, LCB_F_READ);
/* wait to be given the lock */
for (;;) {
@@ -989,36 +1083,30 @@ queue:
/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
break;
}
- schedule();
+ schedule_preempt_disabled();
lockevent_inc(rwsem_sleep_reader);
}
__set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock);
+ trace_contention_end(sem, 0);
return sem;
out_nolock:
- list_del(&waiter.list);
- if (list_empty(&sem->wait_list)) {
- atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
- &sem->count);
- }
- raw_spin_unlock_irq(&sem->wait_lock);
+ rwsem_del_wake_waiter(sem, &waiter, &wake_q);
__set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock_fail);
+ trace_contention_end(sem, -EINTR);
return ERR_PTR(-EINTR);
}
/*
* Wait until we successfully acquire the write lock
*/
-static struct rw_semaphore *
+static struct rw_semaphore __sched *
rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
{
- long count;
- enum writer_wait_state wstate;
struct rwsem_waiter waiter;
- struct rw_semaphore *ret = sem;
DEFINE_WAKE_Q(wake_q);
/* do optimistic spinning and steal lock if possible */
@@ -1034,33 +1122,15 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_WRITE;
waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+ waiter.handoff_set = false;
raw_spin_lock_irq(&sem->wait_lock);
-
- /* account for this before adding a new element to the list */
- wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST;
-
- list_add_tail(&waiter.list, &sem->wait_list);
+ rwsem_add_waiter(sem, &waiter);
/* we're now waiting on the lock */
- if (wstate == WRITER_NOT_FIRST) {
- count = atomic_long_read(&sem->count);
-
- /*
- * If there were already threads queued before us and:
- * 1) there are no no active locks, wake the front
- * queued process(es) as the handoff bit might be set.
- * 2) there are no active writers and some readers, the lock
- * must be read owned; so we try to wake any read lock
- * waiters that were queued ahead of us.
- */
- if (count & RWSEM_WRITER_MASK)
- goto wait;
-
- rwsem_mark_wake(sem, (count & RWSEM_READER_MASK)
- ? RWSEM_WAKE_READERS
- : RWSEM_WAKE_ANY, &wake_q);
-
+ if (rwsem_first_waiter(sem) != &waiter) {
+ rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count),
+ &wake_q);
if (!wake_q_empty(&wake_q)) {
/*
* We want to minimize wait_lock hold time especially
@@ -1068,24 +1138,27 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
*/
raw_spin_unlock_irq(&sem->wait_lock);
wake_up_q(&wake_q);
- wake_q_init(&wake_q); /* Used again, reinit */
raw_spin_lock_irq(&sem->wait_lock);
}
} else {
atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
}
-wait:
/* wait until we successfully acquire the lock */
set_current_state(state);
+ trace_contention_begin(sem, LCB_F_WRITE);
+
for (;;) {
- if (rwsem_try_write_lock(sem, wstate)) {
+ if (rwsem_try_write_lock(sem, &waiter)) {
/* rwsem_try_write_lock() implies ACQUIRE on success */
break;
}
raw_spin_unlock_irq(&sem->wait_lock);
+ if (signal_pending_state(state, current))
+ goto out_nolock;
+
/*
* After setting the handoff bit and failing to acquire
* the lock, attempt to spin on owner to accelerate lock
@@ -1094,70 +1167,32 @@ wait:
* In this case, we attempt to acquire the lock again
* without sleeping.
*/
- if (wstate == WRITER_HANDOFF &&
- rwsem_spin_on_owner(sem) == OWNER_NULL)
- goto trylock_again;
-
- /* Block until there are no active lockers. */
- for (;;) {
- if (signal_pending_state(state, current))
- goto out_nolock;
+ if (waiter.handoff_set) {
+ enum owner_state owner_state;
- schedule();
- lockevent_inc(rwsem_sleep_writer);
- set_current_state(state);
- /*
- * If HANDOFF bit is set, unconditionally do
- * a trylock.
- */
- if (wstate == WRITER_HANDOFF)
- break;
-
- if ((wstate == WRITER_NOT_FIRST) &&
- (rwsem_first_waiter(sem) == &waiter))
- wstate = WRITER_FIRST;
-
- count = atomic_long_read(&sem->count);
- if (!(count & RWSEM_LOCK_MASK))
- break;
-
- /*
- * The setting of the handoff bit is deferred
- * until rwsem_try_write_lock() is called.
- */
- if ((wstate == WRITER_FIRST) && (rt_task(current) ||
- time_after(jiffies, waiter.timeout))) {
- wstate = WRITER_HANDOFF;
- lockevent_inc(rwsem_wlock_handoff);
- break;
- }
+ owner_state = rwsem_spin_on_owner(sem);
+ if (owner_state == OWNER_NULL)
+ goto trylock_again;
}
+
+ schedule_preempt_disabled();
+ lockevent_inc(rwsem_sleep_writer);
+ set_current_state(state);
trylock_again:
raw_spin_lock_irq(&sem->wait_lock);
}
__set_current_state(TASK_RUNNING);
- list_del(&waiter.list);
raw_spin_unlock_irq(&sem->wait_lock);
lockevent_inc(rwsem_wlock);
-
- return ret;
+ trace_contention_end(sem, 0);
+ return sem;
out_nolock:
__set_current_state(TASK_RUNNING);
raw_spin_lock_irq(&sem->wait_lock);
- list_del(&waiter.list);
-
- if (unlikely(wstate == WRITER_HANDOFF))
- atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count);
-
- if (list_empty(&sem->wait_list))
- atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
- else
- rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
- raw_spin_unlock_irq(&sem->wait_lock);
- wake_up_q(&wake_q);
+ rwsem_del_wake_waiter(sem, &waiter, &wake_q);
lockevent_inc(rwsem_wlock_fail);
-
+ trace_contention_end(sem, -EINTR);
return ERR_PTR(-EINTR);
}
@@ -1165,7 +1200,7 @@ out_nolock:
* handle waking up a waiter on the semaphore
* - up_read/up_write has decremented the active part of count if we come here
*/
-static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem, long count)
+static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
{
unsigned long flags;
DEFINE_WAKE_Q(wake_q);
@@ -1205,51 +1240,58 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
/*
* lock for reading
*/
-static inline int __down_read_common(struct rw_semaphore *sem, int state)
+static __always_inline int __down_read_common(struct rw_semaphore *sem, int state)
{
+ int ret = 0;
long count;
+ preempt_disable();
if (!rwsem_read_trylock(sem, &count)) {
- if (IS_ERR(rwsem_down_read_slowpath(sem, count, state)))
- return -EINTR;
+ if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) {
+ ret = -EINTR;
+ goto out;
+ }
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
}
- return 0;
+out:
+ preempt_enable();
+ return ret;
}
-static inline void __down_read(struct rw_semaphore *sem)
+static __always_inline void __down_read(struct rw_semaphore *sem)
{
__down_read_common(sem, TASK_UNINTERRUPTIBLE);
}
-static inline int __down_read_interruptible(struct rw_semaphore *sem)
+static __always_inline int __down_read_interruptible(struct rw_semaphore *sem)
{
return __down_read_common(sem, TASK_INTERRUPTIBLE);
}
-static inline int __down_read_killable(struct rw_semaphore *sem)
+static __always_inline int __down_read_killable(struct rw_semaphore *sem)
{
return __down_read_common(sem, TASK_KILLABLE);
}
static inline int __down_read_trylock(struct rw_semaphore *sem)
{
+ int ret = 0;
long tmp;
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
- /*
- * Optimize for the case when the rwsem is not locked at all.
- */
- tmp = RWSEM_UNLOCKED_VALUE;
- do {
+ preempt_disable();
+ tmp = atomic_long_read(&sem->count);
+ while (!(tmp & RWSEM_READ_FAILED_MASK)) {
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
- tmp + RWSEM_READER_BIAS)) {
+ tmp + RWSEM_READER_BIAS)) {
rwsem_set_reader_owned(sem);
- return 1;
+ ret = 1;
+ break;
}
- } while (!(tmp & RWSEM_READ_FAILED_MASK));
- return 0;
+ }
+ preempt_enable();
+ return ret;
}
/*
@@ -1257,12 +1299,15 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
*/
static inline int __down_write_common(struct rw_semaphore *sem, int state)
{
+ int ret = 0;
+
+ preempt_disable();
if (unlikely(!rwsem_write_trylock(sem))) {
if (IS_ERR(rwsem_down_write_slowpath(sem, state)))
- return -EINTR;
+ ret = -EINTR;
}
-
- return 0;
+ preempt_enable();
+ return ret;
}
static inline void __down_write(struct rw_semaphore *sem)
@@ -1277,8 +1322,14 @@ static inline int __down_write_killable(struct rw_semaphore *sem)
static inline int __down_write_trylock(struct rw_semaphore *sem)
{
+ int ret;
+
+ preempt_disable();
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
- return rwsem_write_trylock(sem);
+ ret = rwsem_write_trylock(sem);
+ preempt_enable();
+
+ return ret;
}
/*
@@ -1291,14 +1342,16 @@ static inline void __up_read(struct rw_semaphore *sem)
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+ preempt_disable();
rwsem_clear_reader_owned(sem);
tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
RWSEM_FLAG_WAITERS)) {
clear_nonspinnable(sem);
- rwsem_wake(sem, tmp);
+ rwsem_wake(sem);
}
+ preempt_enable();
}
/*
@@ -1316,10 +1369,12 @@ static inline void __up_write(struct rw_semaphore *sem)
DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&
!rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
+ preempt_disable();
rwsem_clear_owner(sem);
tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
if (unlikely(tmp & RWSEM_FLAG_WAITERS))
- rwsem_wake(sem, tmp);
+ rwsem_wake(sem);
+ preempt_enable();
}
/*
@@ -1337,13 +1392,131 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
* write side. As such, rely on RELEASE semantics.
*/
DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem);
+ preempt_disable();
tmp = atomic_long_fetch_add_release(
-RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
rwsem_set_reader_owned(sem);
if (tmp & RWSEM_FLAG_WAITERS)
rwsem_downgrade_wake(sem);
+ preempt_enable();
+}
+
+#else /* !CONFIG_PREEMPT_RT */
+
+#define RT_MUTEX_BUILD_MUTEX
+#include "rtmutex.c"
+
+#define rwbase_set_and_save_current_state(state) \
+ set_current_state(state)
+
+#define rwbase_restore_current_state() \
+ __set_current_state(TASK_RUNNING)
+
+#define rwbase_rtmutex_lock_state(rtm, state) \
+ __rt_mutex_lock(rtm, state)
+
+#define rwbase_rtmutex_slowlock_locked(rtm, state) \
+ __rt_mutex_slowlock_locked(rtm, NULL, state)
+
+#define rwbase_rtmutex_unlock(rtm) \
+ __rt_mutex_unlock(rtm)
+
+#define rwbase_rtmutex_trylock(rtm) \
+ __rt_mutex_trylock(rtm)
+
+#define rwbase_signal_pending_state(state, current) \
+ signal_pending_state(state, current)
+
+#define rwbase_pre_schedule() \
+ rt_mutex_pre_schedule()
+
+#define rwbase_schedule() \
+ rt_mutex_schedule()
+
+#define rwbase_post_schedule() \
+ rt_mutex_post_schedule()
+
+#include "rwbase_rt.c"
+
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
+ struct lock_class_key *key)
+{
+ init_rwbase_rt(&(sem)->rwbase);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
+#endif
+}
+EXPORT_SYMBOL(__init_rwsem);
+
+static inline void __down_read(struct rw_semaphore *sem)
+{
+ rwbase_read_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
+}
+
+static inline int __down_read_interruptible(struct rw_semaphore *sem)
+{
+ return rwbase_read_lock(&sem->rwbase, TASK_INTERRUPTIBLE);
+}
+
+static inline int __down_read_killable(struct rw_semaphore *sem)
+{
+ return rwbase_read_lock(&sem->rwbase, TASK_KILLABLE);
+}
+
+static inline int __down_read_trylock(struct rw_semaphore *sem)
+{
+ return rwbase_read_trylock(&sem->rwbase);
+}
+
+static inline void __up_read(struct rw_semaphore *sem)
+{
+ rwbase_read_unlock(&sem->rwbase, TASK_NORMAL);
}
+static inline void __sched __down_write(struct rw_semaphore *sem)
+{
+ rwbase_write_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
+}
+
+static inline int __sched __down_write_killable(struct rw_semaphore *sem)
+{
+ return rwbase_write_lock(&sem->rwbase, TASK_KILLABLE);
+}
+
+static inline int __down_write_trylock(struct rw_semaphore *sem)
+{
+ return rwbase_write_trylock(&sem->rwbase);
+}
+
+static inline void __up_write(struct rw_semaphore *sem)
+{
+ rwbase_write_unlock(&sem->rwbase);
+}
+
+static inline void __downgrade_write(struct rw_semaphore *sem)
+{
+ rwbase_write_downgrade(&sem->rwbase);
+}
+
+/* Debug stubs for the common API */
+#define DEBUG_RWSEMS_WARN_ON(c, sem)
+
+static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
+ struct task_struct *owner)
+{
+}
+
+static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
+{
+ int count = atomic_read(&sem->rwbase.readers);
+
+ return count < 0 && count != READER_BIAS;
+}
+
+#endif /* CONFIG_PREEMPT_RT */
+
/*
* lock for reading
*/
@@ -1506,6 +1679,12 @@ void down_read_non_owner(struct rw_semaphore *sem)
{
might_sleep();
__down_read(sem);
+ /*
+ * The owner value for a reader-owned lock is mostly for debugging
+ * purpose only and is not critical to the correct functioning of
+ * rwsem. So it is perfectly fine to set it in a preempt-enabled
+ * context here.
+ */
__rwsem_set_reader_owned(sem, NULL);
}
EXPORT_SYMBOL(down_read_non_owner);
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/kernel/locking/rwsem.h
+++ /dev/null
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index d9dd94defc0a..34bfae72f295 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -32,6 +32,7 @@
#include <linux/semaphore.h>
#include <linux/spinlock.h>
#include <linux/ftrace.h>
+#include <trace/events/lock.h>
static noinline void __down(struct semaphore *sem);
static noinline int __down_interruptible(struct semaphore *sem);
@@ -50,10 +51,11 @@ static noinline void __up(struct semaphore *sem);
* Use of this function is deprecated, please use down_interruptible() or
* down_killable() instead.
*/
-void down(struct semaphore *sem)
+void __sched down(struct semaphore *sem)
{
unsigned long flags;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
@@ -72,11 +74,12 @@ EXPORT_SYMBOL(down);
* If the sleep is interrupted by a signal, this function will return -EINTR.
* If the semaphore is successfully acquired, this function returns 0.
*/
-int down_interruptible(struct semaphore *sem)
+int __sched down_interruptible(struct semaphore *sem)
{
unsigned long flags;
int result = 0;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
@@ -98,11 +101,12 @@ EXPORT_SYMBOL(down_interruptible);
* -EINTR. If the semaphore is successfully acquired, this function returns
* 0.
*/
-int down_killable(struct semaphore *sem)
+int __sched down_killable(struct semaphore *sem)
{
unsigned long flags;
int result = 0;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
@@ -119,7 +123,7 @@ EXPORT_SYMBOL(down_killable);
* @sem: the semaphore to be acquired
*
* Try to acquire the semaphore atomically. Returns 0 if the semaphore has
- * been acquired successfully or 1 if it it cannot be acquired.
+ * been acquired successfully or 1 if it cannot be acquired.
*
* NOTE: This return value is inverted from both spin_trylock and
* mutex_trylock! Be careful about this when converting code.
@@ -127,7 +131,7 @@ EXPORT_SYMBOL(down_killable);
* Unlike mutex_trylock, this function can be used from interrupt context,
* and the semaphore can be released by any task or interrupt.
*/
-int down_trylock(struct semaphore *sem)
+int __sched down_trylock(struct semaphore *sem)
{
unsigned long flags;
int count;
@@ -152,11 +156,12 @@ EXPORT_SYMBOL(down_trylock);
* If the semaphore is not released within the specified number of jiffies,
* this function returns -ETIME. It returns 0 if the semaphore was acquired.
*/
-int down_timeout(struct semaphore *sem, long timeout)
+int __sched down_timeout(struct semaphore *sem, long timeout)
{
unsigned long flags;
int result = 0;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
@@ -175,7 +180,7 @@ EXPORT_SYMBOL(down_timeout);
* Release the semaphore. Unlike mutexes, up() may be called from any
* context and even by tasks which have never called down().
*/
-void up(struct semaphore *sem)
+void __sched up(struct semaphore *sem)
{
unsigned long flags;
@@ -201,7 +206,7 @@ struct semaphore_waiter {
* constant, and thus optimised away by the compiler. Likewise the
* 'timeout' parameter for the cases without timeouts.
*/
-static inline int __sched __down_common(struct semaphore *sem, long state,
+static inline int __sched ___down_common(struct semaphore *sem, long state,
long timeout)
{
struct semaphore_waiter waiter;
@@ -232,6 +237,18 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
return -EINTR;
}
+static inline int __sched __down_common(struct semaphore *sem, long state,
+ long timeout)
+{
+ int ret;
+
+ trace_contention_begin(sem, 0);
+ ret = ___down_common(sem, state, timeout);
+ trace_contention_end(sem, ret);
+
+ return ret;
+}
+
static noinline void __sched __down(struct semaphore *sem)
{
__down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 0ff08380f531..8475a0794f8c 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -58,10 +58,10 @@ EXPORT_PER_CPU_SYMBOL(__mmiowb_state);
/*
* We build the __lock_function inlines here. They are too large for
* inlining all over the place, but here is only one user per function
- * which embedds them into the calling _lock_function below.
+ * which embeds them into the calling _lock_function below.
*
* This could be a long-held lock. We both prepare to spin for a long
- * time (making _this_ CPU preemptable if possible), and we also signal
+ * time (making _this_ CPU preemptible if possible), and we also signal
* towards that other CPU that it should break the lock ASAP.
*/
#define BUILD_LOCK_OPS(op, locktype) \
@@ -124,13 +124,16 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
* __[spin|read|write]_lock_bh()
*/
BUILD_LOCK_OPS(spin, raw_spinlock);
+
+#ifndef CONFIG_PREEMPT_RT
BUILD_LOCK_OPS(read, rwlock);
BUILD_LOCK_OPS(write, rwlock);
+#endif
#endif
#ifndef CONFIG_INLINE_SPIN_TRYLOCK
-int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
+noinline int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
{
return __raw_spin_trylock(lock);
}
@@ -138,7 +141,7 @@ EXPORT_SYMBOL(_raw_spin_trylock);
#endif
#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
-int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
+noinline int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
{
return __raw_spin_trylock_bh(lock);
}
@@ -146,7 +149,7 @@ EXPORT_SYMBOL(_raw_spin_trylock_bh);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK
-void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
{
__raw_spin_lock(lock);
}
@@ -154,7 +157,7 @@ EXPORT_SYMBOL(_raw_spin_lock);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
-unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
+noinline unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
{
return __raw_spin_lock_irqsave(lock);
}
@@ -162,7 +165,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irqsave);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
-void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
{
__raw_spin_lock_irq(lock);
}
@@ -170,7 +173,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irq);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK_BH
-void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
{
__raw_spin_lock_bh(lock);
}
@@ -178,7 +181,7 @@ EXPORT_SYMBOL(_raw_spin_lock_bh);
#endif
#ifdef CONFIG_UNINLINE_SPIN_UNLOCK
-void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
{
__raw_spin_unlock(lock);
}
@@ -186,7 +189,7 @@ EXPORT_SYMBOL(_raw_spin_unlock);
#endif
#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
-void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
+noinline void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
{
__raw_spin_unlock_irqrestore(lock, flags);
}
@@ -194,7 +197,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_irqrestore);
#endif
#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
-void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
{
__raw_spin_unlock_irq(lock);
}
@@ -202,15 +205,17 @@ EXPORT_SYMBOL(_raw_spin_unlock_irq);
#endif
#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
-void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
{
__raw_spin_unlock_bh(lock);
}
EXPORT_SYMBOL(_raw_spin_unlock_bh);
#endif
+#ifndef CONFIG_PREEMPT_RT
+
#ifndef CONFIG_INLINE_READ_TRYLOCK
-int __lockfunc _raw_read_trylock(rwlock_t *lock)
+noinline int __lockfunc _raw_read_trylock(rwlock_t *lock)
{
return __raw_read_trylock(lock);
}
@@ -218,7 +223,7 @@ EXPORT_SYMBOL(_raw_read_trylock);
#endif
#ifndef CONFIG_INLINE_READ_LOCK
-void __lockfunc _raw_read_lock(rwlock_t *lock)
+noinline void __lockfunc _raw_read_lock(rwlock_t *lock)
{
__raw_read_lock(lock);
}
@@ -226,7 +231,7 @@ EXPORT_SYMBOL(_raw_read_lock);
#endif
#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
-unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
+noinline unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
{
return __raw_read_lock_irqsave(lock);
}
@@ -234,7 +239,7 @@ EXPORT_SYMBOL(_raw_read_lock_irqsave);
#endif
#ifndef CONFIG_INLINE_READ_LOCK_IRQ
-void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
{
__raw_read_lock_irq(lock);
}
@@ -242,7 +247,7 @@ EXPORT_SYMBOL(_raw_read_lock_irq);
#endif
#ifndef CONFIG_INLINE_READ_LOCK_BH
-void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
{
__raw_read_lock_bh(lock);
}
@@ -250,7 +255,7 @@ EXPORT_SYMBOL(_raw_read_lock_bh);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK
-void __lockfunc _raw_read_unlock(rwlock_t *lock)
+noinline void __lockfunc _raw_read_unlock(rwlock_t *lock)
{
__raw_read_unlock(lock);
}
@@ -258,7 +263,7 @@ EXPORT_SYMBOL(_raw_read_unlock);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
-void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+noinline void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
{
__raw_read_unlock_irqrestore(lock, flags);
}
@@ -266,7 +271,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irqrestore);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
-void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
{
__raw_read_unlock_irq(lock);
}
@@ -274,7 +279,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irq);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK_BH
-void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
{
__raw_read_unlock_bh(lock);
}
@@ -282,7 +287,7 @@ EXPORT_SYMBOL(_raw_read_unlock_bh);
#endif
#ifndef CONFIG_INLINE_WRITE_TRYLOCK
-int __lockfunc _raw_write_trylock(rwlock_t *lock)
+noinline int __lockfunc _raw_write_trylock(rwlock_t *lock)
{
return __raw_write_trylock(lock);
}
@@ -290,15 +295,25 @@ EXPORT_SYMBOL(_raw_write_trylock);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK
-void __lockfunc _raw_write_lock(rwlock_t *lock)
+noinline void __lockfunc _raw_write_lock(rwlock_t *lock)
{
__raw_write_lock(lock);
}
EXPORT_SYMBOL(_raw_write_lock);
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+#define __raw_write_lock_nested(lock, subclass) __raw_write_lock(((void)(subclass), (lock)))
+#endif
+
+void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass)
+{
+ __raw_write_lock_nested(lock, subclass);
+}
+EXPORT_SYMBOL(_raw_write_lock_nested);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
-unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
+noinline unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
{
return __raw_write_lock_irqsave(lock);
}
@@ -306,7 +321,7 @@ EXPORT_SYMBOL(_raw_write_lock_irqsave);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
-void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
{
__raw_write_lock_irq(lock);
}
@@ -314,7 +329,7 @@ EXPORT_SYMBOL(_raw_write_lock_irq);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK_BH
-void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
{
__raw_write_lock_bh(lock);
}
@@ -322,7 +337,7 @@ EXPORT_SYMBOL(_raw_write_lock_bh);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK
-void __lockfunc _raw_write_unlock(rwlock_t *lock)
+noinline void __lockfunc _raw_write_unlock(rwlock_t *lock)
{
__raw_write_unlock(lock);
}
@@ -330,7 +345,7 @@ EXPORT_SYMBOL(_raw_write_unlock);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
-void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+noinline void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
{
__raw_write_unlock_irqrestore(lock, flags);
}
@@ -338,7 +353,7 @@ EXPORT_SYMBOL(_raw_write_unlock_irqrestore);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
-void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
{
__raw_write_unlock_irq(lock);
}
@@ -346,13 +361,15 @@ EXPORT_SYMBOL(_raw_write_unlock_irq);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
-void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
{
__raw_write_unlock_bh(lock);
}
EXPORT_SYMBOL(_raw_write_unlock_bh);
#endif
+#endif /* !CONFIG_PREEMPT_RT */
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
@@ -371,8 +388,7 @@ unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
local_irq_save(flags);
preempt_disable();
spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
- LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock,
- do_raw_spin_lock_flags, &flags);
+ LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
return flags;
}
EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested);
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index b9d93087ee66..87b03d2e41db 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -12,6 +12,7 @@
#include <linux/debug_locks.h>
#include <linux/delay.h>
#include <linux/export.h>
+#include <linux/pid.h>
void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
struct lock_class_key *key, short inner)
@@ -31,6 +32,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
EXPORT_SYMBOL(__raw_spin_lock_init);
+#ifndef CONFIG_PREEMPT_RT
void __rwlock_init(rwlock_t *lock, const char *name,
struct lock_class_key *key)
{
@@ -48,6 +50,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
}
EXPORT_SYMBOL(__rwlock_init);
+#endif
static void spin_dump(raw_spinlock_t *lock, const char *msg)
{
@@ -139,6 +142,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
arch_spin_unlock(&lock->raw_lock);
}
+#ifndef CONFIG_PREEMPT_RT
static void rwlock_bug(rwlock_t *lock, const char *msg)
{
if (!debug_locks_off())
@@ -228,3 +232,5 @@ void do_raw_write_unlock(rwlock_t *lock)
debug_write_unlock(lock);
arch_write_unlock(&lock->raw_lock);
}
+
+#endif /* !CONFIG_PREEMPT_RT */
diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c
new file mode 100644
index 000000000000..38e292454fcc
--- /dev/null
+++ b/kernel/locking/spinlock_rt.c
@@ -0,0 +1,286 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * PREEMPT_RT substitution for spin/rw_locks
+ *
+ * spinlocks and rwlocks on RT are based on rtmutexes, with a few twists to
+ * resemble the non RT semantics:
+ *
+ * - Contrary to plain rtmutexes, spinlocks and rwlocks are state
+ * preserving. The task state is saved before blocking on the underlying
+ * rtmutex, and restored when the lock has been acquired. Regular wakeups
+ * during that time are redirected to the saved state so no wake up is
+ * missed.
+ *
+ * - Non RT spin/rwlocks disable preemption and eventually interrupts.
+ * Disabling preemption has the side effect of disabling migration and
+ * preventing RCU grace periods.
+ *
+ * The RT substitutions explicitly disable migration and take
+ * rcu_read_lock() across the lock held section.
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+
+#define RT_MUTEX_BUILD_SPINLOCKS
+#include "rtmutex.c"
+
+/*
+ * __might_resched() skips the state check as rtlocks are state
+ * preserving. Take RCU nesting into account as spin/read/write_lock() can
+ * legitimately nest into an RCU read side critical section.
+ */
+#define RTLOCK_RESCHED_OFFSETS \
+ (rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT)
+
+#define rtlock_might_resched() \
+ __might_resched(__FILE__, __LINE__, RTLOCK_RESCHED_OFFSETS)
+
+static __always_inline void rtlock_lock(struct rt_mutex_base *rtm)
+{
+ lockdep_assert(!current->pi_blocked_on);
+
+ if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
+ rtlock_slowlock(rtm);
+}
+
+static __always_inline void __rt_spin_lock(spinlock_t *lock)
+{
+ rtlock_might_resched();
+ rtlock_lock(&lock->lock);
+ rcu_read_lock();
+ migrate_disable();
+}
+
+void __sched rt_spin_lock(spinlock_t *lock)
+{
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ __rt_spin_lock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched rt_spin_lock_nested(spinlock_t *lock, int subclass)
+{
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ __rt_spin_lock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock_nested);
+
+void __sched rt_spin_lock_nest_lock(spinlock_t *lock,
+ struct lockdep_map *nest_lock)
+{
+ spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
+ __rt_spin_lock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock_nest_lock);
+#endif
+
+void __sched rt_spin_unlock(spinlock_t *lock)
+{
+ spin_release(&lock->dep_map, _RET_IP_);
+ migrate_enable();
+ rcu_read_unlock();
+
+ if (unlikely(!rt_mutex_cmpxchg_release(&lock->lock, current, NULL)))
+ rt_mutex_slowunlock(&lock->lock);
+}
+EXPORT_SYMBOL(rt_spin_unlock);
+
+/*
+ * Wait for the lock to get unlocked: instead of polling for an unlock
+ * (like raw spinlocks do), lock and unlock, to force the kernel to
+ * schedule if there's contention:
+ */
+void __sched rt_spin_lock_unlock(spinlock_t *lock)
+{
+ spin_lock(lock);
+ spin_unlock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock_unlock);
+
+static __always_inline int __rt_spin_trylock(spinlock_t *lock)
+{
+ int ret = 1;
+
+ if (unlikely(!rt_mutex_cmpxchg_acquire(&lock->lock, NULL, current)))
+ ret = rt_mutex_slowtrylock(&lock->lock);
+
+ if (ret) {
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ migrate_disable();
+ }
+ return ret;
+}
+
+int __sched rt_spin_trylock(spinlock_t *lock)
+{
+ return __rt_spin_trylock(lock);
+}
+EXPORT_SYMBOL(rt_spin_trylock);
+
+int __sched rt_spin_trylock_bh(spinlock_t *lock)
+{
+ int ret;
+
+ local_bh_disable();
+ ret = __rt_spin_trylock(lock);
+ if (!ret)
+ local_bh_enable();
+ return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock_bh);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __rt_spin_lock_init(spinlock_t *lock, const char *name,
+ struct lock_class_key *key, bool percpu)
+{
+ u8 type = percpu ? LD_LOCK_PERCPU : LD_LOCK_NORMAL;
+
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map_type(&lock->dep_map, name, key, 0, LD_WAIT_CONFIG,
+ LD_WAIT_INV, type);
+}
+EXPORT_SYMBOL(__rt_spin_lock_init);
+#endif
+
+/*
+ * RT-specific reader/writer locks
+ */
+#define rwbase_set_and_save_current_state(state) \
+ current_save_and_set_rtlock_wait_state()
+
+#define rwbase_restore_current_state() \
+ current_restore_rtlock_saved_state()
+
+static __always_inline int
+rwbase_rtmutex_lock_state(struct rt_mutex_base *rtm, unsigned int state)
+{
+ if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
+ rtlock_slowlock(rtm);
+ return 0;
+}
+
+static __always_inline int
+rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state)
+{
+ rtlock_slowlock_locked(rtm);
+ return 0;
+}
+
+static __always_inline void rwbase_rtmutex_unlock(struct rt_mutex_base *rtm)
+{
+ if (likely(rt_mutex_cmpxchg_acquire(rtm, current, NULL)))
+ return;
+
+ rt_mutex_slowunlock(rtm);
+}
+
+static __always_inline int rwbase_rtmutex_trylock(struct rt_mutex_base *rtm)
+{
+ if (likely(rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
+ return 1;
+
+ return rt_mutex_slowtrylock(rtm);
+}
+
+#define rwbase_signal_pending_state(state, current) (0)
+
+#define rwbase_pre_schedule()
+
+#define rwbase_schedule() \
+ schedule_rtlock()
+
+#define rwbase_post_schedule()
+
+#include "rwbase_rt.c"
+/*
+ * The common functions which get wrapped into the rwlock API.
+ */
+int __sched rt_read_trylock(rwlock_t *rwlock)
+{
+ int ret;
+
+ ret = rwbase_read_trylock(&rwlock->rwbase);
+ if (ret) {
+ rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ migrate_disable();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_read_trylock);
+
+int __sched rt_write_trylock(rwlock_t *rwlock)
+{
+ int ret;
+
+ ret = rwbase_write_trylock(&rwlock->rwbase);
+ if (ret) {
+ rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ migrate_disable();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_write_trylock);
+
+void __sched rt_read_lock(rwlock_t *rwlock)
+{
+ rtlock_might_resched();
+ rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
+ rwbase_read_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+ rcu_read_lock();
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_read_lock);
+
+void __sched rt_write_lock(rwlock_t *rwlock)
+{
+ rtlock_might_resched();
+ rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
+ rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+ rcu_read_lock();
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_write_lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass)
+{
+ rtlock_might_resched();
+ rwlock_acquire(&rwlock->dep_map, subclass, 0, _RET_IP_);
+ rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+ rcu_read_lock();
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_write_lock_nested);
+#endif
+
+void __sched rt_read_unlock(rwlock_t *rwlock)
+{
+ rwlock_release(&rwlock->dep_map, _RET_IP_);
+ migrate_enable();
+ rcu_read_unlock();
+ rwbase_read_unlock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+}
+EXPORT_SYMBOL(rt_read_unlock);
+
+void __sched rt_write_unlock(rwlock_t *rwlock)
+{
+ rwlock_release(&rwlock->dep_map, _RET_IP_);
+ rcu_read_unlock();
+ migrate_enable();
+ rwbase_write_unlock(&rwlock->rwbase);
+}
+EXPORT_SYMBOL(rt_write_unlock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __rt_rwlock_init(rwlock_t *rwlock, const char *name,
+ struct lock_class_key *key)
+{
+ debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
+ lockdep_init_map_wait(&rwlock->dep_map, name, key, 0, LD_WAIT_CONFIG);
+}
+EXPORT_SYMBOL(__rt_rwlock_init);
+#endif
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 3e82f449b4ff..78719e1ef1b1 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -9,13 +9,22 @@
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/module.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
#include <linux/slab.h>
#include <linux/ww_mutex.h>
static DEFINE_WD_CLASS(ww_class);
struct workqueue_struct *wq;
+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
+#define ww_acquire_init_noinject(a, b) do { \
+ ww_acquire_init((a), (b)); \
+ (a)->deadlock_inject_countdown = ~0U; \
+ } while (0)
+#else
+#define ww_acquire_init_noinject(a, b) ww_acquire_init((a), (b))
+#endif
+
struct test_mutex {
struct work_struct work;
struct ww_mutex mutex;
@@ -36,7 +45,7 @@ static void test_mutex_work(struct work_struct *work)
wait_for_completion(&mtx->go);
if (mtx->flags & TEST_MTX_TRY) {
- while (!ww_mutex_trylock(&mtx->mutex))
+ while (!ww_mutex_trylock(&mtx->mutex, NULL))
cond_resched();
} else {
ww_mutex_lock(&mtx->mutex, NULL);
@@ -109,19 +118,39 @@ static int test_mutex(void)
return 0;
}
-static int test_aa(void)
+static int test_aa(bool trylock)
{
struct ww_mutex mutex;
struct ww_acquire_ctx ctx;
int ret;
+ const char *from = trylock ? "trylock" : "lock";
ww_mutex_init(&mutex, &ww_class);
ww_acquire_init(&ctx, &ww_class);
- ww_mutex_lock(&mutex, &ctx);
+ if (!trylock) {
+ ret = ww_mutex_lock(&mutex, &ctx);
+ if (ret) {
+ pr_err("%s: initial lock failed!\n", __func__);
+ goto out;
+ }
+ } else {
+ ret = !ww_mutex_trylock(&mutex, &ctx);
+ if (ret) {
+ pr_err("%s: initial trylock failed!\n", __func__);
+ goto out;
+ }
+ }
- if (ww_mutex_trylock(&mutex)) {
- pr_err("%s: trylocked itself!\n", __func__);
+ if (ww_mutex_trylock(&mutex, NULL)) {
+ pr_err("%s: trylocked itself without context from %s!\n", __func__, from);
+ ww_mutex_unlock(&mutex);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (ww_mutex_trylock(&mutex, &ctx)) {
+ pr_err("%s: trylocked itself with context from %s!\n", __func__, from);
ww_mutex_unlock(&mutex);
ret = -EINVAL;
goto out;
@@ -129,17 +158,17 @@ static int test_aa(void)
ret = ww_mutex_lock(&mutex, &ctx);
if (ret != -EALREADY) {
- pr_err("%s: missed deadlock for recursing, ret=%d\n",
- __func__, ret);
+ pr_err("%s: missed deadlock for recursing, ret=%d from %s\n",
+ __func__, ret, from);
if (!ret)
ww_mutex_unlock(&mutex);
ret = -EINVAL;
goto out;
}
+ ww_mutex_unlock(&mutex);
ret = 0;
out:
- ww_mutex_unlock(&mutex);
ww_acquire_fini(&ctx);
return ret;
}
@@ -150,7 +179,7 @@ struct test_abba {
struct ww_mutex b_mutex;
struct completion a_ready;
struct completion b_ready;
- bool resolve;
+ bool resolve, trylock;
int result;
};
@@ -160,8 +189,13 @@ static void test_abba_work(struct work_struct *work)
struct ww_acquire_ctx ctx;
int err;
- ww_acquire_init(&ctx, &ww_class);
- ww_mutex_lock(&abba->b_mutex, &ctx);
+ ww_acquire_init_noinject(&ctx, &ww_class);
+ if (!abba->trylock)
+ ww_mutex_lock(&abba->b_mutex, &ctx);
+ else
+ WARN_ON(!ww_mutex_trylock(&abba->b_mutex, &ctx));
+
+ WARN_ON(READ_ONCE(abba->b_mutex.ctx) != &ctx);
complete(&abba->b_ready);
wait_for_completion(&abba->a_ready);
@@ -181,7 +215,7 @@ static void test_abba_work(struct work_struct *work)
abba->result = err;
}
-static int test_abba(bool resolve)
+static int test_abba(bool trylock, bool resolve)
{
struct test_abba abba;
struct ww_acquire_ctx ctx;
@@ -192,12 +226,18 @@ static int test_abba(bool resolve)
INIT_WORK_ONSTACK(&abba.work, test_abba_work);
init_completion(&abba.a_ready);
init_completion(&abba.b_ready);
+ abba.trylock = trylock;
abba.resolve = resolve;
schedule_work(&abba.work);
- ww_acquire_init(&ctx, &ww_class);
- ww_mutex_lock(&abba.a_mutex, &ctx);
+ ww_acquire_init_noinject(&ctx, &ww_class);
+ if (!trylock)
+ ww_mutex_lock(&abba.a_mutex, &ctx);
+ else
+ WARN_ON(!ww_mutex_trylock(&abba.a_mutex, &ctx));
+
+ WARN_ON(READ_ONCE(abba.a_mutex.ctx) != &ctx);
complete(&abba.a_ready);
wait_for_completion(&abba.b_ready);
@@ -249,7 +289,7 @@ static void test_cycle_work(struct work_struct *work)
struct ww_acquire_ctx ctx;
int err, erra = 0;
- ww_acquire_init(&ctx, &ww_class);
+ ww_acquire_init_noinject(&ctx, &ww_class);
ww_mutex_lock(&cycle->a_mutex, &ctx);
complete(cycle->a_signal);
@@ -346,6 +386,19 @@ struct stress {
int nlocks;
};
+struct rnd_state rng;
+DEFINE_SPINLOCK(rng_lock);
+
+static inline u32 prandom_u32_below(u32 ceil)
+{
+ u32 ret;
+
+ spin_lock(&rng_lock);
+ ret = prandom_u32_state(&rng) % ceil;
+ spin_unlock(&rng_lock);
+ return ret;
+}
+
static int *get_random_order(int count)
{
int *order;
@@ -359,7 +412,7 @@ static int *get_random_order(int count)
order[n] = n;
for (n = count - 1; n > 1; n--) {
- r = get_random_int() % (n + 1);
+ r = prandom_u32_below(n + 1);
if (r != n) {
tmp = order[n];
order[n] = order[r];
@@ -412,21 +465,21 @@ retry:
ww_mutex_unlock(&locks[order[n]]);
if (err == -EDEADLK) {
- ww_mutex_lock_slow(&locks[order[contended]], &ctx);
- goto retry;
+ if (!time_after(jiffies, stress->timeout)) {
+ ww_mutex_lock_slow(&locks[order[contended]], &ctx);
+ goto retry;
+ }
}
+ ww_acquire_fini(&ctx);
if (err) {
pr_err_once("stress (%s) failed with %d\n",
__func__, err);
break;
}
-
- ww_acquire_fini(&ctx);
} while (!time_after(jiffies, stress->timeout));
kfree(order);
- kfree(stress);
}
struct reorder_lock {
@@ -491,14 +544,13 @@ out:
list_for_each_entry_safe(ll, ln, &locks, link)
kfree(ll);
kfree(order);
- kfree(stress);
}
static void stress_one_work(struct work_struct *work)
{
struct stress *stress = container_of(work, typeof(*stress), work);
const int nlocks = stress->nlocks;
- struct ww_mutex *lock = stress->locks + (get_random_int() % nlocks);
+ struct ww_mutex *lock = stress->locks + get_random_u32_below(nlocks);
int err;
do {
@@ -512,8 +564,6 @@ static void stress_one_work(struct work_struct *work)
break;
}
} while (!time_after(jiffies, stress->timeout));
-
- kfree(stress);
}
#define STRESS_INORDER BIT(0)
@@ -524,15 +574,24 @@ static void stress_one_work(struct work_struct *work)
static int stress(int nlocks, int nthreads, unsigned int flags)
{
struct ww_mutex *locks;
- int n;
+ struct stress *stress_array;
+ int n, count;
locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL);
if (!locks)
return -ENOMEM;
+ stress_array = kmalloc_array(nthreads, sizeof(*stress_array),
+ GFP_KERNEL);
+ if (!stress_array) {
+ kfree(locks);
+ return -ENOMEM;
+ }
+
for (n = 0; n < nlocks; n++)
ww_mutex_init(&locks[n], &ww_class);
+ count = 0;
for (n = 0; nthreads; n++) {
struct stress *stress;
void (*fn)(struct work_struct *work);
@@ -556,9 +615,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
if (!fn)
continue;
- stress = kmalloc(sizeof(*stress), GFP_KERNEL);
- if (!stress)
- break;
+ stress = &stress_array[count++];
INIT_WORK(&stress->work, fn);
stress->locks = locks;
@@ -573,6 +630,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
for (n = 0; n < nlocks; n++)
ww_mutex_destroy(&locks[n]);
+ kfree(stress_array);
kfree(locks);
return 0;
@@ -581,7 +639,11 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
static int __init test_ww_mutex_init(void)
{
int ncpus = num_online_cpus();
- int ret;
+ int ret, i;
+
+ printk(KERN_INFO "Beginning ww mutex selftests\n");
+
+ prandom_seed_state(&rng, get_random_u64());
wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0);
if (!wq)
@@ -591,17 +653,19 @@ static int __init test_ww_mutex_init(void)
if (ret)
return ret;
- ret = test_aa();
+ ret = test_aa(false);
if (ret)
return ret;
- ret = test_abba(false);
+ ret = test_aa(true);
if (ret)
return ret;
- ret = test_abba(true);
- if (ret)
- return ret;
+ for (i = 0; i < 4; i++) {
+ ret = test_abba(i & 1, i & 2);
+ if (ret)
+ return ret;
+ }
ret = test_cycle(ncpus);
if (ret)
@@ -615,10 +679,11 @@ static int __init test_ww_mutex_init(void)
if (ret)
return ret;
- ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
+ ret = stress(2047, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
if (ret)
return ret;
+ printk(KERN_INFO "All ww mutex selftests passed\n");
return 0;
}
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
new file mode 100644
index 000000000000..3ad2cc4823e5
--- /dev/null
+++ b/kernel/locking/ww_mutex.h
@@ -0,0 +1,569 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef WW_RT
+
+#define MUTEX mutex
+#define MUTEX_WAITER mutex_waiter
+
+static inline struct mutex_waiter *
+__ww_waiter_first(struct mutex *lock)
+{
+ struct mutex_waiter *w;
+
+ w = list_first_entry(&lock->wait_list, struct mutex_waiter, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline struct mutex_waiter *
+__ww_waiter_next(struct mutex *lock, struct mutex_waiter *w)
+{
+ w = list_next_entry(w, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline struct mutex_waiter *
+__ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w)
+{
+ w = list_prev_entry(w, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline struct mutex_waiter *
+__ww_waiter_last(struct mutex *lock)
+{
+ struct mutex_waiter *w;
+
+ w = list_last_entry(&lock->wait_list, struct mutex_waiter, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline void
+__ww_waiter_add(struct mutex *lock, struct mutex_waiter *waiter, struct mutex_waiter *pos)
+{
+ struct list_head *p = &lock->wait_list;
+ if (pos)
+ p = &pos->list;
+ __mutex_add_waiter(lock, waiter, p);
+}
+
+static inline struct task_struct *
+__ww_mutex_owner(struct mutex *lock)
+{
+ return __mutex_owner(lock);
+}
+
+static inline bool
+__ww_mutex_has_waiters(struct mutex *lock)
+{
+ return atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS;
+}
+
+static inline void lock_wait_lock(struct mutex *lock)
+{
+ raw_spin_lock(&lock->wait_lock);
+}
+
+static inline void unlock_wait_lock(struct mutex *lock)
+{
+ raw_spin_unlock(&lock->wait_lock);
+}
+
+static inline void lockdep_assert_wait_lock_held(struct mutex *lock)
+{
+ lockdep_assert_held(&lock->wait_lock);
+}
+
+#else /* WW_RT */
+
+#define MUTEX rt_mutex
+#define MUTEX_WAITER rt_mutex_waiter
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_first(struct rt_mutex *lock)
+{
+ struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
+}
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_next(struct rt_mutex *lock, struct rt_mutex_waiter *w)
+{
+ struct rb_node *n = rb_next(&w->tree.entry);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
+}
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w)
+{
+ struct rb_node *n = rb_prev(&w->tree.entry);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
+}
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_last(struct rt_mutex *lock)
+{
+ struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
+}
+
+static inline void
+__ww_waiter_add(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct rt_mutex_waiter *pos)
+{
+ /* RT unconditionally adds the waiter first and then removes it on error */
+}
+
+static inline struct task_struct *
+__ww_mutex_owner(struct rt_mutex *lock)
+{
+ return rt_mutex_owner(&lock->rtmutex);
+}
+
+static inline bool
+__ww_mutex_has_waiters(struct rt_mutex *lock)
+{
+ return rt_mutex_has_waiters(&lock->rtmutex);
+}
+
+static inline void lock_wait_lock(struct rt_mutex *lock)
+{
+ raw_spin_lock(&lock->rtmutex.wait_lock);
+}
+
+static inline void unlock_wait_lock(struct rt_mutex *lock)
+{
+ raw_spin_unlock(&lock->rtmutex.wait_lock);
+}
+
+static inline void lockdep_assert_wait_lock_held(struct rt_mutex *lock)
+{
+ lockdep_assert_held(&lock->rtmutex.wait_lock);
+}
+
+#endif /* WW_RT */
+
+/*
+ * Wait-Die:
+ * The newer transactions are killed when:
+ * It (the new transaction) makes a request for a lock being held
+ * by an older transaction.
+ *
+ * Wound-Wait:
+ * The newer transactions are wounded when:
+ * An older transaction makes a request for a lock being held by
+ * the newer transaction.
+ */
+
+/*
+ * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired
+ * it.
+ */
+static __always_inline void
+ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
+{
+#ifdef DEBUG_WW_MUTEXES
+ /*
+ * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
+ * but released with a normal mutex_unlock in this call.
+ *
+ * This should never happen, always use ww_mutex_unlock.
+ */
+ DEBUG_LOCKS_WARN_ON(ww->ctx);
+
+ /*
+ * Not quite done after calling ww_acquire_done() ?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
+
+ if (ww_ctx->contending_lock) {
+ /*
+ * After -EDEADLK you tried to
+ * acquire a different ww_mutex? Bad!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
+
+ /*
+ * You called ww_mutex_lock after receiving -EDEADLK,
+ * but 'forgot' to unlock everything else first?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
+ ww_ctx->contending_lock = NULL;
+ }
+
+ /*
+ * Naughty, using a different class will lead to undefined behavior!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
+#endif
+ ww_ctx->acquired++;
+ ww->ctx = ww_ctx;
+}
+
+/*
+ * Determine if @a is 'less' than @b. IOW, either @a is a lower priority task
+ * or, when of equal priority, a younger transaction than @b.
+ *
+ * Depending on the algorithm, @a will either need to wait for @b, or die.
+ */
+static inline bool
+__ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
+{
+/*
+ * Can only do the RT prio for WW_RT, because task->prio isn't stable due to PI,
+ * so the wait_list ordering will go wobbly. rt_mutex re-queues the waiter and
+ * isn't affected by this.
+ */
+#ifdef WW_RT
+ /* kernel prio; less is more */
+ int a_prio = a->task->prio;
+ int b_prio = b->task->prio;
+
+ if (rt_prio(a_prio) || rt_prio(b_prio)) {
+
+ if (a_prio > b_prio)
+ return true;
+
+ if (a_prio < b_prio)
+ return false;
+
+ /* equal static prio */
+
+ if (dl_prio(a_prio)) {
+ if (dl_time_before(b->task->dl.deadline,
+ a->task->dl.deadline))
+ return true;
+
+ if (dl_time_before(a->task->dl.deadline,
+ b->task->dl.deadline))
+ return false;
+ }
+
+ /* equal prio */
+ }
+#endif
+
+ /* FIFO order tie break -- bigger is younger */
+ return (signed long)(a->stamp - b->stamp) > 0;
+}
+
+/*
+ * Wait-Die; wake a lesser waiter context (when locks held) such that it can
+ * die.
+ *
+ * Among waiters with context, only the first one can have other locks acquired
+ * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and
+ * __ww_mutex_check_kill() wake any but the earliest context.
+ */
+static bool
+__ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ if (!ww_ctx->is_wait_die)
+ return false;
+
+ if (waiter->ww_ctx->acquired > 0 && __ww_ctx_less(waiter->ww_ctx, ww_ctx)) {
+#ifndef WW_RT
+ debug_mutex_wake_waiter(lock, waiter);
+#endif
+ wake_up_process(waiter->task);
+ }
+
+ return true;
+}
+
+/*
+ * Wound-Wait; wound a lesser @hold_ctx if it holds the lock.
+ *
+ * Wound the lock holder if there are waiters with more important transactions
+ * than the lock holders. Even if multiple waiters may wound the lock holder,
+ * it's sufficient that only one does.
+ */
+static bool __ww_mutex_wound(struct MUTEX *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ struct ww_acquire_ctx *hold_ctx)
+{
+ struct task_struct *owner = __ww_mutex_owner(lock);
+
+ lockdep_assert_wait_lock_held(lock);
+
+ /*
+ * Possible through __ww_mutex_add_waiter() when we race with
+ * ww_mutex_set_context_fastpath(). In that case we'll get here again
+ * through __ww_mutex_check_waiters().
+ */
+ if (!hold_ctx)
+ return false;
+
+ /*
+ * Can have !owner because of __mutex_unlock_slowpath(), but if owner,
+ * it cannot go away because we'll have FLAG_WAITERS set and hold
+ * wait_lock.
+ */
+ if (!owner)
+ return false;
+
+ if (ww_ctx->acquired > 0 && __ww_ctx_less(hold_ctx, ww_ctx)) {
+ hold_ctx->wounded = 1;
+
+ /*
+ * wake_up_process() paired with set_current_state()
+ * inserts sufficient barriers to make sure @owner either sees
+ * it's wounded in __ww_mutex_check_kill() or has a
+ * wakeup pending to re-read the wounded state.
+ */
+ if (owner != current)
+ wake_up_process(owner);
+
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * We just acquired @lock under @ww_ctx, if there are more important contexts
+ * waiting behind us on the wait-list, check if they need to die, or wound us.
+ *
+ * See __ww_mutex_add_waiter() for the list-order construction; basically the
+ * list is ordered by stamp, smallest (oldest) first.
+ *
+ * This relies on never mixing wait-die/wound-wait on the same wait-list;
+ * which is currently ensured by that being a ww_class property.
+ *
+ * The current task must not be on the wait list.
+ */
+static void
+__ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx)
+{
+ struct MUTEX_WAITER *cur;
+
+ lockdep_assert_wait_lock_held(lock);
+
+ for (cur = __ww_waiter_first(lock); cur;
+ cur = __ww_waiter_next(lock, cur)) {
+
+ if (!cur->ww_ctx)
+ continue;
+
+ if (__ww_mutex_die(lock, cur, ww_ctx) ||
+ __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx))
+ break;
+ }
+}
+
+/*
+ * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx
+ * and wake up any waiters so they can recheck.
+ */
+static __always_inline void
+ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ ww_mutex_lock_acquired(lock, ctx);
+
+ /*
+ * The lock->ctx update should be visible on all cores before
+ * the WAITERS check is done, otherwise contended waiters might be
+ * missed. The contended waiters will either see ww_ctx == NULL
+ * and keep spinning, or it will acquire wait_lock, add itself
+ * to waiter list and sleep.
+ */
+ smp_mb(); /* See comments above and below. */
+
+ /*
+ * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS
+ * MB MB
+ * [R] MUTEX_FLAG_WAITERS [R] ww->ctx
+ *
+ * The memory barrier above pairs with the memory barrier in
+ * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx
+ * and/or !empty list.
+ */
+ if (likely(!__ww_mutex_has_waiters(&lock->base)))
+ return;
+
+ /*
+ * Uh oh, we raced in fastpath, check if any of the waiters need to
+ * die or wound us.
+ */
+ lock_wait_lock(&lock->base);
+ __ww_mutex_check_waiters(&lock->base, ctx);
+ unlock_wait_lock(&lock->base);
+}
+
+static __always_inline int
+__ww_mutex_kill(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx)
+{
+ if (ww_ctx->acquired > 0) {
+#ifdef DEBUG_WW_MUTEXES
+ struct ww_mutex *ww;
+
+ ww = container_of(lock, struct ww_mutex, base);
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
+ ww_ctx->contending_lock = ww;
+#endif
+ return -EDEADLK;
+ }
+
+ return 0;
+}
+
+/*
+ * Check the wound condition for the current lock acquire.
+ *
+ * Wound-Wait: If we're wounded, kill ourself.
+ *
+ * Wait-Die: If we're trying to acquire a lock already held by an older
+ * context, kill ourselves.
+ *
+ * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to
+ * look at waiters before us in the wait-list.
+ */
+static inline int
+__ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
+ struct ww_acquire_ctx *ctx)
+{
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+ struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
+ struct MUTEX_WAITER *cur;
+
+ if (ctx->acquired == 0)
+ return 0;
+
+ if (!ctx->is_wait_die) {
+ if (ctx->wounded)
+ return __ww_mutex_kill(lock, ctx);
+
+ return 0;
+ }
+
+ if (hold_ctx && __ww_ctx_less(ctx, hold_ctx))
+ return __ww_mutex_kill(lock, ctx);
+
+ /*
+ * If there is a waiter in front of us that has a context, then its
+ * stamp is earlier than ours and we must kill ourself.
+ */
+ for (cur = __ww_waiter_prev(lock, waiter); cur;
+ cur = __ww_waiter_prev(lock, cur)) {
+
+ if (!cur->ww_ctx)
+ continue;
+
+ return __ww_mutex_kill(lock, ctx);
+ }
+
+ return 0;
+}
+
+/*
+ * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest
+ * first. Such that older contexts are preferred to acquire the lock over
+ * younger contexts.
+ *
+ * Waiters without context are interspersed in FIFO order.
+ *
+ * Furthermore, for Wait-Die kill ourself immediately when possible (there are
+ * older contexts already waiting) to avoid unnecessary waiting and for
+ * Wound-Wait ensure we wound the owning context when it is younger.
+ */
+static inline int
+__ww_mutex_add_waiter(struct MUTEX_WAITER *waiter,
+ struct MUTEX *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ struct MUTEX_WAITER *cur, *pos = NULL;
+ bool is_wait_die;
+
+ if (!ww_ctx) {
+ __ww_waiter_add(lock, waiter, NULL);
+ return 0;
+ }
+
+ is_wait_die = ww_ctx->is_wait_die;
+
+ /*
+ * Add the waiter before the first waiter with a higher stamp.
+ * Waiters without a context are skipped to avoid starving
+ * them. Wait-Die waiters may die here. Wound-Wait waiters
+ * never die here, but they are sorted in stamp order and
+ * may wound the lock holder.
+ */
+ for (cur = __ww_waiter_last(lock); cur;
+ cur = __ww_waiter_prev(lock, cur)) {
+
+ if (!cur->ww_ctx)
+ continue;
+
+ if (__ww_ctx_less(ww_ctx, cur->ww_ctx)) {
+ /*
+ * Wait-Die: if we find an older context waiting, there
+ * is no point in queueing behind it, as we'd have to
+ * die the moment it would acquire the lock.
+ */
+ if (is_wait_die) {
+ int ret = __ww_mutex_kill(lock, ww_ctx);
+
+ if (ret)
+ return ret;
+ }
+
+ break;
+ }
+
+ pos = cur;
+
+ /* Wait-Die: ensure younger waiters die. */
+ __ww_mutex_die(lock, cur, ww_ctx);
+ }
+
+ __ww_waiter_add(lock, waiter, pos);
+
+ /*
+ * Wound-Wait: if we're blocking on a mutex owned by a younger context,
+ * wound that such that we might proceed.
+ */
+ if (!is_wait_die) {
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+
+ /*
+ * See ww_mutex_set_context_fastpath(). Orders setting
+ * MUTEX_FLAG_WAITERS vs the ww->ctx load,
+ * such that either we or the fastpath will wound @ww->ctx.
+ */
+ smp_mb();
+ __ww_mutex_wound(lock, ww_ctx, ww->ctx);
+ }
+
+ return 0;
+}
+
+static inline void __ww_mutex_unlock(struct ww_mutex *lock)
+{
+ if (lock->ctx) {
+#ifdef DEBUG_WW_MUTEXES
+ DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
+#endif
+ if (lock->ctx->acquired > 0)
+ lock->ctx->acquired--;
+ lock->ctx = NULL;
+ }
+}
diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c
new file mode 100644
index 000000000000..c7196de838ed
--- /dev/null
+++ b/kernel/locking/ww_rt_mutex.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * rtmutex API
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+
+#define RT_MUTEX_BUILD_MUTEX
+#define WW_RT
+#include "rtmutex.c"
+
+int ww_mutex_trylock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
+{
+ struct rt_mutex *rtm = &lock->base;
+
+ if (!ww_ctx)
+ return rt_mutex_trylock(rtm);
+
+ /*
+ * Reset the wounded flag after a kill. No other process can
+ * race and wound us here, since they can't have a valid owner
+ * pointer if we don't have any locks held.
+ */
+ if (ww_ctx->acquired == 0)
+ ww_ctx->wounded = 0;
+
+ if (__rt_mutex_trylock(&rtm->rtmutex)) {
+ ww_mutex_set_context_fastpath(lock, ww_ctx);
+ mutex_acquire_nest(&rtm->dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_);
+ return 1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(ww_mutex_trylock);
+
+static int __sched
+__ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx,
+ unsigned int state, unsigned long ip)
+{
+ struct lockdep_map __maybe_unused *nest_lock = NULL;
+ struct rt_mutex *rtm = &lock->base;
+ int ret;
+
+ might_sleep();
+
+ if (ww_ctx) {
+ if (unlikely(ww_ctx == READ_ONCE(lock->ctx)))
+ return -EALREADY;
+
+ /*
+ * Reset the wounded flag after a kill. No other process can
+ * race and wound us here, since they can't have a valid owner
+ * pointer if we don't have any locks held.
+ */
+ if (ww_ctx->acquired == 0)
+ ww_ctx->wounded = 0;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ nest_lock = &ww_ctx->dep_map;
+#endif
+ }
+ mutex_acquire_nest(&rtm->dep_map, 0, 0, nest_lock, ip);
+
+ if (likely(rt_mutex_try_acquire(&rtm->rtmutex))) {
+ if (ww_ctx)
+ ww_mutex_set_context_fastpath(lock, ww_ctx);
+ return 0;
+ }
+
+ ret = rt_mutex_slowlock(&rtm->rtmutex, ww_ctx, state);
+
+ if (ret)
+ mutex_release(&rtm->dep_map, ip);
+ return ret;
+}
+
+int __sched
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ return __ww_rt_mutex_lock(lock, ctx, TASK_UNINTERRUPTIBLE, _RET_IP_);
+}
+EXPORT_SYMBOL(ww_mutex_lock);
+
+int __sched
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ return __ww_rt_mutex_lock(lock, ctx, TASK_INTERRUPTIBLE, _RET_IP_);
+}
+EXPORT_SYMBOL(ww_mutex_lock_interruptible);
+
+void __sched ww_mutex_unlock(struct ww_mutex *lock)
+{
+ struct rt_mutex *rtm = &lock->base;
+
+ __ww_mutex_unlock(lock);
+
+ mutex_release(&rtm->dep_map, _RET_IP_);
+ __rt_mutex_unlock(&rtm->rtmutex);
+}
+EXPORT_SYMBOL(ww_mutex_unlock);