From 63dc47e956b464e0ed3282f6e70974eebf850180 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 6 Jan 2015 11:45:04 -0800 Subject: locking/mutex: Checking the stamp is WW only Mark it so by renaming __mutex_lock_check_stamp(). Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: "Paul E. McKenney" Cc: Thomas Gleixner Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1420573509-24774-2-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 454195194d4a..b042ea57bbea 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -469,7 +469,7 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock) EXPORT_SYMBOL(ww_mutex_unlock); static inline int __sched -__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) +__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); @@ -557,7 +557,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } if (use_ww_ctx && ww_ctx->acquired > 0) { - ret = __mutex_lock_check_stamp(lock, ww_ctx); + ret = __ww_mutex_lock_check_stamp(lock, ww_ctx); if (ret) goto err; } -- cgit From e42f678a0237f84f0004fbaf0fad0b844751eadd Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 6 Jan 2015 11:45:05 -0800 Subject: locking/mutex: Move MCS related comments to proper location It serves much better if the comments are right before the osq_lock() call. Also delete a useless comment. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: "Paul E. McKenney" Cc: Thomas Gleixner Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1420573509-24774-3-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index b042ea57bbea..6db3d0dea6da 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -193,17 +193,6 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, #ifdef CONFIG_MUTEX_SPIN_ON_OWNER -/* - * In order to avoid a stampede of mutex spinners from acquiring the mutex - * more or less simultaneously, the spinners need to acquire a MCS lock - * first before spinning on the owner field. - * - */ - -/* - * Mutex spinning code migrated from kernel/sched/core.c - */ - static inline bool owner_running(struct mutex *lock, struct task_struct *owner) { if (lock->owner != owner) @@ -307,6 +296,11 @@ static bool mutex_optimistic_spin(struct mutex *lock, if (!mutex_can_spin_on_owner(lock)) goto done; + /* + * In order to avoid a stampede of mutex spinners trying to + * acquire the mutex all at once, the spinners need to take a + * MCS (queued) lock first before spinning on the owner field. + */ if (!osq_lock(&lock->osq)) goto done; -- cgit From 4bd19084faa61a8c68586e74f03f5776179f65c2 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 6 Jan 2015 11:45:06 -0800 Subject: locking/mutex: Introduce ww_mutex_set_context_slowpath() ... which is equivalent to the fastpath counter part. This mainly allows getting some WW specific code out of generic mutex paths. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: "Paul E. McKenney" Cc: Thomas Gleixner Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1420573509-24774-4-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 6db3d0dea6da..c67a60b61625 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -147,7 +147,7 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, } /* - * after acquiring lock with fastpath or when we lost out in contested + * After acquiring lock with fastpath or when we lost out in contested * slowpath, set ctx and wake up any waiters so they can recheck. * * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, @@ -191,6 +191,30 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, spin_unlock_mutex(&lock->base.wait_lock, flags); } +/* + * After acquiring lock in the slowpath set ctx and wake up any + * waiters so they can recheck. + * + * Callers must hold the mutex wait_lock. + */ +static __always_inline void +ww_mutex_set_context_slowpath(struct ww_mutex *lock, + struct ww_acquire_ctx *ctx) +{ + struct mutex_waiter *cur; + + ww_mutex_lock_acquired(lock, ctx); + lock->ctx = ctx; + + /* + * Give any possible sleeping processes the chance to wake up, + * so they can recheck if they have to back off. + */ + list_for_each_entry(cur, &lock->base.wait_list, list) { + debug_mutex_wake_waiter(&lock->base, cur); + wake_up_process(cur->task); + } +} #ifdef CONFIG_MUTEX_SPIN_ON_OWNER static inline bool owner_running(struct mutex *lock, struct task_struct *owner) @@ -576,23 +600,7 @@ skip_wait: if (use_ww_ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); - struct mutex_waiter *cur; - - /* - * This branch gets optimized out for the common case, - * and is only important for ww_mutex_lock. - */ - ww_mutex_lock_acquired(ww, ww_ctx); - ww->ctx = ww_ctx; - - /* - * Give any possible sleeping processes the chance to wake up, - * so they can recheck if they have to back off. - */ - list_for_each_entry(cur, &lock->wait_list, list) { - debug_mutex_wake_waiter(lock, cur); - wake_up_process(cur->task); - } + ww_mutex_set_context_slowpath(ww, ww_ctx); } spin_unlock_mutex(&lock->wait_lock, flags); -- cgit From d84b6728c54dcf73bcef3e3f7cf6767e2d224e39 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 6 Jan 2015 11:45:07 -0800 Subject: locking/mcs: Better differentiate between MCS variants We have two flavors of the MCS spinlock: standard and cancelable (OSQ). While each one is independent of the other, we currently mix and match them. This patch: - Moves the OSQ code out of mcs_spinlock.h (which only deals with the traditional version) into include/linux/osq_lock.h. No unnecessary code is added to the more global header file, anything locks that make use of OSQ must include it anyway. - Renames mcs_spinlock.c to osq_lock.c. This file only contains osq code. - Introduces a CONFIG_LOCK_SPIN_ON_OWNER in order to only build osq_lock if there is support for it. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Thomas Gleixner Cc: "Paul E. McKenney" Cc: Jason Low Cc: Linus Torvalds Cc: Mikulas Patocka Cc: Waiman Long Link: http://lkml.kernel.org/r/1420573509-24774-5-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/Kconfig.locks | 4 + kernel/locking/Makefile | 3 +- kernel/locking/mcs_spinlock.c | 208 ------------------------------------------ kernel/locking/mcs_spinlock.h | 16 ---- kernel/locking/osq_lock.c | 203 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 209 insertions(+), 225 deletions(-) delete mode 100644 kernel/locking/mcs_spinlock.c create mode 100644 kernel/locking/osq_lock.c (limited to 'kernel') diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 76768ee812b2..08561f1acd13 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -231,6 +231,10 @@ config RWSEM_SPIN_ON_OWNER def_bool y depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW +config LOCK_SPIN_ON_OWNER + def_bool y + depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER + config ARCH_USE_QUEUE_RWLOCK bool diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 8541bfdfd232..4ca8eb151975 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o +obj-y += mutex.o semaphore.o rwsem.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = -pg @@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y) obj-$(CONFIG_LOCKDEP) += lockdep_proc.o endif obj-$(CONFIG_SMP) += spinlock.o +obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o obj-$(CONFIG_SMP) += lglock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c deleted file mode 100644 index 9887a905a762..000000000000 --- a/kernel/locking/mcs_spinlock.c +++ /dev/null @@ -1,208 +0,0 @@ -#include -#include -#include "mcs_spinlock.h" - -#ifdef CONFIG_SMP - -/* - * An MCS like lock especially tailored for optimistic spinning for sleeping - * lock implementations (mutex, rwsem, etc). - * - * Using a single mcs node per CPU is safe because sleeping locks should not be - * called from interrupt context and we have preemption disabled while - * spinning. - */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node); - -/* - * We use the value 0 to represent "no CPU", thus the encoded value - * will be the CPU number incremented by 1. - */ -static inline int encode_cpu(int cpu_nr) -{ - return cpu_nr + 1; -} - -static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val) -{ - int cpu_nr = encoded_cpu_val - 1; - - return per_cpu_ptr(&osq_node, cpu_nr); -} - -/* - * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. - * Can return NULL in case we were the last queued and we updated @lock instead. - */ -static inline struct optimistic_spin_node * -osq_wait_next(struct optimistic_spin_queue *lock, - struct optimistic_spin_node *node, - struct optimistic_spin_node *prev) -{ - struct optimistic_spin_node *next = NULL; - int curr = encode_cpu(smp_processor_id()); - int old; - - /* - * If there is a prev node in queue, then the 'old' value will be - * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if - * we're currently last in queue, then the queue will then become empty. - */ - old = prev ? prev->cpu : OSQ_UNLOCKED_VAL; - - for (;;) { - if (atomic_read(&lock->tail) == curr && - atomic_cmpxchg(&lock->tail, curr, old) == curr) { - /* - * We were the last queued, we moved @lock back. @prev - * will now observe @lock and will complete its - * unlock()/unqueue(). - */ - break; - } - - /* - * We must xchg() the @node->next value, because if we were to - * leave it in, a concurrent unlock()/unqueue() from - * @node->next might complete Step-A and think its @prev is - * still valid. - * - * If the concurrent unlock()/unqueue() wins the race, we'll - * wait for either @lock to point to us, through its Step-B, or - * wait for a new @node->next from its Step-C. - */ - if (node->next) { - next = xchg(&node->next, NULL); - if (next) - break; - } - - cpu_relax_lowlatency(); - } - - return next; -} - -bool osq_lock(struct optimistic_spin_queue *lock) -{ - struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); - struct optimistic_spin_node *prev, *next; - int curr = encode_cpu(smp_processor_id()); - int old; - - node->locked = 0; - node->next = NULL; - node->cpu = curr; - - old = atomic_xchg(&lock->tail, curr); - if (old == OSQ_UNLOCKED_VAL) - return true; - - prev = decode_cpu(old); - node->prev = prev; - ACCESS_ONCE(prev->next) = node; - - /* - * Normally @prev is untouchable after the above store; because at that - * moment unlock can proceed and wipe the node element from stack. - * - * However, since our nodes are static per-cpu storage, we're - * guaranteed their existence -- this allows us to apply - * cmpxchg in an attempt to undo our queueing. - */ - - while (!smp_load_acquire(&node->locked)) { - /* - * If we need to reschedule bail... so we can block. - */ - if (need_resched()) - goto unqueue; - - cpu_relax_lowlatency(); - } - return true; - -unqueue: - /* - * Step - A -- stabilize @prev - * - * Undo our @prev->next assignment; this will make @prev's - * unlock()/unqueue() wait for a next pointer since @lock points to us - * (or later). - */ - - for (;;) { - if (prev->next == node && - cmpxchg(&prev->next, node, NULL) == node) - break; - - /* - * We can only fail the cmpxchg() racing against an unlock(), - * in which case we should observe @node->locked becomming - * true. - */ - if (smp_load_acquire(&node->locked)) - return true; - - cpu_relax_lowlatency(); - - /* - * Or we race against a concurrent unqueue()'s step-B, in which - * case its step-C will write us a new @node->prev pointer. - */ - prev = ACCESS_ONCE(node->prev); - } - - /* - * Step - B -- stabilize @next - * - * Similar to unlock(), wait for @node->next or move @lock from @node - * back to @prev. - */ - - next = osq_wait_next(lock, node, prev); - if (!next) - return false; - - /* - * Step - C -- unlink - * - * @prev is stable because its still waiting for a new @prev->next - * pointer, @next is stable because our @node->next pointer is NULL and - * it will wait in Step-A. - */ - - ACCESS_ONCE(next->prev) = prev; - ACCESS_ONCE(prev->next) = next; - - return false; -} - -void osq_unlock(struct optimistic_spin_queue *lock) -{ - struct optimistic_spin_node *node, *next; - int curr = encode_cpu(smp_processor_id()); - - /* - * Fast path for the uncontended case. - */ - if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr)) - return; - - /* - * Second most likely case. - */ - node = this_cpu_ptr(&osq_node); - next = xchg(&node->next, NULL); - if (next) { - ACCESS_ONCE(next->locked) = 1; - return; - } - - next = osq_wait_next(lock, node, NULL); - if (next) - ACCESS_ONCE(next->locked) = 1; -} - -#endif - diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 4d60986fcbee..d1fe2ba5bac9 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -108,20 +108,4 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) arch_mcs_spin_unlock_contended(&next->locked); } -/* - * Cancellable version of the MCS lock above. - * - * Intended for adaptive spinning of sleeping locks: - * mutex_lock()/rwsem_down_{read,write}() etc. - */ - -struct optimistic_spin_node { - struct optimistic_spin_node *next, *prev; - int locked; /* 1 if lock acquired */ - int cpu; /* encoded CPU # value */ -}; - -extern bool osq_lock(struct optimistic_spin_queue *lock); -extern void osq_unlock(struct optimistic_spin_queue *lock); - #endif /* __LINUX_MCS_SPINLOCK_H */ diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c new file mode 100644 index 000000000000..ec83d4db8ec6 --- /dev/null +++ b/kernel/locking/osq_lock.c @@ -0,0 +1,203 @@ +#include +#include +#include + +/* + * An MCS like lock especially tailored for optimistic spinning for sleeping + * lock implementations (mutex, rwsem, etc). + * + * Using a single mcs node per CPU is safe because sleeping locks should not be + * called from interrupt context and we have preemption disabled while + * spinning. + */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node); + +/* + * We use the value 0 to represent "no CPU", thus the encoded value + * will be the CPU number incremented by 1. + */ +static inline int encode_cpu(int cpu_nr) +{ + return cpu_nr + 1; +} + +static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val) +{ + int cpu_nr = encoded_cpu_val - 1; + + return per_cpu_ptr(&osq_node, cpu_nr); +} + +/* + * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. + * Can return NULL in case we were the last queued and we updated @lock instead. + */ +static inline struct optimistic_spin_node * +osq_wait_next(struct optimistic_spin_queue *lock, + struct optimistic_spin_node *node, + struct optimistic_spin_node *prev) +{ + struct optimistic_spin_node *next = NULL; + int curr = encode_cpu(smp_processor_id()); + int old; + + /* + * If there is a prev node in queue, then the 'old' value will be + * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if + * we're currently last in queue, then the queue will then become empty. + */ + old = prev ? prev->cpu : OSQ_UNLOCKED_VAL; + + for (;;) { + if (atomic_read(&lock->tail) == curr && + atomic_cmpxchg(&lock->tail, curr, old) == curr) { + /* + * We were the last queued, we moved @lock back. @prev + * will now observe @lock and will complete its + * unlock()/unqueue(). + */ + break; + } + + /* + * We must xchg() the @node->next value, because if we were to + * leave it in, a concurrent unlock()/unqueue() from + * @node->next might complete Step-A and think its @prev is + * still valid. + * + * If the concurrent unlock()/unqueue() wins the race, we'll + * wait for either @lock to point to us, through its Step-B, or + * wait for a new @node->next from its Step-C. + */ + if (node->next) { + next = xchg(&node->next, NULL); + if (next) + break; + } + + cpu_relax_lowlatency(); + } + + return next; +} + +bool osq_lock(struct optimistic_spin_queue *lock) +{ + struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); + struct optimistic_spin_node *prev, *next; + int curr = encode_cpu(smp_processor_id()); + int old; + + node->locked = 0; + node->next = NULL; + node->cpu = curr; + + old = atomic_xchg(&lock->tail, curr); + if (old == OSQ_UNLOCKED_VAL) + return true; + + prev = decode_cpu(old); + node->prev = prev; + ACCESS_ONCE(prev->next) = node; + + /* + * Normally @prev is untouchable after the above store; because at that + * moment unlock can proceed and wipe the node element from stack. + * + * However, since our nodes are static per-cpu storage, we're + * guaranteed their existence -- this allows us to apply + * cmpxchg in an attempt to undo our queueing. + */ + + while (!smp_load_acquire(&node->locked)) { + /* + * If we need to reschedule bail... so we can block. + */ + if (need_resched()) + goto unqueue; + + cpu_relax_lowlatency(); + } + return true; + +unqueue: + /* + * Step - A -- stabilize @prev + * + * Undo our @prev->next assignment; this will make @prev's + * unlock()/unqueue() wait for a next pointer since @lock points to us + * (or later). + */ + + for (;;) { + if (prev->next == node && + cmpxchg(&prev->next, node, NULL) == node) + break; + + /* + * We can only fail the cmpxchg() racing against an unlock(), + * in which case we should observe @node->locked becomming + * true. + */ + if (smp_load_acquire(&node->locked)) + return true; + + cpu_relax_lowlatency(); + + /* + * Or we race against a concurrent unqueue()'s step-B, in which + * case its step-C will write us a new @node->prev pointer. + */ + prev = ACCESS_ONCE(node->prev); + } + + /* + * Step - B -- stabilize @next + * + * Similar to unlock(), wait for @node->next or move @lock from @node + * back to @prev. + */ + + next = osq_wait_next(lock, node, prev); + if (!next) + return false; + + /* + * Step - C -- unlink + * + * @prev is stable because its still waiting for a new @prev->next + * pointer, @next is stable because our @node->next pointer is NULL and + * it will wait in Step-A. + */ + + ACCESS_ONCE(next->prev) = prev; + ACCESS_ONCE(prev->next) = next; + + return false; +} + +void osq_unlock(struct optimistic_spin_queue *lock) +{ + struct optimistic_spin_node *node, *next; + int curr = encode_cpu(smp_processor_id()); + + /* + * Fast path for the uncontended case. + */ + if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr)) + return; + + /* + * Second most likely case. + */ + node = this_cpu_ptr(&osq_node); + next = xchg(&node->next, NULL); + if (next) { + ACCESS_ONCE(next->locked) = 1; + return; + } + + next = osq_wait_next(lock, node, NULL); + if (next) + ACCESS_ONCE(next->locked) = 1; +} -- cgit From 036cc30c6b6af1cd42de6c34c4461f17da01cbf7 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 6 Jan 2015 11:45:09 -0800 Subject: locking/osq: No need for load/acquire when acquire-polling Both mutexes and rwsems took a performance hit when we switched over from the original mcs code to the cancelable variant (osq). The reason being the use of smp_load_acquire() when polling for node->locked. This is not needed as reordering is not an issue, as such, relax the barrier semantics. Paul describes the scenario nicely: https://lkml.org/lkml/2013/11/19/405 - If we start polling before the insertion is complete, all that happens is that the first few polls have no chance of seeing a lock grant. - Ordering the polling against the initialization -- the above xchg() is already doing that for us. The smp_load_acquire() when unqueuing make sense. In addition, we don't need to worry about leaking the critical region as osq is only used internally. This impacts both regular and large levels of concurrency, ie on a 40 core system with a disk intensive workload: disk-1 804.83 ( 0.00%) 828.16 ( 2.90%) disk-61 8063.45 ( 0.00%) 18181.82 (125.48%) disk-121 7187.41 ( 0.00%) 20119.17 (179.92%) disk-181 6933.32 ( 0.00%) 20509.91 (195.82%) disk-241 6850.81 ( 0.00%) 20397.80 (197.74%) disk-301 6815.22 ( 0.00%) 20287.58 (197.68%) disk-361 7080.40 ( 0.00%) 20205.22 (185.37%) disk-421 7076.13 ( 0.00%) 19957.33 (182.04%) disk-481 7083.25 ( 0.00%) 19784.06 (179.31%) disk-541 7038.39 ( 0.00%) 19610.92 (178.63%) disk-601 7072.04 ( 0.00%) 19464.53 (175.23%) disk-661 7010.97 ( 0.00%) 19348.23 (175.97%) disk-721 7069.44 ( 0.00%) 19255.33 (172.37%) disk-781 7007.58 ( 0.00%) 19103.14 (172.61%) disk-841 6981.18 ( 0.00%) 18964.22 (171.65%) disk-901 6968.47 ( 0.00%) 18826.72 (170.17%) disk-961 6964.61 ( 0.00%) 18708.02 (168.62%) Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: "Paul E. McKenney" Cc: Thomas Gleixner Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1420573509-24774-7-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/osq_lock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index ec83d4db8ec6..c112d00341b0 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -109,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) * cmpxchg in an attempt to undo our queueing. */ - while (!smp_load_acquire(&node->locked)) { + while (!ACCESS_ONCE(node->locked)) { /* * If we need to reschedule bail... so we can block. */ -- cgit From 0f1ba9a2cea52896448ef4e14a4cb1880b8e5bee Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 7 Jan 2015 10:04:41 +0100 Subject: softirq/preempt: Add missing current->preempt_disable_ip update While debugging some "sleeping function called from invalid context" bug I realized that the debugging message "Preemption disabled at:" pointed to an incorrect function. In particular if the last function/action that disabled preemption was spin_lock_bh() then current->preempt_disable_ip won't be updated. The reason for this is that __local_bh_disable_ip() will increase preempt_count manually instead of calling preempt_count_add(), which would handle the update correctly. It look like the manual handling was done to work around some lockdep issue. So add the missing update of current->preempt_disable_ip to __local_bh_disable_ip() as well. Signed-off-by: Heiko Carstens Signed-off-by: Peter Zijlstra (Intel) Cc: Paul E. McKenney Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20150107090441.GC4365@osiris Signed-off-by: Ingo Molnar --- kernel/softirq.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 501baa9ac1be..b9ae81643f1d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -114,8 +114,12 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) trace_softirqs_off(ip); raw_local_irq_restore(flags); - if (preempt_count() == cnt) + if (preempt_count() == cnt) { +#ifdef CONFIG_DEBUG_PREEMPT + current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1); +#endif trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + } } EXPORT_SYMBOL(__local_bh_disable_ip); #endif /* CONFIG_TRACE_IRQFLAGS */ -- cgit From 996636ddae5cab8883bd76b996cd4f2ea9a152be Mon Sep 17 00:00:00 2001 From: Michael Kerrisk Date: Fri, 16 Jan 2015 20:28:06 +0100 Subject: futex: Fix argument handling in futex_lock_pi() calls This patch fixes two separate buglets in calls to futex_lock_pi(): * Eliminate unused 'detect' argument * Change unused 'timeout' argument of FUTEX_TRYLOCK_PI to NULL The 'detect' argument of futex_lock_pi() seems never to have been used (when it was included with the initial PI mutex implementation in Linux 2.6.18, all checks against its value were disabled by ANDing against 0 (i.e., if (detect... && 0)), and with commit 778e9a9c3e7193ea9f434f382947155ffb59c755, any mention of this argument in futex_lock_pi() went way altogether. Its presence now serves only to confuse readers of the code, by giving the impression that the futex() FUTEX_LOCK_PI operation actually does use the 'val' argument. This patch removes the argument. The futex_lock_pi() call that corresponds to FUTEX_TRYLOCK_PI includes 'timeout' as one of its arguments. This misleads the reader into thinking that the FUTEX_TRYLOCK_PI operation does employ timeouts for some sensible purpose; but it does not. Indeed, it cannot, because the checks at the start of sys_futex() exclude FUTEX_TRYLOCK_PI from the set of operations that do copy_from_user() on the timeout argument. So, in the FUTEX_TRYLOCK_PI futex_lock_pi() call it would be simplest to change 'timeout' to 'NULL'. This patch does that. Signed-off-by: Michael Kerrisk Reviewed-by: Darren Hart Link: http://lkml.kernel.org/r/54B96646.8010200@gmail.com Signed-off-by: Thomas Gleixner --- kernel/futex.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 63678b573d61..4eeb63de7e54 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2258,7 +2258,7 @@ static long futex_wait_restart(struct restart_block *restart) * if there are waiters then it will block, it does PI, etc. (Due to * races the kernel might see a 0 value of the futex too.) */ -static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, +static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; @@ -2953,11 +2953,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, case FUTEX_WAKE_OP: return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); case FUTEX_LOCK_PI: - return futex_lock_pi(uaddr, flags, val, timeout, 0); + return futex_lock_pi(uaddr, flags, timeout, 0); case FUTEX_UNLOCK_PI: return futex_unlock_pi(uaddr, flags); case FUTEX_TRYLOCK_PI: - return futex_lock_pi(uaddr, flags, 0, timeout, 1); + return futex_lock_pi(uaddr, flags, NULL, 1); case FUTEX_WAIT_REQUEUE_PI: val3 = FUTEX_BITSET_MATCH_ANY; return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, -- cgit From 51587bcf31d070119d37de6103543c807f5ccdb3 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 19 Jan 2015 17:39:21 -0800 Subject: locking/mutex: Explicitly mark task as running after wakeup By the time we wake up and get the lock after being asleep in the slowpath, we better be running. As good practice, be explicit about this and avoid any mischief. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: "Paul E. McKenney" Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1421717961.4903.11.camel@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index c67a60b61625..57407062e209 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -587,6 +587,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, schedule_preempt_disabled(); spin_lock_mutex(&lock->wait_lock, flags); } + __set_task_state(task, TASK_RUNNING); + mutex_remove_waiter(lock, &waiter, current_thread_info()); /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) -- cgit From de30ec47302c101c7badc8fe687641fd75e596e7 Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Sat, 17 Jan 2015 05:05:34 +0100 Subject: sched/completion: Remove unnecessary ->wait.lock serialization when reading completion state Signed-off-by: Nicholas Mc Guire Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1421467534-22834-1-git-send-email-der.herr@hofr.at Signed-off-by: Ingo Molnar --- kernel/sched/completion.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 607f852b4d04..9d1fe32da232 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -288,13 +288,6 @@ EXPORT_SYMBOL(try_wait_for_completion); */ bool completion_done(struct completion *x) { - unsigned long flags; - int ret = 1; - - spin_lock_irqsave(&x->wait.lock, flags); - if (!x->done) - ret = 0; - spin_unlock_irqrestore(&x->wait.lock, flags); - return ret; + return !!ACCESS_ONCE(x->done); } EXPORT_SYMBOL(completion_done); -- cgit From 7c34e3180a01c800a40bc8535654d5735802fc1b Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Fri, 23 Jan 2015 12:41:47 +0100 Subject: sched/completion: Add lock-free checking of the blocking case The "thread would block" case can be checked without grabbing ->wait.lock. [ If the check does not return early then grab the lock and recheck. A memory barrier is not needed as complete() and complete_all() imply a barrier. The ACCESS_ONCE() is needed for calls in a loop that, if inlined, could optimize out the re-fetching of x->done. ] Signed-off-by: Nicholas Mc Guire Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1422013307-13200-1-git-send-email-der.herr@hofr.at Signed-off-by: Ingo Molnar --- kernel/sched/completion.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 9d1fe32da232..7052d3fd4e7b 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -268,6 +268,15 @@ bool try_wait_for_completion(struct completion *x) unsigned long flags; int ret = 1; + /* + * Since x->done will need to be locked only + * in the non-blocking case, we check x->done + * first without taking the lock so we can + * return early in the blocking case. + */ + if (!ACCESS_ONCE(x->done)) + return 0; + spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; -- cgit From 73105994c57d06e40a33ab5a716db04e898b4c05 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sun, 25 Jan 2015 23:36:04 -0800 Subject: locking/rwsem: Use task->state helpers Call __set_task_state() instead of assigning the new state directly. These interfaces also aid CONFIG_DEBUG_ATOMIC_SLEEP environments, keeping track of who last changed the state. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: "Paul E. McKenney" Cc: Jason Low Cc: Michel Lespinasse Cc: Tim Chen Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1422257769-14083-2-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-spinlock.c | 2 +- kernel/locking/rwsem-xadd.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 2c93571162cb..2555ae15ec14 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -154,7 +154,7 @@ void __sched __down_read(struct rw_semaphore *sem) set_task_state(tsk, TASK_UNINTERRUPTIBLE); } - tsk->state = TASK_RUNNING; + __set_task_state(tsk, TASK_RUNNING); out: ; } diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 7628c3fc37ca..2f7cc4076f50 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -242,8 +242,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) schedule(); } - tsk->state = TASK_RUNNING; - + __set_task_state(tsk, TASK_RUNNING); return sem; } EXPORT_SYMBOL(rwsem_down_read_failed); -- cgit From afffc6c1805d98e08e778cddb644a666e78cfcfd Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sun, 1 Feb 2015 22:16:24 -0800 Subject: locking/rtmutex: Optimize setting task running after being blocked We explicitly mark the task running after returning from a __rt_mutex_slowlock() call, which does the actual sleeping via wait-wake-trylocking. As such, this patch does two things: (1) refactors the code so that setting current to TASK_RUNNING is done by __rt_mutex_slowlock(), and not by the callers. The downside to this is that it becomes a bit unclear when at what point we block. As such I've added a comment that the task blocks when calling __rt_mutex_slowlock() so readers can figure out when it is running again. (2) relaxes setting current's state through __set_current_state(), instead of it's more expensive barrier alternative. There was no need for the implied barrier as we're obviously not planning on blocking. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1422857784.18096.1.camel@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 7c98873a3077..3059bc2f022d 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1130,6 +1130,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, set_current_state(state); } + __set_current_state(TASK_RUNNING); return ret; } @@ -1188,10 +1189,9 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); if (likely(!ret)) + /* sleep on the mutex */ ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); - set_current_state(TASK_RUNNING); - if (unlikely(ret)) { remove_waiter(lock, &waiter); rt_mutex_handle_deadlock(ret, chwalk, &waiter); @@ -1626,10 +1626,9 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, set_current_state(TASK_INTERRUPTIBLE); + /* sleep on the mutex */ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); - set_current_state(TASK_RUNNING); - if (unlikely(ret)) remove_waiter(lock, waiter); -- cgit