From 9642d18eee2cd169b60c6ac0f20bda745b5a3d1e Mon Sep 17 00:00:00 2001 From: Vatika Harlalka Date: Tue, 1 Sep 2015 16:50:59 +0200 Subject: nohz: Affine unpinned timers to housekeepers The problem addressed in this patch is about affining unpinned timers. Adaptive or Full Dynticks CPUs are currently disturbed by unnecessary jitter due to firing of such timers on them. This patch will affine timers to online CPUs which are not full dynticks in NOHZ_FULL configured systems. It should not introduce overhead in nohz full off case due to static keys. Signed-off-by: Vatika Harlalka Signed-off-by: Frederic Weisbecker Reviewed-by: Preeti U Murthy Acked-by: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1441119060-2230-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8b864ecee0e1..0902e4d72671 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -623,18 +623,21 @@ int get_nohz_timer_target(void) int i, cpu = smp_processor_id(); struct sched_domain *sd; - if (!idle_cpu(cpu)) + if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu)) return cpu; rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { - if (!idle_cpu(i)) { + if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) { cpu = i; goto unlock; } } } + + if (!is_housekeeping_cpu(cpu)) + cpu = housekeeping_any_cpu(); unlock: rcu_read_unlock(); return cpu; -- cgit From 7c8bb6cb95061b3143759459ed6c6b0c73bcfecb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 1 Sep 2015 16:51:00 +0200 Subject: nohz: Assert existing housekeepers when nohz full enabled The code ensures that when nohz full is running, at least the boot CPU serves as a housekeeper and it can't be later offlined. Let's assert this assumption to make sure that we have CPUs to handle unbound jobs like workqueues and timers while nohz full CPUs run undisturbed. Also improve the comments on housekeeper offlining prevention. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Vatika Harlalka Link: http://lkml.kernel.org/r/1441119060-2230-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3319e16f31e5..7c7ec4515983 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -290,16 +290,17 @@ static int __init tick_nohz_full_setup(char *str) __setup("nohz_full=", tick_nohz_full_setup); static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) + unsigned long action, + void *hcpu) { unsigned int cpu = (unsigned long)hcpu; switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: /* - * If we handle the timekeeping duty for full dynticks CPUs, - * we can't safely shutdown that CPU. + * The boot CPU handles housekeeping duty (unbound timers, + * workqueues, timekeeping, ...) on behalf of full dynticks + * CPUs. It must remain online when nohz full is enabled. */ if (tick_nohz_full_running && tick_do_timer_cpu == cpu) return NOTIFY_BAD; @@ -370,6 +371,12 @@ void __init tick_nohz_init(void) cpu_notifier(tick_nohz_cpu_down_callback, 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); + + /* + * We need at least one CPU to handle housekeeping work such + * as timekeeping, unbound timers, workqueues, ... + */ + WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); } #endif -- cgit From 21dd33b09c61597df603c654589adffd7955491a Mon Sep 17 00:00:00 2001 From: Lina Iyer Date: Wed, 2 Sep 2015 16:18:57 -0600 Subject: kernel/cpu_pm: fix cpu_cluster_pm_exit comment cpu_cluster_pm_exit() must be sent after cpu_cluster_pm_enter() has been sent for the cluster and before any cpu_pm_exit() notifications are sent for any CPU. Cc: Nicolas Pitre Acked-by: Kevin Hilman Signed-off-by: Lina Iyer Signed-off-by: Rafael J. Wysocki --- kernel/cpu_pm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index 9656a3c36503..009cc9a17d95 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -180,7 +180,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); * low power state that may have caused some blocks in the same power domain * to reset. * - * Must be called after cpu_pm_exit has been called on all cpus in the power + * Must be called after cpu_cluster_pm_enter has been called for the power * domain, and before cpu_pm_exit has been called on any cpu in the power * domain. Notified drivers can include VFP co-processor, interrupt controller * and its PM extensions, local CPU timers context save/restore which -- cgit From 43b3f02899f74ae9914a39547cc5492156f0027a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 4 Sep 2015 17:25:23 +0200 Subject: locking/qspinlock/x86: Fix performance regression under unaccelerated VMs Dave ran into horrible performance on a VM without PARAVIRT_SPINLOCKS set and Linus noted that the test-and-set implementation was retarded. One should spin on the variable with a load, not a RMW. While there, remove 'queued' from the name, as the lock isn't queued at all, but a simple test-and-set. Suggested-by: Linus Torvalds Reported-by: Dave Chinner Tested-by: Dave Chinner Signed-off-by: Peter Zijlstra (Intel) Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: stable@vger.kernel.org # v4.2+ Link: http://lkml.kernel.org/r/20150904152523.GR18673@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 337c8818541d..87e9ce6a63c5 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -289,7 +289,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) if (pv_enabled()) goto queue; - if (virt_queued_spin_lock(lock)) + if (virt_spin_lock(lock)) return; /* -- cgit From 5473e0cc37c03c576adbda7591a6cc8e37c1bb7f Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 28 Aug 2015 14:55:56 +0800 Subject: sched: 'Annotate' migrate_tasks() Kernel testing triggered this warning: | WARNING: CPU: 0 PID: 13 at kernel/sched/core.c:1156 do_set_cpus_allowed+0x7e/0x80() | Modules linked in: | CPU: 0 PID: 13 Comm: migration/0 Not tainted 4.2.0-rc1-00049-g25834c7 #2 | Call Trace: | dump_stack+0x4b/0x75 | warn_slowpath_common+0x8b/0xc0 | warn_slowpath_null+0x22/0x30 | do_set_cpus_allowed+0x7e/0x80 | cpuset_cpus_allowed_fallback+0x7c/0x170 | select_fallback_rq+0x221/0x280 | migration_call+0xe3/0x250 | notifier_call_chain+0x53/0x70 | __raw_notifier_call_chain+0x1e/0x30 | cpu_notify+0x28/0x50 | take_cpu_down+0x22/0x40 | multi_cpu_stop+0xd5/0x140 | cpu_stopper_thread+0xbc/0x170 | smpboot_thread_fn+0x174/0x2f0 | kthread+0xc4/0xe0 | ret_from_kernel_thread+0x21/0x30 As Peterz pointed out: | So the normal rules for changing task_struct::cpus_allowed are holding | both pi_lock and rq->lock, such that holding either stabilizes the mask. | | This is so that wakeup can happen without rq->lock and load-balance | without pi_lock. | | From this we already get the relaxation that we can omit acquiring | rq->lock if the task is not on the rq, because in that case | load-balancing will not apply to it. | | ** these are the rules currently tested in do_set_cpus_allowed() ** | | Now, since __set_cpus_allowed_ptr() uses task_rq_lock() which | unconditionally acquires both locks, we could get away with holding just | rq->lock when on_rq for modification because that'd still exclude | __set_cpus_allowed_ptr(), it would also work against | __kthread_bind_mask() because that assumes !on_rq. | | That said, this is all somewhat fragile. | | Now, I don't think dropping rq->lock is quite as disastrous as it | usually is because !cpu_active at this point, which means load-balance | will not interfere, but that too is somewhat fragile. | | So we end up with a choice of two fragile.. This patch fixes it by following the rules for changing task_struct::cpus_allowed with both pi_lock and rq->lock held. Reported-by: kernel test robot Reported-by: Sasha Levin Signed-off-by: Wanpeng Li [ Modified changelog and patch. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/BLU436-SMTP1660820490DE202E3934ED3806E0@phx.gbl Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0902e4d72671..9b786704d34b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5183,24 +5183,47 @@ static void migrate_tasks(struct rq *dead_rq) break; /* - * Ensure rq->lock covers the entire task selection - * until the migration. + * pick_next_task assumes pinned rq->lock. */ lockdep_pin_lock(&rq->lock); next = pick_next_task(rq, &fake_task); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); + /* + * Rules for changing task_struct::cpus_allowed are holding + * both pi_lock and rq->lock, such that holding either + * stabilizes the mask. + * + * Drop rq->lock is not quite as disastrous as it usually is + * because !cpu_active at this point, which means load-balance + * will not interfere. Also, stop-machine. + */ + lockdep_unpin_lock(&rq->lock); + raw_spin_unlock(&rq->lock); + raw_spin_lock(&next->pi_lock); + raw_spin_lock(&rq->lock); + + /* + * Since we're inside stop-machine, _nothing_ should have + * changed the task, WARN if weird stuff happened, because in + * that case the above rq->lock drop is a fail too. + */ + if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { + raw_spin_unlock(&next->pi_lock); + continue; + } + /* Find suitable destination for @next, with force if needed. */ dest_cpu = select_fallback_rq(dead_rq->cpu, next); - lockdep_unpin_lock(&rq->lock); rq = __migrate_task(rq, next, dest_cpu); if (rq != dead_rq) { raw_spin_unlock(&rq->lock); rq = dead_rq; raw_spin_lock(&rq->lock); } + raw_spin_unlock(&next->pi_lock); } rq->stop = stop; -- cgit From 5b25b13ab08f616efd566347d809b4ece54570d1 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 11 Sep 2015 13:07:39 -0700 Subject: sys_membarrier(): system-wide memory barrier (generic, x86) Here is an implementation of a new system call, sys_membarrier(), which executes a memory barrier on all threads running on the system. It is implemented by calling synchronize_sched(). It can be used to distribute the cost of user-space memory barriers asymmetrically by transforming pairs of memory barriers into pairs consisting of sys_membarrier() and a compiler barrier. For synchronization primitives that distinguish between read-side and write-side (e.g. userspace RCU [1], rwlocks), the read-side can be accelerated significantly by moving the bulk of the memory barrier overhead to the write-side. The existing applications of which I am aware that would be improved by this system call are as follows: * Through Userspace RCU library (http://urcu.so) - DNS server (Knot DNS) https://www.knot-dns.cz/ - Network sniffer (http://netsniff-ng.org/) - Distributed object storage (https://sheepdog.github.io/sheepdog/) - User-space tracing (http://lttng.org) - Network storage system (https://www.gluster.org/) - Virtual routers (https://events.linuxfoundation.org/sites/events/files/slides/DPDK_RCU_0MQ.pdf) - Financial software (https://lkml.org/lkml/2015/3/23/189) Those projects use RCU in userspace to increase read-side speed and scalability compared to locking. Especially in the case of RCU used by libraries, sys_membarrier can speed up the read-side by moving the bulk of the memory barrier cost to synchronize_rcu(). * Direct users of sys_membarrier - core dotnet garbage collector (https://github.com/dotnet/coreclr/issues/198) Microsoft core dotnet GC developers are planning to use the mprotect() side-effect of issuing memory barriers through IPIs as a way to implement Windows FlushProcessWriteBuffers() on Linux. They are referring to sys_membarrier in their github thread, specifically stating that sys_membarrier() is what they are looking for. To explain the benefit of this scheme, let's introduce two example threads: Thread A (non-frequent, e.g. executing liburcu synchronize_rcu()) Thread B (frequent, e.g. executing liburcu rcu_read_lock()/rcu_read_unlock()) In a scheme where all smp_mb() in thread A are ordering memory accesses with respect to smp_mb() present in Thread B, we can change each smp_mb() within Thread A into calls to sys_membarrier() and each smp_mb() within Thread B into compiler barriers "barrier()". Before the change, we had, for each smp_mb() pairs: Thread A Thread B previous mem accesses previous mem accesses smp_mb() smp_mb() following mem accesses following mem accesses After the change, these pairs become: Thread A Thread B prev mem accesses prev mem accesses sys_membarrier() barrier() follow mem accesses follow mem accesses As we can see, there are two possible scenarios: either Thread B memory accesses do not happen concurrently with Thread A accesses (1), or they do (2). 1) Non-concurrent Thread A vs Thread B accesses: Thread A Thread B prev mem accesses sys_membarrier() follow mem accesses prev mem accesses barrier() follow mem accesses In this case, thread B accesses will be weakly ordered. This is OK, because at that point, thread A is not particularly interested in ordering them with respect to its own accesses. 2) Concurrent Thread A vs Thread B accesses Thread A Thread B prev mem accesses prev mem accesses sys_membarrier() barrier() follow mem accesses follow mem accesses In this case, thread B accesses, which are ensured to be in program order thanks to the compiler barrier, will be "upgraded" to full smp_mb() by synchronize_sched(). * Benchmarks On Intel Xeon E5405 (8 cores) (one thread is calling sys_membarrier, the other 7 threads are busy looping) 1000 non-expedited sys_membarrier calls in 33s =3D 33 milliseconds/call. * User-space user of this system call: Userspace RCU library Both the signal-based and the sys_membarrier userspace RCU schemes permit us to remove the memory barrier from the userspace RCU rcu_read_lock() and rcu_read_unlock() primitives, thus significantly accelerating them. These memory barriers are replaced by compiler barriers on the read-side, and all matching memory barriers on the write-side are turned into an invocation of a memory barrier on all active threads in the process. By letting the kernel perform this synchronization rather than dumbly sending a signal to every process threads (as we currently do), we diminish the number of unnecessary wake ups and only issue the memory barriers on active threads. Non-running threads do not need to execute such barrier anyway, because these are implied by the scheduler context switches. Results in liburcu: Operations in 10s, 6 readers, 2 writers: memory barriers in reader: 1701557485 reads, 2202847 writes signal-based scheme: 9830061167 reads, 6700 writes sys_membarrier: 9952759104 reads, 425 writes sys_membarrier (dyn. check): 7970328887 reads, 425 writes The dynamic sys_membarrier availability check adds some overhead to the read-side compared to the signal-based scheme, but besides that, sys_membarrier slightly outperforms the signal-based scheme. However, this non-expedited sys_membarrier implementation has a much slower grace period than signal and memory barrier schemes. Besides diminishing the number of wake-ups, one major advantage of the membarrier system call over the signal-based scheme is that it does not need to reserve a signal. This plays much more nicely with libraries, and with processes injected into for tracing purposes, for which we cannot expect that signals will be unused by the application. An expedited version of this system call can be added later on to speed up the grace period. Its implementation will likely depend on reading the cpu_curr()->mm without holding each CPU's rq lock. This patch adds the system call to x86 and to asm-generic. [1] http://urcu.so membarrier(2) man page: MEMBARRIER(2) Linux Programmer's Manual MEMBARRIER(2) NAME membarrier - issue memory barriers on a set of threads SYNOPSIS #include int membarrier(int cmd, int flags); DESCRIPTION The cmd argument is one of the following: MEMBARRIER_CMD_QUERY Query the set of supported commands. It returns a bitmask of supported commands. MEMBARRIER_CMD_SHARED Execute a memory barrier on all threads running on the system. Upon return from system call, the caller thread is ensured that all running threads have passed through a state where all memory accesses to user-space addresses match program order between entry to and return from the system call (non-running threads are de facto in such a state). This covers threads from all pro=E2=80=90 cesses running on the system. This command returns 0. The flags argument needs to be 0. For future extensions. All memory accesses performed in program order from each targeted thread is guaranteed to be ordered with respect to sys_membarrier(). If we use the semantic "barrier()" to represent a compiler barrier forcing memory accesses to be performed in program order across the barrier, and smp_mb() to represent explicit memory barriers forcing full memory ordering across the barrier, we have the following ordering table for each pair of barrier(), sys_membarrier() and smp_mb(): The pair ordering is detailed as (O: ordered, X: not ordered): barrier() smp_mb() sys_membarrier() barrier() X X O smp_mb() X O O sys_membarrier() O O O RETURN VALUE On success, these system calls return zero. On error, -1 is returned, and errno is set appropriately. For a given command, with flags argument set to 0, this system call is guaranteed to always return the same value until reboot. ERRORS ENOSYS System call is not implemented. EINVAL Invalid arguments. Linux 2015-04-15 MEMBARRIER(2) Signed-off-by: Mathieu Desnoyers Reviewed-by: Paul E. McKenney Reviewed-by: Josh Triplett Cc: KOSAKI Motohiro Cc: Steven Rostedt Cc: Nicholas Miell Cc: Ingo Molnar Cc: Alan Cox Cc: Lai Jiangshan Cc: Stephen Hemminger Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: David Howells Cc: Pranith Kumar Cc: Michael Kerrisk Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 1 + kernel/membarrier.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sys_ni.c | 3 +++ 3 files changed, 70 insertions(+) create mode 100644 kernel/membarrier.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index d4988410b410..53abf008ecb3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -100,6 +100,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o +obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_HAS_IOMEM) += memremap.o diff --git a/kernel/membarrier.c b/kernel/membarrier.c new file mode 100644 index 000000000000..536c727a56e9 --- /dev/null +++ b/kernel/membarrier.c @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2010, 2015 Mathieu Desnoyers + * + * membarrier system call + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include + +/* + * Bitmask made from a "or" of all commands within enum membarrier_cmd, + * except MEMBARRIER_CMD_QUERY. + */ +#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED) + +/** + * sys_membarrier - issue memory barriers on a set of threads + * @cmd: Takes command values defined in enum membarrier_cmd. + * @flags: Currently needs to be 0. For future extensions. + * + * If this system call is not implemented, -ENOSYS is returned. If the + * command specified does not exist, or if the command argument is invalid, + * this system call returns -EINVAL. For a given command, with flags argument + * set to 0, this system call is guaranteed to always return the same value + * until reboot. + * + * All memory accesses performed in program order from each targeted thread + * is guaranteed to be ordered with respect to sys_membarrier(). If we use + * the semantic "barrier()" to represent a compiler barrier forcing memory + * accesses to be performed in program order across the barrier, and + * smp_mb() to represent explicit memory barriers forcing full memory + * ordering across the barrier, we have the following ordering table for + * each pair of barrier(), sys_membarrier() and smp_mb(): + * + * The pair ordering is detailed as (O: ordered, X: not ordered): + * + * barrier() smp_mb() sys_membarrier() + * barrier() X X O + * smp_mb() X O O + * sys_membarrier() O O O + */ +SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) +{ + if (unlikely(flags)) + return -EINVAL; + switch (cmd) { + case MEMBARRIER_CMD_QUERY: + return MEMBARRIER_CMD_BITMASK; + case MEMBARRIER_CMD_SHARED: + if (num_online_cpus() > 1) + synchronize_sched(); + return 0; + default: + return -EINVAL; + } +} diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 03c3875d9958..a02decf15583 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -245,3 +245,6 @@ cond_syscall(sys_bpf); /* execveat */ cond_syscall(sys_execveat); + +/* membarrier */ +cond_syscall(sys_membarrier); -- cgit From 2619d7e9c92d524cb155ec89fd72875321512e5b Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 9 Sep 2015 16:07:30 -0700 Subject: time: Fix timekeeping_freqadjust()'s incorrect use of abs() instead of abs64() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The internal clocksteering done for fine-grained error correction uses a logarithmic approximation, so any time adjtimex() adjusts the clock steering, timekeeping_freqadjust() quickly approximates the correct clock frequency over a series of ticks. Unfortunately, the logic in timekeeping_freqadjust(), introduced in commit: dc491596f639 ("timekeeping: Rework frequency adjustments to work better w/ nohz") used the abs() function with a s64 error value to calculate the size of the approximated adjustment to be made. Per include/linux/kernel.h: "abs() should not be used for 64-bit types (s64, u64, long long) - use abs64()". Thus on 32-bit platforms, this resulted in the clocksteering to take a quite dampended random walk trying to converge on the proper frequency, which caused the adjustments to be made much slower then intended (most easily observed when large adjustments are made). This patch fixes the issue by using abs64() instead. Reported-by: Nuno Gonçalves Tested-by: Nuno Goncalves Signed-off-by: John Stultz Cc: # v3.17+ Cc: Linus Torvalds Cc: Miroslav Lichvar Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1441840051-20244-1-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f6ee2e6b6f5d..3739ac6aa473 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1614,7 +1614,7 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, negative = (tick_error < 0); /* Sort out the magnitude of the correction */ - tick_error = abs(tick_error); + tick_error = abs64(tick_error); for (adj = 0; tick_error > interval; adj++) tick_error >>= 1; -- cgit From eef7635a22f6b144206b5ca2f1398f637acffc4d Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 11 Sep 2015 09:34:26 +0530 Subject: clockevents: Remove unused set_mode() callback All users are migrated to the per-state callbacks, get rid of the unused interface and the core support code. Signed-off-by: Viresh Kumar Signed-off-by: Thomas Gleixner Cc: linaro-kernel@lists.linaro.org Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/fd60de14cf6d125489c031207567bb255ad946f6.1441943991.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/clockevents.c | 42 +----------------------------------- kernel/time/tick-common.c | 1 - kernel/time/timer_list.c | 54 +++++++++++++++++++++-------------------------- 3 files changed, 25 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 50eb107f1198..a9b76a40319e 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -97,20 +97,6 @@ EXPORT_SYMBOL_GPL(clockevent_delta2ns); static int __clockevents_switch_state(struct clock_event_device *dev, enum clock_event_state state) { - /* Transition with legacy set_mode() callback */ - if (dev->set_mode) { - /* Legacy callback doesn't support new modes */ - if (state > CLOCK_EVT_STATE_ONESHOT) - return -ENOSYS; - /* - * 'clock_event_state' and 'clock_event_mode' have 1-to-1 - * mapping until *_ONESHOT, and so a simple cast will work. - */ - dev->set_mode((enum clock_event_mode)state, dev); - dev->mode = (enum clock_event_mode)state; - return 0; - } - if (dev->features & CLOCK_EVT_FEAT_DUMMY) return 0; @@ -204,12 +190,8 @@ int clockevents_tick_resume(struct clock_event_device *dev) { int ret = 0; - if (dev->set_mode) { - dev->set_mode(CLOCK_EVT_MODE_RESUME, dev); - dev->mode = CLOCK_EVT_MODE_RESUME; - } else if (dev->tick_resume) { + if (dev->tick_resume) ret = dev->tick_resume(dev); - } return ret; } @@ -460,26 +442,6 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu) } EXPORT_SYMBOL_GPL(clockevents_unbind_device); -/* Sanity check of state transition callbacks */ -static int clockevents_sanity_check(struct clock_event_device *dev) -{ - /* Legacy set_mode() callback */ - if (dev->set_mode) { - /* We shouldn't be supporting new modes now */ - WARN_ON(dev->set_state_periodic || dev->set_state_oneshot || - dev->set_state_shutdown || dev->tick_resume || - dev->set_state_oneshot_stopped); - - BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); - return 0; - } - - if (dev->features & CLOCK_EVT_FEAT_DUMMY) - return 0; - - return 0; -} - /** * clockevents_register_device - register a clock event device * @dev: device to register @@ -488,8 +450,6 @@ void clockevents_register_device(struct clock_event_device *dev) { unsigned long flags; - BUG_ON(clockevents_sanity_check(dev)); - /* Initialize state to DETACHED */ clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index d11c55b6ab7d..4fcd99e12aa0 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -398,7 +398,6 @@ void tick_shutdown(unsigned int cpu) * the set mode function! */ clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); - dev->mode = CLOCK_EVT_MODE_UNUSED; clockevents_exchange_device(dev, NULL); dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 129c96033e46..f75e35b60149 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -225,7 +225,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) (unsigned long long) dev->min_delta_ns); SEQ_printf(m, " mult: %u\n", dev->mult); SEQ_printf(m, " shift: %u\n", dev->shift); - SEQ_printf(m, " mode: %d\n", dev->mode); + SEQ_printf(m, " mode: %d\n", clockevent_get_state(dev)); SEQ_printf(m, " next_event: %Ld nsecs\n", (unsigned long long) ktime_to_ns(dev->next_event)); @@ -233,40 +233,34 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) print_name_offset(m, dev->set_next_event); SEQ_printf(m, "\n"); - if (dev->set_mode) { - SEQ_printf(m, " set_mode: "); - print_name_offset(m, dev->set_mode); + if (dev->set_state_shutdown) { + SEQ_printf(m, " shutdown: "); + print_name_offset(m, dev->set_state_shutdown); SEQ_printf(m, "\n"); - } else { - if (dev->set_state_shutdown) { - SEQ_printf(m, " shutdown: "); - print_name_offset(m, dev->set_state_shutdown); - SEQ_printf(m, "\n"); - } + } - if (dev->set_state_periodic) { - SEQ_printf(m, " periodic: "); - print_name_offset(m, dev->set_state_periodic); - SEQ_printf(m, "\n"); - } + if (dev->set_state_periodic) { + SEQ_printf(m, " periodic: "); + print_name_offset(m, dev->set_state_periodic); + SEQ_printf(m, "\n"); + } - if (dev->set_state_oneshot) { - SEQ_printf(m, " oneshot: "); - print_name_offset(m, dev->set_state_oneshot); - SEQ_printf(m, "\n"); - } + if (dev->set_state_oneshot) { + SEQ_printf(m, " oneshot: "); + print_name_offset(m, dev->set_state_oneshot); + SEQ_printf(m, "\n"); + } - if (dev->set_state_oneshot_stopped) { - SEQ_printf(m, " oneshot stopped: "); - print_name_offset(m, dev->set_state_oneshot_stopped); - SEQ_printf(m, "\n"); - } + if (dev->set_state_oneshot_stopped) { + SEQ_printf(m, " oneshot stopped: "); + print_name_offset(m, dev->set_state_oneshot_stopped); + SEQ_printf(m, "\n"); + } - if (dev->tick_resume) { - SEQ_printf(m, " resume: "); - print_name_offset(m, dev->tick_resume); - SEQ_printf(m, "\n"); - } + if (dev->tick_resume) { + SEQ_printf(m, " resume: "); + print_name_offset(m, dev->tick_resume); + SEQ_printf(m, "\n"); } SEQ_printf(m, " event_handler: "); -- cgit From 449e9cae58b06be1293858ec8e5d8cb728238baa Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 1 Jun 2015 16:05:16 +0800 Subject: genirq: Move field 'node' from irq_data into irq_common_data NUMA node information is per-irq instead of per-irqchip, so move it into struct irq_common_data. Also use CONFIG_NUMA to guard irq_common_data.node. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Jason Cooper Cc: Kevin Cernekee Cc: Arnd Bergmann Link: http://lkml.kernel.org/r/1433145945-789-8-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- kernel/irq/internals.h | 2 +- kernel/irq/irqdesc.c | 4 +++- kernel/irq/irqdomain.c | 1 - 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index eee4b385cffb..5ef0c2dbe930 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -194,7 +194,7 @@ static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) static inline int irq_desc_get_node(struct irq_desc *desc) { - return irq_data_get_node(&desc->irq_data); + return irq_common_data_get_node(&desc->irq_common_data); } #ifdef CONFIG_PM_SLEEP diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 0a2a4b697bcb..7f3e9faa6e4d 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -52,11 +52,13 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) static void desc_smp_init(struct irq_desc *desc, int node) { - desc->irq_data.node = node; cpumask_copy(desc->irq_data.affinity, irq_default_affinity); #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_clear(desc->pending_mask); #endif +#ifdef CONFIG_NUMA + desc->irq_common_data.node = node; +#endif } #else diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 79baaf8a7813..dc9d27c0c158 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -844,7 +844,6 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain, child->parent_data = irq_data; irq_data->irq = child->irq; irq_data->common = child->common; - irq_data->node = child->node; irq_data->domain = domain; } -- cgit From af7080e040d223b5e7d0a8de28f7cea24ef017c4 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 1 Jun 2015 16:05:21 +0800 Subject: genirq: Move field 'handler_data' from irq_data into irq_common_data Handler data (handler_data) is per-irq instead of per irqchip, so move it into struct irq_common_data. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Jason Cooper Cc: Kevin Cernekee Cc: Arnd Bergmann Cc: Marc Zyngier Link: http://lkml.kernel.org/r/1433145945-789-13-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 4 ++-- kernel/irq/irqdesc.c | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6e40a9539763..a48e00e345d7 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -83,7 +83,7 @@ int irq_set_handler_data(unsigned int irq, void *data) if (!desc) return -EINVAL; - desc->irq_data.handler_data = data; + desc->irq_common_data.handler_data = data; irq_put_desc_unlock(desc, flags); return 0; } @@ -796,7 +796,7 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, return; __irq_do_set_handler(desc, handle, 1, NULL); - desc->irq_data.handler_data = data; + desc->irq_common_data.handler_data = data; irq_put_desc_busunlock(desc, flags); } diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 7f3e9faa6e4d..594b3e349aac 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -72,11 +72,12 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, { int cpu; + desc->irq_common_data.handler_data = NULL; + desc->irq_data.common = &desc->irq_common_data; desc->irq_data.irq = irq; desc->irq_data.chip = &no_irq_chip; desc->irq_data.chip_data = NULL; - desc->irq_data.handler_data = NULL; desc->irq_data.msi_desc = NULL; irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); -- cgit From 9df872faa7e1619e9278bec00ceaed2236533530 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jun 2015 11:47:50 +0800 Subject: genirq: Move field 'affinity' from irq_data into irq_common_data Irq affinity mask is per-irq instead of per irqchip, so move it into struct irq_common_data. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Jason Cooper Cc: Kevin Cernekee Cc: Arnd Bergmann Link: http://lkml.kernel.org/r/1433303281-27688-1-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 9 +++++---- kernel/irq/manage.c | 12 ++++++------ kernel/irq/proc.c | 2 +- 3 files changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 594b3e349aac..bb48a5c1964e 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -38,12 +38,13 @@ static void __init init_irq_default_affinity(void) #ifdef CONFIG_SMP static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { - if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node)) + if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity, + gfp, node)) return -ENOMEM; #ifdef CONFIG_GENERIC_PENDING_IRQ if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { - free_cpumask_var(desc->irq_data.affinity); + free_cpumask_var(desc->irq_common_data.affinity); return -ENOMEM; } #endif @@ -52,7 +53,7 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) static void desc_smp_init(struct irq_desc *desc, int node) { - cpumask_copy(desc->irq_data.affinity, irq_default_affinity); + cpumask_copy(desc->irq_common_data.affinity, irq_default_affinity); #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_clear(desc->pending_mask); #endif @@ -124,7 +125,7 @@ static void free_masks(struct irq_desc *desc) #ifdef CONFIG_GENERIC_PENDING_IRQ free_cpumask_var(desc->pending_mask); #endif - free_cpumask_var(desc->irq_data.affinity); + free_cpumask_var(desc->irq_common_data.affinity); } #else static inline void free_masks(struct irq_desc *desc) { } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ad1b064f94fe..f9a59f6cabd2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -192,7 +192,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: - cpumask_copy(data->affinity, mask); + cpumask_copy(desc->irq_common_data.affinity, mask); case IRQ_SET_MASK_OK_NOCOPY: irq_set_thread_affinity(desc); ret = 0; @@ -304,7 +304,7 @@ static void irq_affinity_notify(struct work_struct *work) if (irq_move_pending(&desc->irq_data)) irq_get_pending(cpumask, desc); else - cpumask_copy(cpumask, desc->irq_data.affinity); + cpumask_copy(cpumask, desc->irq_common_data.affinity); raw_spin_unlock_irqrestore(&desc->lock, flags); notify->notify(notify, cpumask); @@ -375,9 +375,9 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask) * one of the targets is online. */ if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { - if (cpumask_intersects(desc->irq_data.affinity, + if (cpumask_intersects(desc->irq_common_data.affinity, cpu_online_mask)) - set = desc->irq_data.affinity; + set = desc->irq_common_data.affinity; else irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); } @@ -829,8 +829,8 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) * This code is triggered unconditionally. Check the affinity * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. */ - if (desc->irq_data.affinity) - cpumask_copy(mask, desc->irq_data.affinity); + if (desc->irq_common_data.affinity) + cpumask_copy(mask, desc->irq_common_data.affinity); else valid = false; raw_spin_unlock_irq(&desc->lock); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 0e97c142ce40..e3a8c9577ba6 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -39,7 +39,7 @@ static struct proc_dir_entry *root_irq_dir; static int show_irq_affinity(int type, struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); - const struct cpumask *mask = desc->irq_data.affinity; + const struct cpumask *mask = desc->irq_common_data.affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ if (irqd_is_setaffinity_pending(&desc->irq_data)) -- cgit From b237721c5d95082a803c0be686f56d2dd1de995b Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 1 Jun 2015 16:05:43 +0800 Subject: genirq: Move field 'msi_desc' from irq_data into irq_common_data MSI descriptors are per-irq instead of per irqchip, so move it into struct irq_common_data. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Jason Cooper Cc: Kevin Cernekee Cc: Arnd Bergmann Cc: Marc Zyngier Link: http://lkml.kernel.org/r/1433145945-789-35-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 2 +- kernel/irq/irqdesc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a48e00e345d7..8c55d545558f 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -105,7 +105,7 @@ int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, if (!desc) return -EINVAL; - desc->irq_data.msi_desc = entry; + desc->irq_common_data.msi_desc = entry; if (entry && !irq_offset) entry->irq = irq_base; irq_put_desc_unlock(desc, flags); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index bb48a5c1964e..596669436f7a 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -74,12 +74,12 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, int cpu; desc->irq_common_data.handler_data = NULL; + desc->irq_common_data.msi_desc = NULL; desc->irq_data.common = &desc->irq_common_data; desc->irq_data.irq = irq; desc->irq_data.chip = &no_irq_chip; desc->irq_data.chip_data = NULL; - desc->irq_data.msi_desc = NULL; irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); desc->handle_irq = handle_bad_irq; -- cgit From bd0b9ac405e1794d72533c3d487aa65b6b955a0c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 14 Sep 2015 10:42:37 +0200 Subject: genirq: Remove irq argument from irq flow handlers Most interrupt flow handlers do not use the irq argument. Those few which use it can retrieve the irq number from the irq descriptor. Remove the argument. Search and replace was done with coccinelle and some extra helper scripts around it. Thanks to Julia for her help! Signed-off-by: Thomas Gleixner Cc: Julia Lawall Cc: Jiang Liu --- kernel/irq/chip.c | 27 ++++++++------------------- kernel/irq/handle.c | 4 +++- kernel/irq/irqdesc.c | 2 +- kernel/irq/resend.c | 2 +- 4 files changed, 13 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 8c55d545558f..e28169dd1c36 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -372,7 +372,6 @@ static bool irq_may_run(struct irq_desc *desc) /** * handle_simple_irq - Simple and software-decoded IRQs. - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Simple interrupts are either sent from a demultiplexing interrupt @@ -382,8 +381,7 @@ static bool irq_may_run(struct irq_desc *desc) * Note: The caller is expected to handle the ack, clear, mask and * unmask issues if necessary. */ -void -handle_simple_irq(unsigned int irq, struct irq_desc *desc) +void handle_simple_irq(struct irq_desc *desc) { raw_spin_lock(&desc->lock); @@ -425,7 +423,6 @@ static void cond_unmask_irq(struct irq_desc *desc) /** * handle_level_irq - Level type irq handler - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Level type interrupts are active as long as the hardware line has @@ -433,8 +430,7 @@ static void cond_unmask_irq(struct irq_desc *desc) * it after the associated handler has acknowledged the device, so the * interrupt line is back to inactive. */ -void -handle_level_irq(unsigned int irq, struct irq_desc *desc) +void handle_level_irq(struct irq_desc *desc) { raw_spin_lock(&desc->lock); mask_ack_irq(desc); @@ -496,7 +492,6 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) /** * handle_fasteoi_irq - irq handler for transparent controllers - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Only a single callback will be issued to the chip: an ->eoi() @@ -504,8 +499,7 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) * for modern forms of interrupt handlers, which handle the flow * details in hardware, transparently. */ -void -handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) +void handle_fasteoi_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; @@ -546,7 +540,6 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_irq); /** * handle_edge_irq - edge type IRQ handler - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Interrupt occures on the falling and/or rising edge of a hardware @@ -560,8 +553,7 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_irq); * the handler was running. If all pending interrupts are handled, the * loop is left. */ -void -handle_edge_irq(unsigned int irq, struct irq_desc *desc) +void handle_edge_irq(struct irq_desc *desc) { raw_spin_lock(&desc->lock); @@ -618,13 +610,12 @@ EXPORT_SYMBOL(handle_edge_irq); #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER /** * handle_edge_eoi_irq - edge eoi type IRQ handler - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Similar as the above handle_edge_irq, but using eoi and w/o the * mask/unmask logic. */ -void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) +void handle_edge_eoi_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); @@ -665,13 +656,11 @@ out_eoi: /** * handle_percpu_irq - Per CPU local irq handler - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Per CPU interrupts on SMP machines without locking requirements */ -void -handle_percpu_irq(unsigned int irq, struct irq_desc *desc) +void handle_percpu_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); @@ -688,7 +677,6 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) /** * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Per CPU interrupts on SMP machines without locking requirements. Same as @@ -698,11 +686,12 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) * contain the real device id for the cpu on which this handler is * called */ -void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc) +void handle_percpu_devid_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct irqaction *action = desc->action; void *dev_id = raw_cpu_ptr(action->percpu_dev_id); + unsigned int irq = irq_desc_get_irq(desc); irqreturn_t res; kstat_incr_irqs_this_cpu(desc); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index b6eeea8a80c5..de41a68fc038 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -27,8 +27,10 @@ * * Handles spurious and unhandled IRQ's. It also prints a debugmessage. */ -void handle_bad_irq(unsigned int irq, struct irq_desc *desc) +void handle_bad_irq(struct irq_desc *desc) { + unsigned int irq = irq_desc_get_irq(desc); + print_irq_desc(irq, desc); kstat_incr_irqs_this_cpu(desc); ack_bad_irq(irq); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 596669436f7a..239e2ae2c947 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -347,7 +347,7 @@ int generic_handle_irq(unsigned int irq) if (!desc) return -EINVAL; - generic_handle_irq_desc(irq, desc); + generic_handle_irq_desc(desc); return 0; } EXPORT_SYMBOL_GPL(generic_handle_irq); diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index dd95f44f99b2..b86886beee4f 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -38,7 +38,7 @@ static void resend_irqs(unsigned long arg) clear_bit(irq, irqs_resend); desc = irq_to_desc(irq); local_irq_disable(); - desc->handle_irq(irq, desc); + desc->handle_irq(desc); local_irq_enable(); } } -- cgit From f9f9e7b776142fb1c0782cade004cc8e0147a199 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Sep 2015 11:51:12 -0400 Subject: Revert "cgroup: simplify threadgroup locking" This reverts commit b5ba75b5fc0e8404e2c50cb68f39bb6a53fc916f. d59cfc09c32a ("sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem") and b5ba75b5fc0e ("cgroup: simplify threadgroup locking") changed how cgroup synchronizes against task fork and exits so that it uses global percpu_rwsem instead of per-process rwsem; unfortunately, the write [un]lock paths of percpu_rwsem always involve synchronize_rcu_expedited() which turned out to be too expensive. Improvements for percpu_rwsem are scheduled to be merged in the coming v4.4-rc1 merge window which alleviates this issue. For now, revert the two commits to restore per-process rwsem. They will be re-applied for the v4.4-rc1 merge window. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/g/55F8097A.7000206@de.ibm.com Reported-by: Christian Borntraeger Cc: Oleg Nesterov Cc: "Paul E. McKenney" Cc: Peter Zijlstra Cc: Paolo Bonzini Cc: stable@vger.kernel.org # v4.2+ --- kernel/cgroup.c | 45 +++++++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2cf0f79f1fc9..115091efa889 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2460,13 +2460,14 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, if (!cgrp) return -ENODEV; - percpu_down_write(&cgroup_threadgroup_rwsem); +retry_find_task: rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { + rcu_read_unlock(); ret = -ESRCH; - goto out_unlock_rcu; + goto out_unlock_cgroup; } } else { tsk = current; @@ -2482,23 +2483,37 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, */ if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; - goto out_unlock_rcu; + rcu_read_unlock(); + goto out_unlock_cgroup; } get_task_struct(tsk); rcu_read_unlock(); + percpu_down_write(&cgroup_threadgroup_rwsem); + if (threadgroup) { + if (!thread_group_leader(tsk)) { + /* + * a race with de_thread from another thread's exec() + * may strip us of our leadership, if this happens, + * there is no choice but to throw this task away and + * try again; this is + * "double-double-toil-and-trouble-check locking". + */ + percpu_up_write(&cgroup_threadgroup_rwsem); + put_task_struct(tsk); + goto retry_find_task; + } + } + ret = cgroup_procs_write_permission(tsk, cgrp, of); if (!ret) ret = cgroup_attach_task(cgrp, tsk, threadgroup); - put_task_struct(tsk); - goto out_unlock_threadgroup; - -out_unlock_rcu: - rcu_read_unlock(); -out_unlock_threadgroup: percpu_up_write(&cgroup_threadgroup_rwsem); + + put_task_struct(tsk); +out_unlock_cgroup: cgroup_kn_unlock(of->kn); return ret ?: nbytes; } @@ -2643,8 +2658,6 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); - percpu_down_write(&cgroup_threadgroup_rwsem); - /* look up all csses currently attached to @cgrp's subtree */ down_read(&css_set_rwsem); css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { @@ -2700,8 +2713,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) goto out_finish; last_task = task; + percpu_down_write(&cgroup_threadgroup_rwsem); + /* raced against de_thread() from another thread? */ + if (!thread_group_leader(task)) { + percpu_up_write(&cgroup_threadgroup_rwsem); + put_task_struct(task); + continue; + } + ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); + percpu_up_write(&cgroup_threadgroup_rwsem); put_task_struct(task); if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) @@ -2711,7 +2733,6 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) out_finish: cgroup_migrate_finish(&preloaded_csets); - percpu_up_write(&cgroup_threadgroup_rwsem); return ret; } -- cgit From 0c986253b939cc14c69d4adbe2b4121bdf4aa220 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Sep 2015 11:51:12 -0400 Subject: Revert "sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem" This reverts commit d59cfc09c32a2ae31f1c3bc2983a0cd79afb3f14. d59cfc09c32a ("sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem") and b5ba75b5fc0e ("cgroup: simplify threadgroup locking") changed how cgroup synchronizes against task fork and exits so that it uses global percpu_rwsem instead of per-process rwsem; unfortunately, the write [un]lock paths of percpu_rwsem always involve synchronize_rcu_expedited() which turned out to be too expensive. Improvements for percpu_rwsem are scheduled to be merged in the coming v4.4-rc1 merge window which alleviates this issue. For now, revert the two commits to restore per-process rwsem. They will be re-applied for the v4.4-rc1 merge window. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/g/55F8097A.7000206@de.ibm.com Reported-by: Christian Borntraeger Cc: Oleg Nesterov Cc: "Paul E. McKenney" Cc: Peter Zijlstra Cc: Paolo Bonzini Cc: stable@vger.kernel.org # v4.2+ --- kernel/cgroup.c | 77 ++++++++++++++++++++++++++++++++++++++++++--------------- kernel/fork.c | 4 +++ 2 files changed, 61 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 115091efa889..2c9eae6ad970 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -46,7 +46,6 @@ #include #include #include -#include #include #include #include @@ -104,8 +103,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); */ static DEFINE_SPINLOCK(release_agent_path_lock); -struct percpu_rw_semaphore cgroup_threadgroup_rwsem; - #define cgroup_assert_mutex_or_rcu_locked() \ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&cgroup_mutex), \ @@ -874,6 +871,48 @@ static struct css_set *find_css_set(struct css_set *old_cset, return cset; } +void cgroup_threadgroup_change_begin(struct task_struct *tsk) +{ + down_read(&tsk->signal->group_rwsem); +} + +void cgroup_threadgroup_change_end(struct task_struct *tsk) +{ + up_read(&tsk->signal->group_rwsem); +} + +/** + * threadgroup_lock - lock threadgroup + * @tsk: member task of the threadgroup to lock + * + * Lock the threadgroup @tsk belongs to. No new task is allowed to enter + * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or + * change ->group_leader/pid. This is useful for cases where the threadgroup + * needs to stay stable across blockable operations. + * + * fork and exit explicitly call threadgroup_change_{begin|end}() for + * synchronization. While held, no new task will be added to threadgroup + * and no existing live task will have its PF_EXITING set. + * + * de_thread() does threadgroup_change_{begin|end}() when a non-leader + * sub-thread becomes a new leader. + */ +static void threadgroup_lock(struct task_struct *tsk) +{ + down_write(&tsk->signal->group_rwsem); +} + +/** + * threadgroup_unlock - unlock threadgroup + * @tsk: member task of the threadgroup to unlock + * + * Reverse threadgroup_lock(). + */ +static inline void threadgroup_unlock(struct task_struct *tsk) +{ + up_write(&tsk->signal->group_rwsem); +} + static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { struct cgroup *root_cgrp = kf_root->kn->priv; @@ -2074,9 +2113,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, lockdep_assert_held(&css_set_rwsem); /* - * We are synchronized through cgroup_threadgroup_rwsem against - * PF_EXITING setting such that we can't race against cgroup_exit() - * changing the css_set to init_css_set and dropping the old one. + * We are synchronized through threadgroup_lock() against PF_EXITING + * setting such that we can't race against cgroup_exit() changing the + * css_set to init_css_set and dropping the old one. */ WARN_ON_ONCE(tsk->flags & PF_EXITING); old_cset = task_css_set(tsk); @@ -2133,11 +2172,10 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) * @src_cset and add it to @preloaded_csets, which should later be cleaned * up by cgroup_migrate_finish(). * - * This function may be called without holding cgroup_threadgroup_rwsem - * even if the target is a process. Threads may be created and destroyed - * but as long as cgroup_mutex is not dropped, no new css_set can be put - * into play and the preloaded css_sets are guaranteed to cover all - * migrations. + * This function may be called without holding threadgroup_lock even if the + * target is a process. Threads may be created and destroyed but as long + * as cgroup_mutex is not dropped, no new css_set can be put into play and + * the preloaded css_sets are guaranteed to cover all migrations. */ static void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, @@ -2240,7 +2278,7 @@ err: * @threadgroup: whether @leader points to the whole process or a single task * * Migrate a process or task denoted by @leader to @cgrp. If migrating a - * process, the caller must be holding cgroup_threadgroup_rwsem. The + * process, the caller must be holding threadgroup_lock of @leader. The * caller is also responsible for invoking cgroup_migrate_add_src() and * cgroup_migrate_prepare_dst() on the targets before invoking this * function and following up with cgroup_migrate_finish(). @@ -2368,7 +2406,7 @@ out_release_tset: * @leader: the task or the leader of the threadgroup to be attached * @threadgroup: attach the whole threadgroup? * - * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. + * Call holding cgroup_mutex and threadgroup_lock of @leader. */ static int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup) @@ -2490,7 +2528,7 @@ retry_find_task: get_task_struct(tsk); rcu_read_unlock(); - percpu_down_write(&cgroup_threadgroup_rwsem); + threadgroup_lock(tsk); if (threadgroup) { if (!thread_group_leader(tsk)) { /* @@ -2500,7 +2538,7 @@ retry_find_task: * try again; this is * "double-double-toil-and-trouble-check locking". */ - percpu_up_write(&cgroup_threadgroup_rwsem); + threadgroup_unlock(tsk); put_task_struct(tsk); goto retry_find_task; } @@ -2510,7 +2548,7 @@ retry_find_task: if (!ret) ret = cgroup_attach_task(cgrp, tsk, threadgroup); - percpu_up_write(&cgroup_threadgroup_rwsem); + threadgroup_unlock(tsk); put_task_struct(tsk); out_unlock_cgroup: @@ -2713,17 +2751,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) goto out_finish; last_task = task; - percpu_down_write(&cgroup_threadgroup_rwsem); + threadgroup_lock(task); /* raced against de_thread() from another thread? */ if (!thread_group_leader(task)) { - percpu_up_write(&cgroup_threadgroup_rwsem); + threadgroup_unlock(task); put_task_struct(task); continue; } ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); - percpu_up_write(&cgroup_threadgroup_rwsem); + threadgroup_unlock(task); put_task_struct(task); if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) @@ -5045,7 +5083,6 @@ int __init cgroup_init(void) unsigned long key; int ssid, err; - BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); diff --git a/kernel/fork.c b/kernel/fork.c index 7d5f0f118a63..2845623fb582 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1149,6 +1149,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sched_autogroup_fork(sig); +#ifdef CONFIG_CGROUPS + init_rwsem(&sig->group_rwsem); +#endif + sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; -- cgit From 00cc1633816de8c95f337608a1ea64e228faf771 Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Fri, 18 Sep 2015 11:27:45 +0200 Subject: sched: access local runqueue directly in single_task_running Commit 2ee507c47293 ("sched: Add function single_task_running to let a task check if it is the only task running on a cpu") referenced the current runqueue with the smp_processor_id. When CONFIG_DEBUG_PREEMPT is enabled, that is only allowed if preemption is disabled or the currrent task is bound to the local cpu (e.g. kernel worker). With commit f78195129963 ("kvm: add halt_poll_ns module parameter") KVM calls single_task_running. If CONFIG_DEBUG_PREEMPT is enabled that generates a lot of kernel messages. To avoid adding preemption in that cases, as it would limit the usefulness, we change single_task_running to access directly the cpu local runqueue. Cc: Tim Chen Suggested-by: Peter Zijlstra Acked-by: Peter Zijlstra (Intel) Cc: Fixes: 2ee507c472939db4b146d545352b8a7c79ef47f8 Signed-off-by: Dominik Dingel Signed-off-by: Paolo Bonzini --- kernel/sched/core.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3595403921bd..4064f794ab8c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2666,13 +2666,20 @@ unsigned long nr_running(void) /* * Check if only the current task is running on the cpu. + * + * Caution: this function does not check that the caller has disabled + * preemption, thus the result might have a time-of-check-to-time-of-use + * race. The caller is responsible to use it correctly, for example: + * + * - from a non-preemptable section (of course) + * + * - from a thread that is bound to a single CPU + * + * - in a loop with very short iterations (e.g. a polling loop) */ bool single_task_running(void) { - if (cpu_rq(smp_processor_id())->nr_running == 1) - return true; - else - return false; + return raw_rq()->nr_running == 1; } EXPORT_SYMBOL(single_task_running); -- cgit From ac5be6b47e8bd25b62bed2c82cda7398999f59e9 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 22 Sep 2015 14:58:49 -0700 Subject: userfaultfd: revert "userfaultfd: waitqueue: add nr wake parameter to __wake_up_locked_key" This reverts commit 51360155eccb907ff8635bd10fc7de876408c2e0 and adapts fs/userfaultfd.c to use the old version of that function. It didn't look robust to call __wake_up_common with "nr == 1" when we absolutely require wakeall semantics, but we've full control of what we insert in the two waitqueue heads of the blocked userfaults. No exclusive waitqueue risks to be inserted into those two waitqueue heads so we can as well stick to "nr == 1" of the old code and we can rely purely on the fact no waitqueue inserted in one of the two waitqueue heads we must enforce as wakeall, has wait->flags WQ_FLAG_EXCLUSIVE set. Signed-off-by: Andrea Arcangeli Cc: Dr. David Alan Gilbert Cc: Michael Ellerman Cc: Shuah Khan Cc: Thierry Reding Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/wait.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 272d9322bc5d..052e02672d12 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -106,10 +106,9 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) } EXPORT_SYMBOL_GPL(__wake_up_locked); -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr, - void *key) +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) { - __wake_up_common(q, mode, nr, 0, key); + __wake_up_common(q, mode, 1, 0, key); } EXPORT_SYMBOL_GPL(__wake_up_locked_key); @@ -284,7 +283,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, if (!list_empty(&wait->task_list)) list_del_init(&wait->task_list); else if (waitqueue_active(q)) - __wake_up_locked_key(q, mode, 1, key); + __wake_up_locked_key(q, mode, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(abort_exclusive_wait); -- cgit