summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/acct.c7
-rw-r--r--kernel/events/core.c17
-rw-r--r--kernel/events/ring_buffer.c14
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/hrtimer.c4
-rw-r--r--kernel/irq/irqdomain.c20
-rw-r--r--kernel/kallsyms.c26
-rw-r--r--kernel/lockdep.c1
-rw-r--r--kernel/modsign_certificate.S13
-rw-r--r--kernel/module.c18
-rw-r--r--kernel/params.c5
-rw-r--r--kernel/posix-cpu-timers.c76
-rw-r--r--kernel/power/console.c116
-rw-r--r--kernel/printk.c1
-rw-r--r--kernel/ptrace.c1
-rw-r--r--kernel/rcutree.c16
-rw-r--r--kernel/rcutree.h2
-rw-r--r--kernel/rcutree_plugin.h33
-rw-r--r--kernel/rcutree_trace.c8
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/rwsem.c16
-rw-r--r--kernel/sched/core.c92
-rw-r--r--kernel/sched/cputime.c80
-rw-r--r--kernel/sched/fair.c10
-rw-r--r--kernel/sched/idle_task.c1
-rw-r--r--kernel/sched/sched.h25
-rw-r--r--kernel/softirq.c19
-rw-r--r--kernel/time/Kconfig80
-rw-r--r--kernel/time/tick-broadcast.c3
-rw-r--r--kernel/time/tick-common.c5
-rw-r--r--kernel/time/tick-sched.c296
-rw-r--r--kernel/timer.c16
-rw-r--r--kernel/trace/blktrace.c1
34 files changed, 875 insertions, 156 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index d1574d47cf27..271fd3119af9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -176,7 +176,7 @@ signing_key.priv signing_key.x509: x509.genkey
openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
-batch -x509 -config x509.genkey \
-outform DER -out signing_key.x509 \
- -keyout signing_key.priv
+ -keyout signing_key.priv 2>&1
@echo "###"
@echo "### Key pair generated."
@echo "###"
diff --git a/kernel/acct.c b/kernel/acct.c
index 85389fe2abd0..8d6e145138bb 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -540,10 +540,15 @@ static void do_acct_process(struct bsd_acct_struct *acct,
ac.ac_swaps = encode_comp_t(0);
/*
+ * Get freeze protection. If the fs is frozen, just skip the write
+ * as we could deadlock the system otherwise.
+ */
+ if (!file_start_write_trylock(file))
+ goto out;
+ /*
* Kernel segment override to datasegment and write it
* to the accounting file.
*/
- file_start_write(file);
fs = get_fs();
set_fs(KERNEL_DS);
/*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3820e3cefbae..6b41c1899a8b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -18,6 +18,7 @@
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/hash.h>
+#include <linux/tick.h>
#include <linux/sysfs.h>
#include <linux/dcache.h>
#include <linux/percpu.h>
@@ -685,8 +686,12 @@ static void perf_pmu_rotate_start(struct pmu *pmu)
WARN_ON(!irqs_disabled());
- if (list_empty(&cpuctx->rotation_list))
+ if (list_empty(&cpuctx->rotation_list)) {
+ int was_empty = list_empty(head);
list_add(&cpuctx->rotation_list, head);
+ if (was_empty)
+ tick_nohz_full_kick();
+ }
}
static void get_ctx(struct perf_event_context *ctx)
@@ -2591,6 +2596,16 @@ done:
list_del_init(&cpuctx->rotation_list);
}
+#ifdef CONFIG_NO_HZ_FULL
+bool perf_event_can_stop_tick(void)
+{
+ if (list_empty(&__get_cpu_var(rotation_list)))
+ return true;
+ else
+ return false;
+}
+#endif
+
void perf_event_task_tick(void)
{
struct list_head *head = &__get_cpu_var(rotation_list);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 97fddb09762b..cd55144270b5 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -326,11 +326,16 @@ void rb_free(struct ring_buffer *rb)
}
#else
+static int data_page_nr(struct ring_buffer *rb)
+{
+ return rb->nr_pages << page_order(rb);
+}
struct page *
perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
{
- if (pgoff > (1UL << page_order(rb)))
+ /* The '>' counts in the user page. */
+ if (pgoff > data_page_nr(rb))
return NULL;
return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
@@ -350,10 +355,11 @@ static void rb_free_work(struct work_struct *work)
int i, nr;
rb = container_of(work, struct ring_buffer, work);
- nr = 1 << page_order(rb);
+ nr = data_page_nr(rb);
base = rb->user_page;
- for (i = 0; i < nr + 1; i++)
+ /* The '<=' counts in the user page. */
+ for (i = 0; i <= nr; i++)
perf_mmap_unmark_page(base + (i * PAGE_SIZE));
vfree(base);
@@ -387,7 +393,7 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
rb->user_page = all_buf;
rb->data_pages[0] = all_buf + PAGE_SIZE;
rb->page_order = ilog2(nr_pages);
- rb->nr_pages = 1;
+ rb->nr_pages = !!nr_pages;
ring_buffer_init(rb, watermark, flags);
diff --git a/kernel/fork.c b/kernel/fork.c
index 7d40687b1434..987b28a1f01b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -70,6 +70,7 @@
#include <linux/khugepaged.h>
#include <linux/signalfd.h>
#include <linux/uprobes.h>
+#include <linux/aio.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -1303,6 +1304,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->memcg_batch.do_batch = 0;
p->memcg_batch.memcg = NULL;
#endif
+#ifdef CONFIG_BCACHE
+ p->sequential_io = 0;
+ p->sequential_io_avg = 0;
+#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 609d8ff38b74..fd4b13b131f8 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -172,7 +172,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
*/
static int hrtimer_get_target(int this_cpu, int pinned)
{
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
return get_nohz_timer_target();
#endif
@@ -1125,7 +1125,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
}
EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/**
* hrtimer_get_next_event - get the time until next expiry event
*
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 96f3a1d9c379..5a83dde8ca0c 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -462,9 +462,23 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
if (domain->ops->map) {
ret = domain->ops->map(domain, virq, hwirq);
if (ret != 0) {
- pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n",
- virq, hwirq, ret);
- WARN_ON(1);
+ /*
+ * If map() returns -EPERM, this interrupt is protected
+ * by the firmware or some other service and shall not
+ * be mapped.
+ *
+ * Since on some platforms we blindly try to map everything
+ * we end up with a log full of backtraces.
+ *
+ * So instead, we silently fail on -EPERM, it is the
+ * responsibility of the PIC driver to display a relevant
+ * message if needed.
+ */
+ if (ret != -EPERM) {
+ pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n",
+ virq, hwirq, ret);
+ WARN_ON(1);
+ }
irq_data->domain = NULL;
irq_data->hwirq = 0;
goto err_unmap;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 2169feeba529..3127ad52cdb2 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -84,9 +84,11 @@ static int is_ksym_addr(unsigned long addr)
/*
* Expand a compressed symbol data into the resulting uncompressed string,
+ * if uncompressed string is too long (>= maxlen), it will be truncated,
* given the offset to where the symbol is in the compressed stream.
*/
-static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
+static unsigned int kallsyms_expand_symbol(unsigned int off,
+ char *result, size_t maxlen)
{
int len, skipped_first = 0;
const u8 *tptr, *data;
@@ -113,15 +115,20 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
while (*tptr) {
if (skipped_first) {
+ if (maxlen <= 1)
+ goto tail;
*result = *tptr;
result++;
+ maxlen--;
} else
skipped_first = 1;
tptr++;
}
}
- *result = '\0';
+tail:
+ if (maxlen)
+ *result = '\0';
/* Return to offset to the next symbol. */
return off;
@@ -176,7 +183,7 @@ unsigned long kallsyms_lookup_name(const char *name)
unsigned int off;
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
- off = kallsyms_expand_symbol(off, namebuf);
+ off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
if (strcmp(namebuf, name) == 0)
return kallsyms_addresses[i];
@@ -195,7 +202,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
int ret;
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
- off = kallsyms_expand_symbol(off, namebuf);
+ off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
if (ret != 0)
return ret;
@@ -294,7 +301,8 @@ const char *kallsyms_lookup(unsigned long addr,
pos = get_symbol_pos(addr, symbolsize, offset);
/* Grab name */
- kallsyms_expand_symbol(get_symbol_offset(pos), namebuf);
+ kallsyms_expand_symbol(get_symbol_offset(pos),
+ namebuf, KSYM_NAME_LEN);
if (modname)
*modname = NULL;
return namebuf;
@@ -315,7 +323,8 @@ int lookup_symbol_name(unsigned long addr, char *symname)
pos = get_symbol_pos(addr, NULL, NULL);
/* Grab name */
- kallsyms_expand_symbol(get_symbol_offset(pos), symname);
+ kallsyms_expand_symbol(get_symbol_offset(pos),
+ symname, KSYM_NAME_LEN);
return 0;
}
/* See if it's in a module. */
@@ -333,7 +342,8 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
pos = get_symbol_pos(addr, size, offset);
/* Grab name */
- kallsyms_expand_symbol(get_symbol_offset(pos), name);
+ kallsyms_expand_symbol(get_symbol_offset(pos),
+ name, KSYM_NAME_LEN);
modname[0] = '\0';
return 0;
}
@@ -463,7 +473,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
iter->type = kallsyms_get_symbol_type(off);
- off = kallsyms_expand_symbol(off, iter->name);
+ off = kallsyms_expand_symbol(off, iter->name, ARRAY_SIZE(iter->name));
return off - iter->nameoff;
}
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 6a3bccba7e7d..1f3186b37fd5 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2998,6 +2998,7 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
EXPORT_SYMBOL_GPL(lockdep_init_map);
struct lock_class_key __lockdep_no_validate__;
+EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
static int
print_lock_nested_lock_not_held(struct task_struct *curr,
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S
index 246b4c6e6135..4a9a86d12c8b 100644
--- a/kernel/modsign_certificate.S
+++ b/kernel/modsign_certificate.S
@@ -1,15 +1,8 @@
-/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */
-#ifndef SYMBOL_PREFIX
-#define ASM_SYMBOL(sym) sym
-#else
-#define PASTE2(x,y) x##y
-#define PASTE(x,y) PASTE2(x,y)
-#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym)
-#endif
+#include <linux/export.h>
#define GLOBAL(name) \
- .globl ASM_SYMBOL(name); \
- ASM_SYMBOL(name):
+ .globl VMLINUX_SYMBOL(name); \
+ VMLINUX_SYMBOL(name):
.section ".init.data","aw"
diff --git a/kernel/module.c b/kernel/module.c
index 0925c9a71975..b049939177f6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1209,10 +1209,11 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
/* Since this should be found in kernel (which can't be removed),
* no locking is necessary. */
- if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
+ if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL,
&crc, true, false))
BUG();
- return check_version(sechdrs, versindex, "module_layout", mod, crc,
+ return check_version(sechdrs, versindex,
+ VMLINUX_SYMBOL_STR(module_layout), mod, crc,
NULL);
}
@@ -1861,12 +1862,12 @@ static void free_module(struct module *mod)
{
trace_module_free(mod);
- /* Delete from various lists */
- mutex_lock(&module_mutex);
- stop_machine(__unlink_module, mod, NULL);
- mutex_unlock(&module_mutex);
mod_sysfs_teardown(mod);
+ /* We leave it in list to prevent duplicate loads, but make sure
+ * that noone uses it while it's being deconstructed. */
+ mod->state = MODULE_STATE_UNFORMED;
+
/* Remove dynamic debug info */
ddebug_remove_module(mod->name);
@@ -1879,6 +1880,11 @@ static void free_module(struct module *mod)
/* Free any allocated parameters. */
destroy_params(mod->kp, mod->num_kp);
+ /* Now we can delete it from the lists */
+ mutex_lock(&module_mutex);
+ stop_machine(__unlink_module, mod, NULL);
+ mutex_unlock(&module_mutex);
+
/* This may be NULL, but that's OK */
unset_module_init_ro_nx(mod);
module_free(mod, mod->module_init);
diff --git a/kernel/params.c b/kernel/params.c
index ed35345be536..53b958fcd639 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -613,10 +613,13 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1),
GFP_KERNEL);
if (!new) {
- kfree(mk->mp);
+ kfree(attrs);
err = -ENOMEM;
goto fail;
}
+ /* Despite looking like the typical realloc() bug, this is safe.
+ * We *want* the old 'attrs' to be freed either way, and we'll store
+ * the new one in the success case. */
attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL);
if (!attrs) {
err = -ENOMEM;
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 8fd709c9bb58..42670e9b44e0 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -10,6 +10,8 @@
#include <linux/kernel_stat.h>
#include <trace/events/timer.h>
#include <linux/random.h>
+#include <linux/tick.h>
+#include <linux/workqueue.h>
/*
* Called after updating RLIMIT_CPU to run cpu timer and update
@@ -153,6 +155,21 @@ static void bump_cpu_timer(struct k_itimer *timer,
}
}
+/**
+ * task_cputime_zero - Check a task_cputime struct for all zero fields.
+ *
+ * @cputime: The struct to compare.
+ *
+ * Checks @cputime to see if all fields are zero. Returns true if all fields
+ * are zero, false if any field is nonzero.
+ */
+static inline int task_cputime_zero(const struct task_cputime *cputime)
+{
+ if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
+ return 1;
+ return 0;
+}
+
static inline cputime_t prof_ticks(struct task_struct *p)
{
cputime_t utime, stime;
@@ -636,6 +653,37 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
return 0;
}
+#ifdef CONFIG_NO_HZ_FULL
+static void nohz_kick_work_fn(struct work_struct *work)
+{
+ tick_nohz_full_kick_all();
+}
+
+static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn);
+
+/*
+ * We need the IPIs to be sent from sane process context.
+ * The posix cpu timers are always set with irqs disabled.
+ */
+static void posix_cpu_timer_kick_nohz(void)
+{
+ schedule_work(&nohz_kick_work);
+}
+
+bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
+{
+ if (!task_cputime_zero(&tsk->cputime_expires))
+ return false;
+
+ if (tsk->signal->cputimer.running)
+ return false;
+
+ return true;
+}
+#else
+static inline void posix_cpu_timer_kick_nohz(void) { }
+#endif
+
/*
* Guts of sys_timer_settime for CPU timers.
* This is called with the timer locked and interrupts disabled.
@@ -794,6 +842,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
sample_to_timespec(timer->it_clock,
old_incr, &old->it_interval);
}
+ if (!ret)
+ posix_cpu_timer_kick_nohz();
return ret;
}
@@ -1008,21 +1058,6 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
}
}
-/**
- * task_cputime_zero - Check a task_cputime struct for all zero fields.
- *
- * @cputime: The struct to compare.
- *
- * Checks @cputime to see if all fields are zero. Returns true if all fields
- * are zero, false if any field is nonzero.
- */
-static inline int task_cputime_zero(const struct task_cputime *cputime)
-{
- if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
- return 1;
- return 0;
-}
-
/*
* Check for any per-thread CPU timers that have fired and move them
* off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1336,6 +1371,13 @@ void run_posix_cpu_timers(struct task_struct *tsk)
cpu_timer_fire(timer);
spin_unlock(&timer->it_lock);
}
+
+ /*
+ * In case some timers were rescheduled after the queue got emptied,
+ * wake up full dynticks CPUs.
+ */
+ if (tsk->signal->cputimer.running)
+ posix_cpu_timer_kick_nohz();
}
/*
@@ -1366,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
}
if (!*newval)
- return;
+ goto out;
*newval += now.cpu;
}
@@ -1384,6 +1426,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
tsk->signal->cputime_expires.virt_exp = *newval;
break;
}
+out:
+ posix_cpu_timer_kick_nohz();
}
static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
diff --git a/kernel/power/console.c b/kernel/power/console.c
index b1dc456474b5..463aa6736751 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -4,6 +4,7 @@
* Originally from swsusp.
*/
+#include <linux/console.h>
#include <linux/vt_kern.h>
#include <linux/kbd_kern.h>
#include <linux/vt.h>
@@ -14,8 +15,120 @@
static int orig_fgconsole, orig_kmsg;
+static DEFINE_MUTEX(vt_switch_mutex);
+
+struct pm_vt_switch {
+ struct list_head head;
+ struct device *dev;
+ bool required;
+};
+
+static LIST_HEAD(pm_vt_switch_list);
+
+
+/**
+ * pm_vt_switch_required - indicate VT switch at suspend requirements
+ * @dev: device
+ * @required: if true, caller needs VT switch at suspend/resume time
+ *
+ * The different console drivers may or may not require VT switches across
+ * suspend/resume, depending on how they handle restoring video state and
+ * what may be running.
+ *
+ * Drivers can indicate support for switchless suspend/resume, which can
+ * save time and flicker, by using this routine and passing 'false' as
+ * the argument. If any loaded driver needs VT switching, or the
+ * no_console_suspend argument has been passed on the command line, VT
+ * switches will occur.
+ */
+void pm_vt_switch_required(struct device *dev, bool required)
+{
+ struct pm_vt_switch *entry, *tmp;
+
+ mutex_lock(&vt_switch_mutex);
+ list_for_each_entry(tmp, &pm_vt_switch_list, head) {
+ if (tmp->dev == dev) {
+ /* already registered, update requirement */
+ tmp->required = required;
+ goto out;
+ }
+ }
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ goto out;
+
+ entry->required = required;
+ entry->dev = dev;
+
+ list_add(&entry->head, &pm_vt_switch_list);
+out:
+ mutex_unlock(&vt_switch_mutex);
+}
+EXPORT_SYMBOL(pm_vt_switch_required);
+
+/**
+ * pm_vt_switch_unregister - stop tracking a device's VT switching needs
+ * @dev: device
+ *
+ * Remove @dev from the vt switch list.
+ */
+void pm_vt_switch_unregister(struct device *dev)
+{
+ struct pm_vt_switch *tmp;
+
+ mutex_lock(&vt_switch_mutex);
+ list_for_each_entry(tmp, &pm_vt_switch_list, head) {
+ if (tmp->dev == dev) {
+ list_del(&tmp->head);
+ break;
+ }
+ }
+ mutex_unlock(&vt_switch_mutex);
+}
+EXPORT_SYMBOL(pm_vt_switch_unregister);
+
+/*
+ * There are three cases when a VT switch on suspend/resume are required:
+ * 1) no driver has indicated a requirement one way or another, so preserve
+ * the old behavior
+ * 2) console suspend is disabled, we want to see debug messages across
+ * suspend/resume
+ * 3) any registered driver indicates it needs a VT switch
+ *
+ * If none of these conditions is present, meaning we have at least one driver
+ * that doesn't need the switch, and none that do, we can avoid it to make
+ * resume look a little prettier (and suspend too, but that's usually hidden,
+ * e.g. when closing the lid on a laptop).
+ */
+static bool pm_vt_switch(void)
+{
+ struct pm_vt_switch *entry;
+ bool ret = true;
+
+ mutex_lock(&vt_switch_mutex);
+ if (list_empty(&pm_vt_switch_list))
+ goto out;
+
+ if (!console_suspend_enabled)
+ goto out;
+
+ list_for_each_entry(entry, &pm_vt_switch_list, head) {
+ if (entry->required)
+ goto out;
+ }
+
+ ret = false;
+out:
+ mutex_unlock(&vt_switch_mutex);
+ return ret;
+}
+
int pm_prepare_console(void)
{
+ if (!pm_vt_switch())
+ return 0;
+
orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
if (orig_fgconsole < 0)
return 1;
@@ -26,6 +139,9 @@ int pm_prepare_console(void)
void pm_restore_console(void)
{
+ if (!pm_vt_switch())
+ return;
+
if (orig_fgconsole >= 0) {
vt_move_to_console(orig_fgconsole, 0);
vt_kmsg_redirect(orig_kmsg);
diff --git a/kernel/printk.c b/kernel/printk.c
index 96dcfcd9a2d4..fa36e1494420 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,6 +32,7 @@
#include <linux/security.h>
#include <linux/bootmem.h>
#include <linux/memblock.h>
+#include <linux/aio.h>
#include <linux/syscalls.h>
#include <linux/kexec.h>
#include <linux/kdb.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 17ae54da0ec2..aed981a3f69c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -17,6 +17,7 @@
#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/signal.h>
+#include <linux/uio.h>
#include <linux/audit.h>
#include <linux/pid_namespace.h>
#include <linux/syscalls.h>
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d8534308fd05..16ea67925015 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -799,6 +799,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
rdp->offline_fqs++;
return 1;
}
+
+ /*
+ * There is a possibility that a CPU in adaptive-ticks state
+ * might run in the kernel with the scheduling-clock tick disabled
+ * for an extended time period. Invoke rcu_kick_nohz_cpu() to
+ * force the CPU to restart the scheduling-clock tick in this
+ * CPU is in this state.
+ */
+ rcu_kick_nohz_cpu(rdp->cpu);
+
return 0;
}
@@ -1820,7 +1830,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
struct rcu_node *rnp, struct rcu_data *rdp)
{
/* No-CBs CPUs do not have orphanable callbacks. */
- if (is_nocb_cpu(rdp->cpu))
+ if (rcu_is_nocb_cpu(rdp->cpu))
return;
/*
@@ -2892,10 +2902,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
* corresponding CPU's preceding callbacks have been invoked.
*/
for_each_possible_cpu(cpu) {
- if (!cpu_online(cpu) && !is_nocb_cpu(cpu))
+ if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu))
continue;
rdp = per_cpu_ptr(rsp->rda, cpu);
- if (is_nocb_cpu(cpu)) {
+ if (rcu_is_nocb_cpu(cpu)) {
_rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
rsp->n_barrier_done);
atomic_inc(&rsp->barrier_cpu_count);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 14ee40795d6f..da77a8f57ff9 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -530,13 +530,13 @@ static int rcu_nocb_needs_gp(struct rcu_state *rsp);
static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
static void rcu_init_one_nocb(struct rcu_node *rnp);
-static bool is_nocb_cpu(int cpu);
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy);
static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
+static void rcu_kick_nohz_cpu(int cpu);
static bool init_nocb_callback_list(struct rcu_data *rdp);
#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index d084ae3f281c..170814dc418f 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -28,6 +28,7 @@
#include <linux/gfp.h>
#include <linux/oom.h>
#include <linux/smpboot.h>
+#include <linux/tick.h>
#define RCU_KTHREAD_PRIO 1
@@ -1705,7 +1706,7 @@ static void rcu_prepare_for_idle(int cpu)
return;
/* If this is a no-CBs CPU, no callbacks, just return. */
- if (is_nocb_cpu(cpu))
+ if (rcu_is_nocb_cpu(cpu))
return;
/*
@@ -1747,7 +1748,7 @@ static void rcu_cleanup_after_idle(int cpu)
struct rcu_data *rdp;
struct rcu_state *rsp;
- if (is_nocb_cpu(cpu))
+ if (rcu_is_nocb_cpu(cpu))
return;
rcu_try_advance_all_cbs();
for_each_rcu_flavor(rsp) {
@@ -2052,7 +2053,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
}
/* Is the specified CPU a no-CPUs CPU? */
-static bool is_nocb_cpu(int cpu)
+bool rcu_is_nocb_cpu(int cpu)
{
if (have_rcu_nocb_mask)
return cpumask_test_cpu(cpu, rcu_nocb_mask);
@@ -2110,7 +2111,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy)
{
- if (!is_nocb_cpu(rdp->cpu))
+ if (!rcu_is_nocb_cpu(rdp->cpu))
return 0;
__call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
if (__is_kfree_rcu_offset((unsigned long)rhp->func))
@@ -2134,7 +2135,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
long qll = rsp->qlen_lazy;
/* If this is not a no-CBs CPU, tell the caller to do it the old way. */
- if (!is_nocb_cpu(smp_processor_id()))
+ if (!rcu_is_nocb_cpu(smp_processor_id()))
return 0;
rsp->qlen = 0;
rsp->qlen_lazy = 0;
@@ -2306,11 +2307,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
{
}
-static bool is_nocb_cpu(int cpu)
-{
- return false;
-}
-
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy)
{
@@ -2337,3 +2333,20 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
}
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+
+/*
+ * An adaptive-ticks CPU can potentially execute in kernel mode for an
+ * arbitrarily long period of time with the scheduling-clock tick turned
+ * off. RCU will be paying attention to this CPU because it is in the
+ * kernel, but the CPU cannot be guaranteed to be executing the RCU state
+ * machine because the scheduling-clock tick has been disabled. Therefore,
+ * if an adaptive-ticks CPU is failing to respond to the current grace
+ * period and has not be idle from an RCU perspective, kick it.
+ */
+static void rcu_kick_nohz_cpu(int cpu)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ if (tick_nohz_full_cpu(cpu))
+ smp_send_reschedule(cpu);
+#endif /* #ifdef CONFIG_NO_HZ_FULL */
+}
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 49099e81c87b..cf6c17412932 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -95,7 +95,7 @@ static const struct file_operations rcubarrier_fops = {
.open = rcubarrier_open,
.read = seq_read,
.llseek = no_llseek,
- .release = seq_release,
+ .release = single_release,
};
#ifdef CONFIG_RCU_BOOST
@@ -206,7 +206,7 @@ static const struct file_operations rcuexp_fops = {
.open = rcuexp_open,
.read = seq_read,
.llseek = no_llseek,
- .release = seq_release,
+ .release = single_release,
};
#ifdef CONFIG_RCU_BOOST
@@ -306,7 +306,7 @@ static const struct file_operations rcuhier_fops = {
.open = rcuhier_open,
.read = seq_read,
.llseek = no_llseek,
- .release = seq_release,
+ .release = single_release,
};
static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
@@ -348,7 +348,7 @@ static const struct file_operations rcugp_fops = {
.open = rcugp_open,
.read = seq_read,
.llseek = no_llseek,
- .release = seq_release,
+ .release = single_release,
};
static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
diff --git a/kernel/relay.c b/kernel/relay.c
index eef0d113b79e..b91488ba2e5a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -234,7 +234,6 @@ static void relay_destroy_buf(struct rchan_buf *buf)
static void relay_remove_buf(struct kref *kref)
{
struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
- buf->chan->cb->remove_buf_file(buf->dentry);
relay_destroy_buf(buf);
}
@@ -484,6 +483,7 @@ static void relay_close_buf(struct rchan_buf *buf)
{
buf->finalized = 1;
del_timer_sync(&buf->timer);
+ buf->chan->cb->remove_buf_file(buf->dentry);
kref_put(&buf->kref, relay_remove_buf);
}
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index b3c6c3fcd847..cfff1435bdfb 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -126,6 +126,15 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
EXPORT_SYMBOL(_down_write_nest_lock);
+void down_read_non_owner(struct rw_semaphore *sem)
+{
+ might_sleep();
+
+ __down_read(sem);
+}
+
+EXPORT_SYMBOL(down_read_non_owner);
+
void down_write_nested(struct rw_semaphore *sem, int subclass)
{
might_sleep();
@@ -136,6 +145,13 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
EXPORT_SYMBOL(down_write_nested);
+void up_read_non_owner(struct rw_semaphore *sem)
+{
+ __up_read(sem);
+}
+
+EXPORT_SYMBOL(up_read_non_owner);
+
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5662f58f0b69..58453b8272fd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -544,7 +544,7 @@ void resched_cpu(int cpu)
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
* In the semi idle case, use the nearest busy cpu for migrating timers
* from an idle cpu. This is good for power-savings.
@@ -582,7 +582,7 @@ unlock:
* account when the CPU goes back to idle and evaluates the timer
* wheel for the next timer event.
*/
-void wake_up_idle_cpu(int cpu)
+static void wake_up_idle_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -612,20 +612,56 @@ void wake_up_idle_cpu(int cpu)
smp_send_reschedule(cpu);
}
+static bool wake_up_full_nohz_cpu(int cpu)
+{
+ if (tick_nohz_full_cpu(cpu)) {
+ if (cpu != smp_processor_id() ||
+ tick_nohz_tick_stopped())
+ smp_send_reschedule(cpu);
+ return true;
+ }
+
+ return false;
+}
+
+void wake_up_nohz_cpu(int cpu)
+{
+ if (!wake_up_full_nohz_cpu(cpu))
+ wake_up_idle_cpu(cpu);
+}
+
static inline bool got_nohz_idle_kick(void)
{
int cpu = smp_processor_id();
return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
}
-#else /* CONFIG_NO_HZ */
+#else /* CONFIG_NO_HZ_COMMON */
static inline bool got_nohz_idle_kick(void)
{
return false;
}
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
+
+#ifdef CONFIG_NO_HZ_FULL
+bool sched_can_stop_tick(void)
+{
+ struct rq *rq;
+
+ rq = this_rq();
+
+ /* Make sure rq->nr_running update is visible after the IPI */
+ smp_rmb();
+
+ /* More than one running task need preemption */
+ if (rq->nr_running > 1)
+ return false;
+
+ return true;
+}
+#endif /* CONFIG_NO_HZ_FULL */
void sched_avg_update(struct rq *rq)
{
@@ -1357,7 +1393,8 @@ static void sched_ttwu_pending(void)
void scheduler_ipi(void)
{
- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
+ if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()
+ && !tick_nohz_full_cpu(smp_processor_id()))
return;
/*
@@ -1374,6 +1411,7 @@ void scheduler_ipi(void)
* somewhat pessimize the simple resched case.
*/
irq_enter();
+ tick_nohz_full_check();
sched_ttwu_pending();
/*
@@ -1855,6 +1893,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
kprobe_flush_task(prev);
put_task_struct(prev);
}
+
+ tick_nohz_task_switch(current);
}
#ifdef CONFIG_SMP
@@ -2118,7 +2158,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
return load >> FSHIFT;
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
* Handle NO_HZ for the global load-average.
*
@@ -2344,12 +2384,12 @@ static void calc_global_nohz(void)
smp_wmb();
calc_load_idx++;
}
-#else /* !CONFIG_NO_HZ */
+#else /* !CONFIG_NO_HZ_COMMON */
static inline long calc_load_fold_idle(void) { return 0; }
static inline void calc_global_nohz(void) { }
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
/*
* calc_load - update the avenrun load estimates 10 ticks after the
@@ -2509,7 +2549,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
sched_avg_update(this_rq);
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
* There is no sane way to deal with nohz on smp when using jiffies because the
* cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
@@ -2569,7 +2609,7 @@ void update_cpu_load_nohz(void)
}
raw_spin_unlock(&this_rq->lock);
}
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
/*
* Called from scheduler_tick()
@@ -2696,7 +2736,34 @@ void scheduler_tick(void)
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq, cpu);
#endif
+ rq_last_tick_reset(rq);
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+/**
+ * scheduler_tick_max_deferment
+ *
+ * Keep at least one tick per second when a single
+ * active task is running because the scheduler doesn't
+ * yet completely support full dynticks environment.
+ *
+ * This makes sure that uptime, CFS vruntime, load
+ * balancing, etc... continue to move forward, even
+ * with a very low granularity.
+ */
+u64 scheduler_tick_max_deferment(void)
+{
+ struct rq *rq = this_rq();
+ unsigned long next, now = ACCESS_ONCE(jiffies);
+
+ next = rq->last_sched_tick + HZ;
+
+ if (time_before_eq(next, now))
+ return 0;
+
+ return jiffies_to_usecs(next - now) * NSEC_PER_USEC;
}
+#endif
notrace unsigned long get_parent_ip(unsigned long addr)
{
@@ -6951,9 +7018,12 @@ void __init sched_init(void)
INIT_LIST_HEAD(&rq->cfs_tasks);
rq_attach_root(rq, &def_root_domain);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
rq->nohz_flags = 0;
#endif
+#ifdef CONFIG_NO_HZ_FULL
+ rq->last_sched_tick = 0;
+#endif
#endif
init_rq_hrtick(rq);
atomic_set(&rq->nr_iowait, 0);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ea32f02bf2c3..cc2dc3eea8a3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -506,34 +506,47 @@ void account_idle_ticks(unsigned long ticks)
}
/*
- * Perform (stime * rtime) / total with reduced chances
- * of multiplication overflows by using smaller factors
- * like quotient and remainders of divisions between
- * rtime and total.
+ * Perform (stime * rtime) / total, but avoid multiplication overflow by
+ * loosing precision when the numbers are big.
*/
static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
{
- u64 rem, res, scaled;
+ u64 scaled;
- if (rtime >= total) {
- /*
- * Scale up to rtime / total then add
- * the remainder scaled to stime / total.
- */
- res = div64_u64_rem(rtime, total, &rem);
- scaled = stime * res;
- scaled += div64_u64(stime * rem, total);
- } else {
- /*
- * Same in reverse: scale down to total / rtime
- * then substract that result scaled to
- * to the remaining part.
- */
- res = div64_u64_rem(total, rtime, &rem);
- scaled = div64_u64(stime, res);
- scaled -= div64_u64(scaled * rem, total);
+ for (;;) {
+ /* Make sure "rtime" is the bigger of stime/rtime */
+ if (stime > rtime) {
+ u64 tmp = rtime; rtime = stime; stime = tmp;
+ }
+
+ /* Make sure 'total' fits in 32 bits */
+ if (total >> 32)
+ goto drop_precision;
+
+ /* Does rtime (and thus stime) fit in 32 bits? */
+ if (!(rtime >> 32))
+ break;
+
+ /* Can we just balance rtime/stime rather than dropping bits? */
+ if (stime >> 31)
+ goto drop_precision;
+
+ /* We can grow stime and shrink rtime and try to make them both fit */
+ stime <<= 1;
+ rtime >>= 1;
+ continue;
+
+drop_precision:
+ /* We drop from rtime, it has more bits than stime */
+ rtime >>= 1;
+ total >>= 1;
}
+ /*
+ * Make sure gcc understands that this is a 32x32->64 multiply,
+ * followed by a 64/32->64 divide.
+ */
+ scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
return (__force cputime_t) scaled;
}
@@ -545,7 +558,7 @@ static void cputime_adjust(struct task_cputime *curr,
struct cputime *prev,
cputime_t *ut, cputime_t *st)
{
- cputime_t rtime, stime, total;
+ cputime_t rtime, stime, utime, total;
if (vtime_accounting_enabled()) {
*ut = curr->utime;
@@ -568,13 +581,21 @@ static void cputime_adjust(struct task_cputime *curr,
*/
rtime = nsecs_to_cputime(curr->sum_exec_runtime);
- if (!rtime) {
- stime = 0;
- } else if (!total) {
- stime = rtime;
- } else {
+ /*
+ * Update userspace visible utime/stime values only if actual execution
+ * time is bigger than already exported. Note that can happen, that we
+ * provided bigger values due to scaling inaccuracy on big numbers.
+ */
+ if (prev->stime + prev->utime >= rtime)
+ goto out;
+
+ if (total) {
stime = scale_stime((__force u64)stime,
(__force u64)rtime, (__force u64)total);
+ utime = rtime - stime;
+ } else {
+ stime = rtime;
+ utime = 0;
}
/*
@@ -583,8 +604,9 @@ static void cputime_adjust(struct task_cputime *curr,
* Let's enforce monotonicity.
*/
prev->stime = max(prev->stime, stime);
- prev->utime = max(prev->utime, rtime - prev->stime);
+ prev->utime = max(prev->utime, utime);
+out:
*ut = prev->utime;
*st = prev->stime;
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8bf7081b1ec5..c61a614465c8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5355,7 +5355,7 @@ out_unlock:
return 0;
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
* idle load balancing details
* - When one of the busy CPUs notice that there may be an idle rebalancing
@@ -5572,9 +5572,9 @@ out:
rq->next_balance = next_balance;
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
- * In CONFIG_NO_HZ case, the idle balance kickee will do the
+ * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
* rebalancing for all the cpus for whom scheduler ticks are stopped.
*/
static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
@@ -5717,7 +5717,7 @@ void trigger_load_balance(struct rq *rq, int cpu)
if (time_after_eq(jiffies, rq->next_balance) &&
likely(!on_null_domain(cpu)))
raise_softirq(SCHED_SOFTIRQ);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
nohz_balancer_kick(cpu);
#endif
@@ -6187,7 +6187,7 @@ __init void init_sched_fair_class(void)
#ifdef CONFIG_SMP
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
nohz.next_balance = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
cpu_notifier(sched_ilb_notifier, 0);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b8ce77328341..d8da01008d39 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -17,6 +17,7 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
{
idle_exit_fair(rq);
+ rq_last_tick_reset(rq);
}
static void post_schedule_idle(struct rq *rq)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4c225c4c7111..ce39224d6155 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -5,6 +5,7 @@
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/stop_machine.h>
+#include <linux/tick.h>
#include "cpupri.h"
#include "cpuacct.h"
@@ -405,10 +406,13 @@ struct rq {
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
unsigned long last_load_update_tick;
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
u64 nohz_stamp;
unsigned long nohz_flags;
#endif
+#ifdef CONFIG_NO_HZ_FULL
+ unsigned long last_sched_tick;
+#endif
int skip_clock_update;
/* capture load from *all* tasks on this cpu: */
@@ -1072,6 +1076,16 @@ static inline u64 steal_ticks(u64 steal)
static inline void inc_nr_running(struct rq *rq)
{
rq->nr_running++;
+
+#ifdef CONFIG_NO_HZ_FULL
+ if (rq->nr_running == 2) {
+ if (tick_nohz_full_cpu(rq->cpu)) {
+ /* Order rq->nr_running write against the IPI */
+ smp_wmb();
+ smp_send_reschedule(rq->cpu);
+ }
+ }
+#endif
}
static inline void dec_nr_running(struct rq *rq)
@@ -1079,6 +1093,13 @@ static inline void dec_nr_running(struct rq *rq)
rq->nr_running--;
}
+static inline void rq_last_tick_reset(struct rq *rq)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ rq->last_sched_tick = jiffies;
+#endif
+}
+
extern void update_rq_clock(struct rq *rq);
extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
@@ -1299,7 +1320,7 @@ extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
enum rq_nohz_flag_bits {
NOHZ_TICK_STOPPED,
NOHZ_BALANCE_KICK,
diff --git a/kernel/softirq.c b/kernel/softirq.c
index aa82723c7202..b5197dcb0dad 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -329,6 +329,19 @@ static inline void invoke_softirq(void)
wakeup_softirqd();
}
+static inline void tick_irq_exit(void)
+{
+#ifdef CONFIG_NO_HZ_COMMON
+ int cpu = smp_processor_id();
+
+ /* Make sure that timer wheel updates are propagated */
+ if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
+ if (!in_interrupt())
+ tick_nohz_irq_exit();
+ }
+#endif
+}
+
/*
* Exit an interrupt context. Process softirqs if needed and possible:
*/
@@ -346,11 +359,7 @@ void irq_exit(void)
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();
-#ifdef CONFIG_NO_HZ
- /* Make sure that timer wheel updates are propagated */
- if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
- tick_nohz_irq_exit();
-#endif
+ tick_irq_exit();
rcu_irq_exit();
}
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 24510d84efd7..e4c07b0692bb 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -64,20 +64,88 @@ config GENERIC_CMOS_UPDATE
if GENERIC_CLOCKEVENTS
menu "Timers subsystem"
-# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is
+# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is
# only related to the tick functionality. Oneshot clockevent devices
# are supported independ of this.
config TICK_ONESHOT
bool
-config NO_HZ
- bool "Tickless System (Dynamic Ticks)"
+config NO_HZ_COMMON
+ bool
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
select TICK_ONESHOT
+
+choice
+ prompt "Timer tick handling"
+ default NO_HZ_IDLE if NO_HZ
+
+config HZ_PERIODIC
+ bool "Periodic timer ticks (constant rate, no dynticks)"
+ help
+ This option keeps the tick running periodically at a constant
+ rate, even when the CPU doesn't need it.
+
+config NO_HZ_IDLE
+ bool "Idle dynticks system (tickless idle)"
+ depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
+ select NO_HZ_COMMON
+ help
+ This option enables a tickless idle system: timer interrupts
+ will only trigger on an as-needed basis when the system is idle.
+ This is usually interesting for energy saving.
+
+ Most of the time you want to say Y here.
+
+config NO_HZ_FULL
+ bool "Full dynticks system (tickless)"
+ # NO_HZ_COMMON dependency
+ depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
+ # We need at least one periodic CPU for timekeeping
+ depends on SMP
+ # RCU_USER_QS dependency
+ depends on HAVE_CONTEXT_TRACKING
+ # VIRT_CPU_ACCOUNTING_GEN dependency
+ depends on 64BIT
+ select NO_HZ_COMMON
+ select RCU_USER_QS
+ select RCU_NOCB_CPU
+ select VIRT_CPU_ACCOUNTING_GEN
+ select CONTEXT_TRACKING_FORCE
+ select IRQ_WORK
+ help
+ Adaptively try to shutdown the tick whenever possible, even when
+ the CPU is running tasks. Typically this requires running a single
+ task on the CPU. Chances for running tickless are maximized when
+ the task mostly runs in userspace and has few kernel activity.
+
+ You need to fill up the nohz_full boot parameter with the
+ desired range of dynticks CPUs.
+
+ This is implemented at the expense of some overhead in user <-> kernel
+ transitions: syscalls, exceptions and interrupts. Even when it's
+ dynamically off.
+
+ Say N.
+
+endchoice
+
+config NO_HZ_FULL_ALL
+ bool "Full dynticks system on all CPUs by default"
+ depends on NO_HZ_FULL
+ help
+ If the user doesn't pass the nohz_full boot option to
+ define the range of full dynticks CPUs, consider that all
+ CPUs in the system are full dynticks by default.
+ Note the boot CPU will still be kept outside the range to
+ handle the timekeeping duty.
+
+config NO_HZ
+ bool "Old Idle dynticks config"
+ depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
help
- This option enables a tickless system: timer interrupts will
- only trigger on an as-needed basis both when the system is
- busy and when the system is idle.
+ This is the old config entry that enables dynticks idle.
+ We keep it around for a little while to enforce backward
+ compatibility with older config files.
config HIGH_RES_TIMERS
bool "High Resolution Timer Support"
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 61d00a8cdf2f..206bbfb34e09 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -693,7 +693,8 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
bc->event_handler = tick_handle_oneshot_broadcast;
/* Take the do_timer update */
- tick_do_timer_cpu = cpu;
+ if (!tick_nohz_full_cpu(cpu))
+ tick_do_timer_cpu = cpu;
/*
* We must be careful here. There might be other CPUs
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 6176a3e45709..5d3fb100bc06 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -163,7 +163,10 @@ static void tick_setup_device(struct tick_device *td,
* this cpu:
*/
if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
- tick_do_timer_cpu = cpu;
+ if (!tick_nohz_full_cpu(cpu))
+ tick_do_timer_cpu = cpu;
+ else
+ tick_do_timer_cpu = TICK_DO_TIMER_NONE;
tick_next_period = ktime_get();
tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 225f8bf19095..bc67d4245e1d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -21,11 +21,15 @@
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/irq_work.h>
+#include <linux/posix-timers.h>
+#include <linux/perf_event.h>
#include <asm/irq_regs.h>
#include "tick-internal.h"
+#include <trace/events/timer.h>
+
/*
* Per cpu nohz control structure
*/
@@ -104,7 +108,7 @@ static void tick_sched_do_timer(ktime_t now)
{
int cpu = smp_processor_id();
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
* Check if the do_timer duty was dropped. We don't care about
* concurrency: This happens only when the cpu in charge went
@@ -112,7 +116,8 @@ static void tick_sched_do_timer(ktime_t now)
* this duty, then the jiffies update is still serialized by
* jiffies_lock.
*/
- if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
+ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
+ && !tick_nohz_full_cpu(cpu))
tick_do_timer_cpu = cpu;
#endif
@@ -123,7 +128,7 @@ static void tick_sched_do_timer(ktime_t now)
static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
{
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
* When we are idle and the tick is stopped, we have to touch
* the watchdog as we might not schedule for a really long
@@ -142,10 +147,226 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
profile_tick(CPU_PROFILING);
}
+#ifdef CONFIG_NO_HZ_FULL
+static cpumask_var_t nohz_full_mask;
+bool have_nohz_full_mask;
+
+static bool can_stop_full_tick(void)
+{
+ WARN_ON_ONCE(!irqs_disabled());
+
+ if (!sched_can_stop_tick()) {
+ trace_tick_stop(0, "more than 1 task in runqueue\n");
+ return false;
+ }
+
+ if (!posix_cpu_timers_can_stop_tick(current)) {
+ trace_tick_stop(0, "posix timers running\n");
+ return false;
+ }
+
+ if (!perf_event_can_stop_tick()) {
+ trace_tick_stop(0, "perf events running\n");
+ return false;
+ }
+
+ /* sched_clock_tick() needs us? */
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+ /*
+ * TODO: kick full dynticks CPUs when
+ * sched_clock_stable is set.
+ */
+ if (!sched_clock_stable) {
+ trace_tick_stop(0, "unstable sched clock\n");
+ return false;
+ }
+#endif
+
+ return true;
+}
+
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
+
+/*
+ * Re-evaluate the need for the tick on the current CPU
+ * and restart it if necessary.
+ */
+void tick_nohz_full_check(void)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+
+ if (tick_nohz_full_cpu(smp_processor_id())) {
+ if (ts->tick_stopped && !is_idle_task(current)) {
+ if (!can_stop_full_tick())
+ tick_nohz_restart_sched_tick(ts, ktime_get());
+ }
+ }
+}
+
+static void nohz_full_kick_work_func(struct irq_work *work)
+{
+ tick_nohz_full_check();
+}
+
+static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
+ .func = nohz_full_kick_work_func,
+};
+
+/*
+ * Kick the current CPU if it's full dynticks in order to force it to
+ * re-evaluate its dependency on the tick and restart it if necessary.
+ */
+void tick_nohz_full_kick(void)
+{
+ if (tick_nohz_full_cpu(smp_processor_id()))
+ irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
+}
+
+static void nohz_full_kick_ipi(void *info)
+{
+ tick_nohz_full_check();
+}
+
+/*
+ * Kick all full dynticks CPUs in order to force these to re-evaluate
+ * their dependency on the tick and restart it if necessary.
+ */
+void tick_nohz_full_kick_all(void)
+{
+ if (!have_nohz_full_mask)
+ return;
+
+ preempt_disable();
+ smp_call_function_many(nohz_full_mask,
+ nohz_full_kick_ipi, NULL, false);
+ preempt_enable();
+}
+
+/*
+ * Re-evaluate the need for the tick as we switch the current task.
+ * It might need the tick due to per task/process properties:
+ * perf events, posix cpu timers, ...
+ */
+void tick_nohz_task_switch(struct task_struct *tsk)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ if (!tick_nohz_full_cpu(smp_processor_id()))
+ goto out;
+
+ if (tick_nohz_tick_stopped() && !can_stop_full_tick())
+ tick_nohz_full_kick();
+
+out:
+ local_irq_restore(flags);
+}
+
+int tick_nohz_full_cpu(int cpu)
+{
+ if (!have_nohz_full_mask)
+ return 0;
+
+ return cpumask_test_cpu(cpu, nohz_full_mask);
+}
+
+/* Parse the boot-time nohz CPU list from the kernel parameters. */
+static int __init tick_nohz_full_setup(char *str)
+{
+ int cpu;
+
+ alloc_bootmem_cpumask_var(&nohz_full_mask);
+ if (cpulist_parse(str, nohz_full_mask) < 0) {
+ pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
+ return 1;
+ }
+
+ cpu = smp_processor_id();
+ if (cpumask_test_cpu(cpu, nohz_full_mask)) {
+ pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
+ cpumask_clear_cpu(cpu, nohz_full_mask);
+ }
+ have_nohz_full_mask = true;
+
+ return 1;
+}
+__setup("nohz_full=", tick_nohz_full_setup);
+
+static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_PREPARE:
+ /*
+ * If we handle the timekeeping duty for full dynticks CPUs,
+ * we can't safely shutdown that CPU.
+ */
+ if (have_nohz_full_mask && tick_do_timer_cpu == cpu)
+ return -EINVAL;
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+/*
+ * Worst case string length in chunks of CPU range seems 2 steps
+ * separations: 0,2,4,6,...
+ * This is NR_CPUS + sizeof('\0')
+ */
+static char __initdata nohz_full_buf[NR_CPUS + 1];
+
+static int tick_nohz_init_all(void)
+{
+ int err = -1;
+
+#ifdef CONFIG_NO_HZ_FULL_ALL
+ if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) {
+ pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
+ return err;
+ }
+ err = 0;
+ cpumask_setall(nohz_full_mask);
+ cpumask_clear_cpu(smp_processor_id(), nohz_full_mask);
+ have_nohz_full_mask = true;
+#endif
+ return err;
+}
+
+void __init tick_nohz_init(void)
+{
+ int cpu;
+
+ if (!have_nohz_full_mask) {
+ if (tick_nohz_init_all() < 0)
+ return;
+ }
+
+ cpu_notifier(tick_nohz_cpu_down_callback, 0);
+
+ /* Make sure full dynticks CPU are also RCU nocbs */
+ for_each_cpu(cpu, nohz_full_mask) {
+ if (!rcu_is_nocb_cpu(cpu)) {
+ pr_warning("NO_HZ: CPU %d is not RCU nocb: "
+ "cleared from nohz_full range", cpu);
+ cpumask_clear_cpu(cpu, nohz_full_mask);
+ }
+ }
+
+ cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask);
+ pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
+}
+#else
+#define have_nohz_full_mask (0)
+#endif
+
/*
* NOHZ - aka dynamic tick functionality
*/
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
* NO HZ enabled ?
*/
@@ -345,11 +566,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
delta_jiffies = rcu_delta_jiffies;
}
}
+
/*
- * Do not stop the tick, if we are only one off
- * or if the cpu is required for rcu
+ * Do not stop the tick, if we are only one off (or less)
+ * or if the cpu is required for RCU:
*/
- if (!ts->tick_stopped && delta_jiffies == 1)
+ if (!ts->tick_stopped && delta_jiffies <= 1)
goto out;
/* Schedule the tick, if we are at least one jiffie off */
@@ -378,6 +600,13 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
time_delta = KTIME_MAX;
}
+#ifdef CONFIG_NO_HZ_FULL
+ if (!ts->inidle) {
+ time_delta = min(time_delta,
+ scheduler_tick_max_deferment());
+ }
+#endif
+
/*
* calculate the expiry time for the next timer wheel
* timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
@@ -421,6 +650,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
+ trace_tick_stop(1, " ");
}
/*
@@ -457,6 +687,24 @@ out:
return ret;
}
+static void tick_nohz_full_stop_tick(struct tick_sched *ts)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ int cpu = smp_processor_id();
+
+ if (!tick_nohz_full_cpu(cpu) || is_idle_task(current))
+ return;
+
+ if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
+ return;
+
+ if (!can_stop_full_tick())
+ return;
+
+ tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+#endif
+}
+
static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
{
/*
@@ -489,6 +737,21 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
return false;
}
+ if (have_nohz_full_mask) {
+ /*
+ * Keep the tick alive to guarantee timekeeping progression
+ * if there are full dynticks CPUs around
+ */
+ if (tick_do_timer_cpu == cpu)
+ return false;
+ /*
+ * Boot safety: make sure the timekeeping duty has been
+ * assigned before entering dyntick-idle mode,
+ */
+ if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
+ return false;
+ }
+
return true;
}
@@ -568,12 +831,13 @@ void tick_nohz_irq_exit(void)
{
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
- if (!ts->inidle)
- return;
-
- /* Cancel the timer because CPU already waken up from the C-states*/
- menu_hrtimer_cancel();
- __tick_nohz_idle_enter(ts);
+ if (ts->inidle) {
+ /* Cancel the timer because CPU already waken up from the C-states*/
+ menu_hrtimer_cancel();
+ __tick_nohz_idle_enter(ts);
+ } else {
+ tick_nohz_full_stop_tick(ts);
+ }
}
/**
@@ -802,7 +1066,7 @@ static inline void tick_check_nohz(int cpu)
static inline void tick_nohz_switch_to_nohz(void) { }
static inline void tick_check_nohz(int cpu) { }
-#endif /* NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
/*
* Called from irq_enter to notify about the possible interruption of idle()
@@ -887,14 +1151,14 @@ void tick_setup_sched_timer(void)
now = ktime_get();
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
if (tick_nohz_enabled)
ts->nohz_mode = NOHZ_MODE_HIGHRES;
#endif
}
#endif /* HIGH_RES_TIMERS */
-#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS
+#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
void tick_cancel_sched_timer(int cpu)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
diff --git a/kernel/timer.c b/kernel/timer.c
index 09bca8ce9771..a860bba34412 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -739,7 +739,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
cpu = smp_processor_id();
-#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
+#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
cpu = get_nohz_timer_target();
#endif
@@ -931,14 +931,14 @@ void add_timer_on(struct timer_list *timer, int cpu)
debug_activate(timer, timer->expires);
internal_add_timer(base, timer);
/*
- * Check whether the other CPU is idle and needs to be
- * triggered to reevaluate the timer wheel when nohz is
- * active. We are protected against the other CPU fiddling
+ * Check whether the other CPU is in dynticks mode and needs
+ * to be triggered to reevaluate the timer wheel.
+ * We are protected against the other CPU fiddling
* with the timer by holding the timer base lock. This also
- * makes sure that a CPU on the way to idle can not evaluate
- * the timer wheel.
+ * makes sure that a CPU on the way to stop its tick can not
+ * evaluate the timer wheel.
*/
- wake_up_idle_cpu(cpu);
+ wake_up_nohz_cpu(cpu);
spin_unlock_irqrestore(&base->lock, flags);
}
EXPORT_SYMBOL_GPL(add_timer_on);
@@ -1189,7 +1189,7 @@ static inline void __run_timers(struct tvec_base *base)
spin_unlock_irq(&base->lock);
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
* Find out when the next timer event is due to happen. This
* is used on S/390 to stop all activity when a CPU is idle.
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index ed58a3216a6d..b8b8560bfb95 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1808,6 +1808,7 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
rwbs[i] = '\0';
}
+EXPORT_SYMBOL_GPL(blk_fill_rwbs);
#endif /* CONFIG_EVENT_TRACING */