summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup/cgroup-internal.h7
-rw-r--r--kernel/cgroup/cgroup-v1.c20
-rw-r--r--kernel/cgroup/cgroup.c40
-rw-r--r--kernel/cgroup/cpuset.c11
-rw-r--r--kernel/cgroup/namespace.c2
-rw-r--r--kernel/compat.c10
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/events/core.c139
-rw-r--r--kernel/events/ring_buffer.c34
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/futex.c518
-rw-r--r--kernel/irq/chip.c5
-rw-r--r--kernel/irq/manage.c6
-rw-r--r--kernel/kprobes.c46
-rw-r--r--kernel/locking/lockdep.c242
-rw-r--r--kernel/locking/rtmutex-debug.c9
-rw-r--r--kernel/locking/rtmutex-debug.h3
-rw-r--r--kernel/locking/rtmutex.c390
-rw-r--r--kernel/locking/rtmutex.h2
-rw-r--r--kernel/locking/rtmutex_common.h25
-rw-r--r--kernel/locking/rwsem.c6
-rw-r--r--kernel/locking/test-ww_mutex.c29
-rw-r--r--kernel/memremap.c22
-rw-r--r--kernel/module.c42
-rw-r--r--kernel/nsproxy.c3
-rw-r--r--kernel/padata.c15
-rw-r--r--kernel/params.c52
-rw-r--r--kernel/relay.c1
-rw-r--r--kernel/sched/core.c267
-rw-r--r--kernel/sched/cpufreq_schedutil.c82
-rw-r--r--kernel/sched/cputime.c27
-rw-r--r--kernel/sched/fair.c418
-rw-r--r--kernel/sched/features.h7
-rw-r--r--kernel/sched/rt.c81
-rw-r--r--kernel/sched/sched-pelt.h13
-rw-r--r--kernel/sched/sched.h74
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/sysctl.c2
-rw-r--r--kernel/time/alarmtimer.c27
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/hrtimer.c15
-rw-r--r--kernel/time/posix-clock.c10
-rw-r--r--kernel/time/posix-cpu-timers.c75
-rw-r--r--kernel/time/posix-stubs.c20
-rw-r--r--kernel/time/posix-timers.c97
-rw-r--r--kernel/time/sched_clock.c5
-rw-r--r--kernel/time/tick-sched.c12
-rw-r--r--kernel/time/time.c4
-rw-r--r--kernel/time/timekeeping.c3
-rw-r--r--kernel/time/timer.c2
-rw-r--r--kernel/time/timer_list.c6
-rw-r--r--kernel/trace/Kconfig2
-rw-r--r--kernel/trace/blktrace.c35
-rw-r--r--kernel/trace/bpf_trace.c2
-rw-r--r--kernel/trace/trace.c3
-rw-r--r--kernel/trace/trace_kprobe.c9
-rw-r--r--kernel/workqueue.c28
57 files changed, 1917 insertions, 1101 deletions
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 9203bfb05603..00f4d6bf048f 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -5,6 +5,7 @@
#include <linux/kernfs.h>
#include <linux/workqueue.h>
#include <linux/list.h>
+#include <linux/refcount.h>
/*
* A cgroup can be associated with multiple css_sets as different tasks may
@@ -134,7 +135,7 @@ static inline void put_css_set(struct css_set *cset)
* can see it. Similar to atomic_dec_and_lock(), but for an
* rwlock
*/
- if (atomic_add_unless(&cset->refcount, -1, 1))
+ if (refcount_dec_not_one(&cset->refcount))
return;
spin_lock_irqsave(&css_set_lock, flags);
@@ -147,7 +148,7 @@ static inline void put_css_set(struct css_set *cset)
*/
static inline void get_css_set(struct css_set *cset)
{
- atomic_inc(&cset->refcount);
+ refcount_inc(&cset->refcount);
}
bool cgroup_ssid_enabled(int ssid);
@@ -163,7 +164,7 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
void cgroup_free_root(struct cgroup_root *root);
void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags);
int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
struct cgroup_root *root, unsigned long magic,
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 1dc22f6b49f5..85d75152402d 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -346,7 +346,7 @@ static int cgroup_task_count(const struct cgroup *cgrp)
spin_lock_irq(&css_set_lock);
list_for_each_entry(link, &cgrp->cset_links, cset_link)
- count += atomic_read(&link->cset->refcount);
+ count += refcount_read(&link->cset->refcount);
spin_unlock_irq(&css_set_lock);
return count;
}
@@ -1072,6 +1072,7 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
struct cgroup_subsys *ss;
struct dentry *dentry;
int i, ret;
+ bool new_root = false;
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
@@ -1181,10 +1182,11 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
ret = -ENOMEM;
goto out_unlock;
}
+ new_root = true;
init_cgroup_root(root, &opts);
- ret = cgroup_setup_root(root, opts.subsys_mask);
+ ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);
if (ret)
cgroup_free_root(root);
@@ -1201,6 +1203,18 @@ out_free:
CGROUP_SUPER_MAGIC, ns);
/*
+ * There's a race window after we release cgroup_mutex and before
+ * allocating a superblock. Make sure a concurrent process won't
+ * be able to re-use the root during this window by delaying the
+ * initialization of root refcnt.
+ */
+ if (new_root) {
+ mutex_lock(&cgroup_mutex);
+ percpu_ref_reinit(&root->cgrp.self.refcnt);
+ mutex_unlock(&cgroup_mutex);
+ }
+
+ /*
* If @pinned_sb, we're reusing an existing root and holding an
* extra ref on its sb. Mount is complete. Put the extra ref.
*/
@@ -1286,7 +1300,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
u64 count;
rcu_read_lock();
- count = atomic_read(&task_css_set(current)->refcount);
+ count = refcount_read(&task_css_set(current)->refcount);
rcu_read_unlock();
return count;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 687f5e0194ef..c3c9a0e1b3c9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -189,7 +189,7 @@ static u16 have_canfork_callback __read_mostly;
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
- .count = { .counter = 2, },
+ .count = REFCOUNT_INIT(2),
.user_ns = &init_user_ns,
.ns.ops = &cgroupns_operations,
.ns.inum = PROC_CGROUP_INIT_INO,
@@ -436,7 +436,12 @@ out_unlock:
return css;
}
-static void cgroup_get(struct cgroup *cgrp)
+static void __maybe_unused cgroup_get(struct cgroup *cgrp)
+{
+ css_get(&cgrp->self);
+}
+
+static void cgroup_get_live(struct cgroup *cgrp)
{
WARN_ON_ONCE(cgroup_is_dead(cgrp));
css_get(&cgrp->self);
@@ -554,7 +559,7 @@ EXPORT_SYMBOL_GPL(of_css);
* haven't been created.
*/
struct css_set init_css_set = {
- .refcount = ATOMIC_INIT(1),
+ .refcount = REFCOUNT_INIT(1),
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
@@ -724,7 +729,7 @@ void put_css_set_locked(struct css_set *cset)
lockdep_assert_held(&css_set_lock);
- if (!atomic_dec_and_test(&cset->refcount))
+ if (!refcount_dec_and_test(&cset->refcount))
return;
/* This css_set is dead. unlink it and release cgroup and css refs */
@@ -932,7 +937,7 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
list_add_tail(&link->cgrp_link, &cset->cgrp_links);
if (cgroup_parent(cgrp))
- cgroup_get(cgrp);
+ cgroup_get_live(cgrp);
}
/**
@@ -977,7 +982,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
return NULL;
}
- atomic_set(&cset->refcount, 1);
+ refcount_set(&cset->refcount, 1);
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks);
INIT_LIST_HEAD(&cset->task_iters);
@@ -1640,7 +1645,7 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
@@ -1656,8 +1661,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
root_cgrp->id = ret;
root_cgrp->ancestor_ids[0] = ret;
- ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
- GFP_KERNEL);
+ ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
+ ref_flags, GFP_KERNEL);
if (ret)
goto out;
@@ -1802,7 +1807,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
return ERR_PTR(-EINVAL);
}
cgrp_dfl_visible = true;
- cgroup_get(&cgrp_dfl_root.cgrp);
+ cgroup_get_live(&cgrp_dfl_root.cgrp);
dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
CGROUP2_SUPER_MAGIC, ns);
@@ -2576,7 +2581,7 @@ restart:
if (!css || !percpu_ref_is_dying(&css->refcnt))
continue;
- cgroup_get(dsct);
+ cgroup_get_live(dsct);
prepare_to_wait(&dsct->offline_waitq, &wait,
TASK_UNINTERRUPTIBLE);
@@ -3947,7 +3952,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
{
lockdep_assert_held(&cgroup_mutex);
- cgroup_get(cgrp);
+ cgroup_get_live(cgrp);
memset(css, 0, sizeof(*css));
css->cgroup = cgrp;
@@ -4123,7 +4128,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
/* allocation complete, commit to creation */
list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
atomic_inc(&root->nr_cgrps);
- cgroup_get(parent);
+ cgroup_get_live(parent);
/*
* @cgrp is now fully operational. If something fails after this
@@ -4513,7 +4518,7 @@ int __init cgroup_init(void)
hash_add(css_set_table, &init_css_set.hlist,
css_set_hash(init_css_set.subsys));
- BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
+ BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0));
mutex_unlock(&cgroup_mutex);
@@ -4947,7 +4952,7 @@ struct cgroup *cgroup_get_from_path(const char *path)
if (kn) {
if (kernfs_type(kn) == KERNFS_DIR) {
cgrp = kn->priv;
- cgroup_get(cgrp);
+ cgroup_get_live(cgrp);
} else {
cgrp = ERR_PTR(-ENOTDIR);
}
@@ -5027,6 +5032,11 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
/* Socket clone path */
if (skcd->val) {
+ /*
+ * We might be cloning a socket which is left in an empty
+ * cgroup and the cgroup might have already been rmdir'd.
+ * Don't use cgroup_get_live().
+ */
cgroup_get(sock_cgroup_ptr(skcd));
return;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 0f41292be0fb..f6501f4f6040 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2121,10 +2121,8 @@ int __init cpuset_init(void)
{
int err = 0;
- if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
- BUG();
- if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
- BUG();
+ BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
@@ -2139,8 +2137,7 @@ int __init cpuset_init(void)
if (err < 0)
return err;
- if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
- BUG();
+ BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
return 0;
}
@@ -2354,7 +2351,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
rebuild_sched_domains();
}
-void cpuset_update_active_cpus(bool cpu_online)
+void cpuset_update_active_cpus(void)
{
/*
* We're inside cpu hotplug critical region which usually nests
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index 96d38dab6fb2..66129eb4371d 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -31,7 +31,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
kfree(new_ns);
return ERR_PTR(ret);
}
- atomic_set(&new_ns->count, 1);
+ refcount_set(&new_ns->count, 1);
new_ns->ns.ops = &cgroupns_operations;
return new_ns;
}
diff --git a/kernel/compat.c b/kernel/compat.c
index 19aec5d98108..933bcb31ae10 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -108,8 +108,8 @@ COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv,
COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
struct timezone __user *, tz)
{
+ struct timespec64 new_ts;
struct timeval user_tv;
- struct timespec new_ts;
struct timezone new_tz;
if (tv) {
@@ -123,7 +123,7 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
return -EFAULT;
}
- return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
+ return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}
static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
@@ -240,18 +240,20 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
struct compat_timespec __user *, rmtp)
{
struct timespec tu, rmt;
+ struct timespec64 tu64;
mm_segment_t oldfs;
long ret;
if (compat_get_timespec(&tu, rqtp))
return -EFAULT;
- if (!timespec_valid(&tu))
+ tu64 = timespec_to_timespec64(tu);
+ if (!timespec64_valid(&tu64))
return -EINVAL;
oldfs = get_fs();
set_fs(KERNEL_DS);
- ret = hrtimer_nanosleep(&tu,
+ ret = hrtimer_nanosleep(&tu64,
rmtp ? (struct timespec __user *)&rmt : NULL,
HRTIMER_MODE_REL, CLOCK_MONOTONIC);
set_fs(oldfs);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 37b223e4fc05..9ae6fbe5b5cf 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1125,6 +1125,8 @@ core_initcall(cpu_hotplug_pm_sync_init);
#endif /* CONFIG_PM_SLEEP_SMP */
+int __boot_cpu_id;
+
#endif /* CONFIG_SMP */
/* Boot processor state steps */
@@ -1815,6 +1817,10 @@ void __init boot_cpu_init(void)
set_cpu_active(cpu, true);
set_cpu_present(cpu, true);
set_cpu_possible(cpu, true);
+
+#ifdef CONFIG_SMP
+ __boot_cpu_id = cpu;
+#endif
}
/*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ff01cba86f43..6e75a5c9412d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -48,6 +48,8 @@
#include <linux/parser.h>
#include <linux/sched/clock.h>
#include <linux/sched/mm.h>
+#include <linux/proc_ns.h>
+#include <linux/mount.h>
#include "internal.h"
@@ -379,6 +381,7 @@ static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
+static atomic_t nr_namespaces_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
static atomic_t nr_switch_events __read_mostly;
@@ -3991,6 +3994,8 @@ static void unaccount_event(struct perf_event *event)
atomic_dec(&nr_mmap_events);
if (event->attr.comm)
atomic_dec(&nr_comm_events);
+ if (event->attr.namespaces)
+ atomic_dec(&nr_namespaces_events);
if (event->attr.task)
atomic_dec(&nr_task_events);
if (event->attr.freq)
@@ -6491,6 +6496,7 @@ static void perf_event_task(struct task_struct *task,
void perf_event_fork(struct task_struct *task)
{
perf_event_task(task, NULL, 1);
+ perf_event_namespaces(task);
}
/*
@@ -6593,6 +6599,132 @@ void perf_event_comm(struct task_struct *task, bool exec)
}
/*
+ * namespaces tracking
+ */
+
+struct perf_namespaces_event {
+ struct task_struct *task;
+
+ struct {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 tid;
+ u64 nr_namespaces;
+ struct perf_ns_link_info link_info[NR_NAMESPACES];
+ } event_id;
+};
+
+static int perf_event_namespaces_match(struct perf_event *event)
+{
+ return event->attr.namespaces;
+}
+
+static void perf_event_namespaces_output(struct perf_event *event,
+ void *data)
+{
+ struct perf_namespaces_event *namespaces_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ if (!perf_event_namespaces_match(event))
+ return;
+
+ perf_event_header__init_id(&namespaces_event->event_id.header,
+ &sample, event);
+ ret = perf_output_begin(&handle, event,
+ namespaces_event->event_id.header.size);
+ if (ret)
+ return;
+
+ namespaces_event->event_id.pid = perf_event_pid(event,
+ namespaces_event->task);
+ namespaces_event->event_id.tid = perf_event_tid(event,
+ namespaces_event->task);
+
+ perf_output_put(&handle, namespaces_event->event_id);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
+ struct task_struct *task,
+ const struct proc_ns_operations *ns_ops)
+{
+ struct path ns_path;
+ struct inode *ns_inode;
+ void *error;
+
+ error = ns_get_path(&ns_path, task, ns_ops);
+ if (!error) {
+ ns_inode = ns_path.dentry->d_inode;
+ ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
+ ns_link_info->ino = ns_inode->i_ino;
+ }
+}
+
+void perf_event_namespaces(struct task_struct *task)
+{
+ struct perf_namespaces_event namespaces_event;
+ struct perf_ns_link_info *ns_link_info;
+
+ if (!atomic_read(&nr_namespaces_events))
+ return;
+
+ namespaces_event = (struct perf_namespaces_event){
+ .task = task,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_NAMESPACES,
+ .misc = 0,
+ .size = sizeof(namespaces_event.event_id),
+ },
+ /* .pid */
+ /* .tid */
+ .nr_namespaces = NR_NAMESPACES,
+ /* .link_info[NR_NAMESPACES] */
+ },
+ };
+
+ ns_link_info = namespaces_event.event_id.link_info;
+
+ perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
+ task, &mntns_operations);
+
+#ifdef CONFIG_USER_NS
+ perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
+ task, &userns_operations);
+#endif
+#ifdef CONFIG_NET_NS
+ perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
+ task, &netns_operations);
+#endif
+#ifdef CONFIG_UTS_NS
+ perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
+ task, &utsns_operations);
+#endif
+#ifdef CONFIG_IPC_NS
+ perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
+ task, &ipcns_operations);
+#endif
+#ifdef CONFIG_PID_NS
+ perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
+ task, &pidns_operations);
+#endif
+#ifdef CONFIG_CGROUPS
+ perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
+ task, &cgroupns_operations);
+#endif
+
+ perf_iterate_sb(perf_event_namespaces_output,
+ &namespaces_event,
+ NULL);
+}
+
+/*
* mmap tracking
*/
@@ -9146,6 +9278,8 @@ static void account_event(struct perf_event *event)
atomic_inc(&nr_mmap_events);
if (event->attr.comm)
atomic_inc(&nr_comm_events);
+ if (event->attr.namespaces)
+ atomic_inc(&nr_namespaces_events);
if (event->attr.task)
atomic_inc(&nr_task_events);
if (event->attr.freq)
@@ -9691,6 +9825,11 @@ SYSCALL_DEFINE5(perf_event_open,
return -EACCES;
}
+ if (attr.namespaces) {
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ }
+
if (attr.freq) {
if (attr.sample_freq > sysctl_perf_event_sample_rate)
return -EINVAL;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 257fa460b846..2831480c63a2 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -297,6 +297,19 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
rb->paused = 1;
}
+void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
+{
+ /*
+ * OVERWRITE is determined by perf_aux_output_end() and can't
+ * be passed in directly.
+ */
+ if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE))
+ return;
+
+ handle->aux_flags |= flags;
+}
+EXPORT_SYMBOL_GPL(perf_aux_output_flag);
+
/*
* This is called before hardware starts writing to the AUX area to
* obtain an output handle and make sure there's room in the buffer.
@@ -360,6 +373,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
handle->event = event;
handle->head = aux_head;
handle->size = 0;
+ handle->aux_flags = 0;
/*
* In overwrite mode, AUX data stores do not depend on aux_tail,
@@ -408,34 +422,32 @@ err:
* of the AUX buffer management code is that after pmu::stop(), the AUX
* transaction must be stopped and therefore drop the AUX reference count.
*/
-void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
- bool truncated)
+void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
{
+ bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED);
struct ring_buffer *rb = handle->rb;
- bool wakeup = truncated;
unsigned long aux_head;
- u64 flags = 0;
-
- if (truncated)
- flags |= PERF_AUX_FLAG_TRUNCATED;
/* in overwrite mode, driver provides aux_head via handle */
if (rb->aux_overwrite) {
- flags |= PERF_AUX_FLAG_OVERWRITE;
+ handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
aux_head = handle->head;
local_set(&rb->aux_head, aux_head);
} else {
+ handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
+
aux_head = local_read(&rb->aux_head);
local_add(size, &rb->aux_head);
}
- if (size || flags) {
+ if (size || handle->aux_flags) {
/*
* Only send RECORD_AUX if we have something useful to communicate
*/
- perf_event_aux_event(handle->event, aux_head, size, flags);
+ perf_event_aux_event(handle->event, aux_head, size,
+ handle->aux_flags);
}
aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
@@ -446,7 +458,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
}
if (wakeup) {
- if (truncated)
+ if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
handle->event->pending_disable = 1;
perf_output_wakeup(handle);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 6c463c80e93d..3a4343cdfe90 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1438,6 +1438,7 @@ static void rt_mutex_init_task(struct task_struct *p)
#ifdef CONFIG_RT_MUTEXES
p->pi_waiters = RB_ROOT;
p->pi_waiters_leftmost = NULL;
+ p->pi_top_task = NULL;
p->pi_blocked_on = NULL;
#endif
}
@@ -2352,6 +2353,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
}
}
+ perf_event_namespaces(current);
+
bad_unshare_cleanup_cred:
if (new_cred)
put_cred(new_cred);
diff --git a/kernel/futex.c b/kernel/futex.c
index 45858ec73941..357348a6cf6b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -802,7 +802,7 @@ static int refill_pi_state_cache(void)
return 0;
}
-static struct futex_pi_state * alloc_pi_state(void)
+static struct futex_pi_state *alloc_pi_state(void)
{
struct futex_pi_state *pi_state = current->pi_state_cache;
@@ -812,6 +812,11 @@ static struct futex_pi_state * alloc_pi_state(void)
return pi_state;
}
+static void get_pi_state(struct futex_pi_state *pi_state)
+{
+ WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
+}
+
/*
* Drops a reference to the pi_state object and frees or caches it
* when the last reference is gone.
@@ -856,7 +861,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
* Look up the task based on what TID userspace gave us.
* We dont trust it.
*/
-static struct task_struct * futex_find_get_task(pid_t pid)
+static struct task_struct *futex_find_get_task(pid_t pid)
{
struct task_struct *p;
@@ -916,10 +921,12 @@ void exit_pi_state_list(struct task_struct *curr)
pi_state->owner = NULL;
raw_spin_unlock_irq(&curr->pi_lock);
- rt_mutex_unlock(&pi_state->pi_mutex);
-
+ get_pi_state(pi_state);
spin_unlock(&hb->lock);
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+ put_pi_state(pi_state);
+
raw_spin_lock_irq(&curr->pi_lock);
}
raw_spin_unlock_irq(&curr->pi_lock);
@@ -973,6 +980,39 @@ void exit_pi_state_list(struct task_struct *curr)
*
* [10] There is no transient state which leaves owner and user space
* TID out of sync.
+ *
+ *
+ * Serialization and lifetime rules:
+ *
+ * hb->lock:
+ *
+ * hb -> futex_q, relation
+ * futex_q -> pi_state, relation
+ *
+ * (cannot be raw because hb can contain arbitrary amount
+ * of futex_q's)
+ *
+ * pi_mutex->wait_lock:
+ *
+ * {uval, pi_state}
+ *
+ * (and pi_mutex 'obviously')
+ *
+ * p->pi_lock:
+ *
+ * p->pi_state_list -> pi_state->list, relation
+ *
+ * pi_state->refcount:
+ *
+ * pi_state lifetime
+ *
+ *
+ * Lock order:
+ *
+ * hb->lock
+ * pi_mutex->wait_lock
+ * p->pi_lock
+ *
*/
/*
@@ -980,10 +1020,13 @@ void exit_pi_state_list(struct task_struct *curr)
* the pi_state against the user space value. If correct, attach to
* it.
*/
-static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
+static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_pi_state *pi_state,
struct futex_pi_state **ps)
{
pid_t pid = uval & FUTEX_TID_MASK;
+ u32 uval2;
+ int ret;
/*
* Userspace might have messed up non-PI and PI futexes [3]
@@ -991,9 +1034,39 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
if (unlikely(!pi_state))
return -EINVAL;
+ /*
+ * We get here with hb->lock held, and having found a
+ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
+ * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
+ * which in turn means that futex_lock_pi() still has a reference on
+ * our pi_state.
+ *
+ * The waiter holding a reference on @pi_state also protects against
+ * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
+ * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
+ * free pi_state before we can take a reference ourselves.
+ */
WARN_ON(!atomic_read(&pi_state->refcount));
/*
+ * Now that we have a pi_state, we can acquire wait_lock
+ * and do the state validation.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Since {uval, pi_state} is serialized by wait_lock, and our current
+ * uval was read without holding it, it can have changed. Verify it
+ * still is what we expect it to be, otherwise retry the entire
+ * operation.
+ */
+ if (get_futex_value_locked(&uval2, uaddr))
+ goto out_efault;
+
+ if (uval != uval2)
+ goto out_eagain;
+
+ /*
* Handle the owner died case:
*/
if (uval & FUTEX_OWNER_DIED) {
@@ -1008,11 +1081,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* is not 0. Inconsistent state. [5]
*/
if (pid)
- return -EINVAL;
+ goto out_einval;
/*
* Take a ref on the state and return success. [4]
*/
- goto out_state;
+ goto out_attach;
}
/*
@@ -1024,14 +1097,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* Take a ref on the state and return success. [6]
*/
if (!pid)
- goto out_state;
+ goto out_attach;
} else {
/*
* If the owner died bit is not set, then the pi_state
* must have an owner. [7]
*/
if (!pi_state->owner)
- return -EINVAL;
+ goto out_einval;
}
/*
@@ -1040,11 +1113,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* user space TID. [9/10]
*/
if (pid != task_pid_vnr(pi_state->owner))
- return -EINVAL;
-out_state:
- atomic_inc(&pi_state->refcount);
+ goto out_einval;
+
+out_attach:
+ get_pi_state(pi_state);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
*ps = pi_state;
return 0;
+
+out_einval:
+ ret = -EINVAL;
+ goto out_error;
+
+out_eagain:
+ ret = -EAGAIN;
+ goto out_error;
+
+out_efault:
+ ret = -EFAULT;
+ goto out_error;
+
+out_error:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
}
/*
@@ -1095,6 +1186,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
/*
* No existing pi state. First waiter. [2]
+ *
+ * This creates pi_state, we have hb->lock held, this means nothing can
+ * observe this state, wait_lock is irrelevant.
*/
pi_state = alloc_pi_state();
@@ -1119,17 +1213,18 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
return 0;
}
-static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+static int lookup_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_hash_bucket *hb,
union futex_key *key, struct futex_pi_state **ps)
{
- struct futex_q *match = futex_top_waiter(hb, key);
+ struct futex_q *top_waiter = futex_top_waiter(hb, key);
/*
* If there is a waiter on that futex, validate it and
* attach to the pi_state when the validation succeeds.
*/
- if (match)
- return attach_to_pi_state(uval, match->pi_state, ps);
+ if (top_waiter)
+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
/*
* We are the first waiter - try to look up the owner based on
@@ -1148,7 +1243,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
return -EFAULT;
- /*If user space value changed, let the caller retry */
+ /* If user space value changed, let the caller retry */
return curval != uval ? -EAGAIN : 0;
}
@@ -1176,7 +1271,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
struct task_struct *task, int set_waiters)
{
u32 uval, newval, vpid = task_pid_vnr(task);
- struct futex_q *match;
+ struct futex_q *top_waiter;
int ret;
/*
@@ -1202,9 +1297,9 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
* Lookup existing state first. If it exists, try to attach to
* its pi_state.
*/
- match = futex_top_waiter(hb, key);
- if (match)
- return attach_to_pi_state(uval, match->pi_state, ps);
+ top_waiter = futex_top_waiter(hb, key);
+ if (top_waiter)
+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
/*
* No waiter and user TID is 0. We are here because the
@@ -1285,50 +1380,44 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
wake_q_add(wake_q, p);
__unqueue_futex(q);
/*
- * The waiting task can free the futex_q as soon as
- * q->lock_ptr = NULL is written, without taking any locks. A
- * memory barrier is required here to prevent the following
- * store to lock_ptr from getting ahead of the plist_del.
+ * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
+ * is written, without taking any locks. This is possible in the event
+ * of a spurious wakeup, for example. A memory barrier is required here
+ * to prevent the following store to lock_ptr from getting ahead of the
+ * plist_del in __unqueue_futex().
*/
- smp_wmb();
- q->lock_ptr = NULL;
+ smp_store_release(&q->lock_ptr, NULL);
}
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
- struct futex_hash_bucket *hb)
+/*
+ * Caller must hold a reference on @pi_state.
+ */
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
{
- struct task_struct *new_owner;
- struct futex_pi_state *pi_state = this->pi_state;
u32 uninitialized_var(curval), newval;
+ struct task_struct *new_owner;
+ bool postunlock = false;
DEFINE_WAKE_Q(wake_q);
- bool deboost;
int ret = 0;
- if (!pi_state)
- return -EINVAL;
-
- /*
- * If current does not own the pi_state then the futex is
- * inconsistent and user space fiddled with the futex value.
- */
- if (pi_state->owner != current)
- return -EINVAL;
-
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
+ if (WARN_ON_ONCE(!new_owner)) {
+ /*
+ * As per the comment in futex_unlock_pi() this should not happen.
+ *
+ * When this happens, give up our locks and try again, giving
+ * the futex_lock_pi() instance time to complete, either by
+ * waiting on the rtmutex or removing itself from the futex
+ * queue.
+ */
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
/*
- * It is possible that the next waiter (the one that brought
- * this owner to the kernel) timed out and is no longer
- * waiting on the lock.
- */
- if (!new_owner)
- new_owner = this->task;
-
- /*
- * We pass it to the next owner. The WAITERS bit is always
- * kept enabled while there is PI state around. We cleanup the
- * owner died bit, because we are the owner.
+ * We pass it to the next owner. The WAITERS bit is always kept
+ * enabled while there is PI state around. We cleanup the owner
+ * died bit, because we are the owner.
*/
newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
@@ -1337,6 +1426,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
ret = -EFAULT;
+
} else if (curval != uval) {
/*
* If a unconditional UNLOCK_PI operation (user space did not
@@ -1349,10 +1439,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
else
ret = -EINVAL;
}
- if (ret) {
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- return ret;
- }
+
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * This is a point of no return; once we modify the uval there is no
+ * going back and subsequent operations must not fail.
+ */
raw_spin_lock(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list));
@@ -1365,22 +1459,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
pi_state->owner = new_owner;
raw_spin_unlock(&new_owner->pi_lock);
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
- deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
+out_unlock:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- /*
- * First unlock HB so the waiter does not spin on it once he got woken
- * up. Second wake up the waiter before the priority is adjusted. If we
- * deboost first (and lose our higher priority), then the task might get
- * scheduled away before the wake up can take place.
- */
- spin_unlock(&hb->lock);
- wake_up_q(&wake_q);
- if (deboost)
- rt_mutex_adjust_prio(current);
+ if (postunlock)
+ rt_mutex_postunlock(&wake_q);
- return 0;
+ return ret;
}
/*
@@ -1826,7 +1913,7 @@ retry_private:
* If that call succeeds then we have pi_state and an
* initial refcount on it.
*/
- ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
+ ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
}
switch (ret) {
@@ -1909,7 +1996,7 @@ retry_private:
* refcount on the pi_state and store the pointer in
* the futex_q object of the waiter.
*/
- atomic_inc(&pi_state->refcount);
+ get_pi_state(pi_state);
this->pi_state = pi_state;
ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
this->rt_waiter,
@@ -2009,20 +2096,7 @@ queue_unlock(struct futex_hash_bucket *hb)
hb_waiters_dec(hb);
}
-/**
- * queue_me() - Enqueue the futex_q on the futex_hash_bucket
- * @q: The futex_q to enqueue
- * @hb: The destination hash bucket
- *
- * The hb->lock must be held by the caller, and is released here. A call to
- * queue_me() is typically paired with exactly one call to unqueue_me(). The
- * exceptions involve the PI related operations, which may use unqueue_me_pi()
- * or nothing if the unqueue is done as part of the wake process and the unqueue
- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
- * an example).
- */
-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
- __releases(&hb->lock)
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
{
int prio;
@@ -2039,6 +2113,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
plist_node_init(&q->list, prio);
plist_add(&q->list, &hb->chain);
q->task = current;
+}
+
+/**
+ * queue_me() - Enqueue the futex_q on the futex_hash_bucket
+ * @q: The futex_q to enqueue
+ * @hb: The destination hash bucket
+ *
+ * The hb->lock must be held by the caller, and is released here. A call to
+ * queue_me() is typically paired with exactly one call to unqueue_me(). The
+ * exceptions involve the PI related operations, which may use unqueue_me_pi()
+ * or nothing if the unqueue is done as part of the wake process and the unqueue
+ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
+ * an example).
+ */
+static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
+{
+ __queue_me(q, hb);
spin_unlock(&hb->lock);
}
@@ -2125,10 +2217,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
{
u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
- struct task_struct *oldowner = pi_state->owner;
u32 uval, uninitialized_var(curval), newval;
+ struct task_struct *oldowner;
int ret;
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ oldowner = pi_state->owner;
/* Owner died? */
if (!pi_state->owner)
newtid |= FUTEX_OWNER_DIED;
@@ -2136,7 +2231,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
/*
* We are here either because we stole the rtmutex from the
* previous highest priority waiter or we are the highest priority
- * waiter but failed to get the rtmutex the first time.
+ * waiter but have failed to get the rtmutex the first time.
+ *
* We have to replace the newowner TID in the user space variable.
* This must be atomic as we have to preserve the owner died bit here.
*
@@ -2144,17 +2240,16 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
* because we can fault here. Imagine swapped out pages or a fork
* that marked all the anonymous memory readonly for cow.
*
- * Modifying pi_state _before_ the user space value would
- * leave the pi_state in an inconsistent state when we fault
- * here, because we need to drop the hash bucket lock to
- * handle the fault. This might be observed in the PID check
- * in lookup_pi_state.
+ * Modifying pi_state _before_ the user space value would leave the
+ * pi_state in an inconsistent state when we fault here, because we
+ * need to drop the locks to handle the fault. This might be observed
+ * in the PID check in lookup_pi_state.
*/
retry:
if (get_futex_value_locked(&uval, uaddr))
goto handle_fault;
- while (1) {
+ for (;;) {
newval = (uval & FUTEX_OWNER_DIED) | newtid;
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
@@ -2169,47 +2264,60 @@ retry:
* itself.
*/
if (pi_state->owner != NULL) {
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
+ raw_spin_lock(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+ raw_spin_unlock(&pi_state->owner->pi_lock);
}
pi_state->owner = newowner;
- raw_spin_lock_irq(&newowner->pi_lock);
+ raw_spin_lock(&newowner->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &newowner->pi_state_list);
- raw_spin_unlock_irq(&newowner->pi_lock);
+ raw_spin_unlock(&newowner->pi_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+
return 0;
/*
- * To handle the page fault we need to drop the hash bucket
- * lock here. That gives the other task (either the highest priority
- * waiter itself or the task which stole the rtmutex) the
- * chance to try the fixup of the pi_state. So once we are
- * back from handling the fault we need to check the pi_state
- * after reacquiring the hash bucket lock and before trying to
- * do another fixup. When the fixup has been done already we
- * simply return.
+ * To handle the page fault we need to drop the locks here. That gives
+ * the other task (either the highest priority waiter itself or the
+ * task which stole the rtmutex) the chance to try the fixup of the
+ * pi_state. So once we are back from handling the fault we need to
+ * check the pi_state after reacquiring the locks and before trying to
+ * do another fixup. When the fixup has been done already we simply
+ * return.
+ *
+ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
+ * drop hb->lock since the caller owns the hb -> futex_q relation.
+ * Dropping the pi_mutex->wait_lock requires the state revalidate.
*/
handle_fault:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(q->lock_ptr);
ret = fault_in_user_writeable(uaddr);
spin_lock(q->lock_ptr);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
/*
* Check if someone else fixed it for us:
*/
- if (pi_state->owner != oldowner)
- return 0;
+ if (pi_state->owner != oldowner) {
+ ret = 0;
+ goto out_unlock;
+ }
if (ret)
- return ret;
+ goto out_unlock;
goto retry;
+
+out_unlock:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
}
static long futex_wait_restart(struct restart_block *restart);
@@ -2231,13 +2339,16 @@ static long futex_wait_restart(struct restart_block *restart);
*/
static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
{
- struct task_struct *owner;
int ret = 0;
if (locked) {
/*
* Got the lock. We might not be the anticipated owner if we
* did a lock-steal - fix up the PI-state in that case:
+ *
+ * We can safely read pi_state->owner without holding wait_lock
+ * because we now own the rt_mutex, only the owner will attempt
+ * to change it.
*/
if (q->pi_state->owner != current)
ret = fixup_pi_state_owner(uaddr, q, current);
@@ -2245,43 +2356,15 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
}
/*
- * Catch the rare case, where the lock was released when we were on the
- * way back before we locked the hash bucket.
- */
- if (q->pi_state->owner == current) {
- /*
- * Try to get the rt_mutex now. This might fail as some other
- * task acquired the rt_mutex after we removed ourself from the
- * rt_mutex waiters list.
- */
- if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
- locked = 1;
- goto out;
- }
-
- /*
- * pi_state is incorrect, some other task did a lock steal and
- * we returned due to timeout or signal without taking the
- * rt_mutex. Too late.
- */
- raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
- owner = rt_mutex_owner(&q->pi_state->pi_mutex);
- if (!owner)
- owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
- raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
- ret = fixup_pi_state_owner(uaddr, q, owner);
- goto out;
- }
-
- /*
* Paranoia check. If we did not take the lock, then we should not be
* the owner of the rt_mutex.
*/
- if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
+ if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
"pi-state %p\n", ret,
q->pi_state->pi_mutex.owner,
q->pi_state->owner);
+ }
out:
return ret ? ret : locked;
@@ -2505,6 +2588,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
+ struct futex_pi_state *pi_state = NULL;
+ struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
int res, ret;
@@ -2557,25 +2642,68 @@ retry_private:
}
}
+ WARN_ON(!q.pi_state);
+
/*
* Only actually queue now that the atomic ops are done:
*/
- queue_me(&q, hb);
+ __queue_me(&q, hb);
- WARN_ON(!q.pi_state);
- /*
- * Block on the PI mutex:
- */
- if (!trylock) {
- ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
- } else {
- ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
+ if (trylock) {
+ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
/* Fixup the trylock return value: */
ret = ret ? 0 : -EWOULDBLOCK;
+ goto no_block;
}
+ rt_mutex_init_waiter(&rt_waiter);
+
+ /*
+ * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
+ * hold it while doing rt_mutex_start_proxy(), because then it will
+ * include hb->lock in the blocking chain, even through we'll not in
+ * fact hold it while blocking. This will lead it to report -EDEADLK
+ * and BUG when futex_unlock_pi() interleaves with this.
+ *
+ * Therefore acquire wait_lock while holding hb->lock, but drop the
+ * latter before calling rt_mutex_start_proxy_lock(). This still fully
+ * serializes against futex_unlock_pi() as that does the exact same
+ * lock handoff sequence.
+ */
+ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
+ spin_unlock(q.lock_ptr);
+ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
+ raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
+
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+
+ spin_lock(q.lock_ptr);
+ goto no_block;
+ }
+
+
+ if (unlikely(to))
+ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+
+ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+
spin_lock(q.lock_ptr);
/*
+ * If we failed to acquire the lock (signal/timeout), we must
+ * first acquire the hb->lock before removing the lock from the
+ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
+ * wait lists consistent.
+ *
+ * In particular; it is important that futex_unlock_pi() can not
+ * observe this inconsistency.
+ */
+ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+ ret = 0;
+
+no_block:
+ /*
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
*/
@@ -2591,12 +2719,19 @@ retry_private:
* If fixup_owner() faulted and was unable to handle the fault, unlock
* it and return the fault to userspace.
*/
- if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
- rt_mutex_unlock(&q.pi_state->pi_mutex);
+ if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
+ pi_state = q.pi_state;
+ get_pi_state(pi_state);
+ }
/* Unqueue and drop the lock */
unqueue_me_pi(&q);
+ if (pi_state) {
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+ put_pi_state(pi_state);
+ }
+
goto out_put_key;
out_unlock_put_key:
@@ -2605,8 +2740,10 @@ out_unlock_put_key:
out_put_key:
put_futex_key(&q.key);
out:
- if (to)
+ if (to) {
+ hrtimer_cancel(&to->timer);
destroy_hrtimer_on_stack(&to->timer);
+ }
return ret != -EINTR ? ret : -ERESTARTNOINTR;
uaddr_faulted:
@@ -2633,7 +2770,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
union futex_key key = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb;
- struct futex_q *match;
+ struct futex_q *top_waiter;
int ret;
retry:
@@ -2657,12 +2794,37 @@ retry:
* all and we at least want to know if user space fiddled
* with the futex value instead of blindly unlocking.
*/
- match = futex_top_waiter(hb, &key);
- if (match) {
- ret = wake_futex_pi(uaddr, uval, match, hb);
+ top_waiter = futex_top_waiter(hb, &key);
+ if (top_waiter) {
+ struct futex_pi_state *pi_state = top_waiter->pi_state;
+
+ ret = -EINVAL;
+ if (!pi_state)
+ goto out_unlock;
+
/*
- * In case of success wake_futex_pi dropped the hash
- * bucket lock.
+ * If current does not own the pi_state then the futex is
+ * inconsistent and user space fiddled with the futex value.
+ */
+ if (pi_state->owner != current)
+ goto out_unlock;
+
+ get_pi_state(pi_state);
+ /*
+ * By taking wait_lock while still holding hb->lock, we ensure
+ * there is no point where we hold neither; and therefore
+ * wake_futex_pi() must observe a state consistent with what we
+ * observed.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+
+ ret = wake_futex_pi(uaddr, uval, pi_state);
+
+ put_pi_state(pi_state);
+
+ /*
+ * Success, we're done! No tricky corner cases.
*/
if (!ret)
goto out_putkey;
@@ -2677,7 +2839,6 @@ retry:
* setting the FUTEX_WAITERS bit. Try again.
*/
if (ret == -EAGAIN) {
- spin_unlock(&hb->lock);
put_futex_key(&key);
goto retry;
}
@@ -2685,7 +2846,7 @@ retry:
* wake_futex_pi has detected invalid state. Tell user
* space.
*/
- goto out_unlock;
+ goto out_putkey;
}
/*
@@ -2695,8 +2856,10 @@ retry:
* preserve the WAITERS bit not the OWNER_DIED one. We are the
* owner.
*/
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
+ spin_unlock(&hb->lock);
goto pi_faulted;
+ }
/*
* If uval has changed, let user space handle it.
@@ -2710,7 +2873,6 @@ out_putkey:
return ret;
pi_faulted:
- spin_unlock(&hb->lock);
put_futex_key(&key);
ret = fault_in_user_writeable(uaddr);
@@ -2814,6 +2976,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
u32 __user *uaddr2)
{
struct hrtimer_sleeper timeout, *to = NULL;
+ struct futex_pi_state *pi_state = NULL;
struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
union futex_key key2 = FUTEX_KEY_INIT;
@@ -2840,10 +3003,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* The waiter is allocated on our stack, manipulated by the requeue
* code while we sleep on uaddr.
*/
- debug_rt_mutex_init_waiter(&rt_waiter);
- RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
- RB_CLEAR_NODE(&rt_waiter.tree_entry);
- rt_waiter.task = NULL;
+ rt_mutex_init_waiter(&rt_waiter);
ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
if (unlikely(ret != 0))
@@ -2898,8 +3058,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr);
ret = fixup_pi_state_owner(uaddr2, &q, current);
- if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
- rt_mutex_unlock(&q.pi_state->pi_mutex);
+ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
+ pi_state = q.pi_state;
+ get_pi_state(pi_state);
+ }
/*
* Drop the reference to the pi state which
* the requeue_pi() code acquired for us.
@@ -2917,10 +3079,13 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
*/
WARN_ON(!q.pi_state);
pi_mutex = &q.pi_state->pi_mutex;
- ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
- debug_rt_mutex_free_waiter(&rt_waiter);
+ ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
spin_lock(q.lock_ptr);
+ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
+ ret = 0;
+
+ debug_rt_mutex_free_waiter(&rt_waiter);
/*
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
@@ -2938,13 +3103,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* the fault, unlock the rt_mutex and return the fault to
* userspace.
*/
- if (ret && rt_mutex_owner(pi_mutex) == current)
- rt_mutex_unlock(pi_mutex);
+ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
+ pi_state = q.pi_state;
+ get_pi_state(pi_state);
+ }
/* Unqueue and drop the lock. */
unqueue_me_pi(&q);
}
+ if (pi_state) {
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+ put_pi_state(pi_state);
+ }
+
if (ret == -EINTR) {
/*
* We've already been requeued, but cannot restart by calling
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index be3c34e4f2ac..686be4b73018 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -348,7 +348,10 @@ void handle_nested_irq(unsigned int irq)
irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
raw_spin_unlock_irq(&desc->lock);
- action_ret = action->thread_fn(action->irq, action->dev_id);
+ action_ret = IRQ_NONE;
+ for_each_action_of_desc(desc, action)
+ action_ret |= action->thread_fn(action->irq, action->dev_id);
+
if (!noirqdebug)
note_interrupt(desc, action_ret);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a4afe5cc5af1..ae1c90f20381 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -852,7 +852,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
* This code is triggered unconditionally. Check the affinity
* mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
*/
- if (desc->irq_common_data.affinity)
+ if (cpumask_available(desc->irq_common_data.affinity))
cpumask_copy(mask, desc->irq_common_data.affinity);
else
valid = false;
@@ -1212,8 +1212,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* set the trigger type must match. Also all must
* agree on ONESHOT.
*/
+ unsigned int oldtype = irqd_get_trigger_type(&desc->irq_data);
+
if (!((old->flags & new->flags) & IRQF_SHARED) ||
- ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
+ (oldtype != (new->flags & IRQF_TRIGGER_MASK)) ||
((old->flags ^ new->flags) & IRQF_ONESHOT))
goto mismatch;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 699c5bc51a92..d733479a10ee 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1391,21 +1391,19 @@ bool within_kprobe_blacklist(unsigned long addr)
* This returns encoded errors if it fails to look up symbol or invalid
* combination of parameters.
*/
-static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
+static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr,
+ const char *symbol_name, unsigned int offset)
{
- kprobe_opcode_t *addr = p->addr;
-
- if ((p->symbol_name && p->addr) ||
- (!p->symbol_name && !p->addr))
+ if ((symbol_name && addr) || (!symbol_name && !addr))
goto invalid;
- if (p->symbol_name) {
- kprobe_lookup_name(p->symbol_name, addr);
+ if (symbol_name) {
+ kprobe_lookup_name(symbol_name, addr);
if (!addr)
return ERR_PTR(-ENOENT);
}
- addr = (kprobe_opcode_t *)(((char *)addr) + p->offset);
+ addr = (kprobe_opcode_t *)(((char *)addr) + offset);
if (addr)
return addr;
@@ -1413,6 +1411,11 @@ invalid:
return ERR_PTR(-EINVAL);
}
+static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
+{
+ return _kprobe_addr(p->addr, p->symbol_name, p->offset);
+}
+
/* Check passed kprobe is valid and return kprobe in kprobe_table. */
static struct kprobe *__get_valid_kprobe(struct kprobe *p)
{
@@ -1740,11 +1743,12 @@ void unregister_kprobes(struct kprobe **kps, int num)
}
EXPORT_SYMBOL_GPL(unregister_kprobes);
-int __weak __kprobes kprobe_exceptions_notify(struct notifier_block *self,
- unsigned long val, void *data)
+int __weak kprobe_exceptions_notify(struct notifier_block *self,
+ unsigned long val, void *data)
{
return NOTIFY_DONE;
}
+NOKPROBE_SYMBOL(kprobe_exceptions_notify);
static struct notifier_block kprobe_exceptions_nb = {
.notifier_call = kprobe_exceptions_notify,
@@ -1875,6 +1879,25 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);
+bool __weak arch_function_offset_within_entry(unsigned long offset)
+{
+ return !offset;
+}
+
+bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
+{
+ kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
+
+ if (IS_ERR(kp_addr))
+ return false;
+
+ if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) ||
+ !arch_function_offset_within_entry(offset))
+ return false;
+
+ return true;
+}
+
int register_kretprobe(struct kretprobe *rp)
{
int ret = 0;
@@ -1882,6 +1905,9 @@ int register_kretprobe(struct kretprobe *rp)
int i;
void *addr;
+ if (!function_offset_within_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
+ return -EINVAL;
+
if (kretprobe_blacklist_size) {
addr = kprobe_addr(&rp->kp);
if (IS_ERR(addr))
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index a95e5d1f4a9c..98dd6231d43b 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -660,6 +660,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
struct lockdep_subclass_key *key;
struct hlist_head *hash_head;
struct lock_class *class;
+ bool is_static = false;
if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
debug_locks_off();
@@ -673,10 +674,23 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
/*
* Static locks do not have their class-keys yet - for them the key
- * is the lock object itself:
+ * is the lock object itself. If the lock is in the per cpu area,
+ * the canonical address of the lock (per cpu offset removed) is
+ * used.
*/
- if (unlikely(!lock->key))
- lock->key = (void *)lock;
+ if (unlikely(!lock->key)) {
+ unsigned long can_addr, addr = (unsigned long)lock;
+
+ if (__is_kernel_percpu_address(addr, &can_addr))
+ lock->key = (void *)can_addr;
+ else if (__is_module_percpu_address(addr, &can_addr))
+ lock->key = (void *)can_addr;
+ else if (static_obj(lock))
+ lock->key = (void *)lock;
+ else
+ return ERR_PTR(-EINVAL);
+ is_static = true;
+ }
/*
* NOTE: the class-key must be unique. For dynamic locks, a static
@@ -708,7 +722,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
}
}
- return NULL;
+ return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
}
/*
@@ -726,19 +740,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
DEBUG_LOCKS_WARN_ON(!irqs_disabled());
class = look_up_lock_class(lock, subclass);
- if (likely(class))
+ if (likely(!IS_ERR_OR_NULL(class)))
goto out_set_class_cache;
/*
* Debug-check: all keys must be persistent!
- */
- if (!static_obj(lock->key)) {
+ */
+ if (IS_ERR(class)) {
debug_locks_off();
printk("INFO: trying to register non-static key.\n");
printk("the code is fine but needs lockdep annotation.\n");
printk("turning off the locking correctness validator.\n");
dump_stack();
-
return NULL;
}
@@ -3419,7 +3432,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
* Clearly if the lock hasn't been acquired _ever_, we're not
* holding it either, so report failure.
*/
- if (!class)
+ if (IS_ERR_OR_NULL(class))
return 0;
/*
@@ -3437,13 +3450,67 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
return 0;
}
+/* @depth must not be zero */
+static struct held_lock *find_held_lock(struct task_struct *curr,
+ struct lockdep_map *lock,
+ unsigned int depth, int *idx)
+{
+ struct held_lock *ret, *hlock, *prev_hlock;
+ int i;
+
+ i = depth - 1;
+ hlock = curr->held_locks + i;
+ ret = hlock;
+ if (match_held_lock(hlock, lock))
+ goto out;
+
+ ret = NULL;
+ for (i--, prev_hlock = hlock--;
+ i >= 0;
+ i--, prev_hlock = hlock--) {
+ /*
+ * We must not cross into another context:
+ */
+ if (prev_hlock->irq_context != hlock->irq_context) {
+ ret = NULL;
+ break;
+ }
+ if (match_held_lock(hlock, lock)) {
+ ret = hlock;
+ break;
+ }
+ }
+
+out:
+ *idx = i;
+ return ret;
+}
+
+static int reacquire_held_locks(struct task_struct *curr, unsigned int depth,
+ int idx)
+{
+ struct held_lock *hlock;
+
+ for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) {
+ if (!__lock_acquire(hlock->instance,
+ hlock_class(hlock)->subclass,
+ hlock->trylock,
+ hlock->read, hlock->check,
+ hlock->hardirqs_off,
+ hlock->nest_lock, hlock->acquire_ip,
+ hlock->references, hlock->pin_count))
+ return 1;
+ }
+ return 0;
+}
+
static int
__lock_set_class(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, unsigned int subclass,
unsigned long ip)
{
struct task_struct *curr = current;
- struct held_lock *hlock, *prev_hlock;
+ struct held_lock *hlock;
struct lock_class *class;
unsigned int depth;
int i;
@@ -3456,21 +3523,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
if (DEBUG_LOCKS_WARN_ON(!depth))
return 0;
- prev_hlock = NULL;
- for (i = depth-1; i >= 0; i--) {
- hlock = curr->held_locks + i;
- /*
- * We must not cross into another context:
- */
- if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
- break;
- if (match_held_lock(hlock, lock))
- goto found_it;
- prev_hlock = hlock;
- }
- return print_unlock_imbalance_bug(curr, lock, ip);
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock)
+ return print_unlock_imbalance_bug(curr, lock, ip);
-found_it:
lockdep_init_map(lock, name, key, 0);
class = register_lock_class(lock, subclass, 0);
hlock->class_idx = class - lock_classes + 1;
@@ -3478,15 +3534,46 @@ found_it:
curr->lockdep_depth = i;
curr->curr_chain_key = hlock->prev_chain_key;
- for (; i < depth; i++) {
- hlock = curr->held_locks + i;
- if (!__lock_acquire(hlock->instance,
- hlock_class(hlock)->subclass, hlock->trylock,
- hlock->read, hlock->check, hlock->hardirqs_off,
- hlock->nest_lock, hlock->acquire_ip,
- hlock->references, hlock->pin_count))
- return 0;
- }
+ if (reacquire_held_locks(curr, depth, i))
+ return 0;
+
+ /*
+ * I took it apart and put it back together again, except now I have
+ * these 'spare' parts.. where shall I put them.
+ */
+ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+ return 0;
+ return 1;
+}
+
+static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
+{
+ struct task_struct *curr = current;
+ struct held_lock *hlock;
+ unsigned int depth;
+ int i;
+
+ depth = curr->lockdep_depth;
+ /*
+ * This function is about (re)setting the class of a held lock,
+ * yet we're not actually holding any locks. Naughty user!
+ */
+ if (DEBUG_LOCKS_WARN_ON(!depth))
+ return 0;
+
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock)
+ return print_unlock_imbalance_bug(curr, lock, ip);
+
+ curr->lockdep_depth = i;
+ curr->curr_chain_key = hlock->prev_chain_key;
+
+ WARN(hlock->read, "downgrading a read lock");
+ hlock->read = 1;
+ hlock->acquire_ip = ip;
+
+ if (reacquire_held_locks(curr, depth, i))
+ return 0;
/*
* I took it apart and put it back together again, except now I have
@@ -3508,7 +3595,7 @@ static int
__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
{
struct task_struct *curr = current;
- struct held_lock *hlock, *prev_hlock;
+ struct held_lock *hlock;
unsigned int depth;
int i;
@@ -3527,21 +3614,10 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
* Check whether the lock exists in the current stack
* of held locks:
*/
- prev_hlock = NULL;
- for (i = depth-1; i >= 0; i--) {
- hlock = curr->held_locks + i;
- /*
- * We must not cross into another context:
- */
- if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
- break;
- if (match_held_lock(hlock, lock))
- goto found_it;
- prev_hlock = hlock;
- }
- return print_unlock_imbalance_bug(curr, lock, ip);
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock)
+ return print_unlock_imbalance_bug(curr, lock, ip);
-found_it:
if (hlock->instance == lock)
lock_release_holdtime(hlock);
@@ -3568,15 +3644,8 @@ found_it:
curr->lockdep_depth = i;
curr->curr_chain_key = hlock->prev_chain_key;
- for (i++; i < depth; i++) {
- hlock = curr->held_locks + i;
- if (!__lock_acquire(hlock->instance,
- hlock_class(hlock)->subclass, hlock->trylock,
- hlock->read, hlock->check, hlock->hardirqs_off,
- hlock->nest_lock, hlock->acquire_ip,
- hlock->references, hlock->pin_count))
- return 0;
- }
+ if (reacquire_held_locks(curr, depth, i + 1))
+ return 0;
/*
* We had N bottles of beer on the wall, we drank one, but now
@@ -3741,6 +3810,23 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
}
EXPORT_SYMBOL_GPL(lock_set_class);
+void lock_downgrade(struct lockdep_map *lock, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ current->lockdep_recursion = 1;
+ check_flags(flags);
+ if (__lock_downgrade(lock, ip))
+ check_chain_key(current);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_downgrade);
+
/*
* We are not always called with irqs disabled - do that here,
* and also avoid lockdep recursion:
@@ -3903,7 +3989,7 @@ static void
__lock_contended(struct lockdep_map *lock, unsigned long ip)
{
struct task_struct *curr = current;
- struct held_lock *hlock, *prev_hlock;
+ struct held_lock *hlock;
struct lock_class_stats *stats;
unsigned int depth;
int i, contention_point, contending_point;
@@ -3916,22 +4002,12 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!depth))
return;
- prev_hlock = NULL;
- for (i = depth-1; i >= 0; i--) {
- hlock = curr->held_locks + i;
- /*
- * We must not cross into another context:
- */
- if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
- break;
- if (match_held_lock(hlock, lock))
- goto found_it;
- prev_hlock = hlock;
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock) {
+ print_lock_contention_bug(curr, lock, ip);
+ return;
}
- print_lock_contention_bug(curr, lock, ip);
- return;
-found_it:
if (hlock->instance != lock)
return;
@@ -3955,7 +4031,7 @@ static void
__lock_acquired(struct lockdep_map *lock, unsigned long ip)
{
struct task_struct *curr = current;
- struct held_lock *hlock, *prev_hlock;
+ struct held_lock *hlock;
struct lock_class_stats *stats;
unsigned int depth;
u64 now, waittime = 0;
@@ -3969,22 +4045,12 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!depth))
return;
- prev_hlock = NULL;
- for (i = depth-1; i >= 0; i--) {
- hlock = curr->held_locks + i;
- /*
- * We must not cross into another context:
- */
- if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
- break;
- if (match_held_lock(hlock, lock))
- goto found_it;
- prev_hlock = hlock;
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock) {
+ print_lock_contention_bug(curr, lock, _RET_IP_);
+ return;
}
- print_lock_contention_bug(curr, lock, _RET_IP_);
- return;
-found_it:
if (hlock->instance != lock)
return;
@@ -4172,7 +4238,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
* If the class exists we look it up and zap it:
*/
class = look_up_lock_class(lock, j);
- if (class)
+ if (!IS_ERR_OR_NULL(class))
zap_class(class);
}
/*
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 97ee9df32e0f..32fe775a2eaf 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -174,12 +174,3 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
lock->name = name;
}
-void
-rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
-{
-}
-
-void rt_mutex_deadlock_account_unlock(struct task_struct *task)
-{
-}
-
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index d0519c3432b6..b585af9a1b50 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -9,9 +9,6 @@
* This file contains macros used solely by rtmutex.c. Debug version.
*/
-extern void
-rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
-extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6edc32ecd9c5..b95509416909 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -224,6 +224,12 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
}
#endif
+/*
+ * Only use with rt_mutex_waiter_{less,equal}()
+ */
+#define task_to_waiter(p) \
+ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
+
static inline int
rt_mutex_waiter_less(struct rt_mutex_waiter *left,
struct rt_mutex_waiter *right)
@@ -238,12 +244,30 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left,
* then right waiter has a dl_prio() too.
*/
if (dl_prio(left->prio))
- return dl_time_before(left->task->dl.deadline,
- right->task->dl.deadline);
+ return dl_time_before(left->deadline, right->deadline);
return 0;
}
+static inline int
+rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
+ struct rt_mutex_waiter *right)
+{
+ if (left->prio != right->prio)
+ return 0;
+
+ /*
+ * If both waiters have dl_prio(), we check the deadlines of the
+ * associated tasks.
+ * If left waiter has a dl_prio(), and we didn't return 0 above,
+ * then right waiter has a dl_prio() too.
+ */
+ if (dl_prio(left->prio))
+ return left->deadline == right->deadline;
+
+ return 1;
+}
+
static void
rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
{
@@ -322,72 +346,16 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
RB_CLEAR_NODE(&waiter->pi_tree_entry);
}
-/*
- * Calculate task priority from the waiter tree priority
- *
- * Return task->normal_prio when the waiter tree is empty or when
- * the waiter is not allowed to do priority boosting
- */
-int rt_mutex_getprio(struct task_struct *task)
-{
- if (likely(!task_has_pi_waiters(task)))
- return task->normal_prio;
-
- return min(task_top_pi_waiter(task)->prio,
- task->normal_prio);
-}
-
-struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
+static void rt_mutex_adjust_prio(struct task_struct *p)
{
- if (likely(!task_has_pi_waiters(task)))
- return NULL;
-
- return task_top_pi_waiter(task)->task;
-}
+ struct task_struct *pi_task = NULL;
-/*
- * Called by sched_setscheduler() to get the priority which will be
- * effective after the change.
- */
-int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
-{
- if (!task_has_pi_waiters(task))
- return newprio;
+ lockdep_assert_held(&p->pi_lock);
- if (task_top_pi_waiter(task)->task->prio <= newprio)
- return task_top_pi_waiter(task)->task->prio;
- return newprio;
-}
+ if (task_has_pi_waiters(p))
+ pi_task = task_top_pi_waiter(p)->task;
-/*
- * Adjust the priority of a task, after its pi_waiters got modified.
- *
- * This can be both boosting and unboosting. task->pi_lock must be held.
- */
-static void __rt_mutex_adjust_prio(struct task_struct *task)
-{
- int prio = rt_mutex_getprio(task);
-
- if (task->prio != prio || dl_prio(prio))
- rt_mutex_setprio(task, prio);
-}
-
-/*
- * Adjust task priority (undo boosting). Called from the exit path of
- * rt_mutex_slowunlock() and rt_mutex_slowlock().
- *
- * (Note: We do this outside of the protection of lock->wait_lock to
- * allow the lock to be taken while or before we readjust the priority
- * of task. We do not use the spin_xx_mutex() variants here as we are
- * outside of the debug path.)
- */
-void rt_mutex_adjust_prio(struct task_struct *task)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&task->pi_lock, flags);
- __rt_mutex_adjust_prio(task);
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ rt_mutex_setprio(p, pi_task);
}
/*
@@ -610,7 +578,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* enabled we continue, but stop the requeueing in the chain
* walk.
*/
- if (waiter->prio == task->prio) {
+ if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
if (!detect_deadlock)
goto out_unlock_pi;
else
@@ -706,7 +674,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/* [7] Requeue the waiter in the lock waiter tree. */
rt_mutex_dequeue(lock, waiter);
+
+ /*
+ * Update the waiter prio fields now that we're dequeued.
+ *
+ * These values can have changed through either:
+ *
+ * sys_sched_set_scheduler() / sys_sched_setattr()
+ *
+ * or
+ *
+ * DL CBS enforcement advancing the effective deadline.
+ *
+ * Even though pi_waiters also uses these fields, and that tree is only
+ * updated in [11], we can do this here, since we hold [L], which
+ * serializes all pi_waiters access and rb_erase() does not care about
+ * the values of the node being removed.
+ */
waiter->prio = task->prio;
+ waiter->deadline = task->dl.deadline;
+
rt_mutex_enqueue(lock, waiter);
/* [8] Release the task */
@@ -747,7 +734,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
*/
rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
rt_mutex_enqueue_pi(task, waiter);
- __rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(task);
} else if (prerequeue_top_waiter == waiter) {
/*
@@ -763,7 +750,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
rt_mutex_dequeue_pi(task, waiter);
waiter = rt_mutex_top_waiter(lock);
rt_mutex_enqueue_pi(task, waiter);
- __rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(task);
} else {
/*
* Nothing changed. No need to do any priority
@@ -833,6 +820,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
struct rt_mutex_waiter *waiter)
{
+ lockdep_assert_held(&lock->wait_lock);
+
/*
* Before testing whether we can acquire @lock, we set the
* RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
@@ -892,7 +881,8 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* the top waiter priority (kernel view),
* @task lost.
*/
- if (task->prio >= rt_mutex_top_waiter(lock)->prio)
+ if (!rt_mutex_waiter_less(task_to_waiter(task),
+ rt_mutex_top_waiter(lock)))
return 0;
/*
@@ -938,8 +928,6 @@ takeit:
*/
rt_mutex_set_owner(lock, task);
- rt_mutex_deadlock_account_lock(lock, task);
-
return 1;
}
@@ -960,6 +948,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct rt_mutex *next_lock;
int chain_walk = 0, res;
+ lockdep_assert_held(&lock->wait_lock);
+
/*
* Early deadlock detection. We really don't want the task to
* enqueue on itself just to untangle the mess later. It's not
@@ -973,10 +963,11 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
return -EDEADLK;
raw_spin_lock(&task->pi_lock);
- __rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(task);
waiter->task = task;
waiter->lock = lock;
waiter->prio = task->prio;
+ waiter->deadline = task->dl.deadline;
/* Get the top priority waiter on the lock */
if (rt_mutex_has_waiters(lock))
@@ -995,7 +986,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
rt_mutex_dequeue_pi(owner, top_waiter);
rt_mutex_enqueue_pi(owner, waiter);
- __rt_mutex_adjust_prio(owner);
+ rt_mutex_adjust_prio(owner);
if (owner->pi_blocked_on)
chain_walk = 1;
} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
@@ -1047,12 +1038,14 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
waiter = rt_mutex_top_waiter(lock);
/*
- * Remove it from current->pi_waiters. We do not adjust a
- * possible priority boost right now. We execute wakeup in the
- * boosted mode and go back to normal after releasing
- * lock->wait_lock.
+ * Remove it from current->pi_waiters and deboost.
+ *
+ * We must in fact deboost here in order to ensure we call
+ * rt_mutex_setprio() to update p->pi_top_task before the
+ * task unblocks.
*/
rt_mutex_dequeue_pi(current, waiter);
+ rt_mutex_adjust_prio(current);
/*
* As we are waking up the top waiter, and the waiter stays
@@ -1064,9 +1057,19 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
*/
lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
- raw_spin_unlock(&current->pi_lock);
-
+ /*
+ * We deboosted before waking the top waiter task such that we don't
+ * run two tasks with the 'same' priority (and ensure the
+ * p->pi_top_task pointer points to a blocked task). This however can
+ * lead to priority inversion if we would get preempted after the
+ * deboost but before waking our donor task, hence the preempt_disable()
+ * before unlock.
+ *
+ * Pairs with preempt_enable() in rt_mutex_postunlock();
+ */
+ preempt_disable();
wake_q_add(wake_q, waiter->task);
+ raw_spin_unlock(&current->pi_lock);
}
/*
@@ -1082,6 +1085,8 @@ static void remove_waiter(struct rt_mutex *lock,
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex *next_lock;
+ lockdep_assert_held(&lock->wait_lock);
+
raw_spin_lock(&current->pi_lock);
rt_mutex_dequeue(lock, waiter);
current->pi_blocked_on = NULL;
@@ -1101,7 +1106,7 @@ static void remove_waiter(struct rt_mutex *lock,
if (rt_mutex_has_waiters(lock))
rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
- __rt_mutex_adjust_prio(owner);
+ rt_mutex_adjust_prio(owner);
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);
@@ -1140,8 +1145,7 @@ void rt_mutex_adjust_pi(struct task_struct *task)
raw_spin_lock_irqsave(&task->pi_lock, flags);
waiter = task->pi_blocked_on;
- if (!waiter || (waiter->prio == task->prio &&
- !dl_prio(task->prio))) {
+ if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
return;
}
@@ -1155,6 +1159,14 @@ void rt_mutex_adjust_pi(struct task_struct *task)
next_lock, NULL, task);
}
+void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ debug_rt_mutex_init_waiter(waiter);
+ RB_CLEAR_NODE(&waiter->pi_tree_entry);
+ RB_CLEAR_NODE(&waiter->tree_entry);
+ waiter->task = NULL;
+}
+
/**
* __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
* @lock: the rt_mutex to take
@@ -1237,9 +1249,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
unsigned long flags;
int ret = 0;
- debug_rt_mutex_init_waiter(&waiter);
- RB_CLEAR_NODE(&waiter.pi_tree_entry);
- RB_CLEAR_NODE(&waiter.tree_entry);
+ rt_mutex_init_waiter(&waiter);
/*
* Technically we could use raw_spin_[un]lock_irq() here, but this can
@@ -1330,7 +1340,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
/*
* Slow path to release a rt-mutex.
- * Return whether the current task needs to undo a potential priority boosting.
+ *
+ * Return whether the current task needs to call rt_mutex_postunlock().
*/
static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
struct wake_q_head *wake_q)
@@ -1342,8 +1353,6 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
debug_rt_mutex_unlock(lock);
- rt_mutex_deadlock_account_unlock(current);
-
/*
* We must be careful here if the fast path is enabled. If we
* have no waiters queued we cannot set owner to NULL here
@@ -1390,11 +1399,9 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
* Queue the next waiter for wakeup once we release the wait_lock.
*/
mark_wakeup_next_waiter(wake_q, lock);
-
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- /* check PI boosting */
- return true;
+ return true; /* call rt_mutex_postunlock() */
}
/*
@@ -1409,11 +1416,10 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
enum rtmutex_chainwalk chwalk))
{
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 0;
- } else
- return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
+
+ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
}
static inline int
@@ -1425,24 +1431,33 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
enum rtmutex_chainwalk chwalk))
{
if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
- likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
+ likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 0;
- } else
- return slowfn(lock, state, timeout, chwalk);
+
+ return slowfn(lock, state, timeout, chwalk);
}
static inline int
rt_mutex_fasttrylock(struct rt_mutex *lock,
int (*slowfn)(struct rt_mutex *lock))
{
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 1;
- }
+
return slowfn(lock);
}
+/*
+ * Performs the wakeup of the the top-waiter and re-enables preemption.
+ */
+void rt_mutex_postunlock(struct wake_q_head *wake_q)
+{
+ wake_up_q(wake_q);
+
+ /* Pairs with preempt_disable() in rt_mutex_slowunlock() */
+ preempt_enable();
+}
+
static inline void
rt_mutex_fastunlock(struct rt_mutex *lock,
bool (*slowfn)(struct rt_mutex *lock,
@@ -1450,18 +1465,11 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
{
DEFINE_WAKE_Q(wake_q);
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
- rt_mutex_deadlock_account_unlock(current);
-
- } else {
- bool deboost = slowfn(lock, &wake_q);
-
- wake_up_q(&wake_q);
+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ return;
- /* Undo pi boosting if necessary: */
- if (deboost)
- rt_mutex_adjust_prio(current);
- }
+ if (slowfn(lock, &wake_q))
+ rt_mutex_postunlock(&wake_q);
}
/**
@@ -1495,16 +1503,11 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
/*
- * Futex variant with full deadlock detection.
+ * Futex variant, must not use fastpath.
*/
-int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *timeout)
+int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
{
- might_sleep();
-
- return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
- RT_MUTEX_FULL_CHAINWALK,
- rt_mutex_slowlock);
+ return rt_mutex_slowtrylock(lock);
}
/**
@@ -1563,20 +1566,43 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
EXPORT_SYMBOL_GPL(rt_mutex_unlock);
/**
- * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock
- * @lock: the rt_mutex to be unlocked
- *
- * Returns: true/false indicating whether priority adjustment is
- * required or not.
+ * Futex variant, that since futex variants do not use the fast-path, can be
+ * simple and will not need to retry.
*/
-bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wqh)
+bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
+ struct wake_q_head *wake_q)
{
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
- rt_mutex_deadlock_account_unlock(current);
- return false;
+ lockdep_assert_held(&lock->wait_lock);
+
+ debug_rt_mutex_unlock(lock);
+
+ if (!rt_mutex_has_waiters(lock)) {
+ lock->owner = NULL;
+ return false; /* done */
}
- return rt_mutex_slowunlock(lock, wqh);
+
+ /*
+ * We've already deboosted, mark_wakeup_next_waiter() will
+ * retain preempt_disabled when we drop the wait_lock, to
+ * avoid inversion prior to the wakeup. preempt_disable()
+ * therein pairs with rt_mutex_postunlock().
+ */
+ mark_wakeup_next_waiter(wake_q, lock);
+
+ return true; /* call postunlock() */
+}
+
+void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
+{
+ DEFINE_WAKE_Q(wake_q);
+ bool postunlock;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ if (postunlock)
+ rt_mutex_postunlock(&wake_q);
}
/**
@@ -1637,7 +1663,6 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
__rt_mutex_init(lock, NULL);
debug_rt_mutex_proxy_lock(lock, proxy_owner);
rt_mutex_set_owner(lock, proxy_owner);
- rt_mutex_deadlock_account_lock(lock, proxy_owner);
}
/**
@@ -1657,34 +1682,16 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
{
debug_rt_mutex_proxy_unlock(lock);
rt_mutex_set_owner(lock, NULL);
- rt_mutex_deadlock_account_unlock(proxy_owner);
}
-/**
- * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
- * @lock: the rt_mutex to take
- * @waiter: the pre-initialized rt_mutex_waiter
- * @task: the task to prepare
- *
- * Returns:
- * 0 - task blocked on lock
- * 1 - acquired the lock for task, caller should wake it up
- * <0 - error
- *
- * Special API call for FUTEX_REQUEUE_PI support.
- */
-int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task)
{
int ret;
- raw_spin_lock_irq(&lock->wait_lock);
-
- if (try_to_take_rt_mutex(lock, task, NULL)) {
- raw_spin_unlock_irq(&lock->wait_lock);
+ if (try_to_take_rt_mutex(lock, task, NULL))
return 1;
- }
/* We enforce deadlock detection for futexes */
ret = task_blocks_on_rt_mutex(lock, waiter, task,
@@ -1703,14 +1710,38 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
if (unlikely(ret))
remove_waiter(lock, waiter);
- raw_spin_unlock_irq(&lock->wait_lock);
-
debug_rt_mutex_print_deadlock(waiter);
return ret;
}
/**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for FUTEX_REQUEUE_PI support.
+ */
+int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+{
+ int ret;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+}
+
+/**
* rt_mutex_next_owner - return the next owner of the lock
*
* @lock: the rt lock query
@@ -1731,21 +1762,23 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
}
/**
- * rt_mutex_finish_proxy_lock() - Complete lock acquisition
+ * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
* @lock: the rt_mutex we were woken on
* @to: the timeout, null if none. hrtimer should already have
* been started.
* @waiter: the pre-initialized rt_mutex_waiter
*
- * Complete the lock acquisition started our behalf by another thread.
+ * Wait for the the lock acquisition started on our behalf by
+ * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
+ * rt_mutex_cleanup_proxy_lock().
*
* Returns:
* 0 - success
* <0 - error, one of -EINTR, -ETIMEDOUT
*
- * Special API call for PI-futex requeue support
+ * Special API call for PI-futex support
*/
-int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
struct hrtimer_sleeper *to,
struct rt_mutex_waiter *waiter)
{
@@ -1758,8 +1791,45 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
/* sleep on the mutex */
ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
- if (unlikely(ret))
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+}
+
+/**
+ * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
+ *
+ * Unless we acquired the lock; we're still enqueued on the wait-list and can
+ * in fact still be granted ownership until we're removed. Therefore we can
+ * find we are in fact the owner and must disregard the
+ * rt_mutex_wait_proxy_lock() failure.
+ *
+ * Returns:
+ * true - did the cleanup, we done.
+ * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
+ * caller should disregards its return value.
+ *
+ * Special API call for PI-futex support
+ */
+bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter)
+{
+ bool cleanup = false;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ /*
+ * Unless we're the owner; we're still enqueued on the wait_list.
+ * So check if we became owner, if not, take us off the wait_list.
+ */
+ if (rt_mutex_owner(lock) != current) {
remove_waiter(lock, waiter);
+ fixup_rt_mutex_waiters(lock);
+ cleanup = true;
+ }
/*
* try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
@@ -1769,5 +1839,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
raw_spin_unlock_irq(&lock->wait_lock);
- return ret;
+ return cleanup;
}
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index c4060584c407..6607802efa8b 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -11,8 +11,6 @@
*/
#define rt_mutex_deadlock_check(l) (0)
-#define rt_mutex_deadlock_account_lock(m, t) do { } while (0)
-#define rt_mutex_deadlock_account_unlock(l) do { } while (0)
#define debug_rt_mutex_init_waiter(w) do { } while (0)
#define debug_rt_mutex_free_waiter(w) do { } while (0)
#define debug_rt_mutex_lock(l) do { } while (0)
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 856dfff5c33a..72ad45a9a794 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -34,6 +34,7 @@ struct rt_mutex_waiter {
struct rt_mutex *deadlock_lock;
#endif
int prio;
+ u64 deadline;
};
/*
@@ -103,16 +104,26 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
struct task_struct *proxy_owner);
extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
struct task_struct *proxy_owner);
+extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task);
-extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter);
-extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
-extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wqh);
-extern void rt_mutex_adjust_prio(struct task_struct *task);
+extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter);
+extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter);
+
+extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+
+extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
+extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
+ struct wake_q_head *wqh);
+
+extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
#ifdef CONFIG_DEBUG_RT_MUTEXES
# include "rtmutex-debug.h"
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 90a74ccd85a4..4d48b1c4870d 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -124,10 +124,8 @@ EXPORT_SYMBOL(up_write);
*/
void downgrade_write(struct rw_semaphore *sem)
{
- /*
- * lockdep: a downgraded write will live on as a write
- * dependency.
- */
+ lock_downgrade(&sem->dep_map, _RET_IP_);
+
rwsem_set_reader_owned(sem);
__downgrade_write(sem);
}
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 6b7abb334ca6..39f56c870051 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -353,8 +353,8 @@ static int test_cycle(unsigned int ncpus)
struct stress {
struct work_struct work;
struct ww_mutex *locks;
+ unsigned long timeout;
int nlocks;
- int nloops;
};
static int *get_random_order(int count)
@@ -398,12 +398,11 @@ static void stress_inorder_work(struct work_struct *work)
if (!order)
return;
- ww_acquire_init(&ctx, &ww_class);
-
do {
int contended = -1;
int n, err;
+ ww_acquire_init(&ctx, &ww_class);
retry:
err = 0;
for (n = 0; n < nlocks; n++) {
@@ -433,9 +432,9 @@ retry:
__func__, err);
break;
}
- } while (--stress->nloops);
- ww_acquire_fini(&ctx);
+ ww_acquire_fini(&ctx);
+ } while (!time_after(jiffies, stress->timeout));
kfree(order);
kfree(stress);
@@ -470,9 +469,9 @@ static void stress_reorder_work(struct work_struct *work)
kfree(order);
order = NULL;
- ww_acquire_init(&ctx, &ww_class);
-
do {
+ ww_acquire_init(&ctx, &ww_class);
+
list_for_each_entry(ll, &locks, link) {
err = ww_mutex_lock(ll->lock, &ctx);
if (!err)
@@ -495,9 +494,9 @@ static void stress_reorder_work(struct work_struct *work)
dummy_load(stress);
list_for_each_entry(ll, &locks, link)
ww_mutex_unlock(ll->lock);
- } while (--stress->nloops);
- ww_acquire_fini(&ctx);
+ ww_acquire_fini(&ctx);
+ } while (!time_after(jiffies, stress->timeout));
out:
list_for_each_entry_safe(ll, ln, &locks, link)
@@ -523,7 +522,7 @@ static void stress_one_work(struct work_struct *work)
__func__, err);
break;
}
- } while (--stress->nloops);
+ } while (!time_after(jiffies, stress->timeout));
kfree(stress);
}
@@ -533,7 +532,7 @@ static void stress_one_work(struct work_struct *work)
#define STRESS_ONE BIT(2)
#define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE)
-static int stress(int nlocks, int nthreads, int nloops, unsigned int flags)
+static int stress(int nlocks, int nthreads, unsigned int flags)
{
struct ww_mutex *locks;
int n;
@@ -575,7 +574,7 @@ static int stress(int nlocks, int nthreads, int nloops, unsigned int flags)
INIT_WORK(&stress->work, fn);
stress->locks = locks;
stress->nlocks = nlocks;
- stress->nloops = nloops;
+ stress->timeout = jiffies + 2*HZ;
queue_work(wq, &stress->work);
nthreads--;
@@ -619,15 +618,15 @@ static int __init test_ww_mutex_init(void)
if (ret)
return ret;
- ret = stress(16, 2*ncpus, 1<<10, STRESS_INORDER);
+ ret = stress(16, 2*ncpus, STRESS_INORDER);
if (ret)
return ret;
- ret = stress(16, 2*ncpus, 1<<10, STRESS_REORDER);
+ ret = stress(16, 2*ncpus, STRESS_REORDER);
if (ret)
return ret;
- ret = stress(4095, hweight32(STRESS_ALL)*ncpus, 1<<12, STRESS_ALL);
+ ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
if (ret)
return ret;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 07e85e5229da..23a6483c3666 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -182,18 +182,6 @@ struct page_map {
struct vmem_altmap altmap;
};
-void get_zone_device_page(struct page *page)
-{
- percpu_ref_get(page->pgmap->ref);
-}
-EXPORT_SYMBOL(get_zone_device_page);
-
-void put_zone_device_page(struct page *page)
-{
- put_dev_pagemap(page->pgmap);
-}
-EXPORT_SYMBOL(put_zone_device_page);
-
static void pgmap_radix_release(struct resource *res)
{
resource_size_t key, align_start, align_size, align_end;
@@ -237,6 +225,10 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
struct resource *res = &page_map->res;
resource_size_t align_start, align_size;
struct dev_pagemap *pgmap = &page_map->pgmap;
+ unsigned long pfn;
+
+ for_each_device_pfn(pfn, page_map)
+ put_page(pfn_to_page(pfn));
if (percpu_ref_tryget_live(pgmap->ref)) {
dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
@@ -277,7 +269,10 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
*
* Notes:
* 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
- * (or devm release event).
+ * (or devm release event). The expected order of events is that @ref has
+ * been through percpu_ref_kill() before devm_memremap_pages_release(). The
+ * wait for the completion of all references being dropped and
+ * percpu_ref_exit() must occur after devm_memremap_pages_release().
*
* 2/ @res is expected to be a host memory range that could feasibly be
* treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -379,6 +374,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
*/
list_del(&page->lru);
page->pgmap = pgmap;
+ percpu_ref_get(ref);
}
devres_add(dev, page_map);
return __va(res->start);
diff --git a/kernel/module.c b/kernel/module.c
index 7eba6dea4f41..6d9988031c5b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -665,16 +665,7 @@ static void percpu_modcopy(struct module *mod,
memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
}
-/**
- * is_module_percpu_address - test whether address is from module static percpu
- * @addr: address to test
- *
- * Test whether @addr belongs to module static percpu area.
- *
- * RETURNS:
- * %true if @addr is from module static percpu area
- */
-bool is_module_percpu_address(unsigned long addr)
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
{
struct module *mod;
unsigned int cpu;
@@ -688,9 +679,15 @@ bool is_module_percpu_address(unsigned long addr)
continue;
for_each_possible_cpu(cpu) {
void *start = per_cpu_ptr(mod->percpu, cpu);
-
- if ((void *)addr >= start &&
- (void *)addr < start + mod->percpu_size) {
+ void *va = (void *)addr;
+
+ if (va >= start && va < start + mod->percpu_size) {
+ if (can_addr) {
+ *can_addr = (unsigned long) (va - start);
+ *can_addr += (unsigned long)
+ per_cpu_ptr(mod->percpu,
+ get_boot_cpu_id());
+ }
preempt_enable();
return true;
}
@@ -701,6 +698,20 @@ bool is_module_percpu_address(unsigned long addr)
return false;
}
+/**
+ * is_module_percpu_address - test whether address is from module static percpu
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to module static percpu area.
+ *
+ * RETURNS:
+ * %true if @addr is from module static percpu area
+ */
+bool is_module_percpu_address(unsigned long addr)
+{
+ return __is_module_percpu_address(addr, NULL);
+}
+
#else /* ... !CONFIG_SMP */
static inline void __percpu *mod_percpu(struct module *mod)
@@ -732,6 +743,11 @@ bool is_module_percpu_address(unsigned long addr)
return false;
}
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
+{
+ return false;
+}
+
#endif /* CONFIG_SMP */
#define MODINFO_ATTR(field) \
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 782102e59eed..f6c5d330059a 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,6 +26,7 @@
#include <linux/file.h>
#include <linux/syscalls.h>
#include <linux/cgroup.h>
+#include <linux/perf_event.h>
static struct kmem_cache *nsproxy_cachep;
@@ -262,6 +263,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
goto out;
}
switch_task_namespaces(tsk, new_nsproxy);
+
+ perf_event_namespaces(tsk);
out:
fput(file);
return err;
diff --git a/kernel/padata.c b/kernel/padata.c
index 3202aa17492c..ac8f1e524836 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -154,8 +154,6 @@ EXPORT_SYMBOL(padata_do_parallel);
* A pointer to the control struct of the next object that needs
* serialization, if present in one of the percpu reorder queues.
*
- * NULL, if all percpu reorder queues are empty.
- *
* -EINPROGRESS, if the next object that needs serialization will
* be parallel processed by another cpu and is not yet present in
* the cpu's reorder queue.
@@ -182,8 +180,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
cpu = padata_index_to_cpu(pd, next_index);
next_queue = per_cpu_ptr(pd->pqueue, cpu);
- padata = NULL;
-
reorder = &next_queue->reorder;
spin_lock(&reorder->lock);
@@ -235,12 +231,11 @@ static void padata_reorder(struct parallel_data *pd)
padata = padata_get_next(pd);
/*
- * All reorder queues are empty, or the next object that needs
- * serialization is parallel processed by another cpu and is
- * still on it's way to the cpu's reorder queue, nothing to
- * do for now.
+ * If the next object that needs serialization is parallel
+ * processed by another cpu and is still on it's way to the
+ * cpu's reorder queue, nothing to do for now.
*/
- if (!padata || PTR_ERR(padata) == -EINPROGRESS)
+ if (PTR_ERR(padata) == -EINPROGRESS)
break;
/*
@@ -354,7 +349,7 @@ static int padata_setup_cpumasks(struct parallel_data *pd,
cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask);
if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
- free_cpumask_var(pd->cpumask.cbcpu);
+ free_cpumask_var(pd->cpumask.pcpu);
return -ENOMEM;
}
diff --git a/kernel/params.c b/kernel/params.c
index a6d6149c0fe6..60b2d8101355 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -160,58 +160,6 @@ static int parse_one(char *param,
return -ENOENT;
}
-/* You can use " around spaces, but can't escape ". */
-/* Hyphens and underscores equivalent in parameter names. */
-static char *next_arg(char *args, char **param, char **val)
-{
- unsigned int i, equals = 0;
- int in_quote = 0, quoted = 0;
- char *next;
-
- if (*args == '"') {
- args++;
- in_quote = 1;
- quoted = 1;
- }
-
- for (i = 0; args[i]; i++) {
- if (isspace(args[i]) && !in_quote)
- break;
- if (equals == 0) {
- if (args[i] == '=')
- equals = i;
- }
- if (args[i] == '"')
- in_quote = !in_quote;
- }
-
- *param = args;
- if (!equals)
- *val = NULL;
- else {
- args[equals] = '\0';
- *val = args + equals + 1;
-
- /* Don't include quotes in value. */
- if (**val == '"') {
- (*val)++;
- if (args[i-1] == '"')
- args[i-1] = '\0';
- }
- }
- if (quoted && args[i-1] == '"')
- args[i-1] = '\0';
-
- if (args[i]) {
- args[i] = '\0';
- next = args + i + 1;
- } else
- next = args + i;
-
- /* Chew up trailing spaces. */
- return skip_spaces(next);
-}
-
/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
char *parse_args(const char *doing,
char *args,
diff --git a/kernel/relay.c b/kernel/relay.c
index 0e413d9eec8a..39a9dfc69486 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1212,7 +1212,6 @@ static ssize_t subbuf_splice_actor(struct file *in,
.nr_pages = 0,
.nr_pages_max = PIPE_DEF_BUFFERS,
.partial = partial,
- .flags = flags,
.ops = &relay_pipe_buf_ops,
.spd_release = relay_page_release,
};
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3b31fc05a0f1..c51147a1204c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -86,21 +86,6 @@ int sysctl_sched_rt_runtime = 950000;
cpumask_var_t cpu_isolated_map;
/*
- * this_rq_lock - lock this runqueue and disable interrupts.
- */
-static struct rq *this_rq_lock(void)
- __acquires(rq->lock)
-{
- struct rq *rq;
-
- local_irq_disable();
- rq = this_rq();
- raw_spin_lock(&rq->lock);
-
- return rq;
-}
-
-/*
* __task_rq_lock - lock the rq @p resides on.
*/
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
@@ -233,8 +218,11 @@ void update_rq_clock(struct rq *rq)
return;
#ifdef CONFIG_SCHED_DEBUG
+ if (sched_feat(WARN_DOUBLE_CLOCK))
+ SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
rq->clock_update_flags |= RQCF_UPDATED;
#endif
+
delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
if (delta < 0)
return;
@@ -261,13 +249,14 @@ static void hrtick_clear(struct rq *rq)
static enum hrtimer_restart hrtick(struct hrtimer *timer)
{
struct rq *rq = container_of(timer, struct rq, hrtick_timer);
+ struct rq_flags rf;
WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
update_rq_clock(rq);
rq->curr->sched_class->task_tick(rq, rq->curr, 1);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
return HRTIMER_NORESTART;
}
@@ -287,11 +276,12 @@ static void __hrtick_restart(struct rq *rq)
static void __hrtick_start(void *arg)
{
struct rq *rq = arg;
+ struct rq_flags rf;
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
__hrtick_restart(rq);
rq->hrtick_csd_pending = 0;
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
}
/*
@@ -762,17 +752,23 @@ static void set_load_weight(struct task_struct *p)
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
- update_rq_clock(rq);
+ if (!(flags & ENQUEUE_NOCLOCK))
+ update_rq_clock(rq);
+
if (!(flags & ENQUEUE_RESTORE))
sched_info_queued(rq, p);
+
p->sched_class->enqueue_task(rq, p, flags);
}
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
- update_rq_clock(rq);
+ if (!(flags & DEQUEUE_NOCLOCK))
+ update_rq_clock(rq);
+
if (!(flags & DEQUEUE_SAVE))
sched_info_dequeued(rq, p);
+
p->sched_class->dequeue_task(rq, p, flags);
}
@@ -946,18 +942,19 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
*
* Returns (locked) new rq. Old rq's lock is released.
*/
-static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
+static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
+ struct task_struct *p, int new_cpu)
{
lockdep_assert_held(&rq->lock);
p->on_rq = TASK_ON_RQ_MIGRATING;
- dequeue_task(rq, p, 0);
+ dequeue_task(rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, new_cpu);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, rf);
rq = cpu_rq(new_cpu);
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, rf);
BUG_ON(task_cpu(p) != new_cpu);
enqueue_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
@@ -980,7 +977,8 @@ struct migration_arg {
* So we race with normal scheduler movements, but that's OK, as long
* as the task is no longer on this CPU.
*/
-static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
+static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
+ struct task_struct *p, int dest_cpu)
{
if (unlikely(!cpu_active(dest_cpu)))
return rq;
@@ -989,7 +987,8 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_
if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
return rq;
- rq = move_queued_task(rq, p, dest_cpu);
+ update_rq_clock(rq);
+ rq = move_queued_task(rq, rf, p, dest_cpu);
return rq;
}
@@ -1004,6 +1003,7 @@ static int migration_cpu_stop(void *data)
struct migration_arg *arg = data;
struct task_struct *p = arg->task;
struct rq *rq = this_rq();
+ struct rq_flags rf;
/*
* The original target CPU might have gone down and we might
@@ -1018,7 +1018,7 @@ static int migration_cpu_stop(void *data)
sched_ttwu_pending();
raw_spin_lock(&p->pi_lock);
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
/*
* If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
@@ -1026,11 +1026,11 @@ static int migration_cpu_stop(void *data)
*/
if (task_rq(p) == rq) {
if (task_on_rq_queued(p))
- rq = __migrate_task(rq, p, arg->dest_cpu);
+ rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
else
p->wake_cpu = arg->dest_cpu;
}
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
raw_spin_unlock(&p->pi_lock);
local_irq_enable();
@@ -1063,7 +1063,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
* holding rq->lock.
*/
lockdep_assert_held(&rq->lock);
- dequeue_task(rq, p, DEQUEUE_SAVE);
+ dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
}
if (running)
put_prev_task(rq, p);
@@ -1071,7 +1071,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
p->sched_class->set_cpus_allowed(p, new_mask);
if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE);
+ enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
set_curr_task(rq, p);
}
@@ -1150,9 +1150,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
* OK, since we're going to drop the lock immediately
* afterwards anyway.
*/
- rq_unpin_lock(rq, &rf);
- rq = move_queued_task(rq, p, dest_cpu);
- rq_repin_lock(rq, &rf);
+ rq = move_queued_task(rq, &rf, p, dest_cpu);
}
out:
task_rq_unlock(rq, p, &rf);
@@ -1217,16 +1215,24 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
{
if (task_on_rq_queued(p)) {
struct rq *src_rq, *dst_rq;
+ struct rq_flags srf, drf;
src_rq = task_rq(p);
dst_rq = cpu_rq(cpu);
+ rq_pin_lock(src_rq, &srf);
+ rq_pin_lock(dst_rq, &drf);
+
p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(dst_rq, p, 0);
+
+ rq_unpin_lock(dst_rq, &drf);
+ rq_unpin_lock(src_rq, &srf);
+
} else {
/*
* Task isn't running anymore; make it appear like we migrated
@@ -1680,7 +1686,7 @@ static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
struct rq_flags *rf)
{
- int en_flags = ENQUEUE_WAKEUP;
+ int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
lockdep_assert_held(&rq->lock);
@@ -1726,14 +1732,13 @@ void sched_ttwu_pending(void)
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
struct task_struct *p;
- unsigned long flags;
struct rq_flags rf;
if (!llist)
return;
- raw_spin_lock_irqsave(&rq->lock, flags);
- rq_pin_lock(rq, &rf);
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
while (llist) {
int wake_flags = 0;
@@ -1747,8 +1752,7 @@ void sched_ttwu_pending(void)
ttwu_do_activate(rq, p, wake_flags, &rf);
}
- rq_unpin_lock(rq, &rf);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ rq_unlock_irqrestore(rq, &rf);
}
void scheduler_ipi(void)
@@ -1806,7 +1810,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
void wake_up_if_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
+ struct rq_flags rf;
rcu_read_lock();
@@ -1816,11 +1820,11 @@ void wake_up_if_idle(int cpu)
if (set_nr_if_polling(rq->idle)) {
trace_sched_wake_idle_without_ipi(cpu);
} else {
- raw_spin_lock_irqsave(&rq->lock, flags);
+ rq_lock_irqsave(rq, &rf);
if (is_idle_task(rq->curr))
smp_send_reschedule(cpu);
/* Else CPU is not idle, do nothing here: */
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ rq_unlock_irqrestore(rq, &rf);
}
out:
@@ -1846,11 +1850,10 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
}
#endif
- raw_spin_lock(&rq->lock);
- rq_pin_lock(rq, &rf);
+ rq_lock(rq, &rf);
+ update_rq_clock(rq);
ttwu_do_activate(rq, p, wake_flags, &rf);
- rq_unpin_lock(rq, &rf);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
}
/*
@@ -2097,11 +2100,9 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
* disabled avoiding further scheduler activity on it and we've
* not yet picked a replacement task.
*/
- rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, rf);
raw_spin_lock(&p->pi_lock);
- raw_spin_lock(&rq->lock);
- rq_repin_lock(rq, rf);
+ rq_relock(rq, rf);
}
if (!(p->state & TASK_NORMAL))
@@ -2114,7 +2115,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
delayacct_blkio_end();
atomic_dec(&rq->nr_iowait);
}
- ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
}
ttwu_do_wakeup(rq, p, 0, rf);
@@ -2555,7 +2556,7 @@ void wake_up_new_task(struct task_struct *p)
update_rq_clock(rq);
post_init_entity_util_avg(&p->se);
- activate_task(rq, p, 0);
+ activate_task(rq, p, ENQUEUE_NOCLOCK);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
@@ -3093,15 +3094,18 @@ void scheduler_tick(void)
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
+ struct rq_flags rf;
sched_clock_tick();
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
+
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
cpu_load_update_active(rq);
calc_global_load_tick(rq);
- raw_spin_unlock(&rq->lock);
+
+ rq_unlock(rq, &rf);
perf_event_task_tick();
@@ -3386,18 +3390,18 @@ static void __sched notrace __schedule(bool preempt)
* done by the caller to avoid the race with signal_wake_up().
*/
smp_mb__before_spinlock();
- raw_spin_lock(&rq->lock);
- rq_pin_lock(rq, &rf);
+ rq_lock(rq, &rf);
/* Promote REQ to ACT */
rq->clock_update_flags <<= 1;
+ update_rq_clock(rq);
switch_count = &prev->nivcsw;
if (!preempt && prev->state) {
if (unlikely(signal_pending_state(prev->state, prev))) {
prev->state = TASK_RUNNING;
} else {
- deactivate_task(rq, prev, DEQUEUE_SLEEP);
+ deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
prev->on_rq = 0;
if (prev->in_iowait) {
@@ -3421,9 +3425,6 @@ static void __sched notrace __schedule(bool preempt)
switch_count = &prev->nvcsw;
}
- if (task_on_rq_queued(prev))
- update_rq_clock(rq);
-
next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
@@ -3439,8 +3440,7 @@ static void __sched notrace __schedule(bool preempt)
rq = context_switch(rq, prev, next, &rf);
} else {
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
- rq_unpin_lock(rq, &rf);
- raw_spin_unlock_irq(&rq->lock);
+ rq_unlock_irq(rq, &rf);
}
balance_callback(rq);
@@ -3671,10 +3671,25 @@ EXPORT_SYMBOL(default_wake_function);
#ifdef CONFIG_RT_MUTEXES
+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
+{
+ if (pi_task)
+ prio = min(prio, pi_task->prio);
+
+ return prio;
+}
+
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+ struct task_struct *pi_task = rt_mutex_get_top_task(p);
+
+ return __rt_effective_prio(pi_task, prio);
+}
+
/*
* rt_mutex_setprio - set the current priority of a task
- * @p: task
- * @prio: prio value (kernel-internal form)
+ * @p: task to boost
+ * @pi_task: donor task
*
* This function changes the 'effective' priority of a task. It does
* not touch ->normal_prio like __setscheduler().
@@ -3682,17 +3697,42 @@ EXPORT_SYMBOL(default_wake_function);
* Used by the rt_mutex code to implement priority inheritance
* logic. Call site only calls if the priority of the task changed.
*/
-void rt_mutex_setprio(struct task_struct *p, int prio)
+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
{
- int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
+ int prio, oldprio, queued, running, queue_flag =
+ DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *prev_class;
struct rq_flags rf;
struct rq *rq;
- BUG_ON(prio > MAX_PRIO);
+ /* XXX used to be waiter->prio, not waiter->task->prio */
+ prio = __rt_effective_prio(pi_task, p->normal_prio);
+
+ /*
+ * If nothing changed; bail early.
+ */
+ if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
+ return;
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
+ /*
+ * Set under pi_lock && rq->lock, such that the value can be used under
+ * either lock.
+ *
+ * Note that there is loads of tricky to make this pointer cache work
+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
+ * ensure a task is de-boosted (pi_task is set to NULL) before the
+ * task is allowed to run again (and can exit). This ensures the pointer
+ * points to a blocked task -- which guaratees the task is present.
+ */
+ p->pi_top_task = pi_task;
+
+ /*
+ * For FIFO/RR we only need to set prio, if that matches we're done.
+ */
+ if (prio == p->prio && !dl_prio(prio))
+ goto out_unlock;
/*
* Idle task boosting is a nono in general. There is one
@@ -3712,7 +3752,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
goto out_unlock;
}
- trace_sched_pi_setprio(p, prio);
+ trace_sched_pi_setprio(p, pi_task);
oldprio = p->prio;
if (oldprio == prio)
@@ -3736,7 +3776,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
* running task
*/
if (dl_prio(prio)) {
- struct task_struct *pi_task = rt_mutex_get_top_task(p);
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
@@ -3774,6 +3813,11 @@ out_unlock:
balance_callback(rq);
preempt_enable();
}
+#else
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+ return prio;
+}
#endif
void set_user_nice(struct task_struct *p, long nice)
@@ -3805,7 +3849,7 @@ void set_user_nice(struct task_struct *p, long nice)
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE);
+ dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
if (running)
put_prev_task(rq, p);
@@ -3816,7 +3860,7 @@ void set_user_nice(struct task_struct *p, long nice)
delta = p->prio - old_prio;
if (queued) {
- enqueue_task(rq, p, ENQUEUE_RESTORE);
+ enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
@@ -4020,10 +4064,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
* Keep a potential priority boosting if called from
* sched_setscheduler().
*/
+ p->prio = normal_prio(p);
if (keep_boost)
- p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
- else
- p->prio = normal_prio(p);
+ p->prio = rt_effective_prio(p, p->prio);
if (dl_prio(p->prio))
p->sched_class = &dl_sched_class;
@@ -4126,7 +4169,7 @@ static int __sched_setscheduler(struct task_struct *p,
const struct sched_class *prev_class;
struct rq_flags rf;
int reset_on_fork;
- int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+ int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq *rq;
/* May grab non-irq protected spin_locks: */
@@ -4310,7 +4353,7 @@ change:
* the runqueue. This will be done when the task deboost
* itself.
*/
- new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+ new_effective_prio = rt_effective_prio(p, newprio);
if (new_effective_prio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}
@@ -4923,7 +4966,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
*/
SYSCALL_DEFINE0(sched_yield)
{
- struct rq *rq = this_rq_lock();
+ struct rq_flags rf;
+ struct rq *rq;
+
+ local_irq_disable();
+ rq = this_rq();
+ rq_lock(rq, &rf);
schedstat_inc(rq->yld_count);
current->sched_class->yield_task(rq);
@@ -4932,9 +4980,8 @@ SYSCALL_DEFINE0(sched_yield)
* Since we are going to call schedule() anyway, there's
* no need to preempt or enable interrupts:
*/
- __release(rq->lock);
- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
- do_raw_spin_unlock(&rq->lock);
+ preempt_disable();
+ rq_unlock(rq, &rf);
sched_preempt_enable_no_resched();
schedule();
@@ -5514,7 +5561,7 @@ void sched_setnuma(struct task_struct *p, int nid)
p->numa_preferred_nid = nid;
if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE);
+ enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
set_curr_task(rq, p);
task_rq_unlock(rq, p, &rf);
@@ -5579,11 +5626,11 @@ static struct task_struct fake_task = {
* there's no concurrency possible, we hold the required locks anyway
* because of lock validation efforts.
*/
-static void migrate_tasks(struct rq *dead_rq)
+static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
{
struct rq *rq = dead_rq;
struct task_struct *next, *stop = rq->stop;
- struct rq_flags rf;
+ struct rq_flags orf = *rf;
int dest_cpu;
/*
@@ -5602,9 +5649,7 @@ static void migrate_tasks(struct rq *dead_rq)
* class method both need to have an up-to-date
* value of rq->clock[_task]
*/
- rq_pin_lock(rq, &rf);
update_rq_clock(rq);
- rq_unpin_lock(rq, &rf);
for (;;) {
/*
@@ -5617,8 +5662,7 @@ static void migrate_tasks(struct rq *dead_rq)
/*
* pick_next_task() assumes pinned rq->lock:
*/
- rq_repin_lock(rq, &rf);
- next = pick_next_task(rq, &fake_task, &rf);
+ next = pick_next_task(rq, &fake_task, rf);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
@@ -5631,10 +5675,9 @@ static void migrate_tasks(struct rq *dead_rq)
* because !cpu_active at this point, which means load-balance
* will not interfere. Also, stop-machine.
*/
- rq_unpin_lock(rq, &rf);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, rf);
raw_spin_lock(&next->pi_lock);
- raw_spin_lock(&rq->lock);
+ rq_relock(rq, rf);
/*
* Since we're inside stop-machine, _nothing_ should have
@@ -5648,12 +5691,12 @@ static void migrate_tasks(struct rq *dead_rq)
/* Find suitable destination for @next, with force if needed. */
dest_cpu = select_fallback_rq(dead_rq->cpu, next);
-
- rq = __migrate_task(rq, next, dest_cpu);
+ rq = __migrate_task(rq, rf, next, dest_cpu);
if (rq != dead_rq) {
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, rf);
rq = dead_rq;
- raw_spin_lock(&rq->lock);
+ *rf = orf;
+ rq_relock(rq, rf);
}
raw_spin_unlock(&next->pi_lock);
}
@@ -5732,7 +5775,7 @@ static void cpuset_cpu_active(void)
* cpuset configurations.
*/
}
- cpuset_update_active_cpus(true);
+ cpuset_update_active_cpus();
}
static int cpuset_cpu_inactive(unsigned int cpu)
@@ -5755,7 +5798,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
if (overflow)
return -EBUSY;
- cpuset_update_active_cpus(false);
+ cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
partition_sched_domains(1, NULL, NULL);
@@ -5766,7 +5809,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
int sched_cpu_activate(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
+ struct rq_flags rf;
set_cpu_active(cpu, true);
@@ -5784,12 +5827,12 @@ int sched_cpu_activate(unsigned int cpu)
* 2) At runtime, if cpuset_cpu_active() fails to rebuild the
* domains.
*/
- raw_spin_lock_irqsave(&rq->lock, flags);
+ rq_lock_irqsave(rq, &rf);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_online(rq);
}
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ rq_unlock_irqrestore(rq, &rf);
update_max_interval();
@@ -5847,18 +5890,20 @@ int sched_cpu_starting(unsigned int cpu)
int sched_cpu_dying(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
+ struct rq_flags rf;
/* Handle pending wakeups and then migrate everything off */
sched_ttwu_pending();
- raw_spin_lock_irqsave(&rq->lock, flags);
+
+ rq_lock_irqsave(rq, &rf);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
- migrate_tasks(rq);
+ migrate_tasks(rq, &rf);
BUG_ON(rq->nr_running != 1);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ rq_unlock_irqrestore(rq, &rf);
+
calc_load_migrate(rq);
update_max_interval();
nohz_balance_exit_idle(cpu);
@@ -6412,7 +6457,8 @@ static void sched_change_group(struct task_struct *tsk, int type)
*/
void sched_move_task(struct task_struct *tsk)
{
- int queued, running;
+ int queued, running, queue_flags =
+ DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq_flags rf;
struct rq *rq;
@@ -6423,14 +6469,14 @@ void sched_move_task(struct task_struct *tsk)
queued = task_on_rq_queued(tsk);
if (queued)
- dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
+ dequeue_task(rq, tsk, queue_flags);
if (running)
put_prev_task(rq, tsk);
sched_change_group(tsk, TASK_MOVE_GROUP);
if (queued)
- enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
+ enqueue_task(rq, tsk, queue_flags);
if (running)
set_curr_task(rq, tsk);
@@ -7008,14 +7054,15 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
for_each_online_cpu(i) {
struct cfs_rq *cfs_rq = tg->cfs_rq[i];
struct rq *rq = cfs_rq->rq;
+ struct rq_flags rf;
- raw_spin_lock_irq(&rq->lock);
+ rq_lock_irq(rq, &rf);
cfs_rq->runtime_enabled = runtime_enabled;
cfs_rq->runtime_remaining = 0;
if (cfs_rq->throttled)
unthrottle_cfs_rq(cfs_rq);
- raw_spin_unlock_irq(&rq->lock);
+ rq_unlock_irq(rq, &rf);
}
if (runtime_was_enabled && !runtime_enabled)
cfs_bandwidth_usage_dec();
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 54c577578da6..76877a62b5fa 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -61,6 +61,11 @@ struct sugov_cpu {
unsigned long util;
unsigned long max;
unsigned int flags;
+
+ /* The field below is for single-CPU policies only. */
+#ifdef CONFIG_NO_HZ_COMMON
+ unsigned long saved_idle_calls;
+#endif
};
static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
@@ -93,22 +98,23 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
{
struct cpufreq_policy *policy = sg_policy->policy;
+ if (sg_policy->next_freq == next_freq)
+ return;
+
+ if (sg_policy->next_freq > next_freq)
+ next_freq = (sg_policy->next_freq + next_freq) >> 1;
+
+ sg_policy->next_freq = next_freq;
sg_policy->last_freq_update_time = time;
if (policy->fast_switch_enabled) {
- if (sg_policy->next_freq == next_freq) {
- trace_cpu_frequency(policy->cur, smp_processor_id());
- return;
- }
- sg_policy->next_freq = next_freq;
next_freq = cpufreq_driver_fast_switch(policy, next_freq);
if (next_freq == CPUFREQ_ENTRY_INVALID)
return;
policy->cur = next_freq;
trace_cpu_frequency(next_freq, smp_processor_id());
- } else if (sg_policy->next_freq != next_freq) {
- sg_policy->next_freq = next_freq;
+ } else {
sg_policy->work_in_progress = true;
irq_work_queue(&sg_policy->irq_work);
}
@@ -192,6 +198,19 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
sg_cpu->iowait_boost >>= 1;
}
+#ifdef CONFIG_NO_HZ_COMMON
+static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
+{
+ unsigned long idle_calls = tick_nohz_get_idle_calls();
+ bool ret = idle_calls == sg_cpu->saved_idle_calls;
+
+ sg_cpu->saved_idle_calls = idle_calls;
+ return ret;
+}
+#else
+static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
+#endif /* CONFIG_NO_HZ_COMMON */
+
static void sugov_update_single(struct update_util_data *hook, u64 time,
unsigned int flags)
{
@@ -200,6 +219,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
struct cpufreq_policy *policy = sg_policy->policy;
unsigned long util, max;
unsigned int next_f;
+ bool busy;
sugov_set_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
@@ -207,40 +227,37 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
if (!sugov_should_update_freq(sg_policy, time))
return;
+ busy = sugov_cpu_is_busy(sg_cpu);
+
if (flags & SCHED_CPUFREQ_RT_DL) {
next_f = policy->cpuinfo.max_freq;
} else {
sugov_get_util(&util, &max);
sugov_iowait_boost(sg_cpu, &util, &max);
next_f = get_next_freq(sg_policy, util, max);
+ /*
+ * Do not reduce the frequency if the CPU has not been idle
+ * recently, as the reduction is likely to be premature then.
+ */
+ if (busy && next_f < sg_policy->next_freq)
+ next_f = sg_policy->next_freq;
}
sugov_update_commit(sg_policy, time, next_f);
}
-static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
- unsigned long util, unsigned long max,
- unsigned int flags)
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
- unsigned int max_f = policy->cpuinfo.max_freq;
u64 last_freq_update_time = sg_policy->last_freq_update_time;
+ unsigned long util = 0, max = 1;
unsigned int j;
- if (flags & SCHED_CPUFREQ_RT_DL)
- return max_f;
-
- sugov_iowait_boost(sg_cpu, &util, &max);
-
for_each_cpu(j, policy->cpus) {
- struct sugov_cpu *j_sg_cpu;
+ struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
unsigned long j_util, j_max;
s64 delta_ns;
- if (j == smp_processor_id())
- continue;
-
- j_sg_cpu = &per_cpu(sugov_cpu, j);
/*
* If the CPU utilization was last updated before the previous
* frequency update and the time elapsed between the last update
@@ -254,7 +271,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
continue;
}
if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
- return max_f;
+ return policy->cpuinfo.max_freq;
j_util = j_sg_cpu->util;
j_max = j_sg_cpu->max;
@@ -289,7 +306,11 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
sg_cpu->last_update = time;
if (sugov_should_update_freq(sg_policy, time)) {
- next_f = sugov_next_freq_shared(sg_cpu, util, max, flags);
+ if (flags & SCHED_CPUFREQ_RT_DL)
+ next_f = sg_policy->policy->cpuinfo.max_freq;
+ else
+ next_f = sugov_next_freq_shared(sg_cpu);
+
sugov_update_commit(sg_policy, time, next_f);
}
@@ -473,7 +494,6 @@ static int sugov_init(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy;
struct sugov_tunables *tunables;
- unsigned int lat;
int ret = 0;
/* State should be equivalent to EXIT */
@@ -512,10 +532,16 @@ static int sugov_init(struct cpufreq_policy *policy)
goto stop_kthread;
}
- tunables->rate_limit_us = LATENCY_MULTIPLIER;
- lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
- if (lat)
- tunables->rate_limit_us *= lat;
+ if (policy->transition_delay_us) {
+ tunables->rate_limit_us = policy->transition_delay_us;
+ } else {
+ unsigned int lat;
+
+ tunables->rate_limit_us = LATENCY_MULTIPLIER;
+ lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
+ if (lat)
+ tunables->rate_limit_us *= lat;
+ }
policy->governor_data = sg_policy;
sg_policy->tunables = tunables;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f3778e2b46c8..aea3135c5d90 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -34,6 +34,18 @@ void disable_sched_clock_irqtime(void)
sched_clock_irqtime = 0;
}
+static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
+ enum cpu_usage_stat idx)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ u64_stats_update_begin(&irqtime->sync);
+ cpustat[idx] += delta;
+ irqtime->total += delta;
+ irqtime->tick_delta += delta;
+ u64_stats_update_end(&irqtime->sync);
+}
+
/*
* Called before incrementing preempt_count on {soft,}irq_enter
* and before decrementing preempt_count on {soft,}irq_exit.
@@ -41,7 +53,6 @@ void disable_sched_clock_irqtime(void)
void irqtime_account_irq(struct task_struct *curr)
{
struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
- u64 *cpustat = kcpustat_this_cpu->cpustat;
s64 delta;
int cpu;
@@ -52,22 +63,16 @@ void irqtime_account_irq(struct task_struct *curr)
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
- u64_stats_update_begin(&irqtime->sync);
/*
* We do not account for softirq time from ksoftirqd here.
* We want to continue accounting softirq time to ksoftirqd thread
* in that case, so as not to confuse scheduler with a special task
* that do not consume any time, but still wants to run.
*/
- if (hardirq_count()) {
- cpustat[CPUTIME_IRQ] += delta;
- irqtime->tick_delta += delta;
- } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) {
- cpustat[CPUTIME_SOFTIRQ] += delta;
- irqtime->tick_delta += delta;
- }
-
- u64_stats_update_end(&irqtime->sync);
+ if (hardirq_count())
+ irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
+ else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+ irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dea138964b91..a903276fcb62 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -717,18 +717,12 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#ifdef CONFIG_SMP
+
+#include "sched-pelt.h"
+
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
-/*
- * We choose a half-life close to 1 scheduling period.
- * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
- * dependent on this value.
- */
-#define LOAD_AVG_PERIOD 32
-#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
-#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
-
/* Give new sched_entity start runnable values to heavy its load in infant time */
void init_entity_runnable_average(struct sched_entity *se)
{
@@ -2733,47 +2727,15 @@ static inline void update_cfs_shares(struct sched_entity *se)
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_SMP
-/* Precomputed fixed inverse multiplies for multiplication by y^n */
-static const u32 runnable_avg_yN_inv[] = {
- 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
- 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
- 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
- 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
- 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
- 0x85aac367, 0x82cd8698,
-};
-
-/*
- * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
- * over-estimates when re-combining.
- */
-static const u32 runnable_avg_yN_sum[] = {
- 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
- 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
- 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
-};
-
-/*
- * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
- * lower integers. See Documentation/scheduler/sched-avg.txt how these
- * were generated:
- */
-static const u32 __accumulated_sum_N32[] = {
- 0, 23371, 35056, 40899, 43820, 45281,
- 46011, 46376, 46559, 46650, 46696, 46719,
-};
-
/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
*/
-static __always_inline u64 decay_load(u64 val, u64 n)
+static u64 decay_load(u64 val, u64 n)
{
unsigned int local_n;
- if (!n)
- return val;
- else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+ if (unlikely(n > LOAD_AVG_PERIOD * 63))
return 0;
/* after bounds checking we can collapse to 32-bit */
@@ -2795,30 +2757,97 @@ static __always_inline u64 decay_load(u64 val, u64 n)
return val;
}
+static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
+{
+ u32 c1, c2, c3 = d3; /* y^0 == 1 */
+
+ /*
+ * c1 = d1 y^p
+ */
+ c1 = decay_load((u64)d1, periods);
+
+ /*
+ * p-1
+ * c2 = 1024 \Sum y^n
+ * n=1
+ *
+ * inf inf
+ * = 1024 ( \Sum y^n - \Sum y^n - y^0 )
+ * n=0 n=p
+ */
+ c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
+
+ return c1 + c2 + c3;
+}
+
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
/*
- * For updates fully spanning n periods, the contribution to runnable
- * average will be: \Sum 1024*y^n
+ * Accumulate the three separate parts of the sum; d1 the remainder
+ * of the last (incomplete) period, d2 the span of full periods and d3
+ * the remainder of the (incomplete) current period.
+ *
+ * d1 d2 d3
+ * ^ ^ ^
+ * | | |
+ * |<->|<----------------->|<--->|
+ * ... |---x---|------| ... |------|-----x (now)
+ *
+ * p-1
+ * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
+ * n=1
*
- * We can compute this reasonably efficiently by combining:
- * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
+ * = u y^p + (Step 1)
+ *
+ * p-1
+ * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
+ * n=1
*/
-static u32 __compute_runnable_contrib(u64 n)
+static __always_inline u32
+accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
+ unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
- u32 contrib = 0;
+ unsigned long scale_freq, scale_cpu;
+ u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
+ u64 periods;
- if (likely(n <= LOAD_AVG_PERIOD))
- return runnable_avg_yN_sum[n];
- else if (unlikely(n >= LOAD_AVG_MAX_N))
- return LOAD_AVG_MAX;
+ scale_freq = arch_scale_freq_capacity(NULL, cpu);
+ scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
- /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
- contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
- n %= LOAD_AVG_PERIOD;
- contrib = decay_load(contrib, n);
- return contrib + runnable_avg_yN_sum[n];
-}
+ delta += sa->period_contrib;
+ periods = delta / 1024; /* A period is 1024us (~1ms) */
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+ /*
+ * Step 1: decay old *_sum if we crossed period boundaries.
+ */
+ if (periods) {
+ sa->load_sum = decay_load(sa->load_sum, periods);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_sum =
+ decay_load(cfs_rq->runnable_load_sum, periods);
+ }
+ sa->util_sum = decay_load((u64)(sa->util_sum), periods);
+
+ /*
+ * Step 2
+ */
+ delta %= 1024;
+ contrib = __accumulate_pelt_segments(periods,
+ 1024 - sa->period_contrib, delta);
+ }
+ sa->period_contrib = delta;
+
+ contrib = cap_scale(contrib, scale_freq);
+ if (weight) {
+ sa->load_sum += weight * contrib;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * contrib;
+ }
+ if (running)
+ sa->util_sum += contrib * scale_cpu;
+
+ return periods;
+}
/*
* We can represent the historical contribution to runnable average as the
@@ -2849,13 +2878,10 @@ static u32 __compute_runnable_contrib(u64 n)
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
static __always_inline int
-__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
- u64 delta, scaled_delta, periods;
- u32 contrib;
- unsigned int delta_w, scaled_delta_w, decayed = 0;
- unsigned long scale_freq, scale_cpu;
+ u64 delta;
delta = now - sa->last_update_time;
/*
@@ -2874,83 +2900,52 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
delta >>= 10;
if (!delta)
return 0;
- sa->last_update_time = now;
-
- scale_freq = arch_scale_freq_capacity(NULL, cpu);
- scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
- /* delta_w is the amount already accumulated against our next period */
- delta_w = sa->period_contrib;
- if (delta + delta_w >= 1024) {
- decayed = 1;
- /* how much left for next period will start over, we don't know yet */
- sa->period_contrib = 0;
+ sa->last_update_time += delta << 10;
- /*
- * Now that we know we're crossing a period boundary, figure
- * out how much from delta we need to complete the current
- * period and accrue it.
- */
- delta_w = 1024 - delta_w;
- scaled_delta_w = cap_scale(delta_w, scale_freq);
- if (weight) {
- sa->load_sum += weight * scaled_delta_w;
- if (cfs_rq) {
- cfs_rq->runnable_load_sum +=
- weight * scaled_delta_w;
- }
- }
- if (running)
- sa->util_sum += scaled_delta_w * scale_cpu;
-
- delta -= delta_w;
-
- /* Figure out how many additional periods this update spans */
- periods = delta / 1024;
- delta %= 1024;
+ /*
+ * Now we know we crossed measurement unit boundaries. The *_avg
+ * accrues by two steps:
+ *
+ * Step 1: accumulate *_sum since last_update_time. If we haven't
+ * crossed period boundaries, finish.
+ */
+ if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+ return 0;
- sa->load_sum = decay_load(sa->load_sum, periods + 1);
- if (cfs_rq) {
- cfs_rq->runnable_load_sum =
- decay_load(cfs_rq->runnable_load_sum, periods + 1);
- }
- sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
-
- /* Efficiently calculate \sum (1..n_period) 1024*y^i */
- contrib = __compute_runnable_contrib(periods);
- contrib = cap_scale(contrib, scale_freq);
- if (weight) {
- sa->load_sum += weight * contrib;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * contrib;
- }
- if (running)
- sa->util_sum += contrib * scale_cpu;
+ /*
+ * Step 2: update *_avg.
+ */
+ sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_avg =
+ div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
}
+ sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
- /* Remainder of delta accrued against u_0` */
- scaled_delta = cap_scale(delta, scale_freq);
- if (weight) {
- sa->load_sum += weight * scaled_delta;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * scaled_delta;
- }
- if (running)
- sa->util_sum += scaled_delta * scale_cpu;
+ return 1;
+}
- sa->period_contrib += delta;
+static int
+__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
+{
+ return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
+}
- if (decayed) {
- sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
- if (cfs_rq) {
- cfs_rq->runnable_load_avg =
- div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
- }
- sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
- }
+static int
+__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ return ___update_load_avg(now, cpu, &se->avg,
+ se->on_rq * scale_load_down(se->load.weight),
+ cfs_rq->curr == se, NULL);
+}
- return decayed;
+static int
+__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
+{
+ return ___update_load_avg(now, cpu, &cfs_rq->avg,
+ scale_load_down(cfs_rq->load.weight),
+ cfs_rq->curr != NULL, cfs_rq);
}
/*
@@ -3014,6 +3009,9 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next)
{
+ u64 p_last_update_time;
+ u64 n_last_update_time;
+
if (!sched_feat(ATTACH_AGE_LOAD))
return;
@@ -3024,11 +3022,11 @@ void set_task_rq_fair(struct sched_entity *se,
* time. This will result in the wakee task is less decayed, but giving
* the wakee more load sounds not bad.
*/
- if (se->avg.last_update_time && prev) {
- u64 p_last_update_time;
- u64 n_last_update_time;
+ if (!(se->avg.last_update_time && prev))
+ return;
#ifndef CONFIG_64BIT
+ {
u64 p_last_update_time_copy;
u64 n_last_update_time_copy;
@@ -3043,14 +3041,13 @@ void set_task_rq_fair(struct sched_entity *se,
} while (p_last_update_time != p_last_update_time_copy ||
n_last_update_time != n_last_update_time_copy);
+ }
#else
- p_last_update_time = prev->avg.last_update_time;
- n_last_update_time = next->avg.last_update_time;
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
#endif
- __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
- &se->avg, 0, 0, NULL);
- se->avg.last_update_time = n_last_update_time;
- }
+ __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
+ se->avg.last_update_time = n_last_update_time;
}
/* Take into account change of utilization of a child task group */
@@ -3173,6 +3170,36 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
return 1;
}
+/*
+ * Check if we need to update the load and the utilization of a blocked
+ * group_entity:
+ */
+static inline bool skip_blocked_update(struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+
+ /*
+ * If sched_entity still have not zero load or utilization, we have to
+ * decay it:
+ */
+ if (se->avg.load_avg || se->avg.util_avg)
+ return false;
+
+ /*
+ * If there is a pending propagation, we have to update the load and
+ * the utilization of the sched_entity:
+ */
+ if (gcfs_rq->propagate_avg)
+ return false;
+
+ /*
+ * Otherwise, the load and the utilization of the sched_entity is
+ * already zero and there is no pending propagation, so it will be a
+ * waste of time to try to decay it:
+ */
+ return true;
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
@@ -3265,8 +3292,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
set_tg_cfs_propagate(cfs_rq);
}
- decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
- scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
+ decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
#ifndef CONFIG_64BIT
smp_wmb();
@@ -3298,11 +3324,8 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
* Track task load average for carrying it to new CPU after migrated, and
* track group sched_entity load average for task_h_load calc in migration
*/
- if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
- __update_load_avg(now, cpu, &se->avg,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
- }
+ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
+ __update_load_avg_se(now, cpu, cfs_rq, se);
decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
decayed |= propagate_entity_load_avg(se);
@@ -3407,7 +3430,7 @@ void sync_entity_load_avg(struct sched_entity *se)
u64 last_update_time;
last_update_time = cfs_rq_last_update_time(cfs_rq);
- __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+ __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
}
/*
@@ -4271,8 +4294,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
throttled_list) {
struct rq *rq = rq_of(cfs_rq);
+ struct rq_flags rf;
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
if (!cfs_rq_throttled(cfs_rq))
goto next;
@@ -4289,7 +4313,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
unthrottle_cfs_rq(cfs_rq);
next:
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
if (!remaining)
break;
@@ -5097,15 +5121,16 @@ void cpu_load_update_nohz_stop(void)
unsigned long curr_jiffies = READ_ONCE(jiffies);
struct rq *this_rq = this_rq();
unsigned long load;
+ struct rq_flags rf;
if (curr_jiffies == this_rq->last_load_update_tick)
return;
load = weighted_cpuload(cpu_of(this_rq));
- raw_spin_lock(&this_rq->lock);
+ rq_lock(this_rq, &rf);
update_rq_clock(this_rq);
cpu_load_update_nohz(this_rq, curr_jiffies, load);
- raw_spin_unlock(&this_rq->lock);
+ rq_unlock(this_rq, &rf);
}
#else /* !CONFIG_NO_HZ_COMMON */
static inline void cpu_load_update_nohz(struct rq *this_rq,
@@ -6769,7 +6794,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
lockdep_assert_held(&env->src_rq->lock);
p->on_rq = TASK_ON_RQ_MIGRATING;
- deactivate_task(env->src_rq, p, 0);
+ deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, env->dst_cpu);
}
@@ -6902,7 +6927,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
lockdep_assert_held(&rq->lock);
BUG_ON(task_rq(p) != rq);
- activate_task(rq, p, 0);
+ activate_task(rq, p, ENQUEUE_NOCLOCK);
p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0);
}
@@ -6913,9 +6938,12 @@ static void attach_task(struct rq *rq, struct task_struct *p)
*/
static void attach_one_task(struct rq *rq, struct task_struct *p)
{
- raw_spin_lock(&rq->lock);
+ struct rq_flags rf;
+
+ rq_lock(rq, &rf);
+ update_rq_clock(rq);
attach_task(rq, p);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
}
/*
@@ -6926,8 +6954,10 @@ static void attach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->tasks;
struct task_struct *p;
+ struct rq_flags rf;
- raw_spin_lock(&env->dst_rq->lock);
+ rq_lock(env->dst_rq, &rf);
+ update_rq_clock(env->dst_rq);
while (!list_empty(tasks)) {
p = list_first_entry(tasks, struct task_struct, se.group_node);
@@ -6936,7 +6966,7 @@ static void attach_tasks(struct lb_env *env)
attach_task(env->dst_rq, p);
}
- raw_spin_unlock(&env->dst_rq->lock);
+ rq_unlock(env->dst_rq, &rf);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6944,9 +6974,9 @@ static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq;
- unsigned long flags;
+ struct rq_flags rf;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
/*
@@ -6954,6 +6984,8 @@ static void update_blocked_averages(int cpu)
* list_add_leaf_cfs_rq() for details.
*/
for_each_leaf_cfs_rq(rq, cfs_rq) {
+ struct sched_entity *se;
+
/* throttled entities do not contribute to load */
if (throttled_hierarchy(cfs_rq))
continue;
@@ -6961,11 +6993,12 @@ static void update_blocked_averages(int cpu)
if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
update_tg_load_avg(cfs_rq, 0);
- /* Propagate pending load changes to the parent */
- if (cfs_rq->tg->se[cpu])
- update_load_avg(cfs_rq->tg->se[cpu], 0);
+ /* Propagate pending load changes to the parent, if any: */
+ se = cfs_rq->tg->se[cpu];
+ if (se && !skip_blocked_update(se))
+ update_load_avg(se, 0);
}
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ rq_unlock_irqrestore(rq, &rf);
}
/*
@@ -7019,12 +7052,12 @@ static inline void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq = &rq->cfs;
- unsigned long flags;
+ struct rq_flags rf;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ rq_unlock_irqrestore(rq, &rf);
}
static unsigned long task_h_load(struct task_struct *p)
@@ -7525,6 +7558,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
{
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
+ struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
bool overload = false;
@@ -7541,7 +7575,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
if (local_group) {
sds->local = sg;
- sgs = &sds->local_stat;
+ sgs = local;
if (env->idle != CPU_NEWLY_IDLE ||
time_after_eq(jiffies, sg->sgc->next_update))
@@ -7565,8 +7599,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
* the tasks on the system).
*/
if (prefer_sibling && sds->local &&
- group_has_capacity(env, &sds->local_stat) &&
- (sgs->sum_nr_running > 1)) {
+ group_has_capacity(env, local) &&
+ (sgs->sum_nr_running > local->sum_nr_running + 1)) {
sgs->group_no_capacity = 1;
sgs->group_type = group_classify(sg, sgs);
}
@@ -8042,7 +8076,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd_parent = sd->parent;
struct sched_group *group;
struct rq *busiest;
- unsigned long flags;
+ struct rq_flags rf;
struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
struct lb_env env = {
@@ -8105,7 +8139,7 @@ redo:
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
- raw_spin_lock_irqsave(&busiest->lock, flags);
+ rq_lock_irqsave(busiest, &rf);
update_rq_clock(busiest);
/*
@@ -8122,14 +8156,14 @@ more_balance:
* See task_rq_lock() family for the details.
*/
- raw_spin_unlock(&busiest->lock);
+ rq_unlock(busiest, &rf);
if (cur_ld_moved) {
attach_tasks(&env);
ld_moved += cur_ld_moved;
}
- local_irq_restore(flags);
+ local_irq_restore(rf.flags);
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
@@ -8207,6 +8241,8 @@ more_balance:
sd->nr_balance_failed++;
if (need_active_balance(&env)) {
+ unsigned long flags;
+
raw_spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the active_load_balance_cpu_stop,
@@ -8444,8 +8480,9 @@ static int active_load_balance_cpu_stop(void *data)
struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd;
struct task_struct *p = NULL;
+ struct rq_flags rf;
- raw_spin_lock_irq(&busiest_rq->lock);
+ rq_lock_irq(busiest_rq, &rf);
/* make sure the requested cpu hasn't gone down in the meantime */
if (unlikely(busiest_cpu != smp_processor_id() ||
@@ -8496,7 +8533,7 @@ static int active_load_balance_cpu_stop(void *data)
rcu_read_unlock();
out_unlock:
busiest_rq->active_balance = 0;
- raw_spin_unlock(&busiest_rq->lock);
+ rq_unlock(busiest_rq, &rf);
if (p)
attach_one_task(target_rq, p);
@@ -8794,10 +8831,13 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
* do the balance.
*/
if (time_after_eq(jiffies, rq->next_balance)) {
- raw_spin_lock_irq(&rq->lock);
+ struct rq_flags rf;
+
+ rq_lock_irq(rq, &rf);
update_rq_clock(rq);
cpu_load_update_idle(rq);
- raw_spin_unlock_irq(&rq->lock);
+ rq_unlock_irq(rq, &rf);
+
rebalance_domains(rq, CPU_IDLE);
}
@@ -8988,8 +9028,9 @@ static void task_fork_fair(struct task_struct *p)
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, *curr;
struct rq *rq = this_rq();
+ struct rq_flags rf;
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
update_rq_clock(rq);
cfs_rq = task_cfs_rq(current);
@@ -9010,7 +9051,7 @@ static void task_fork_fair(struct task_struct *p)
}
se->vruntime -= cfs_rq->min_vruntime;
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
}
/*
@@ -9372,7 +9413,6 @@ static DEFINE_MUTEX(shares_mutex);
int sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
int i;
- unsigned long flags;
/*
* We can't change the weight of the root cgroup.
@@ -9389,19 +9429,17 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
tg->shares = shares;
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
- struct sched_entity *se;
+ struct sched_entity *se = tg->se[i];
+ struct rq_flags rf;
- se = tg->se[i];
/* Propagate contribution to hierarchy */
- raw_spin_lock_irqsave(&rq->lock, flags);
-
- /* Possible calls to update_curr() need rq clock */
+ rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
for_each_sched_entity(se) {
update_load_avg(se, UPDATE_TG);
update_cfs_shares(se);
}
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ rq_unlock_irqrestore(rq, &rf);
}
done:
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1b3c8189b286..11192e0cb122 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -56,6 +56,13 @@ SCHED_FEAT(TTWU_QUEUE, true)
*/
SCHED_FEAT(SIS_AVG_CPU, false)
+/*
+ * Issue a WARN when we do multiple update_rq_clock() calls
+ * in a single rq->lock section. Default disabled because the
+ * annotations are not complete.
+ */
+SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
+
#ifdef HAVE_RT_PUSH_IPI
/*
* In order to avoid a thundering herd attack of CPUs that are
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 9f3e40226dec..979b7341008a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1927,6 +1927,87 @@ static int find_next_push_cpu(struct rq *rq)
#define RT_PUSH_IPI_EXECUTING 1
#define RT_PUSH_IPI_RESTART 2
+/*
+ * When a high priority task schedules out from a CPU and a lower priority
+ * task is scheduled in, a check is made to see if there's any RT tasks
+ * on other CPUs that are waiting to run because a higher priority RT task
+ * is currently running on its CPU. In this case, the CPU with multiple RT
+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+ * up that may be able to run one of its non-running queued RT tasks.
+ *
+ * On large CPU boxes, there's the case that several CPUs could schedule
+ * a lower priority task at the same time, in which case it will look for
+ * any overloaded CPUs that it could pull a task from. To do this, the runqueue
+ * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
+ * for a single overloaded CPU's runqueue lock can produce a large latency.
+ * (This has actually been observed on large boxes running cyclictest).
+ * Instead of taking the runqueue lock of the overloaded CPU, each of the
+ * CPUs that scheduled a lower priority task simply sends an IPI to the
+ * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
+ * lots of contention. The overloaded CPU will look to push its non-running
+ * RT task off, and if it does, it can then ignore the other IPIs coming
+ * in, and just pass those IPIs off to any other overloaded CPU.
+ *
+ * When a CPU schedules a lower priority task, it only sends an IPI to
+ * the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
+ * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
+ * RT overloaded tasks, would cause 100 IPIs to go out at once.
+ *
+ * The overloaded RT CPU, when receiving an IPI, will try to push off its
+ * overloaded RT tasks and then send an IPI to the next CPU that has
+ * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
+ * have completed. Just because a CPU may have pushed off its own overloaded
+ * RT task does not mean it should stop sending the IPI around to other
+ * overloaded CPUs. There may be another RT task waiting to run on one of
+ * those CPUs that are of higher priority than the one that was just
+ * pushed.
+ *
+ * An optimization that could possibly be made is to make a CPU array similar
+ * to the cpupri array mask of all running RT tasks, but for the overloaded
+ * case, then the IPI could be sent to only the CPU with the highest priority
+ * RT task waiting, and that CPU could send off further IPIs to the CPU with
+ * the next highest waiting task. Since the overloaded case is much less likely
+ * to happen, the complexity of this implementation may not be worth it.
+ * Instead, just send an IPI around to all overloaded CPUs.
+ *
+ * The rq->rt.push_flags holds the status of the IPI that is going around.
+ * A run queue can only send out a single IPI at a time. The possible flags
+ * for rq->rt.push_flags are:
+ *
+ * (None or zero): No IPI is going around for the current rq
+ * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around
+ * RT_PUSH_IPI_RESTART: The priority of the running task for the rq
+ * has changed, and the IPI should restart
+ * circulating the overloaded CPUs again.
+ *
+ * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
+ * before sending to the next CPU.
+ *
+ * Instead of having all CPUs that schedule a lower priority task send
+ * an IPI to the same "first" CPU in the RT overload mask, they send it
+ * to the next overloaded CPU after their own CPU. This helps distribute
+ * the work when there's more than one overloaded CPU and multiple CPUs
+ * scheduling in lower priority tasks.
+ *
+ * When a rq schedules a lower priority task than what was currently
+ * running, the next CPU with overloaded RT tasks is examined first.
+ * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
+ * priority task, it will send an IPI first to CPU 5, then CPU 5 will
+ * send to CPU 1 if it is still overloaded. CPU 1 will clear the
+ * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
+ *
+ * The first CPU to notice IPI_RESTART is set, will clear that flag and then
+ * send an IPI to the next overloaded CPU after the rq->cpu and not the next
+ * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
+ * schedules a lower priority task, and the IPI_RESTART gets set while the
+ * handling is being done on CPU 5, it will clear the flag and send it back to
+ * CPU 4 instead of CPU 1.
+ *
+ * Note, the above logic can be disabled by turning off the sched_feature
+ * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
+ * taken by the CPU requesting a pull and the waiting RT task will be pulled
+ * by that CPU. This may be fine for machines with few CPUs.
+ */
static void tell_cpu_to_push(struct rq *rq)
{
int cpu;
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
new file mode 100644
index 000000000000..cd200d16529e
--- /dev/null
+++ b/kernel/sched/sched-pelt.h
@@ -0,0 +1,13 @@
+/* Generated by Documentation/scheduler/sched-pelt; do not modify. */
+
+static const u32 runnable_avg_yN_inv[] = {
+ 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
+ 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
+ 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
+ 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
+ 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
+ 0x85aac367, 0x82cd8698,
+};
+
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5cbf92214ad8..7808ab050599 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1331,15 +1331,17 @@ extern const u32 sched_prio_to_wmult[40];
#define DEQUEUE_SLEEP 0x01
#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */
#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */
+#define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */
#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_RESTORE 0x02
#define ENQUEUE_MOVE 0x04
+#define ENQUEUE_NOCLOCK 0x08
-#define ENQUEUE_HEAD 0x08
-#define ENQUEUE_REPLENISH 0x10
+#define ENQUEUE_HEAD 0x10
+#define ENQUEUE_REPLENISH 0x20
#ifdef CONFIG_SMP
-#define ENQUEUE_MIGRATED 0x20
+#define ENQUEUE_MIGRATED 0x40
#else
#define ENQUEUE_MIGRATED 0x00
#endif
@@ -1624,6 +1626,7 @@ static inline void sched_avg_update(struct rq *rq) { }
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(rq->lock);
+
struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(p->pi_lock)
__acquires(rq->lock);
@@ -1645,6 +1648,62 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
}
+static inline void
+rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock_irqsave(&rq->lock, rf->flags);
+ rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_lock_irq(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock_irq(&rq->lock);
+ rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_lock(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock(&rq->lock);
+ rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_relock(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock(&rq->lock);
+ rq_repin_lock(rq, rf);
+}
+
+static inline void
+rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
+}
+
+static inline void
+rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock_irq(&rq->lock);
+}
+
+static inline void
+rq_unlock(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock(&rq->lock);
+}
+
#ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT
@@ -1869,6 +1928,7 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
struct irqtime {
+ u64 total;
u64 tick_delta;
u64 irq_start_time;
struct u64_stats_sync sync;
@@ -1876,16 +1936,20 @@ struct irqtime {
DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
+/*
+ * Returns the irqtime minus the softirq time computed by ksoftirqd.
+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
+ * and never move forward.
+ */
static inline u64 irq_time_read(int cpu)
{
struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
- u64 *cpustat = kcpustat_cpu(cpu).cpustat;
unsigned int seq;
u64 total;
do {
seq = __u64_stats_fetch_begin(&irqtime->sync);
- total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ];
+ total = irqtime->total;
} while (__u64_stats_fetch_retry(&irqtime->sync, seq));
return total;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 744fa611cae0..4e09821f9d9e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -309,7 +309,7 @@ restart:
account_irq_exit_time(current);
__local_bh_enable(SOFTIRQ_OFFSET);
WARN_ON_ONCE(in_interrupt());
- tsk_restore_flags(current, old_flags, PF_MEMALLOC);
+ current_restore_flags(old_flags, PF_MEMALLOC);
}
asmlinkage __visible void do_softirq(void)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8c8714fcb53c..21343d110296 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1176,6 +1176,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = timer_migration_handler,
+ .extra1 = &zero,
+ .extra2 = &one,
},
#endif
#ifdef CONFIG_BPF_SYSCALL
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index ce3a31e8eb36..5cb5b0008d97 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -541,7 +541,7 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
*
* Returns the granularity of underlying alarm base clock
*/
-static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
+static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
{
if (!alarmtimer_get_rtcdev())
return -EINVAL;
@@ -558,14 +558,14 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
*
* Provides the underlying alarm base time.
*/
-static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
+static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp)
{
struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
if (!alarmtimer_get_rtcdev())
return -EINVAL;
- *tp = ktime_to_timespec(base->gettime());
+ *tp = ktime_to_timespec64(base->gettime());
return 0;
}
@@ -598,19 +598,19 @@ static int alarm_timer_create(struct k_itimer *new_timer)
* Copies out the current itimerspec data
*/
static void alarm_timer_get(struct k_itimer *timr,
- struct itimerspec *cur_setting)
+ struct itimerspec64 *cur_setting)
{
ktime_t relative_expiry_time =
alarm_expires_remaining(&(timr->it.alarm.alarmtimer));
if (ktime_to_ns(relative_expiry_time) > 0) {
- cur_setting->it_value = ktime_to_timespec(relative_expiry_time);
+ cur_setting->it_value = ktime_to_timespec64(relative_expiry_time);
} else {
cur_setting->it_value.tv_sec = 0;
cur_setting->it_value.tv_nsec = 0;
}
- cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval);
+ cur_setting->it_interval = ktime_to_timespec64(timr->it.alarm.interval);
}
/**
@@ -640,8 +640,8 @@ static int alarm_timer_del(struct k_itimer *timr)
* Sets the timer to new_setting, and starts the timer.
*/
static int alarm_timer_set(struct k_itimer *timr, int flags,
- struct itimerspec *new_setting,
- struct itimerspec *old_setting)
+ struct itimerspec64 *new_setting,
+ struct itimerspec64 *old_setting)
{
ktime_t exp;
@@ -659,8 +659,8 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
return TIMER_RETRY;
/* start the timer */
- timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
- exp = timespec_to_ktime(new_setting->it_value);
+ timr->it.alarm.interval = timespec64_to_ktime(new_setting->it_interval);
+ exp = timespec64_to_ktime(new_setting->it_value);
/* Convert (if necessary) to absolute time */
if (flags != TIMER_ABSTIME) {
ktime_t now;
@@ -790,13 +790,14 @@ out:
* Handles clock_nanosleep calls against _ALARM clockids
*/
static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
- struct timespec *tsreq, struct timespec __user *rmtp)
+ struct timespec64 *tsreq,
+ struct timespec __user *rmtp)
{
enum alarmtimer_type type = clock2alarm(which_clock);
+ struct restart_block *restart;
struct alarm alarm;
ktime_t exp;
int ret = 0;
- struct restart_block *restart;
if (!alarmtimer_get_rtcdev())
return -ENOTSUPP;
@@ -809,7 +810,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
- exp = timespec_to_ktime(*tsreq);
+ exp = timespec64_to_ktime(*tsreq);
/* Convert (if necessary) to absolute time */
if (flags != TIMER_ABSTIME) {
ktime_t now = alarm_bases[type].gettime();
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 97ac0951f164..4237e0744e26 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -468,7 +468,7 @@ void clockevents_register_device(struct clock_event_device *dev)
}
EXPORT_SYMBOL_GPL(clockevents_register_device);
-void clockevents_config(struct clock_event_device *dev, u32 freq)
+static void clockevents_config(struct clock_event_device *dev, u32 freq)
{
u64 sec;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index ec08f527d7ee..a7560123617c 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1368,10 +1368,7 @@ retry:
ktime_to_ns(delta));
}
-/*
- * local version of hrtimer_peek_ahead_timers() called with interrupts
- * disabled.
- */
+/* called with interrupts disabled */
static inline void __hrtimer_peek_ahead_timers(void)
{
struct tick_device *td;
@@ -1506,7 +1503,7 @@ out:
return ret;
}
-long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
+long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp,
const enum hrtimer_mode mode, const clockid_t clockid)
{
struct restart_block *restart;
@@ -1519,7 +1516,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
slack = 0;
hrtimer_init_on_stack(&t.timer, clockid, mode);
- hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
+ hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
if (do_nanosleep(&t, mode))
goto out;
@@ -1550,15 +1547,17 @@ out:
SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
struct timespec __user *, rmtp)
{
+ struct timespec64 tu64;
struct timespec tu;
if (copy_from_user(&tu, rqtp, sizeof(tu)))
return -EFAULT;
- if (!timespec_valid(&tu))
+ tu64 = timespec_to_timespec64(tu);
+ if (!timespec64_valid(&tu64))
return -EINVAL;
- return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(&tu64, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}
/*
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 9cff0ab82b63..31d588d37a17 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -297,7 +297,7 @@ out:
return err;
}
-static int pc_clock_gettime(clockid_t id, struct timespec *ts)
+static int pc_clock_gettime(clockid_t id, struct timespec64 *ts)
{
struct posix_clock_desc cd;
int err;
@@ -316,7 +316,7 @@ static int pc_clock_gettime(clockid_t id, struct timespec *ts)
return err;
}
-static int pc_clock_getres(clockid_t id, struct timespec *ts)
+static int pc_clock_getres(clockid_t id, struct timespec64 *ts)
{
struct posix_clock_desc cd;
int err;
@@ -335,7 +335,7 @@ static int pc_clock_getres(clockid_t id, struct timespec *ts)
return err;
}
-static int pc_clock_settime(clockid_t id, const struct timespec *ts)
+static int pc_clock_settime(clockid_t id, const struct timespec64 *ts)
{
struct posix_clock_desc cd;
int err;
@@ -399,7 +399,7 @@ static int pc_timer_delete(struct k_itimer *kit)
return err;
}
-static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
+static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec64 *ts)
{
clockid_t id = kit->it_clock;
struct posix_clock_desc cd;
@@ -414,7 +414,7 @@ static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
}
static int pc_timer_settime(struct k_itimer *kit, int flags,
- struct itimerspec *ts, struct itimerspec *old)
+ struct itimerspec64 *ts, struct itimerspec64 *old)
{
clockid_t id = kit->it_clock;
struct posix_clock_desc cd;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 4513ad16a253..949e434d3536 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -116,7 +116,7 @@ static inline u64 virt_ticks(struct task_struct *p)
}
static int
-posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
+posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
{
int error = check_clock(which_clock);
if (!error) {
@@ -135,7 +135,7 @@ posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
}
static int
-posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
+posix_cpu_clock_set(const clockid_t which_clock, const struct timespec64 *tp)
{
/*
* You can never reset a CPU clock, but we check for other errors
@@ -261,7 +261,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
static int posix_cpu_clock_get_task(struct task_struct *tsk,
const clockid_t which_clock,
- struct timespec *tp)
+ struct timespec64 *tp)
{
int err = -EINVAL;
u64 rtn;
@@ -275,13 +275,13 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
}
if (!err)
- *tp = ns_to_timespec(rtn);
+ *tp = ns_to_timespec64(rtn);
return err;
}
-static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
+static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *tp)
{
const pid_t pid = CPUCLOCK_PID(which_clock);
int err = -EINVAL;
@@ -562,7 +562,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
* and try again. (This happens when the timer is in the middle of firing.)
*/
static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
- struct itimerspec *new, struct itimerspec *old)
+ struct itimerspec64 *new, struct itimerspec64 *old)
{
unsigned long flags;
struct sighand_struct *sighand;
@@ -572,7 +572,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
WARN_ON_ONCE(p == NULL);
- new_expires = timespec_to_ns(&new->it_value);
+ new_expires = timespec64_to_ns(&new->it_value);
/*
* Protect against sighand release/switch in exit/exec and p->cpu_timers
@@ -633,7 +633,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
bump_cpu_timer(timer, val);
if (val < timer->it.cpu.expires) {
old_expires = timer->it.cpu.expires - val;
- old->it_value = ns_to_timespec(old_expires);
+ old->it_value = ns_to_timespec64(old_expires);
} else {
old->it_value.tv_nsec = 1;
old->it_value.tv_sec = 0;
@@ -671,7 +671,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
* Install the new reload setting, and
* set up the signal and overrun bookkeeping.
*/
- timer->it.cpu.incr = timespec_to_ns(&new->it_interval);
+ timer->it.cpu.incr = timespec64_to_ns(&new->it_interval);
/*
* This acts as a modification timestamp for the timer,
@@ -695,12 +695,12 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
ret = 0;
out:
if (old)
- old->it_interval = ns_to_timespec(old_incr);
+ old->it_interval = ns_to_timespec64(old_incr);
return ret;
}
-static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
+static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp)
{
u64 now;
struct task_struct *p = timer->it.cpu.task;
@@ -710,7 +710,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
/*
* Easy part: convert the reload time.
*/
- itp->it_interval = ns_to_timespec(timer->it.cpu.incr);
+ itp->it_interval = ns_to_timespec64(timer->it.cpu.incr);
if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */
itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
@@ -739,7 +739,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
* Call the timer disarmed, nothing else to do.
*/
timer->it.cpu.expires = 0;
- itp->it_value = ns_to_timespec(timer->it.cpu.expires);
+ itp->it_value = ns_to_timespec64(timer->it.cpu.expires);
return;
} else {
cpu_timer_sample_group(timer->it_clock, p, &now);
@@ -748,7 +748,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
}
if (now < timer->it.cpu.expires) {
- itp->it_value = ns_to_timespec(timer->it.cpu.expires - now);
+ itp->it_value = ns_to_timespec64(timer->it.cpu.expires - now);
} else {
/*
* The timer should have expired already, but the firing
@@ -825,6 +825,8 @@ static void check_thread_timers(struct task_struct *tsk,
* At the hard limit, we just die.
* No need to calculate anything else now.
*/
+ pr_info("CPU Watchdog Timeout (hard): %s[%d]\n",
+ tsk->comm, task_pid_nr(tsk));
__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
return;
}
@@ -836,8 +838,7 @@ static void check_thread_timers(struct task_struct *tsk,
soft += USEC_PER_SEC;
sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
}
- printk(KERN_INFO
- "RT Watchdog Timeout: %s[%d]\n",
+ pr_info("RT Watchdog Timeout (soft): %s[%d]\n",
tsk->comm, task_pid_nr(tsk));
__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
}
@@ -935,6 +936,8 @@ static void check_process_timers(struct task_struct *tsk,
* At the hard limit, we just die.
* No need to calculate anything else now.
*/
+ pr_info("RT Watchdog Timeout (hard): %s[%d]\n",
+ tsk->comm, task_pid_nr(tsk));
__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
return;
}
@@ -942,6 +945,8 @@ static void check_process_timers(struct task_struct *tsk,
/*
* At the soft limit, send a SIGXCPU every second.
*/
+ pr_info("CPU Watchdog Timeout (soft): %s[%d]\n",
+ tsk->comm, task_pid_nr(tsk));
__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
if (soft < hard) {
soft++;
@@ -1214,7 +1219,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
}
static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
- struct timespec *rqtp, struct itimerspec *it)
+ struct timespec64 *rqtp, struct itimerspec64 *it)
{
struct k_itimer timer;
int error;
@@ -1229,7 +1234,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
error = posix_cpu_timer_create(&timer);
timer.it_process = current;
if (!error) {
- static struct itimerspec zero_it;
+ static struct itimerspec64 zero_it;
memset(it, 0, sizeof *it);
it->it_value = *rqtp;
@@ -1264,7 +1269,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
/*
* We were interrupted by a signal.
*/
- *rqtp = ns_to_timespec(timer.it.cpu.expires);
+ *rqtp = ns_to_timespec64(timer.it.cpu.expires);
error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
if (!error) {
/*
@@ -1301,10 +1306,11 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
- struct timespec *rqtp, struct timespec __user *rmtp)
+ struct timespec64 *rqtp, struct timespec __user *rmtp)
{
struct restart_block *restart_block = &current->restart_block;
- struct itimerspec it;
+ struct itimerspec64 it;
+ struct timespec ts;
int error;
/*
@@ -1324,13 +1330,14 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
/*
* Report back to the user the time still remaining.
*/
- if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+ ts = timespec64_to_timespec(it.it_value);
+ if (rmtp && copy_to_user(rmtp, &ts, sizeof(*rmtp)))
return -EFAULT;
restart_block->fn = posix_cpu_nsleep_restart;
restart_block->nanosleep.clockid = which_clock;
restart_block->nanosleep.rmtp = rmtp;
- restart_block->nanosleep.expires = timespec_to_ns(rqtp);
+ restart_block->nanosleep.expires = timespec64_to_ns(rqtp);
}
return error;
}
@@ -1338,11 +1345,12 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
{
clockid_t which_clock = restart_block->nanosleep.clockid;
- struct timespec t;
- struct itimerspec it;
+ struct itimerspec64 it;
+ struct timespec64 t;
+ struct timespec tmp;
int error;
- t = ns_to_timespec(restart_block->nanosleep.expires);
+ t = ns_to_timespec64(restart_block->nanosleep.expires);
error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
@@ -1351,10 +1359,11 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
/*
* Report back to the user the time still remaining.
*/
- if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+ tmp = timespec64_to_timespec(it.it_value);
+ if (rmtp && copy_to_user(rmtp, &tmp, sizeof(*rmtp)))
return -EFAULT;
- restart_block->nanosleep.expires = timespec_to_ns(&t);
+ restart_block->nanosleep.expires = timespec64_to_ns(&t);
}
return error;
@@ -1364,12 +1373,12 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
static int process_cpu_clock_getres(const clockid_t which_clock,
- struct timespec *tp)
+ struct timespec64 *tp)
{
return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
}
static int process_cpu_clock_get(const clockid_t which_clock,
- struct timespec *tp)
+ struct timespec64 *tp)
{
return posix_cpu_clock_get(PROCESS_CLOCK, tp);
}
@@ -1379,7 +1388,7 @@ static int process_cpu_timer_create(struct k_itimer *timer)
return posix_cpu_timer_create(timer);
}
static int process_cpu_nsleep(const clockid_t which_clock, int flags,
- struct timespec *rqtp,
+ struct timespec64 *rqtp,
struct timespec __user *rmtp)
{
return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
@@ -1389,12 +1398,12 @@ static long process_cpu_nsleep_restart(struct restart_block *restart_block)
return -EINVAL;
}
static int thread_cpu_clock_getres(const clockid_t which_clock,
- struct timespec *tp)
+ struct timespec64 *tp)
{
return posix_cpu_clock_getres(THREAD_CLOCK, tp);
}
static int thread_cpu_clock_get(const clockid_t which_clock,
- struct timespec *tp)
+ struct timespec64 *tp)
{
return posix_cpu_clock_get(THREAD_CLOCK, tp);
}
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index cd6716e115e8..c0cd53eb018a 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -49,26 +49,32 @@ SYS_NI(alarm);
SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
const struct timespec __user *, tp)
{
+ struct timespec64 new_tp64;
struct timespec new_tp;
if (which_clock != CLOCK_REALTIME)
return -EINVAL;
if (copy_from_user(&new_tp, tp, sizeof (*tp)))
return -EFAULT;
- return do_sys_settimeofday(&new_tp, NULL);
+
+ new_tp64 = timespec_to_timespec64(new_tp);
+ return do_sys_settimeofday64(&new_tp64, NULL);
}
SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
struct timespec __user *,tp)
{
+ struct timespec64 kernel_tp64;
struct timespec kernel_tp;
switch (which_clock) {
- case CLOCK_REALTIME: ktime_get_real_ts(&kernel_tp); break;
- case CLOCK_MONOTONIC: ktime_get_ts(&kernel_tp); break;
- case CLOCK_BOOTTIME: get_monotonic_boottime(&kernel_tp); break;
+ case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break;
+ case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break;
+ case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break;
default: return -EINVAL;
}
+
+ kernel_tp = timespec64_to_timespec(kernel_tp64);
if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
return -EFAULT;
return 0;
@@ -97,6 +103,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
const struct timespec __user *, rqtp,
struct timespec __user *, rmtp)
{
+ struct timespec64 t64;
struct timespec t;
switch (which_clock) {
@@ -105,9 +112,10 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
case CLOCK_BOOTTIME:
if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
return -EFAULT;
- if (!timespec_valid(&t))
+ t64 = timespec_to_timespec64(t);
+ if (!timespec64_valid(&t64))
return -EINVAL;
- return hrtimer_nanosleep(&t, rmtp, flags & TIMER_ABSTIME ?
+ return hrtimer_nanosleep(&t64, rmtp, flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
which_clock);
default:
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 50a6a47020de..4d7b2ce09c27 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -130,12 +130,12 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
/*
* These ones are defined below.
*/
-static int common_nsleep(const clockid_t, int flags, struct timespec *t,
+static int common_nsleep(const clockid_t, int flags, struct timespec64 *t,
struct timespec __user *rmtp);
static int common_timer_create(struct k_itimer *new_timer);
-static void common_timer_get(struct k_itimer *, struct itimerspec *);
+static void common_timer_get(struct k_itimer *, struct itimerspec64 *);
static int common_timer_set(struct k_itimer *, int,
- struct itimerspec *, struct itimerspec *);
+ struct itimerspec64 *, struct itimerspec64 *);
static int common_timer_del(struct k_itimer *timer);
static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
@@ -204,17 +204,17 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
}
/* Get clock_realtime */
-static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
+static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp)
{
- ktime_get_real_ts(tp);
+ ktime_get_real_ts64(tp);
return 0;
}
/* Set clock_realtime */
static int posix_clock_realtime_set(const clockid_t which_clock,
- const struct timespec *tp)
+ const struct timespec64 *tp)
{
- return do_sys_settimeofday(tp, NULL);
+ return do_sys_settimeofday64(tp, NULL);
}
static int posix_clock_realtime_adj(const clockid_t which_clock,
@@ -226,54 +226,54 @@ static int posix_clock_realtime_adj(const clockid_t which_clock,
/*
* Get monotonic time for posix timers
*/
-static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
+static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp)
{
- ktime_get_ts(tp);
+ ktime_get_ts64(tp);
return 0;
}
/*
* Get monotonic-raw time for posix timers
*/
-static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
+static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
{
- getrawmonotonic(tp);
+ getrawmonotonic64(tp);
return 0;
}
-static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
+static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp)
{
- *tp = current_kernel_time();
+ *tp = current_kernel_time64();
return 0;
}
static int posix_get_monotonic_coarse(clockid_t which_clock,
- struct timespec *tp)
+ struct timespec64 *tp)
{
- *tp = get_monotonic_coarse();
+ *tp = get_monotonic_coarse64();
return 0;
}
-static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
+static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *tp)
{
- *tp = ktime_to_timespec(KTIME_LOW_RES);
+ *tp = ktime_to_timespec64(KTIME_LOW_RES);
return 0;
}
-static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
+static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp)
{
- get_monotonic_boottime(tp);
+ get_monotonic_boottime64(tp);
return 0;
}
-static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
+static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp)
{
- timekeeping_clocktai(tp);
+ timekeeping_clocktai64(tp);
return 0;
}
-static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp)
+static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
{
tp->tv_sec = 0;
tp->tv_nsec = hrtimer_resolution;
@@ -734,18 +734,18 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
* report.
*/
static void
-common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
+common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
{
ktime_t now, remaining, iv;
struct hrtimer *timer = &timr->it.real.timer;
- memset(cur_setting, 0, sizeof(struct itimerspec));
+ memset(cur_setting, 0, sizeof(*cur_setting));
iv = timr->it.real.interval;
/* interval timer ? */
if (iv)
- cur_setting->it_interval = ktime_to_timespec(iv);
+ cur_setting->it_interval = ktime_to_timespec64(iv);
else if (!hrtimer_active(timer) &&
(timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
return;
@@ -771,13 +771,14 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
cur_setting->it_value.tv_nsec = 1;
} else
- cur_setting->it_value = ktime_to_timespec(remaining);
+ cur_setting->it_value = ktime_to_timespec64(remaining);
}
/* Get the time remaining on a POSIX.1b interval timer. */
SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
struct itimerspec __user *, setting)
{
+ struct itimerspec64 cur_setting64;
struct itimerspec cur_setting;
struct k_itimer *timr;
struct k_clock *kc;
@@ -792,10 +793,11 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
if (WARN_ON_ONCE(!kc || !kc->timer_get))
ret = -EINVAL;
else
- kc->timer_get(timr, &cur_setting);
+ kc->timer_get(timr, &cur_setting64);
unlock_timer(timr, flags);
+ cur_setting = itimerspec64_to_itimerspec(&cur_setting64);
if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
return -EFAULT;
@@ -831,7 +833,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
/* timr->it_lock is taken. */
static int
common_timer_set(struct k_itimer *timr, int flags,
- struct itimerspec *new_setting, struct itimerspec *old_setting)
+ struct itimerspec64 *new_setting, struct itimerspec64 *old_setting)
{
struct hrtimer *timer = &timr->it.real.timer;
enum hrtimer_mode mode;
@@ -860,10 +862,10 @@ common_timer_set(struct k_itimer *timr, int flags,
hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
timr->it.real.timer.function = posix_timer_fn;
- hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value));
+ hrtimer_set_expires(timer, timespec64_to_ktime(new_setting->it_value));
/* Convert interval */
- timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
+ timr->it.real.interval = timespec64_to_ktime(new_setting->it_interval);
/* SIGEV_NONE timers are not queued ! See common_timer_get */
if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
@@ -883,21 +885,23 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
const struct itimerspec __user *, new_setting,
struct itimerspec __user *, old_setting)
{
- struct k_itimer *timr;
+ struct itimerspec64 new_spec64, old_spec64;
+ struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL;
struct itimerspec new_spec, old_spec;
- int error = 0;
+ struct k_itimer *timr;
unsigned long flag;
- struct itimerspec *rtn = old_setting ? &old_spec : NULL;
struct k_clock *kc;
+ int error = 0;
if (!new_setting)
return -EINVAL;
if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
return -EFAULT;
+ new_spec64 = itimerspec_to_itimerspec64(&new_spec);
- if (!timespec_valid(&new_spec.it_interval) ||
- !timespec_valid(&new_spec.it_value))
+ if (!timespec64_valid(&new_spec64.it_interval) ||
+ !timespec64_valid(&new_spec64.it_value))
return -EINVAL;
retry:
timr = lock_timer(timer_id, &flag);
@@ -908,7 +912,7 @@ retry:
if (WARN_ON_ONCE(!kc || !kc->timer_set))
error = -EINVAL;
else
- error = kc->timer_set(timr, flags, &new_spec, rtn);
+ error = kc->timer_set(timr, flags, &new_spec64, rtn);
unlock_timer(timr, flag);
if (error == TIMER_RETRY) {
@@ -916,6 +920,7 @@ retry:
goto retry;
}
+ old_spec = itimerspec64_to_itimerspec(&old_spec64);
if (old_setting && !error &&
copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
error = -EFAULT;
@@ -1014,6 +1019,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
const struct timespec __user *, tp)
{
struct k_clock *kc = clockid_to_kclock(which_clock);
+ struct timespec64 new_tp64;
struct timespec new_tp;
if (!kc || !kc->clock_set)
@@ -1021,21 +1027,24 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
if (copy_from_user(&new_tp, tp, sizeof (*tp)))
return -EFAULT;
+ new_tp64 = timespec_to_timespec64(new_tp);
- return kc->clock_set(which_clock, &new_tp);
+ return kc->clock_set(which_clock, &new_tp64);
}
SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
struct timespec __user *,tp)
{
struct k_clock *kc = clockid_to_kclock(which_clock);
+ struct timespec64 kernel_tp64;
struct timespec kernel_tp;
int error;
if (!kc)
return -EINVAL;
- error = kc->clock_get(which_clock, &kernel_tp);
+ error = kc->clock_get(which_clock, &kernel_tp64);
+ kernel_tp = timespec64_to_timespec(kernel_tp64);
if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
error = -EFAULT;
@@ -1070,13 +1079,15 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
struct timespec __user *, tp)
{
struct k_clock *kc = clockid_to_kclock(which_clock);
+ struct timespec64 rtn_tp64;
struct timespec rtn_tp;
int error;
if (!kc)
return -EINVAL;
- error = kc->clock_getres(which_clock, &rtn_tp);
+ error = kc->clock_getres(which_clock, &rtn_tp64);
+ rtn_tp = timespec64_to_timespec(rtn_tp64);
if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
error = -EFAULT;
@@ -1088,7 +1099,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
* nanosleep for monotonic and realtime clocks
*/
static int common_nsleep(const clockid_t which_clock, int flags,
- struct timespec *tsave, struct timespec __user *rmtp)
+ struct timespec64 *tsave, struct timespec __user *rmtp)
{
return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
@@ -1100,6 +1111,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
struct timespec __user *, rmtp)
{
struct k_clock *kc = clockid_to_kclock(which_clock);
+ struct timespec64 t64;
struct timespec t;
if (!kc)
@@ -1110,10 +1122,11 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
return -EFAULT;
- if (!timespec_valid(&t))
+ t64 = timespec_to_timespec64(t);
+ if (!timespec64_valid(&t64))
return -EINVAL;
- return kc->nsleep(which_clock, flags, &t, rmtp);
+ return kc->nsleep(which_clock, flags, &t64, rmtp);
}
/*
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index ea6b610c4c57..2d8f05aad442 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -206,6 +206,11 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
update_clock_read_data(&rd);
+ if (sched_clock_timer.function != NULL) {
+ /* update timeout for clock wrap */
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+ }
+
r = rate;
if (r >= 4000000) {
r /= 1000000;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7fe53be86077..64c97fc130c4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -993,6 +993,18 @@ ktime_t tick_nohz_get_sleep_length(void)
return ts->sleep_length;
}
+/**
+ * tick_nohz_get_idle_calls - return the current idle calls counter value
+ *
+ * Called from the schedutil frequency scaling governor in scheduler context.
+ */
+unsigned long tick_nohz_get_idle_calls(void)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ return ts->idle_calls;
+}
+
static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
{
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 25bdd2504571..6574bba44b55 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -193,8 +193,8 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz
SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
struct timezone __user *, tz)
{
+ struct timespec64 new_ts;
struct timeval user_tv;
- struct timespec new_ts;
struct timezone new_tz;
if (tv) {
@@ -212,7 +212,7 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
return -EFAULT;
}
- return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
+ return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}
SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5b63a2102c29..9652bc57fd09 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -996,8 +996,7 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history,
return 0;
/* Interpolate shortest distance from beginning or end of history */
- interp_forward = partial_history_cycles > total_history_cycles/2 ?
- true : false;
+ interp_forward = partial_history_cycles > total_history_cycles / 2;
partial_history_cycles = interp_forward ?
total_history_cycles - partial_history_cycles :
partial_history_cycles;
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 1dc0256bfb6e..cc6b6bdd1329 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -241,7 +241,7 @@ int timer_migration_handler(struct ctl_table *table, int write,
int ret;
mutex_lock(&mutex);
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write)
timers_update_migration(false);
mutex_unlock(&mutex);
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index ff8d5c13d04b..0e7f5428a148 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -16,6 +16,7 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
+#include <linux/nmi.h>
#include <linux/uaccess.h>
@@ -86,6 +87,9 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
next_one:
i = 0;
+
+ touch_nmi_watchdog();
+
raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
curr = timerqueue_getnext(&base->active);
@@ -197,6 +201,8 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
{
struct clock_event_device *dev = td->evtdev;
+ touch_nmi_watchdog();
+
SEQ_printf(m, "Tick Device: mode: %d\n", td->mode);
if (cpu < 0)
SEQ_printf(m, "Broadcast device\n");
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d4a06e714645..9619b5768e4b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -455,7 +455,7 @@ config UPROBE_EVENTS
select UPROBES
select PROBE_EVENTS
select TRACING
- default n
+ default y
help
This allows the user to add tracing events on top of userspace
dynamic events (similar to tracepoints) on the fly via the trace
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b2058a7f94bd..bd8ae8d5ae9c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -690,8 +690,8 @@ void blk_trace_shutdown(struct request_queue *q)
/**
* blk_add_trace_rq - Add a trace for a request oriented action
- * @q: queue the io is for
* @rq: the source request
+ * @error: return status to log
* @nr_bytes: number of completed bytes
* @what: the action
*
@@ -699,10 +699,10 @@ void blk_trace_shutdown(struct request_queue *q)
* Records an action against a request. Will log the bio offset + size.
*
**/
-static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+static void blk_add_trace_rq(struct request *rq, int error,
unsigned int nr_bytes, u32 what)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt = rq->q->blk_trace;
if (likely(!bt))
return;
@@ -713,40 +713,32 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
what |= BLK_TC_ACT(BLK_TC_FS);
__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
- rq->cmd_flags, what, rq->errors, 0, NULL);
-}
-
-static void blk_add_trace_rq_abort(void *ignore,
- struct request_queue *q, struct request *rq)
-{
- blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT);
+ rq->cmd_flags, what, error, 0, NULL);
}
static void blk_add_trace_rq_insert(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT);
}
static void blk_add_trace_rq_issue(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE);
}
static void blk_add_trace_rq_requeue(void *ignore,
struct request_queue *q,
struct request *rq)
{
- blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE);
}
-static void blk_add_trace_rq_complete(void *ignore,
- struct request_queue *q,
- struct request *rq,
- unsigned int nr_bytes)
+static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
+ int error, unsigned int nr_bytes)
{
- blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE);
+ blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE);
}
/**
@@ -941,7 +933,7 @@ static void blk_add_trace_rq_remap(void *ignore,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
- rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors,
+ rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
sizeof(r), &r);
}
@@ -966,7 +958,7 @@ void blk_add_driver_data(struct request_queue *q,
return;
__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
- BLK_TA_DRV_DATA, rq->errors, len, data);
+ BLK_TA_DRV_DATA, 0, len, data);
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -974,8 +966,6 @@ static void blk_register_tracepoints(void)
{
int ret;
- ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
- WARN_ON(ret);
ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
WARN_ON(ret);
ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
@@ -1028,7 +1018,6 @@ static void blk_unregister_tracepoints(void)
unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
- unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
tracepoint_synchronize_unregister();
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 8a4efac28710..460a031c77e5 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -96,7 +96,7 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
if (unlikely(in_interrupt() ||
current->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
- if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+ if (unlikely(uaccess_kernel()))
return -EPERM;
if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
return -EPERM;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0ad75e9698f6..0029fe62b245 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4355,6 +4355,7 @@ static const char readme_msg[] =
"\t -:[<group>/]<event>\n"
#ifdef CONFIG_KPROBE_EVENTS
"\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
+ "place (kretprobe): [<module>:]<symbol>[+<offset>]|<memaddr>\n"
#endif
#ifdef CONFIG_UPROBE_EVENTS
"\t place: <path>:<offset>\n"
@@ -5529,7 +5530,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
.partial = partial_def,
.nr_pages = 0, /* This gets updated below. */
.nr_pages_max = PIPE_DEF_BUFFERS,
- .flags = flags,
.ops = &tracing_pipe_buf_ops,
.spd_release = tracing_spd_release_pipe,
};
@@ -6427,7 +6427,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
.pages = pages_def,
.partial = partial_def,
.nr_pages_max = PIPE_DEF_BUFFERS,
- .flags = flags,
.ops = &buffer_pipe_buf_ops,
.spd_release = buffer_spd_release,
};
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5f688cc724f0..013f4e7146d4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -681,10 +681,6 @@ static int create_trace_kprobe(int argc, char **argv)
return -EINVAL;
}
if (isdigit(argv[1][0])) {
- if (is_return) {
- pr_info("Return probe point must be a symbol.\n");
- return -EINVAL;
- }
/* an address specified */
ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr);
if (ret) {
@@ -700,8 +696,9 @@ static int create_trace_kprobe(int argc, char **argv)
pr_info("Failed to parse symbol.\n");
return ret;
}
- if (offset && is_return) {
- pr_info("Return probe must be used without offset.\n");
+ if (offset && is_return &&
+ !function_offset_within_entry(NULL, symbol, offset)) {
+ pr_info("Given offset is not valid for return probe.\n");
return -EINVAL;
}
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c0168b7da1ea..c74bf39ef764 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3209,9 +3209,8 @@ static int init_worker_pool(struct worker_pool *pool)
INIT_LIST_HEAD(&pool->idle_list);
hash_init(pool->busy_hash);
- init_timer_deferrable(&pool->idle_timer);
- pool->idle_timer.function = idle_worker_timeout;
- pool->idle_timer.data = (unsigned long)pool;
+ setup_deferrable_timer(&pool->idle_timer, idle_worker_timeout,
+ (unsigned long)pool);
setup_timer(&pool->mayday_timer, pool_mayday_timeout,
(unsigned long)pool);
@@ -4735,6 +4734,29 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
return wfc.ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu);
+
+/**
+ * work_on_cpu_safe - run a function in thread context on a particular cpu
+ * @cpu: the cpu to run on
+ * @fn: the function to run
+ * @arg: the function argument
+ *
+ * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
+ * any locks which would prevent @fn from completing.
+ *
+ * Return: The value @fn returns.
+ */
+long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
+{
+ long ret = -ENODEV;
+
+ get_online_cpus();
+ if (cpu_online(cpu))
+ ret = work_on_cpu(cpu, fn, arg);
+ put_online_cpus();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(work_on_cpu_safe);
#endif /* CONFIG_SMP */
#ifdef CONFIG_FREEZER