summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/cgroup.c26
-rw-r--r--kernel/cpu.c105
-rw-r--r--kernel/cpuset.c20
-rw-r--r--kernel/cred.c85
-rw-r--r--kernel/debug/debug_core.c2
-rw-r--r--kernel/debug/gdbstub.c9
-rw-r--r--kernel/debug/kdb/kdb_main.c19
-rw-r--r--kernel/early_res.c6
-rw-r--r--kernel/exec_domain.c18
-rw-r--r--kernel/exit.c40
-rw-r--r--kernel/fork.c51
-rw-r--r--kernel/futex.c17
-rw-r--r--kernel/hrtimer.c2
-rw-r--r--kernel/irq/manage.c3
-rw-r--r--kernel/kexec.c7
-rw-r--r--kernel/kmod.c193
-rw-r--r--kernel/module.c352
-rw-r--r--kernel/mutex.c7
-rw-r--r--kernel/padata.c4
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/perf_event.c753
-rw-r--r--kernel/pid.c7
-rw-r--r--kernel/pm_qos_params.c215
-rw-r--r--kernel/posix-cpu-timers.c12
-rw-r--r--kernel/posix-timers.c11
-rw-r--r--kernel/power/Kconfig9
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/hibernate.c24
-rw-r--r--kernel/power/main.c55
-rw-r--r--kernel/power/nvs.c (renamed from kernel/power/hibernate_nvs.c)24
-rw-r--r--kernel/power/suspend.c19
-rw-r--r--kernel/power/swap.c4
-rw-r--r--kernel/profile.c8
-rw-r--r--kernel/ptrace.c26
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/sched.c186
-rw-r--r--kernel/sched_debug.c10
-rw-r--r--kernel/sched_fair.c24
-rw-r--r--kernel/signal.c32
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/time/tick-sched.c21
-rw-r--r--kernel/timer.c32
-rw-r--r--kernel/trace/blktrace.c140
-rw-r--r--kernel/trace/ftrace.c7
-rw-r--r--kernel/trace/kmemtrace.c70
-rw-r--r--kernel/trace/ring_buffer.c19
-rw-r--r--kernel/trace/trace.c15
-rw-r--r--kernel/trace/trace.h9
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_event_perf.c192
-rw-r--r--kernel/trace/trace_events.c139
-rw-r--r--kernel/trace/trace_events_filter.c28
-rw-r--r--kernel/trace/trace_export.c16
-rw-r--r--kernel/trace/trace_functions_graph.c13
-rw-r--r--kernel/trace/trace_kprobe.c113
-rw-r--r--kernel/trace/trace_output.c137
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_sched_switch.c20
-rw-r--r--kernel/trace/trace_sched_wakeup.c28
-rw-r--r--kernel/trace/trace_syscalls.c146
-rw-r--r--kernel/trace/trace_workqueue.c26
-rw-r--r--kernel/tracepoint.c91
-rw-r--r--kernel/user_namespace.c44
-rw-r--r--kernel/workqueue.c9
69 files changed, 2191 insertions, 1548 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index c71bd26631a2..8296aa516c5a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -407,7 +407,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
audit_hold_skb(skb);
} else
/* drop the extra reference if sent ok */
- kfree_skb(skb);
+ consume_skb(skb);
}
static int kauditd_thread(void *dummy)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 291775021b2e..a8ce09954404 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1788,6 +1788,29 @@ out:
return retval;
}
+/**
+ * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_current_cg(struct task_struct *tsk)
+{
+ struct cgroupfs_root *root;
+ struct cgroup *cur_cg;
+ int retval = 0;
+
+ cgroup_lock();
+ for_each_active_root(root) {
+ cur_cg = task_cgroup_from_root(current, root);
+ retval = cgroup_attach_task(cur_cg, tsk);
+ if (retval)
+ break;
+ }
+ cgroup_unlock();
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg);
+
/*
* Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
* held. May take task_lock of task
@@ -2994,7 +3017,6 @@ static void cgroup_event_remove(struct work_struct *work)
remove);
struct cgroup *cgrp = event->cgrp;
- /* TODO: check return code */
event->cft->unregister_event(cgrp, event->cft, event->eventfd);
eventfd_ctx_put(event->eventfd);
@@ -4599,7 +4621,7 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
parent_css = parent->subsys[subsys_id];
child_css = child->subsys[subsys_id];
parent_id = parent_css->id;
- depth = parent_id->depth;
+ depth = parent_id->depth + 1;
child_id = get_new_cssid(ss, depth);
if (IS_ERR(child_id))
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 124ad9d6be16..97d1b426a4ac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -20,13 +20,29 @@
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
-static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
+/*
+ * The following two API's must be used when attempting
+ * to serialize the updates to cpu_online_mask, cpu_present_mask.
+ */
+void cpu_maps_update_begin(void)
+{
+ mutex_lock(&cpu_add_remove_lock);
+}
+
+void cpu_maps_update_done(void)
+{
+ mutex_unlock(&cpu_add_remove_lock);
+}
+
+static RAW_NOTIFIER_HEAD(cpu_chain);
/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
* Should always be manipulated under cpu_add_remove_lock
*/
static int cpu_hotplug_disabled;
+#ifdef CONFIG_HOTPLUG_CPU
+
static struct {
struct task_struct *active_writer;
struct mutex lock; /* Synchronizes accesses to refcount, */
@@ -41,8 +57,6 @@ static struct {
.refcount = 0,
};
-#ifdef CONFIG_HOTPLUG_CPU
-
void get_online_cpus(void)
{
might_sleep();
@@ -67,22 +81,6 @@ void put_online_cpus(void)
}
EXPORT_SYMBOL_GPL(put_online_cpus);
-#endif /* CONFIG_HOTPLUG_CPU */
-
-/*
- * The following two API's must be used when attempting
- * to serialize the updates to cpu_online_mask, cpu_present_mask.
- */
-void cpu_maps_update_begin(void)
-{
- mutex_lock(&cpu_add_remove_lock);
-}
-
-void cpu_maps_update_done(void)
-{
- mutex_unlock(&cpu_add_remove_lock);
-}
-
/*
* This ensures that the hotplug operation can begin only when the
* refcount goes to zero.
@@ -124,6 +122,12 @@ static void cpu_hotplug_done(void)
cpu_hotplug.active_writer = NULL;
mutex_unlock(&cpu_hotplug.lock);
}
+
+#else /* #if CONFIG_HOTPLUG_CPU */
+static void cpu_hotplug_begin(void) {}
+static void cpu_hotplug_done(void) {}
+#endif /* #esle #if CONFIG_HOTPLUG_CPU */
+
/* Need to know about CPUs going up/down? */
int __ref register_cpu_notifier(struct notifier_block *nb)
{
@@ -134,8 +138,29 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
return ret;
}
+static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
+ int *nr_calls)
+{
+ int ret;
+
+ ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call,
+ nr_calls);
+
+ return notifier_to_errno(ret);
+}
+
+static int cpu_notify(unsigned long val, void *v)
+{
+ return __cpu_notify(val, v, -1, NULL);
+}
+
#ifdef CONFIG_HOTPLUG_CPU
+static void cpu_notify_nofail(unsigned long val, void *v)
+{
+ BUG_ON(cpu_notify(val, v));
+}
+
EXPORT_SYMBOL(register_cpu_notifier);
void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -181,8 +206,7 @@ static int __ref take_cpu_down(void *_param)
if (err < 0)
return err;
- raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
- param->hcpu);
+ cpu_notify(CPU_DYING | param->mod, param->hcpu);
if (task_cpu(param->caller) == cpu)
move_task_off_dead_cpu(cpu, param->caller);
@@ -212,17 +236,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
cpu_hotplug_begin();
set_cpu_active(cpu, false);
- err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
- hcpu, -1, &nr_calls);
- if (err == NOTIFY_BAD) {
+ err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
+ if (err) {
set_cpu_active(cpu, true);
nr_calls--;
- __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
- hcpu, nr_calls, NULL);
+ __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
printk("%s: attempt to take down CPU %u failed\n",
__func__, cpu);
- err = -EINVAL;
goto out_release;
}
@@ -230,9 +251,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
if (err) {
set_cpu_active(cpu, true);
/* CPU didn't die: tell everyone. Can't complain. */
- if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
- hcpu) == NOTIFY_BAD)
- BUG();
+ cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
goto out_release;
}
@@ -246,19 +265,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
__cpu_die(cpu);
/* CPU is completely dead: tell everyone. Too late to complain. */
- if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod,
- hcpu) == NOTIFY_BAD)
- BUG();
+ cpu_notify_nofail(CPU_DEAD | mod, hcpu);
check_for_tasks(cpu);
out_release:
cpu_hotplug_done();
- if (!err) {
- if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod,
- hcpu) == NOTIFY_BAD)
- BUG();
- }
+ if (!err)
+ cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
return err;
}
@@ -293,13 +307,11 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
return -EINVAL;
cpu_hotplug_begin();
- ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
- -1, &nr_calls);
- if (ret == NOTIFY_BAD) {
+ ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
+ if (ret) {
nr_calls--;
printk("%s: attempt to bring up CPU %u failed\n",
__func__, cpu);
- ret = -EINVAL;
goto out_notify;
}
@@ -312,12 +324,11 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
set_cpu_active(cpu, true);
/* Now call notifier in preparation. */
- raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
+ cpu_notify(CPU_ONLINE | mod, hcpu);
out_notify:
if (ret != 0)
- __raw_notifier_call_chain(&cpu_chain,
- CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+ __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
cpu_hotplug_done();
return ret;
@@ -383,7 +394,7 @@ static cpumask_var_t frozen_cpus;
int disable_nonboot_cpus(void)
{
- int cpu, first_cpu, error;
+ int cpu, first_cpu, error = 0;
cpu_maps_update_begin();
first_cpu = cpumask_first(cpu_online_mask);
@@ -481,7 +492,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
val = CPU_STARTING_FROZEN;
#endif /* CONFIG_PM_SLEEP_SMP */
- raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
+ cpu_notify(val, (void *)(long)cpu);
}
#endif /* CONFIG_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 61d6af7fa676..02b9611eadde 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2469,7 +2469,8 @@ void cpuset_unlock(void)
}
/**
- * cpuset_mem_spread_node() - On which node to begin search for a page
+ * cpuset_mem_spread_node() - On which node to begin search for a file page
+ * cpuset_slab_spread_node() - On which node to begin search for a slab page
*
* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
* tasks in a cpuset with is_spread_page or is_spread_slab set),
@@ -2494,16 +2495,27 @@ void cpuset_unlock(void)
* See kmem_cache_alloc_node().
*/
-int cpuset_mem_spread_node(void)
+static int cpuset_spread_node(int *rotor)
{
int node;
- node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
+ node = next_node(*rotor, current->mems_allowed);
if (node == MAX_NUMNODES)
node = first_node(current->mems_allowed);
- current->cpuset_mem_spread_rotor = node;
+ *rotor = node;
return node;
}
+
+int cpuset_mem_spread_node(void)
+{
+ return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
+}
+
+int cpuset_slab_spread_node(void)
+{
+ return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
+}
+
EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
/**
diff --git a/kernel/cred.c b/kernel/cred.c
index 2c24870c55d1..60bc8b1e32e6 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -209,6 +209,31 @@ void exit_creds(struct task_struct *tsk)
}
}
+/**
+ * get_task_cred - Get another task's objective credentials
+ * @task: The task to query
+ *
+ * Get the objective credentials of a task, pinning them so that they can't go
+ * away. Accessing a task's credentials directly is not permitted.
+ *
+ * The caller must also make sure task doesn't get deleted, either by holding a
+ * ref on task or by holding tasklist_lock to prevent it from being unlinked.
+ */
+const struct cred *get_task_cred(struct task_struct *task)
+{
+ const struct cred *cred;
+
+ rcu_read_lock();
+
+ do {
+ cred = __task_cred((task));
+ BUG_ON(!cred);
+ } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage));
+
+ rcu_read_unlock();
+ return cred;
+}
+
/*
* Allocate blank credentials, such that the credentials can be filled in at a
* later date without risk of ENOMEM.
@@ -347,66 +372,6 @@ struct cred *prepare_exec_creds(void)
}
/*
- * prepare new credentials for the usermode helper dispatcher
- */
-struct cred *prepare_usermodehelper_creds(void)
-{
-#ifdef CONFIG_KEYS
- struct thread_group_cred *tgcred = NULL;
-#endif
- struct cred *new;
-
-#ifdef CONFIG_KEYS
- tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC);
- if (!tgcred)
- return NULL;
-#endif
-
- new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
- if (!new)
- goto free_tgcred;
-
- kdebug("prepare_usermodehelper_creds() alloc %p", new);
-
- memcpy(new, &init_cred, sizeof(struct cred));
-
- atomic_set(&new->usage, 1);
- set_cred_subscribers(new, 0);
- get_group_info(new->group_info);
- get_uid(new->user);
-
-#ifdef CONFIG_KEYS
- new->thread_keyring = NULL;
- new->request_key_auth = NULL;
- new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT;
-
- atomic_set(&tgcred->usage, 1);
- spin_lock_init(&tgcred->lock);
- new->tgcred = tgcred;
-#endif
-
-#ifdef CONFIG_SECURITY
- new->security = NULL;
-#endif
- if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
- goto error;
- validate_creds(new);
-
- BUG_ON(atomic_read(&new->usage) != 1);
- return new;
-
-error:
- put_cred(new);
- return NULL;
-
-free_tgcred:
-#ifdef CONFIG_KEYS
- kfree(tgcred);
-#endif
- return NULL;
-}
-
-/*
* Copy credentials for the new process created by fork()
*
* We share if we can, but under some circumstances we have to generate a new
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 5cb7cd1de10c..8bc5eeffec8a 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -605,13 +605,13 @@ cpu_master_loop:
if (dbg_kdb_mode) {
kgdb_connected = 1;
error = kdb_stub(ks);
+ kgdb_connected = 0;
} else {
error = gdb_serial_stub(ks);
}
if (error == DBG_PASS_EVENT) {
dbg_kdb_mode = !dbg_kdb_mode;
- kgdb_connected = 0;
} else if (error == DBG_SWITCH_CPU_EVENT) {
dbg_cpu_switch(cpu, dbg_switch_cpu);
goto cpu_loop;
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 4b17b3269525..e8fd6868682d 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -621,10 +621,8 @@ static void gdb_cmd_query(struct kgdb_state *ks)
switch (remcom_in_buffer[1]) {
case 's':
case 'f':
- if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) {
- error_packet(remcom_out_buffer, -EINVAL);
+ if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10))
break;
- }
i = 0;
remcom_out_buffer[0] = 'm';
@@ -665,10 +663,9 @@ static void gdb_cmd_query(struct kgdb_state *ks)
pack_threadid(remcom_out_buffer + 2, thref);
break;
case 'T':
- if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) {
- error_packet(remcom_out_buffer, -EINVAL);
+ if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16))
break;
- }
+
ks->threadid = 0;
ptr = remcom_in_buffer + 17;
kgdb_hex2long(&ptr, &ks->threadid);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index b724c791b6d4..ebe4a287419e 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1820,9 +1820,8 @@ static int kdb_sr(int argc, const char **argv)
{
if (argc != 1)
return KDB_ARGCOUNT;
- sysrq_toggle_support(1);
kdb_trap_printk++;
- handle_sysrq(*argv[1], NULL);
+ __handle_sysrq(*argv[1], NULL, 0);
kdb_trap_printk--;
return 0;
@@ -1857,12 +1856,6 @@ static int kdb_ef(int argc, const char **argv)
}
#if defined(CONFIG_MODULES)
-/* modules using other modules */
-struct module_use {
- struct list_head list;
- struct module *module_which_uses;
-};
-
/*
* kdb_lsmod - This function implements the 'lsmod' command. Lists
* currently loaded kernel modules.
@@ -1889,14 +1882,15 @@ static int kdb_lsmod(int argc, const char **argv)
kdb_printf(" (Loading)");
else
kdb_printf(" (Live)");
+ kdb_printf(" 0x%p", mod->module_core);
#ifdef CONFIG_MODULE_UNLOAD
{
struct module_use *use;
kdb_printf(" [ ");
- list_for_each_entry(use, &mod->modules_which_use_me,
- list)
- kdb_printf("%s ", use->module_which_uses->name);
+ list_for_each_entry(use, &mod->source_list,
+ source_list)
+ kdb_printf("%s ", use->target->name);
kdb_printf("]\n");
}
#endif
@@ -2297,6 +2291,9 @@ static int kdb_ll(int argc, const char **argv)
while (va) {
char buf[80];
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+
sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
diag = kdb_parse(buf);
if (diag)
diff --git a/kernel/early_res.c b/kernel/early_res.c
index 31aa9332ef3f..7bfae887f211 100644
--- a/kernel/early_res.c
+++ b/kernel/early_res.c
@@ -7,6 +7,8 @@
#include <linux/bootmem.h>
#include <linux/mm.h>
#include <linux/early_res.h>
+#include <linux/slab.h>
+#include <linux/kmemleak.h>
/*
* Early reserved memory areas.
@@ -319,6 +321,8 @@ void __init free_early(u64 start, u64 end)
struct early_res *r;
int i;
+ kmemleak_free_part(__va(start), end - start);
+
i = find_overlapped_early(start, end);
r = &early_res[i];
if (i >= max_early_res || r->end != end || r->start != start)
@@ -333,6 +337,8 @@ void __init free_early_partial(u64 start, u64 end)
struct early_res *r;
int i;
+ kmemleak_free_part(__va(start), end - start);
+
if (start == end)
return;
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index c35452cadded..dd62f8e714ca 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -27,7 +27,7 @@ static struct exec_domain *exec_domains = &default_exec_domain;
static DEFINE_RWLOCK(exec_domains_lock);
-static u_long ident_map[32] = {
+static unsigned long ident_map[32] = {
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
@@ -56,10 +56,10 @@ default_handler(int segment, struct pt_regs *regp)
}
static struct exec_domain *
-lookup_exec_domain(u_long personality)
+lookup_exec_domain(unsigned int personality)
{
- struct exec_domain * ep;
- u_long pers = personality(personality);
+ unsigned int pers = personality(personality);
+ struct exec_domain *ep;
read_lock(&exec_domains_lock);
for (ep = exec_domains; ep; ep = ep->next) {
@@ -70,7 +70,7 @@ lookup_exec_domain(u_long personality)
#ifdef CONFIG_MODULES
read_unlock(&exec_domains_lock);
- request_module("personality-%ld", pers);
+ request_module("personality-%d", pers);
read_lock(&exec_domains_lock);
for (ep = exec_domains; ep; ep = ep->next) {
@@ -135,7 +135,7 @@ unregister:
}
int
-__set_personality(u_long personality)
+__set_personality(unsigned int personality)
{
struct exec_domain *ep, *oep;
@@ -188,9 +188,9 @@ static int __init proc_execdomains_init(void)
module_init(proc_execdomains_init);
#endif
-SYSCALL_DEFINE1(personality, u_long, personality)
+SYSCALL_DEFINE1(personality, unsigned int, personality)
{
- u_long old = current->personality;
+ unsigned int old = current->personality;
if (personality != 0xffffffff) {
set_personality(personality);
@@ -198,7 +198,7 @@ SYSCALL_DEFINE1(personality, u_long, personality)
return -EINVAL;
}
- return (long)old;
+ return old;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 019a2843bf95..ceffc67b564a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -58,11 +58,11 @@
static void exit_mm(struct task_struct * tsk);
-static void __unhash_process(struct task_struct *p)
+static void __unhash_process(struct task_struct *p, bool group_dead)
{
nr_threads--;
detach_pid(p, PIDTYPE_PID);
- if (thread_group_leader(p)) {
+ if (group_dead) {
detach_pid(p, PIDTYPE_PGID);
detach_pid(p, PIDTYPE_SID);
@@ -79,10 +79,9 @@ static void __unhash_process(struct task_struct *p)
static void __exit_signal(struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
+ bool group_dead = thread_group_leader(tsk);
struct sighand_struct *sighand;
-
- BUG_ON(!sig);
- BUG_ON(!atomic_read(&sig->count));
+ struct tty_struct *uninitialized_var(tty);
sighand = rcu_dereference_check(tsk->sighand,
rcu_read_lock_held() ||
@@ -90,14 +89,16 @@ static void __exit_signal(struct task_struct *tsk)
spin_lock(&sighand->siglock);
posix_cpu_timers_exit(tsk);
- if (atomic_dec_and_test(&sig->count))
+ if (group_dead) {
posix_cpu_timers_exit_group(tsk);
- else {
+ tty = sig->tty;
+ sig->tty = NULL;
+ } else {
/*
* If there is any task waiting for the group exit
* then notify it:
*/
- if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count)
+ if (sig->notify_count > 0 && !--sig->notify_count)
wake_up_process(sig->group_exit_task);
if (tsk == sig->curr_target)
@@ -123,32 +124,24 @@ static void __exit_signal(struct task_struct *tsk)
sig->oublock += task_io_get_oublock(tsk);
task_io_accounting_add(&sig->ioac, &tsk->ioac);
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
- sig = NULL; /* Marker for below. */
}
- __unhash_process(tsk);
+ sig->nr_threads--;
+ __unhash_process(tsk, group_dead);
/*
* Do this under ->siglock, we can race with another thread
* doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
*/
flush_sigqueue(&tsk->pending);
-
- tsk->signal = NULL;
tsk->sighand = NULL;
spin_unlock(&sighand->siglock);
__cleanup_sighand(sighand);
clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
- if (sig) {
+ if (group_dead) {
flush_sigqueue(&sig->shared_pending);
- taskstats_tgid_free(sig);
- /*
- * Make sure ->signal can't go away under rq->lock,
- * see account_group_exec_runtime().
- */
- task_rq_unlock_wait(tsk);
- __cleanup_signal(sig);
+ tty_kref_put(tty);
}
}
@@ -856,12 +849,9 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
- /* mt-exec, de_thread() is waiting for us */
- if (thread_group_leader(tsk) &&
- tsk->signal->group_exit_task &&
- tsk->signal->notify_count < 0)
+ /* mt-exec, de_thread() is waiting for group leader */
+ if (unlikely(tsk->signal->notify_count < 0))
wake_up_process(tsk->signal->group_exit_task);
-
write_unlock_irq(&tasklist_lock);
tracehook_report_death(tsk, signal, cookie, group_dead);
diff --git a/kernel/fork.c b/kernel/fork.c
index 4d57d9e3a6e9..b6cce14ba047 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -165,6 +165,18 @@ void free_task(struct task_struct *tsk)
}
EXPORT_SYMBOL(free_task);
+static inline void free_signal_struct(struct signal_struct *sig)
+{
+ taskstats_tgid_free(sig);
+ kmem_cache_free(signal_cachep, sig);
+}
+
+static inline void put_signal_struct(struct signal_struct *sig)
+{
+ if (atomic_dec_and_test(&sig->sigcnt))
+ free_signal_struct(sig);
+}
+
void __put_task_struct(struct task_struct *tsk)
{
WARN_ON(!tsk->exit_state);
@@ -173,6 +185,7 @@ void __put_task_struct(struct task_struct *tsk)
exit_creds(tsk);
delayacct_tsk_free(tsk);
+ put_signal_struct(tsk->signal);
if (!profile_handoff_task(tsk))
free_task(tsk);
@@ -864,8 +877,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
if (!sig)
return -ENOMEM;
- atomic_set(&sig->count, 1);
+ sig->nr_threads = 1;
atomic_set(&sig->live, 1);
+ atomic_set(&sig->sigcnt, 1);
init_waitqueue_head(&sig->wait_chldexit);
if (clone_flags & CLONE_NEWPID)
sig->flags |= SIGNAL_UNKILLABLE;
@@ -889,13 +903,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
-void __cleanup_signal(struct signal_struct *sig)
-{
- thread_group_cputime_free(sig);
- tty_kref_put(sig->tty);
- kmem_cache_free(signal_cachep, sig);
-}
-
static void copy_flags(unsigned long clone_flags, struct task_struct *p)
{
unsigned long new_flags = p->flags;
@@ -1245,8 +1252,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
}
if (clone_flags & CLONE_THREAD) {
- atomic_inc(&current->signal->count);
+ current->signal->nr_threads++;
atomic_inc(&current->signal->live);
+ atomic_inc(&current->signal->sigcnt);
p->group_leader = current->group_leader;
list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
}
@@ -1259,7 +1267,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->nsproxy->pid_ns->child_reaper = p;
p->signal->leader_pid = pid;
- tty_kref_put(p->signal->tty);
p->signal->tty = tty_kref_get(current->signal->tty);
attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
attach_pid(p, PIDTYPE_SID, task_session(current));
@@ -1292,7 +1299,7 @@ bad_fork_cleanup_mm:
mmput(p->mm);
bad_fork_cleanup_signal:
if (!(clone_flags & CLONE_THREAD))
- __cleanup_signal(p->signal);
+ free_signal_struct(p->signal);
bad_fork_cleanup_sighand:
__cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
@@ -1327,6 +1334,16 @@ noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_re
return regs;
}
+static inline void init_idle_pids(struct pid_link *links)
+{
+ enum pid_type type;
+
+ for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
+ INIT_HLIST_NODE(&links[type].node); /* not really needed */
+ links[type].pid = &init_struct_pid;
+ }
+}
+
struct task_struct * __cpuinit fork_idle(int cpu)
{
struct task_struct *task;
@@ -1334,8 +1351,10 @@ struct task_struct * __cpuinit fork_idle(int cpu)
task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
&init_struct_pid, 0);
- if (!IS_ERR(task))
+ if (!IS_ERR(task)) {
+ init_idle_pids(task->pids);
init_idle(task, cpu);
+ }
return task;
}
@@ -1507,14 +1526,6 @@ static void check_unshare_flags(unsigned long *flags_ptr)
*flags_ptr |= CLONE_SIGHAND;
/*
- * If unsharing signal handlers and the task was created
- * using CLONE_THREAD, then must unshare the thread
- */
- if ((*flags_ptr & CLONE_SIGHAND) &&
- (atomic_read(&current->signal->count) > 1))
- *flags_ptr |= CLONE_THREAD;
-
- /*
* If unsharing namespace, must also unshare filesystem information.
*/
if (*flags_ptr & CLONE_NEWNS)
diff --git a/kernel/futex.c b/kernel/futex.c
index e7a35f1039e7..6a3a5fa1526d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -429,20 +429,11 @@ static void free_pi_state(struct futex_pi_state *pi_state)
static struct task_struct * futex_find_get_task(pid_t pid)
{
struct task_struct *p;
- const struct cred *cred = current_cred(), *pcred;
rcu_read_lock();
p = find_task_by_vpid(pid);
- if (!p) {
- p = ERR_PTR(-ESRCH);
- } else {
- pcred = __task_cred(p);
- if (cred->euid != pcred->euid &&
- cred->euid != pcred->uid)
- p = ERR_PTR(-ESRCH);
- else
- get_task_struct(p);
- }
+ if (p)
+ get_task_struct(p);
rcu_read_unlock();
@@ -564,8 +555,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
if (!pid)
return -ESRCH;
p = futex_find_get_task(pid);
- if (IS_ERR(p))
- return PTR_ERR(p);
+ if (!p)
+ return -ESRCH;
/*
* We need to look at the task state flags to figure out,
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b9b134b35088..5c69e996bd0f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -89,7 +89,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
do {
seq = read_seqbegin(&xtime_lock);
- xts = current_kernel_time();
+ xts = __current_kernel_time();
tom = wall_to_monotonic;
} while (read_seqretry(&xtime_lock, seq));
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3164ba7ce151..e1497481fe8a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -456,6 +456,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
/* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
desc->status |= flags;
+
+ if (chip != desc->chip)
+ irq_chip_set_defaults(desc->chip);
}
return ret;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 474a84715eac..131b1703936f 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1089,9 +1089,10 @@ void crash_kexec(struct pt_regs *regs)
size_t crash_get_memory_size(void)
{
- size_t size;
+ size_t size = 0;
mutex_lock(&kexec_mutex);
- size = crashk_res.end - crashk_res.start + 1;
+ if (crashk_res.end != crashk_res.start)
+ size = crashk_res.end - crashk_res.start + 1;
mutex_unlock(&kexec_mutex);
return size;
}
@@ -1134,7 +1135,7 @@ int crash_shrink_memory(unsigned long new_size)
free_reserved_phys_range(end, crashk_res.end);
- if (start == end)
+ if ((start == end) && (crashk_res.parent != NULL))
release_resource(&crashk_res);
crashk_res.end = end - 1;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bf0e231d9702..6e9b19667a8d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -116,27 +116,16 @@ int __request_module(bool wait, const char *fmt, ...)
trace_module_request(module_name, wait, _RET_IP_);
- ret = call_usermodehelper(modprobe_path, argv, envp,
- wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
+ ret = call_usermodehelper_fns(modprobe_path, argv, envp,
+ wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC,
+ NULL, NULL, NULL);
+
atomic_dec(&kmod_concurrent);
return ret;
}
EXPORT_SYMBOL(__request_module);
#endif /* CONFIG_MODULES */
-struct subprocess_info {
- struct work_struct work;
- struct completion *complete;
- struct cred *cred;
- char *path;
- char **argv;
- char **envp;
- enum umh_wait wait;
- int retval;
- struct file *stdin;
- void (*cleanup)(char **argv, char **envp);
-};
-
/*
* This is the task which runs the usermode application
*/
@@ -145,36 +134,10 @@ static int ____call_usermodehelper(void *data)
struct subprocess_info *sub_info = data;
int retval;
- BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
-
- /* Unblock all signals */
spin_lock_irq(&current->sighand->siglock);
flush_signal_handlers(current, 1);
- sigemptyset(&current->blocked);
- recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
- /* Install the credentials */
- commit_creds(sub_info->cred);
- sub_info->cred = NULL;
-
- /* Install input pipe when needed */
- if (sub_info->stdin) {
- struct files_struct *f = current->files;
- struct fdtable *fdt;
- /* no races because files should be private here */
- sys_close(0);
- fd_install(0, sub_info->stdin);
- spin_lock(&f->file_lock);
- fdt = files_fdtable(f);
- FD_SET(0, fdt->open_fds);
- FD_CLR(0, fdt->close_on_exec);
- spin_unlock(&f->file_lock);
-
- /* and disallow core files too */
- current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0};
- }
-
/* We can run anywhere, unlike our parent keventd(). */
set_cpus_allowed_ptr(current, cpu_all_mask);
@@ -184,9 +147,16 @@ static int ____call_usermodehelper(void *data)
*/
set_user_nice(current, 0);
+ if (sub_info->init) {
+ retval = sub_info->init(sub_info);
+ if (retval)
+ goto fail;
+ }
+
retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp);
/* Exec failed? */
+fail:
sub_info->retval = retval;
do_exit(0);
}
@@ -194,9 +164,7 @@ static int ____call_usermodehelper(void *data)
void call_usermodehelper_freeinfo(struct subprocess_info *info)
{
if (info->cleanup)
- (*info->cleanup)(info->argv, info->envp);
- if (info->cred)
- put_cred(info->cred);
+ (*info->cleanup)(info);
kfree(info);
}
EXPORT_SYMBOL(call_usermodehelper_freeinfo);
@@ -207,16 +175,16 @@ static int wait_for_helper(void *data)
struct subprocess_info *sub_info = data;
pid_t pid;
- /* Install a handler: if SIGCLD isn't handled sys_wait4 won't
- * populate the status, but will return -ECHILD. */
- allow_signal(SIGCHLD);
+ /* If SIGCLD is ignored sys_wait4 won't populate the status. */
+ spin_lock_irq(&current->sighand->siglock);
+ current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL;
+ spin_unlock_irq(&current->sighand->siglock);
pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
if (pid < 0) {
sub_info->retval = pid;
} else {
- int ret;
-
+ int ret = -ECHILD;
/*
* Normally it is bogus to call wait4() from in-kernel because
* wait4() wants to write the exit code to a userspace address.
@@ -237,10 +205,7 @@ static int wait_for_helper(void *data)
sub_info->retval = ret;
}
- if (sub_info->wait == UMH_NO_WAIT)
- call_usermodehelper_freeinfo(sub_info);
- else
- complete(sub_info->complete);
+ complete(sub_info->complete);
return 0;
}
@@ -249,15 +214,13 @@ static void __call_usermodehelper(struct work_struct *work)
{
struct subprocess_info *sub_info =
container_of(work, struct subprocess_info, work);
- pid_t pid;
enum umh_wait wait = sub_info->wait;
-
- BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
+ pid_t pid;
/* CLONE_VFORK: wait until the usermode helper has execve'd
* successfully We need the data structures to stay around
* until that is done. */
- if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT)
+ if (wait == UMH_WAIT_PROC)
pid = kernel_thread(wait_for_helper, sub_info,
CLONE_FS | CLONE_FILES | SIGCHLD);
else
@@ -266,15 +229,16 @@ static void __call_usermodehelper(struct work_struct *work)
switch (wait) {
case UMH_NO_WAIT:
+ call_usermodehelper_freeinfo(sub_info);
break;
case UMH_WAIT_PROC:
if (pid > 0)
break;
- sub_info->retval = pid;
/* FALLTHROUGH */
-
case UMH_WAIT_EXEC:
+ if (pid < 0)
+ sub_info->retval = pid;
complete(sub_info->complete);
}
}
@@ -376,80 +340,37 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
sub_info->path = path;
sub_info->argv = argv;
sub_info->envp = envp;
- sub_info->cred = prepare_usermodehelper_creds();
- if (!sub_info->cred) {
- kfree(sub_info);
- return NULL;
- }
-
out:
return sub_info;
}
EXPORT_SYMBOL(call_usermodehelper_setup);
/**
- * call_usermodehelper_setkeys - set the session keys for usermode helper
- * @info: a subprocess_info returned by call_usermodehelper_setup
- * @session_keyring: the session keyring for the process
- */
-void call_usermodehelper_setkeys(struct subprocess_info *info,
- struct key *session_keyring)
-{
-#ifdef CONFIG_KEYS
- struct thread_group_cred *tgcred = info->cred->tgcred;
- key_put(tgcred->session_keyring);
- tgcred->session_keyring = key_get(session_keyring);
-#else
- BUG();
-#endif
-}
-EXPORT_SYMBOL(call_usermodehelper_setkeys);
-
-/**
- * call_usermodehelper_setcleanup - set a cleanup function
+ * call_usermodehelper_setfns - set a cleanup/init function
* @info: a subprocess_info returned by call_usermodehelper_setup
* @cleanup: a cleanup function
+ * @init: an init function
+ * @data: arbitrary context sensitive data
*
- * The cleanup function is just befor ethe subprocess_info is about to
+ * The init function is used to customize the helper process prior to
+ * exec. A non-zero return code causes the process to error out, exit,
+ * and return the failure to the calling process
+ *
+ * The cleanup function is just before ethe subprocess_info is about to
* be freed. This can be used for freeing the argv and envp. The
* Function must be runnable in either a process context or the
* context in which call_usermodehelper_exec is called.
*/
-void call_usermodehelper_setcleanup(struct subprocess_info *info,
- void (*cleanup)(char **argv, char **envp))
+void call_usermodehelper_setfns(struct subprocess_info *info,
+ int (*init)(struct subprocess_info *info),
+ void (*cleanup)(struct subprocess_info *info),
+ void *data)
{
info->cleanup = cleanup;
+ info->init = init;
+ info->data = data;
}
-EXPORT_SYMBOL(call_usermodehelper_setcleanup);
-
-/**
- * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin
- * @sub_info: a subprocess_info returned by call_usermodehelper_setup
- * @filp: set to the write-end of a pipe
- *
- * This constructs a pipe, and sets the read end to be the stdin of the
- * subprocess, and returns the write-end in *@filp.
- */
-int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
- struct file **filp)
-{
- struct file *f;
-
- f = create_write_pipe(0);
- if (IS_ERR(f))
- return PTR_ERR(f);
- *filp = f;
-
- f = create_read_pipe(f, 0);
- if (IS_ERR(f)) {
- free_write_pipe(*filp);
- return PTR_ERR(f);
- }
- sub_info->stdin = f;
-
- return 0;
-}
-EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
+EXPORT_SYMBOL(call_usermodehelper_setfns);
/**
* call_usermodehelper_exec - start a usermode application
@@ -469,9 +390,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
DECLARE_COMPLETION_ONSTACK(done);
int retval = 0;
- BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
- validate_creds(sub_info->cred);
-
helper_lock();
if (sub_info->path[0] == '\0')
goto out;
@@ -498,41 +416,6 @@ unlock:
}
EXPORT_SYMBOL(call_usermodehelper_exec);
-/**
- * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin
- * @path: path to usermode executable
- * @argv: arg vector for process
- * @envp: environment for process
- * @filp: set to the write-end of a pipe
- *
- * This is a simple wrapper which executes a usermode-helper function
- * with a pipe as stdin. It is implemented entirely in terms of
- * lower-level call_usermodehelper_* functions.
- */
-int call_usermodehelper_pipe(char *path, char **argv, char **envp,
- struct file **filp)
-{
- struct subprocess_info *sub_info;
- int ret;
-
- sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL);
- if (sub_info == NULL)
- return -ENOMEM;
-
- ret = call_usermodehelper_stdinpipe(sub_info, filp);
- if (ret < 0) {
- call_usermodehelper_freeinfo(sub_info);
- return ret;
- }
-
- ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
- if (ret < 0) /* Failed to execute helper, close pipe */
- filp_close(*filp, NULL);
-
- return ret;
-}
-EXPORT_SYMBOL(call_usermodehelper_pipe);
-
void __init usermodehelper_init(void)
{
khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/module.c b/kernel/module.c
index 333fbcc96978..6c562828c85c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -72,7 +72,11 @@
/* If this is set, the section belongs in the init part of the module */
#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
-/* List of modules, protected by module_mutex or preempt_disable
+/*
+ * Mutex protects:
+ * 1) List of modules (also safely readable with preempt_disable),
+ * 2) module_use links,
+ * 3) module_addr_min/module_addr_max.
* (delete uses stop_machine/add uses RCU list operations). */
DEFINE_MUTEX(module_mutex);
EXPORT_SYMBOL_GPL(module_mutex);
@@ -90,7 +94,8 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
static BLOCKING_NOTIFIER_HEAD(module_notify_list);
-/* Bounds of module allocation, for speeding __module_address */
+/* Bounds of module allocation, for speeding __module_address.
+ * Protected by module_mutex. */
static unsigned long module_addr_min = -1UL, module_addr_max = 0;
int register_module_notifier(struct notifier_block * nb)
@@ -329,7 +334,7 @@ static bool find_symbol_in_section(const struct symsearch *syms,
}
/* Find a symbol and return it, along with, (optional) crc and
- * (optional) module which owns it */
+ * (optional) module which owns it. Needs preempt disabled or module_mutex. */
const struct kernel_symbol *find_symbol(const char *name,
struct module **owner,
const unsigned long **crc,
@@ -403,7 +408,7 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr,
Elf_Shdr *sechdrs,
const char *secstrings)
{
- return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
+ return find_sec(hdr, sechdrs, secstrings, ".data..percpu");
}
static void percpu_modcopy(struct module *mod,
@@ -523,7 +528,8 @@ static void module_unload_init(struct module *mod)
{
int cpu;
- INIT_LIST_HEAD(&mod->modules_which_use_me);
+ INIT_LIST_HEAD(&mod->source_list);
+ INIT_LIST_HEAD(&mod->target_list);
for_each_possible_cpu(cpu) {
per_cpu_ptr(mod->refptr, cpu)->incs = 0;
per_cpu_ptr(mod->refptr, cpu)->decs = 0;
@@ -535,20 +541,13 @@ static void module_unload_init(struct module *mod)
mod->waiter = current;
}
-/* modules using other modules */
-struct module_use
-{
- struct list_head list;
- struct module *module_which_uses;
-};
-
/* Does a already use b? */
static int already_uses(struct module *a, struct module *b)
{
struct module_use *use;
- list_for_each_entry(use, &b->modules_which_use_me, list) {
- if (use->module_which_uses == a) {
+ list_for_each_entry(use, &b->source_list, source_list) {
+ if (use->source == a) {
DEBUGP("%s uses %s!\n", a->name, b->name);
return 1;
}
@@ -557,62 +556,68 @@ static int already_uses(struct module *a, struct module *b)
return 0;
}
-/* Module a uses b */
-int use_module(struct module *a, struct module *b)
+/*
+ * Module a uses b
+ * - we add 'a' as a "source", 'b' as a "target" of module use
+ * - the module_use is added to the list of 'b' sources (so
+ * 'b' can walk the list to see who sourced them), and of 'a'
+ * targets (so 'a' can see what modules it targets).
+ */
+static int add_module_usage(struct module *a, struct module *b)
{
struct module_use *use;
- int no_warn, err;
- if (b == NULL || already_uses(a, b)) return 1;
+ DEBUGP("Allocating new usage for %s.\n", a->name);
+ use = kmalloc(sizeof(*use), GFP_ATOMIC);
+ if (!use) {
+ printk(KERN_WARNING "%s: out of memory loading\n", a->name);
+ return -ENOMEM;
+ }
- /* If we're interrupted or time out, we fail. */
- if (wait_event_interruptible_timeout(
- module_wq, (err = strong_try_module_get(b)) != -EBUSY,
- 30 * HZ) <= 0) {
- printk("%s: gave up waiting for init of module %s.\n",
- a->name, b->name);
+ use->source = a;
+ use->target = b;
+ list_add(&use->source_list, &b->source_list);
+ list_add(&use->target_list, &a->target_list);
+ return 0;
+}
+
+/* Module a uses b: caller needs module_mutex() */
+int ref_module(struct module *a, struct module *b)
+{
+ int err;
+
+ if (b == NULL || already_uses(a, b))
return 0;
- }
- /* If strong_try_module_get() returned a different error, we fail. */
+ /* If module isn't available, we fail. */
+ err = strong_try_module_get(b);
if (err)
- return 0;
+ return err;
- DEBUGP("Allocating new usage for %s.\n", a->name);
- use = kmalloc(sizeof(*use), GFP_ATOMIC);
- if (!use) {
- printk("%s: out of memory loading\n", a->name);
+ err = add_module_usage(a, b);
+ if (err) {
module_put(b);
- return 0;
+ return err;
}
-
- use->module_which_uses = a;
- list_add(&use->list, &b->modules_which_use_me);
- no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
- return 1;
+ return 0;
}
-EXPORT_SYMBOL_GPL(use_module);
+EXPORT_SYMBOL_GPL(ref_module);
/* Clear the unload stuff of the module. */
static void module_unload_free(struct module *mod)
{
- struct module *i;
+ struct module_use *use, *tmp;
- list_for_each_entry(i, &modules, list) {
- struct module_use *use;
-
- list_for_each_entry(use, &i->modules_which_use_me, list) {
- if (use->module_which_uses == mod) {
- DEBUGP("%s unusing %s\n", mod->name, i->name);
- module_put(i);
- list_del(&use->list);
- kfree(use);
- sysfs_remove_link(i->holders_dir, mod->name);
- /* There can be at most one match. */
- break;
- }
- }
+ mutex_lock(&module_mutex);
+ list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
+ struct module *i = use->target;
+ DEBUGP("%s unusing %s\n", mod->name, i->name);
+ module_put(i);
+ list_del(&use->source_list);
+ list_del(&use->target_list);
+ kfree(use);
}
+ mutex_unlock(&module_mutex);
}
#ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -735,7 +740,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
goto out;
}
- if (!list_empty(&mod->modules_which_use_me)) {
+ if (!list_empty(&mod->source_list)) {
/* Other modules depend on us: get rid of them first. */
ret = -EWOULDBLOCK;
goto out;
@@ -779,13 +784,13 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
async_synchronize_full();
- mutex_lock(&module_mutex);
+
/* Store the name of the last unloaded module for diagnostic purposes */
strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
- ddebug_remove_module(mod->name);
- free_module(mod);
- out:
+ free_module(mod);
+ return 0;
+out:
mutex_unlock(&module_mutex);
return ret;
}
@@ -799,9 +804,9 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
/* Always include a trailing , so userspace can differentiate
between this and the old multi-field proc format. */
- list_for_each_entry(use, &mod->modules_which_use_me, list) {
+ list_for_each_entry(use, &mod->source_list, source_list) {
printed_something = 1;
- seq_printf(m, "%s,", use->module_which_uses->name);
+ seq_printf(m, "%s,", use->source->name);
}
if (mod->init != NULL && mod->exit == NULL) {
@@ -880,11 +885,11 @@ static inline void module_unload_free(struct module *mod)
{
}
-int use_module(struct module *a, struct module *b)
+int ref_module(struct module *a, struct module *b)
{
- return strong_try_module_get(b) == 0;
+ return strong_try_module_get(b);
}
-EXPORT_SYMBOL_GPL(use_module);
+EXPORT_SYMBOL_GPL(ref_module);
static inline void module_unload_init(struct module *mod)
{
@@ -1001,6 +1006,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
{
const unsigned long *crc;
+ /* Since this should be found in kernel (which can't be removed),
+ * no locking is necessary. */
if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
&crc, true, false))
BUG();
@@ -1043,29 +1050,62 @@ static inline int same_magic(const char *amagic, const char *bmagic,
}
#endif /* CONFIG_MODVERSIONS */
-/* Resolve a symbol for this module. I.e. if we find one, record usage.
- Must be holding module_mutex. */
+/* Resolve a symbol for this module. I.e. if we find one, record usage. */
static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
unsigned int versindex,
const char *name,
- struct module *mod)
+ struct module *mod,
+ char ownername[])
{
struct module *owner;
const struct kernel_symbol *sym;
const unsigned long *crc;
+ int err;
+ mutex_lock(&module_mutex);
sym = find_symbol(name, &owner, &crc,
!(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
- /* use_module can fail due to OOM,
- or module initialization or unloading */
- if (sym) {
- if (!check_version(sechdrs, versindex, name, mod, crc, owner)
- || !use_module(mod, owner))
- sym = NULL;
+ if (!sym)
+ goto unlock;
+
+ if (!check_version(sechdrs, versindex, name, mod, crc, owner)) {
+ sym = ERR_PTR(-EINVAL);
+ goto getname;
}
+
+ err = ref_module(mod, owner);
+ if (err) {
+ sym = ERR_PTR(err);
+ goto getname;
+ }
+
+getname:
+ /* We must make copy under the lock if we failed to get ref. */
+ strncpy(ownername, module_name(owner), MODULE_NAME_LEN);
+unlock:
+ mutex_unlock(&module_mutex);
return sym;
}
+static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs,
+ unsigned int versindex,
+ const char *name,
+ struct module *mod)
+{
+ const struct kernel_symbol *ksym;
+ char ownername[MODULE_NAME_LEN];
+
+ if (wait_event_interruptible_timeout(module_wq,
+ !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name,
+ mod, ownername)) ||
+ PTR_ERR(ksym) != -EBUSY,
+ 30 * HZ) <= 0) {
+ printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
+ mod->name, ownername);
+ }
+ return ksym;
+}
+
/*
* /sys/module/foo/sections stuff
* J. Corbet <corbet@lwn.net>
@@ -1295,7 +1335,34 @@ static inline void remove_notes_attrs(struct module *mod)
#endif
#ifdef CONFIG_SYSFS
-int module_add_modinfo_attrs(struct module *mod)
+static void add_usage_links(struct module *mod)
+{
+#ifdef CONFIG_MODULE_UNLOAD
+ struct module_use *use;
+ int nowarn;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(use, &mod->target_list, target_list) {
+ nowarn = sysfs_create_link(use->target->holders_dir,
+ &mod->mkobj.kobj, mod->name);
+ }
+ mutex_unlock(&module_mutex);
+#endif
+}
+
+static void del_usage_links(struct module *mod)
+{
+#ifdef CONFIG_MODULE_UNLOAD
+ struct module_use *use;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(use, &mod->target_list, target_list)
+ sysfs_remove_link(use->target->holders_dir, mod->name);
+ mutex_unlock(&module_mutex);
+#endif
+}
+
+static int module_add_modinfo_attrs(struct module *mod)
{
struct module_attribute *attr;
struct module_attribute *temp_attr;
@@ -1321,7 +1388,7 @@ int module_add_modinfo_attrs(struct module *mod)
return error;
}
-void module_remove_modinfo_attrs(struct module *mod)
+static void module_remove_modinfo_attrs(struct module *mod)
{
struct module_attribute *attr;
int i;
@@ -1337,7 +1404,7 @@ void module_remove_modinfo_attrs(struct module *mod)
kfree(mod->modinfo_attrs);
}
-int mod_sysfs_init(struct module *mod)
+static int mod_sysfs_init(struct module *mod)
{
int err;
struct kobject *kobj;
@@ -1371,12 +1438,16 @@ out:
return err;
}
-int mod_sysfs_setup(struct module *mod,
+static int mod_sysfs_setup(struct module *mod,
struct kernel_param *kparam,
unsigned int num_params)
{
int err;
+ err = mod_sysfs_init(mod);
+ if (err)
+ goto out;
+
mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
if (!mod->holders_dir) {
err = -ENOMEM;
@@ -1391,6 +1462,8 @@ int mod_sysfs_setup(struct module *mod,
if (err)
goto out_unreg_param;
+ add_usage_links(mod);
+
kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
return 0;
@@ -1400,6 +1473,7 @@ out_unreg_holders:
kobject_put(mod->holders_dir);
out_unreg:
kobject_put(&mod->mkobj.kobj);
+out:
return err;
}
@@ -1410,14 +1484,40 @@ static void mod_sysfs_fini(struct module *mod)
#else /* CONFIG_SYSFS */
+static inline int mod_sysfs_init(struct module *mod)
+{
+ return 0;
+}
+
+static inline int mod_sysfs_setup(struct module *mod,
+ struct kernel_param *kparam,
+ unsigned int num_params)
+{
+ return 0;
+}
+
+static inline int module_add_modinfo_attrs(struct module *mod)
+{
+ return 0;
+}
+
+static inline void module_remove_modinfo_attrs(struct module *mod)
+{
+}
+
static void mod_sysfs_fini(struct module *mod)
{
}
+static void del_usage_links(struct module *mod)
+{
+}
+
#endif /* CONFIG_SYSFS */
static void mod_kobject_remove(struct module *mod)
{
+ del_usage_links(mod);
module_remove_modinfo_attrs(mod);
module_param_sysfs_remove(mod);
kobject_put(mod->mkobj.drivers_dir);
@@ -1436,17 +1536,22 @@ static int __unlink_module(void *_mod)
return 0;
}
-/* Free a module, remove from lists, etc (must hold module_mutex). */
+/* Free a module, remove from lists, etc. */
static void free_module(struct module *mod)
{
trace_module_free(mod);
/* Delete from various lists */
+ mutex_lock(&module_mutex);
stop_machine(__unlink_module, mod, NULL);
+ mutex_unlock(&module_mutex);
remove_notes_attrs(mod);
remove_sect_attrs(mod);
mod_kobject_remove(mod);
+ /* Remove dynamic debug info */
+ ddebug_remove_module(mod->name);
+
/* Arch-specific cleanup. */
module_arch_cleanup(mod);
@@ -1493,6 +1598,8 @@ EXPORT_SYMBOL_GPL(__symbol_get);
/*
* Ensure that an exported symbol [global namespace] does not already exist
* in the kernel or in some other module's exported symbol table.
+ *
+ * You must hold the module_mutex.
*/
static int verify_export_symbols(struct module *mod)
{
@@ -1558,21 +1665,23 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
break;
case SHN_UNDEF:
- ksym = resolve_symbol(sechdrs, versindex,
- strtab + sym[i].st_name, mod);
+ ksym = resolve_symbol_wait(sechdrs, versindex,
+ strtab + sym[i].st_name,
+ mod);
/* Ok if resolved. */
- if (ksym) {
+ if (ksym && !IS_ERR(ksym)) {
sym[i].st_value = ksym->value;
break;
}
/* Ok if weak. */
- if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
+ if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
break;
- printk(KERN_WARNING "%s: Unknown symbol %s\n",
- mod->name, strtab + sym[i].st_name);
- ret = -ENOENT;
+ printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n",
+ mod->name, strtab + sym[i].st_name,
+ PTR_ERR(ksym));
+ ret = PTR_ERR(ksym) ?: -ENOENT;
break;
default:
@@ -1955,16 +2064,24 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
#endif
}
+static void dynamic_debug_remove(struct _ddebug *debug)
+{
+ if (debug)
+ ddebug_remove_module(debug->modname);
+}
+
static void *module_alloc_update_bounds(unsigned long size)
{
void *ret = module_alloc(size);
if (ret) {
+ mutex_lock(&module_mutex);
/* Update module bounds. */
if ((unsigned long)ret < module_addr_min)
module_addr_min = (unsigned long)ret;
if ((unsigned long)ret + size > module_addr_max)
module_addr_max = (unsigned long)ret + size;
+ mutex_unlock(&module_mutex);
}
return ret;
}
@@ -2014,6 +2131,9 @@ static noinline struct module *load_module(void __user *umod,
long err = 0;
void *ptr = NULL; /* Stops spurious gcc warning */
unsigned long symoffs, stroffs, *strmap;
+ void __percpu *percpu;
+ struct _ddebug *debug = NULL;
+ unsigned int num_debug = 0;
mm_segment_t old_fs;
@@ -2138,11 +2258,6 @@ static noinline struct module *load_module(void __user *umod,
goto free_mod;
}
- if (find_module(mod->name)) {
- err = -EEXIST;
- goto free_mod;
- }
-
mod->state = MODULE_STATE_COMING;
/* Allow arches to frob section contents and sizes. */
@@ -2158,6 +2273,8 @@ static noinline struct module *load_module(void __user *umod,
goto free_mod;
sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
}
+ /* Keep this around for failure path. */
+ percpu = mod_percpu(mod);
/* Determine total sizes, and put offsets in sh_entsize. For now
this is done generically; there doesn't appear to be any
@@ -2231,11 +2348,6 @@ static noinline struct module *load_module(void __user *umod,
/* Now we've moved module, initialize linked lists, etc. */
module_unload_init(mod);
- /* add kobject, so we can reference it. */
- err = mod_sysfs_init(mod);
- if (err)
- goto free_unload;
-
/* Set up license info based on the info section */
set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
@@ -2360,11 +2472,6 @@ static noinline struct module *load_module(void __user *umod,
goto cleanup;
}
- /* Find duplicate symbols */
- err = verify_export_symbols(mod);
- if (err < 0)
- goto cleanup;
-
/* Set up and sort exception table */
mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
sizeof(*mod->extable), &mod->num_exentries);
@@ -2379,15 +2486,9 @@ static noinline struct module *load_module(void __user *umod,
kfree(strmap);
strmap = NULL;
- if (!mod->taints) {
- struct _ddebug *debug;
- unsigned int num_debug;
-
+ if (!mod->taints)
debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
sizeof(*debug), &num_debug);
- if (debug)
- dynamic_debug_setup(debug, num_debug);
- }
err = module_finalize(hdr, sechdrs, mod);
if (err < 0)
@@ -2423,7 +2524,22 @@ static noinline struct module *load_module(void __user *umod,
* function to insert in a way safe to concurrent readers.
* The mutex protects against concurrent writers.
*/
+ mutex_lock(&module_mutex);
+ if (find_module(mod->name)) {
+ err = -EEXIST;
+ goto unlock;
+ }
+
+ if (debug)
+ dynamic_debug_setup(debug, num_debug);
+
+ /* Find duplicate symbols */
+ err = verify_export_symbols(mod);
+ if (err < 0)
+ goto ddebug;
+
list_add_rcu(&mod->list, &modules);
+ mutex_unlock(&module_mutex);
err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
if (err < 0)
@@ -2432,6 +2548,7 @@ static noinline struct module *load_module(void __user *umod,
err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
if (err < 0)
goto unlink;
+
add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
@@ -2444,15 +2561,17 @@ static noinline struct module *load_module(void __user *umod,
return mod;
unlink:
+ mutex_lock(&module_mutex);
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
+ ddebug:
+ dynamic_debug_remove(debug);
+ unlock:
+ mutex_unlock(&module_mutex);
synchronize_sched();
module_arch_cleanup(mod);
cleanup:
free_modinfo(mod);
- kobject_del(&mod->mkobj.kobj);
- kobject_put(&mod->mkobj.kobj);
- free_unload:
module_unload_free(mod);
#if defined(CONFIG_MODULE_UNLOAD)
free_percpu(mod->refptr);
@@ -2463,7 +2582,7 @@ static noinline struct module *load_module(void __user *umod,
module_free(mod, mod->module_core);
/* mod will be freed with core. Don't access it beyond this line! */
free_percpu:
- percpu_modfree(mod);
+ free_percpu(percpu);
free_mod:
kfree(args);
kfree(strmap);
@@ -2499,19 +2618,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
if (!capable(CAP_SYS_MODULE) || modules_disabled)
return -EPERM;
- /* Only one module load at a time, please */
- if (mutex_lock_interruptible(&module_mutex) != 0)
- return -EINTR;
-
/* Do all the hard work */
mod = load_module(umod, len, uargs);
- if (IS_ERR(mod)) {
- mutex_unlock(&module_mutex);
+ if (IS_ERR(mod))
return PTR_ERR(mod);
- }
-
- /* Drop lock so they can recurse */
- mutex_unlock(&module_mutex);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_COMING, mod);
@@ -2528,9 +2638,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
module_put(mod);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
- mutex_lock(&module_mutex);
free_module(mod);
- mutex_unlock(&module_mutex);
wake_up(&module_wq);
return ret;
}
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 632f04c57d82..4c0b7b3e6d2e 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -172,6 +172,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
struct thread_info *owner;
/*
+ * If we own the BKL, then don't spin. The owner of
+ * the mutex might be waiting on us to release the BKL.
+ */
+ if (unlikely(current->lock_depth >= 0))
+ break;
+
+ /*
* If there's an owner, wait for it to either
* release the lock or go to sleep.
*/
diff --git a/kernel/padata.c b/kernel/padata.c
index fd4679266ede..751019415d23 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -864,7 +864,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
err = __padata_add_cpu(pinst, cpu);
mutex_unlock(&pinst->lock);
if (err)
- return NOTIFY_BAD;
+ return notifier_from_errno(err);
break;
case CPU_DOWN_PREPARE:
@@ -875,7 +875,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
err = __padata_remove_cpu(pinst, cpu);
mutex_unlock(&pinst->lock);
if (err)
- return NOTIFY_BAD;
+ return notifier_from_errno(err);
break;
case CPU_UP_CANCELED:
diff --git a/kernel/panic.c b/kernel/panic.c
index dbe13dbb057a..3b16cd93fa7d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -87,6 +87,7 @@ NORET_TYPE void panic(const char * fmt, ...)
*/
preempt_disable();
+ console_verbose();
bust_spinlocks(1);
va_start(args, fmt);
vsnprintf(buf, sizeof(buf), fmt, args);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index a4fa381db3c2..ff86c558af4c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -283,14 +283,15 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
- struct perf_event *group_leader = event->group_leader;
+ WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
+ event->attach_state |= PERF_ATTACH_CONTEXT;
/*
- * Depending on whether it is a standalone or sibling event,
- * add it straight to the context's event list, or to the group
- * leader's sibling list:
+ * If we're a stand alone event or group leader, we go to the context
+ * list, group events are kept attached to the group so that
+ * perf_group_detach can, at all times, locate all siblings.
*/
- if (group_leader == event) {
+ if (event->group_leader == event) {
struct list_head *list;
if (is_software_event(event))
@@ -298,13 +299,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
list = ctx_group_list(event, ctx);
list_add_tail(&event->group_entry, list);
- } else {
- if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
- !is_software_event(event))
- group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
-
- list_add_tail(&event->group_entry, &group_leader->sibling_list);
- group_leader->nr_siblings++;
}
list_add_rcu(&event->event_entry, &ctx->event_list);
@@ -313,6 +307,24 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->nr_stat++;
}
+static void perf_group_attach(struct perf_event *event)
+{
+ struct perf_event *group_leader = event->group_leader;
+
+ WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
+ event->attach_state |= PERF_ATTACH_GROUP;
+
+ if (group_leader == event)
+ return;
+
+ if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
+ !is_software_event(event))
+ group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
+
+ list_add_tail(&event->group_entry, &group_leader->sibling_list);
+ group_leader->nr_siblings++;
+}
+
/*
* Remove a event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
@@ -320,17 +332,22 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
- if (list_empty(&event->group_entry))
+ /*
+ * We can have double detach due to exit/hot-unplug + close.
+ */
+ if (!(event->attach_state & PERF_ATTACH_CONTEXT))
return;
+
+ event->attach_state &= ~PERF_ATTACH_CONTEXT;
+
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
- list_del_init(&event->group_entry);
list_del_rcu(&event->event_entry);
- if (event->group_leader != event)
- event->group_leader->nr_siblings--;
+ if (event->group_leader == event)
+ list_del_init(&event->group_entry);
update_group_times(event);
@@ -345,21 +362,39 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
event->state = PERF_EVENT_STATE_OFF;
}
-static void
-perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx)
+static void perf_group_detach(struct perf_event *event)
{
struct perf_event *sibling, *tmp;
+ struct list_head *list = NULL;
+
+ /*
+ * We can have double detach due to exit/hot-unplug + close.
+ */
+ if (!(event->attach_state & PERF_ATTACH_GROUP))
+ return;
+
+ event->attach_state &= ~PERF_ATTACH_GROUP;
+
+ /*
+ * If this is a sibling, remove it from its group.
+ */
+ if (event->group_leader != event) {
+ list_del_init(&event->group_entry);
+ event->group_leader->nr_siblings--;
+ return;
+ }
+
+ if (!list_empty(&event->group_entry))
+ list = &event->group_entry;
/*
* If this was a group event with sibling events then
* upgrade the siblings to singleton events by adding them
- * to the context list directly:
+ * to whatever list we are on.
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
- struct list_head *list;
-
- list = ctx_group_list(event, ctx);
- list_move_tail(&sibling->group_entry, list);
+ if (list)
+ list_move_tail(&sibling->group_entry, list);
sibling->group_leader = sibling;
/* Inherit group flags from the previous leader */
@@ -652,8 +687,11 @@ group_sched_in(struct perf_event *group_event,
if (txn)
pmu->start_txn(pmu);
- if (event_sched_in(group_event, cpuctx, ctx))
+ if (event_sched_in(group_event, cpuctx, ctx)) {
+ if (txn)
+ pmu->cancel_txn(pmu);
return -EAGAIN;
+ }
/*
* Schedule in siblings as one group (if any):
@@ -675,9 +713,6 @@ group_sched_in(struct perf_event *group_event,
}
group_error:
- if (txn)
- pmu->cancel_txn(pmu);
-
/*
* Groups can be scheduled in as one unit only, so undo any
* partial group before returning:
@@ -689,6 +724,9 @@ group_error:
}
event_sched_out(group_event, cpuctx, ctx);
+ if (txn)
+ pmu->cancel_txn(pmu);
+
return -EAGAIN;
}
@@ -727,6 +765,7 @@ static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
list_add_event(event, ctx);
+ perf_group_attach(event);
event->tstamp_enabled = ctx->time;
event->tstamp_running = ctx->time;
event->tstamp_stopped = ctx->time;
@@ -1468,6 +1507,9 @@ do { \
divisor = nsec * frequency;
}
+ if (!divisor)
+ return dividend;
+
return div64_u64(dividend, divisor);
}
@@ -1490,7 +1532,7 @@ static int perf_event_start(struct perf_event *event)
static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
{
struct hw_perf_event *hwc = &event->hw;
- u64 period, sample_period;
+ s64 period, sample_period;
s64 delta;
period = perf_calculate_period(event, nsec, count);
@@ -1841,6 +1883,7 @@ static void free_event_rcu(struct rcu_head *head)
}
static void perf_pending_sync(struct perf_event *event);
+static void perf_mmap_data_put(struct perf_mmap_data *data);
static void free_event(struct perf_event *event)
{
@@ -1856,9 +1899,9 @@ static void free_event(struct perf_event *event)
atomic_dec(&nr_task_events);
}
- if (event->output) {
- fput(event->output->filp);
- event->output = NULL;
+ if (event->data) {
+ perf_mmap_data_put(event->data);
+ event->data = NULL;
}
if (event->destroy)
@@ -1893,8 +1936,8 @@ int perf_event_release_kernel(struct perf_event *event)
*/
mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
raw_spin_lock_irq(&ctx->lock);
+ perf_group_detach(event);
list_del_event(event, ctx);
- perf_destroy_group(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
mutex_unlock(&ctx->mutex);
@@ -2175,7 +2218,27 @@ unlock:
return ret;
}
-static int perf_event_set_output(struct perf_event *event, int output_fd);
+static const struct file_operations perf_fops;
+
+static struct perf_event *perf_fget_light(int fd, int *fput_needed)
+{
+ struct file *file;
+
+ file = fget_light(fd, fput_needed);
+ if (!file)
+ return ERR_PTR(-EBADF);
+
+ if (file->f_op != &perf_fops) {
+ fput_light(file, *fput_needed);
+ *fput_needed = 0;
+ return ERR_PTR(-EBADF);
+ }
+
+ return file->private_data;
+}
+
+static int perf_event_set_output(struct perf_event *event,
+ struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -2202,7 +2265,23 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return perf_event_period(event, (u64 __user *)arg);
case PERF_EVENT_IOC_SET_OUTPUT:
- return perf_event_set_output(event, arg);
+ {
+ struct perf_event *output_event = NULL;
+ int fput_needed = 0;
+ int ret;
+
+ if (arg != -1) {
+ output_event = perf_fget_light(arg, &fput_needed);
+ if (IS_ERR(output_event))
+ return PTR_ERR(output_event);
+ }
+
+ ret = perf_event_set_output(event, output_event);
+ if (output_event)
+ fput_light(output_event->filp, fput_needed);
+
+ return ret;
+ }
case PERF_EVENT_IOC_SET_FILTER:
return perf_event_set_filter(event, (void __user *)arg);
@@ -2297,11 +2376,6 @@ unlock:
rcu_read_unlock();
}
-static unsigned long perf_data_size(struct perf_mmap_data *data)
-{
- return data->nr_pages << (PAGE_SHIFT + data->data_order);
-}
-
#ifndef CONFIG_PERF_USE_VMALLOC
/*
@@ -2320,6 +2394,19 @@ perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
return virt_to_page(data->data_pages[pgoff - 1]);
}
+static void *perf_mmap_alloc_page(int cpu)
+{
+ struct page *page;
+ int node;
+
+ node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+ page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ if (!page)
+ return NULL;
+
+ return page_address(page);
+}
+
static struct perf_mmap_data *
perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
{
@@ -2327,8 +2414,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
unsigned long size;
int i;
- WARN_ON(atomic_read(&event->mmap_count));
-
size = sizeof(struct perf_mmap_data);
size += nr_pages * sizeof(void *);
@@ -2336,17 +2421,16 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
if (!data)
goto fail;
- data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
+ data->user_page = perf_mmap_alloc_page(event->cpu);
if (!data->user_page)
goto fail_user_page;
for (i = 0; i < nr_pages; i++) {
- data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+ data->data_pages[i] = perf_mmap_alloc_page(event->cpu);
if (!data->data_pages[i])
goto fail_data_pages;
}
- data->data_order = 0;
data->nr_pages = nr_pages;
return data;
@@ -2382,6 +2466,11 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
kfree(data);
}
+static inline int page_order(struct perf_mmap_data *data)
+{
+ return 0;
+}
+
#else
/*
@@ -2390,10 +2479,15 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
* Required for architectures that have d-cache aliasing issues.
*/
+static inline int page_order(struct perf_mmap_data *data)
+{
+ return data->page_order;
+}
+
static struct page *
perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
{
- if (pgoff > (1UL << data->data_order))
+ if (pgoff > (1UL << page_order(data)))
return NULL;
return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
@@ -2413,7 +2507,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)
int i, nr;
data = container_of(work, struct perf_mmap_data, work);
- nr = 1 << data->data_order;
+ nr = 1 << page_order(data);
base = data->user_page;
for (i = 0; i < nr + 1; i++)
@@ -2435,8 +2529,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
unsigned long size;
void *all_buf;
- WARN_ON(atomic_read(&event->mmap_count));
-
size = sizeof(struct perf_mmap_data);
size += sizeof(void *);
@@ -2452,7 +2544,7 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
data->user_page = all_buf;
data->data_pages[0] = all_buf + PAGE_SIZE;
- data->data_order = ilog2(nr_pages);
+ data->page_order = ilog2(nr_pages);
data->nr_pages = 1;
return data;
@@ -2466,6 +2558,11 @@ fail:
#endif
+static unsigned long perf_data_size(struct perf_mmap_data *data)
+{
+ return data->nr_pages << (PAGE_SHIFT + page_order(data));
+}
+
static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct perf_event *event = vma->vm_file->private_data;
@@ -2506,8 +2603,6 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
{
long max_size = perf_data_size(data);
- atomic_set(&data->lock, -1);
-
if (event->attr.watermark) {
data->watermark = min_t(long, max_size,
event->attr.wakeup_watermark);
@@ -2516,7 +2611,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
if (!data->watermark)
data->watermark = max_size / 2;
-
+ atomic_set(&data->refcount, 1);
rcu_assign_pointer(event->data, data);
}
@@ -2528,13 +2623,26 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
perf_mmap_data_free(data);
}
-static void perf_mmap_data_release(struct perf_event *event)
+static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event)
{
- struct perf_mmap_data *data = event->data;
+ struct perf_mmap_data *data;
- WARN_ON(atomic_read(&event->mmap_count));
+ rcu_read_lock();
+ data = rcu_dereference(event->data);
+ if (data) {
+ if (!atomic_inc_not_zero(&data->refcount))
+ data = NULL;
+ }
+ rcu_read_unlock();
+
+ return data;
+}
+
+static void perf_mmap_data_put(struct perf_mmap_data *data)
+{
+ if (!atomic_dec_and_test(&data->refcount))
+ return;
- rcu_assign_pointer(event->data, NULL);
call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
}
@@ -2549,15 +2657,18 @@ static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
- WARN_ON_ONCE(event->ctx->parent_ctx);
if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
unsigned long size = perf_data_size(event->data);
- struct user_struct *user = current_user();
+ struct user_struct *user = event->mmap_user;
+ struct perf_mmap_data *data = event->data;
atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
- vma->vm_mm->locked_vm -= event->data->nr_locked;
- perf_mmap_data_release(event);
+ vma->vm_mm->locked_vm -= event->mmap_locked;
+ rcu_assign_pointer(event->data, NULL);
mutex_unlock(&event->mmap_mutex);
+
+ perf_mmap_data_put(data);
+ free_uid(user);
}
}
@@ -2580,6 +2691,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
long user_extra, extra;
int ret = 0;
+ /*
+ * Don't allow mmap() of inherited per-task counters. This would
+ * create a performance issue due to all children writing to the
+ * same buffer.
+ */
+ if (event->cpu == -1 && event->attr.inherit)
+ return -EINVAL;
+
if (!(vma->vm_flags & VM_SHARED))
return -EINVAL;
@@ -2601,13 +2720,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
WARN_ON_ONCE(event->ctx->parent_ctx);
mutex_lock(&event->mmap_mutex);
- if (event->output) {
- ret = -EINVAL;
- goto unlock;
- }
-
- if (atomic_inc_not_zero(&event->mmap_count)) {
- if (nr_pages != event->data->nr_pages)
+ if (event->data) {
+ if (event->data->nr_pages == nr_pages)
+ atomic_inc(&event->data->refcount);
+ else
ret = -EINVAL;
goto unlock;
}
@@ -2639,21 +2755,23 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
WARN_ON(event->data);
data = perf_mmap_data_alloc(event, nr_pages);
- ret = -ENOMEM;
- if (!data)
+ if (!data) {
+ ret = -ENOMEM;
goto unlock;
+ }
- ret = 0;
perf_mmap_data_init(event, data);
-
- atomic_set(&event->mmap_count, 1);
- atomic_long_add(user_extra, &user->locked_vm);
- vma->vm_mm->locked_vm += extra;
- event->data->nr_locked = extra;
if (vma->vm_flags & VM_WRITE)
event->data->writable = 1;
+ atomic_long_add(user_extra, &user->locked_vm);
+ event->mmap_locked = extra;
+ event->mmap_user = get_current_user();
+ vma->vm_mm->locked_vm += event->mmap_locked;
+
unlock:
+ if (!ret)
+ atomic_inc(&event->mmap_count);
mutex_unlock(&event->mmap_mutex);
vma->vm_flags |= VM_RESERVED;
@@ -2885,127 +3003,87 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
}
/*
- * Curious locking construct.
- *
* We need to ensure a later event_id doesn't publish a head when a former
- * event_id isn't done writing. However since we need to deal with NMIs we
+ * event isn't done writing. However since we need to deal with NMIs we
* cannot fully serialize things.
*
- * What we do is serialize between CPUs so we only have to deal with NMI
- * nesting on a single CPU.
- *
* We only publish the head (and generate a wakeup) when the outer-most
- * event_id completes.
+ * event completes.
*/
-static void perf_output_lock(struct perf_output_handle *handle)
+static void perf_output_get_handle(struct perf_output_handle *handle)
{
struct perf_mmap_data *data = handle->data;
- int cur, cpu = get_cpu();
-
- handle->locked = 0;
- for (;;) {
- cur = atomic_cmpxchg(&data->lock, -1, cpu);
- if (cur == -1) {
- handle->locked = 1;
- break;
- }
- if (cur == cpu)
- break;
-
- cpu_relax();
- }
+ preempt_disable();
+ local_inc(&data->nest);
+ handle->wakeup = local_read(&data->wakeup);
}
-static void perf_output_unlock(struct perf_output_handle *handle)
+static void perf_output_put_handle(struct perf_output_handle *handle)
{
struct perf_mmap_data *data = handle->data;
unsigned long head;
- int cpu;
-
- data->done_head = data->head;
-
- if (!handle->locked)
- goto out;
again:
- /*
- * The xchg implies a full barrier that ensures all writes are done
- * before we publish the new head, matched by a rmb() in userspace when
- * reading this position.
- */
- while ((head = atomic_long_xchg(&data->done_head, 0)))
- data->user_page->data_head = head;
+ head = local_read(&data->head);
/*
- * NMI can happen here, which means we can miss a done_head update.
+ * IRQ/NMI can happen here, which means we can miss a head update.
*/
- cpu = atomic_xchg(&data->lock, -1);
- WARN_ON_ONCE(cpu != smp_processor_id());
+ if (!local_dec_and_test(&data->nest))
+ goto out;
/*
- * Therefore we have to validate we did not indeed do so.
+ * Publish the known good head. Rely on the full barrier implied
+ * by atomic_dec_and_test() order the data->head read and this
+ * write.
*/
- if (unlikely(atomic_long_read(&data->done_head))) {
- /*
- * Since we had it locked, we can lock it again.
- */
- while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
- cpu_relax();
+ data->user_page->data_head = head;
+ /*
+ * Now check if we missed an update, rely on the (compiler)
+ * barrier in atomic_dec_and_test() to re-read data->head.
+ */
+ if (unlikely(head != local_read(&data->head))) {
+ local_inc(&data->nest);
goto again;
}
- if (atomic_xchg(&data->wakeup, 0))
+ if (handle->wakeup != local_read(&data->wakeup))
perf_output_wakeup(handle);
-out:
- put_cpu();
+
+ out:
+ preempt_enable();
}
-void perf_output_copy(struct perf_output_handle *handle,
+__always_inline void perf_output_copy(struct perf_output_handle *handle,
const void *buf, unsigned int len)
{
- unsigned int pages_mask;
- unsigned long offset;
- unsigned int size;
- void **pages;
-
- offset = handle->offset;
- pages_mask = handle->data->nr_pages - 1;
- pages = handle->data->data_pages;
-
do {
- unsigned long page_offset;
- unsigned long page_size;
- int nr;
+ unsigned long size = min_t(unsigned long, handle->size, len);
- nr = (offset >> PAGE_SHIFT) & pages_mask;
- page_size = 1UL << (handle->data->data_order + PAGE_SHIFT);
- page_offset = offset & (page_size - 1);
- size = min_t(unsigned int, page_size - page_offset, len);
+ memcpy(handle->addr, buf, size);
- memcpy(pages[nr] + page_offset, buf, size);
+ len -= size;
+ handle->addr += size;
+ buf += size;
+ handle->size -= size;
+ if (!handle->size) {
+ struct perf_mmap_data *data = handle->data;
- len -= size;
- buf += size;
- offset += size;
+ handle->page++;
+ handle->page &= data->nr_pages - 1;
+ handle->addr = data->data_pages[handle->page];
+ handle->size = PAGE_SIZE << page_order(data);
+ }
} while (len);
-
- handle->offset = offset;
-
- /*
- * Check we didn't copy past our reservation window, taking the
- * possible unsigned int wrap into account.
- */
- WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
}
int perf_output_begin(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size,
int nmi, int sample)
{
- struct perf_event *output_event;
struct perf_mmap_data *data;
unsigned long tail, offset, head;
int have_lost;
@@ -3022,10 +3100,6 @@ int perf_output_begin(struct perf_output_handle *handle,
if (event->parent)
event = event->parent;
- output_event = rcu_dereference(event->output);
- if (output_event)
- event = output_event;
-
data = rcu_dereference(event->data);
if (!data)
goto out;
@@ -3036,13 +3110,13 @@ int perf_output_begin(struct perf_output_handle *handle,
handle->sample = sample;
if (!data->nr_pages)
- goto fail;
+ goto out;
- have_lost = atomic_read(&data->lost);
+ have_lost = local_read(&data->lost);
if (have_lost)
size += sizeof(lost_event);
- perf_output_lock(handle);
+ perf_output_get_handle(handle);
do {
/*
@@ -3052,24 +3126,28 @@ int perf_output_begin(struct perf_output_handle *handle,
*/
tail = ACCESS_ONCE(data->user_page->data_tail);
smp_rmb();
- offset = head = atomic_long_read(&data->head);
+ offset = head = local_read(&data->head);
head += size;
if (unlikely(!perf_output_space(data, tail, offset, head)))
goto fail;
- } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
+ } while (local_cmpxchg(&data->head, offset, head) != offset);
- handle->offset = offset;
- handle->head = head;
+ if (head - local_read(&data->wakeup) > data->watermark)
+ local_add(data->watermark, &data->wakeup);
- if (head - tail > data->watermark)
- atomic_set(&data->wakeup, 1);
+ handle->page = offset >> (PAGE_SHIFT + page_order(data));
+ handle->page &= data->nr_pages - 1;
+ handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1);
+ handle->addr = data->data_pages[handle->page];
+ handle->addr += handle->size;
+ handle->size = (PAGE_SIZE << page_order(data)) - handle->size;
if (have_lost) {
lost_event.header.type = PERF_RECORD_LOST;
lost_event.header.misc = 0;
lost_event.header.size = sizeof(lost_event);
lost_event.id = event->id;
- lost_event.lost = atomic_xchg(&data->lost, 0);
+ lost_event.lost = local_xchg(&data->lost, 0);
perf_output_put(handle, lost_event);
}
@@ -3077,8 +3155,8 @@ int perf_output_begin(struct perf_output_handle *handle,
return 0;
fail:
- atomic_inc(&data->lost);
- perf_output_unlock(handle);
+ local_inc(&data->lost);
+ perf_output_put_handle(handle);
out:
rcu_read_unlock();
@@ -3093,14 +3171,14 @@ void perf_output_end(struct perf_output_handle *handle)
int wakeup_events = event->attr.wakeup_events;
if (handle->sample && wakeup_events) {
- int events = atomic_inc_return(&data->events);
+ int events = local_inc_return(&data->events);
if (events >= wakeup_events) {
- atomic_sub(wakeup_events, &data->events);
- atomic_set(&data->wakeup, 1);
+ local_sub(wakeup_events, &data->events);
+ local_inc(&data->wakeup);
}
}
- perf_output_unlock(handle);
+ perf_output_put_handle(handle);
rcu_read_unlock();
}
@@ -3436,22 +3514,13 @@ static void perf_event_task_output(struct perf_event *event,
{
struct perf_output_handle handle;
struct task_struct *task = task_event->task;
- unsigned long flags;
int size, ret;
- /*
- * If this CPU attempts to acquire an rq lock held by a CPU spinning
- * in perf_output_lock() from interrupt context, it's game over.
- */
- local_irq_save(flags);
-
size = task_event->event_id.header.size;
ret = perf_output_begin(&handle, event, size, 0, 0);
- if (ret) {
- local_irq_restore(flags);
+ if (ret)
return;
- }
task_event->event_id.pid = perf_event_pid(event, task);
task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3462,7 +3531,6 @@ static void perf_event_task_output(struct perf_event *event,
perf_output_put(&handle, task_event->event_id);
perf_output_end(&handle);
- local_irq_restore(flags);
}
static int perf_event_task_match(struct perf_event *event)
@@ -3990,13 +4058,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
}
}
-static void perf_swevent_unthrottle(struct perf_event *event)
-{
- /*
- * Nothing to do, we already reset hwc->interrupts.
- */
-}
-
static void perf_swevent_add(struct perf_event *event, u64 nr,
int nmi, struct perf_sample_data *data,
struct pt_regs *regs)
@@ -4020,9 +4081,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
perf_swevent_overflow(event, 0, nmi, data, regs);
}
-static int perf_tp_event_match(struct perf_event *event,
- struct perf_sample_data *data);
-
static int perf_exclude_event(struct perf_event *event,
struct pt_regs *regs)
{
@@ -4052,10 +4110,6 @@ static int perf_swevent_match(struct perf_event *event,
if (perf_exclude_event(event, regs))
return 0;
- if (event->attr.type == PERF_TYPE_TRACEPOINT &&
- !perf_tp_event_match(event, data))
- return 0;
-
return 1;
}
@@ -4066,19 +4120,46 @@ static inline u64 swevent_hash(u64 type, u32 event_id)
return hash_64(val, SWEVENT_HLIST_BITS);
}
-static struct hlist_head *
-find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+static inline struct hlist_head *
+__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
{
- u64 hash;
- struct swevent_hlist *hlist;
+ u64 hash = swevent_hash(type, event_id);
- hash = swevent_hash(type, event_id);
+ return &hlist->heads[hash];
+}
+
+/* For the read side: events when they trigger */
+static inline struct hlist_head *
+find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+{
+ struct swevent_hlist *hlist;
hlist = rcu_dereference(ctx->swevent_hlist);
if (!hlist)
return NULL;
- return &hlist->heads[hash];
+ return __find_swevent_head(hlist, type, event_id);
+}
+
+/* For the event head insertion and removal in the hlist */
+static inline struct hlist_head *
+find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
+{
+ struct swevent_hlist *hlist;
+ u32 event_id = event->attr.config;
+ u64 type = event->attr.type;
+
+ /*
+ * Event scheduling is always serialized against hlist allocation
+ * and release. Which makes the protected version suitable here.
+ * The context lock guarantees that.
+ */
+ hlist = rcu_dereference_protected(ctx->swevent_hlist,
+ lockdep_is_held(&event->ctx->lock));
+ if (!hlist)
+ return NULL;
+
+ return __find_swevent_head(hlist, type, event_id);
}
static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
@@ -4095,7 +4176,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
rcu_read_lock();
- head = find_swevent_head(cpuctx, type, event_id);
+ head = find_swevent_head_rcu(cpuctx, type, event_id);
if (!head)
goto end;
@@ -4110,7 +4191,7 @@ end:
int perf_swevent_get_recursion_context(void)
{
- struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
int rctx;
if (in_nmi())
@@ -4122,10 +4203,8 @@ int perf_swevent_get_recursion_context(void)
else
rctx = 0;
- if (cpuctx->recursion[rctx]) {
- put_cpu_var(perf_cpu_context);
+ if (cpuctx->recursion[rctx])
return -1;
- }
cpuctx->recursion[rctx]++;
barrier();
@@ -4139,7 +4218,6 @@ void perf_swevent_put_recursion_context(int rctx)
struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
barrier();
cpuctx->recursion[rctx]--;
- put_cpu_var(perf_cpu_context);
}
EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
@@ -4150,6 +4228,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
struct perf_sample_data data;
int rctx;
+ preempt_disable_notrace();
rctx = perf_swevent_get_recursion_context();
if (rctx < 0)
return;
@@ -4159,6 +4238,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
perf_swevent_put_recursion_context(rctx);
+ preempt_enable_notrace();
}
static void perf_swevent_read(struct perf_event *event)
@@ -4178,7 +4258,7 @@ static int perf_swevent_enable(struct perf_event *event)
perf_swevent_set_period(event);
}
- head = find_swevent_head(cpuctx, event->attr.type, event->attr.config);
+ head = find_swevent_head(cpuctx, event);
if (WARN_ON_ONCE(!head))
return -EINVAL;
@@ -4192,11 +4272,22 @@ static void perf_swevent_disable(struct perf_event *event)
hlist_del_rcu(&event->hlist_entry);
}
+static void perf_swevent_void(struct perf_event *event)
+{
+}
+
+static int perf_swevent_int(struct perf_event *event)
+{
+ return 0;
+}
+
static const struct pmu perf_ops_generic = {
.enable = perf_swevent_enable,
.disable = perf_swevent_disable,
+ .start = perf_swevent_int,
+ .stop = perf_swevent_void,
.read = perf_swevent_read,
- .unthrottle = perf_swevent_unthrottle,
+ .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */
};
/*
@@ -4366,6 +4457,14 @@ static const struct pmu perf_ops_task_clock = {
.read = task_clock_perf_event_read,
};
+/* Deref the hlist from the update side */
+static inline struct swevent_hlist *
+swevent_hlist_deref(struct perf_cpu_context *cpuctx)
+{
+ return rcu_dereference_protected(cpuctx->swevent_hlist,
+ lockdep_is_held(&cpuctx->hlist_mutex));
+}
+
static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
{
struct swevent_hlist *hlist;
@@ -4376,12 +4475,11 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
{
- struct swevent_hlist *hlist;
+ struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
- if (!cpuctx->swevent_hlist)
+ if (!hlist)
return;
- hlist = cpuctx->swevent_hlist;
rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
}
@@ -4418,7 +4516,7 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
mutex_lock(&cpuctx->hlist_mutex);
- if (!cpuctx->swevent_hlist && cpu_online(cpu)) {
+ if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
struct swevent_hlist *hlist;
hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -4467,10 +4565,48 @@ static int swevent_hlist_get(struct perf_event *event)
#ifdef CONFIG_EVENT_TRACING
-void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
- int entry_size, struct pt_regs *regs)
+static const struct pmu perf_ops_tracepoint = {
+ .enable = perf_trace_enable,
+ .disable = perf_trace_disable,
+ .start = perf_swevent_int,
+ .stop = perf_swevent_void,
+ .read = perf_swevent_read,
+ .unthrottle = perf_swevent_void,
+};
+
+static int perf_tp_filter_match(struct perf_event *event,
+ struct perf_sample_data *data)
+{
+ void *record = data->raw->data;
+
+ if (likely(!event->filter) || filter_match_preds(event->filter, record))
+ return 1;
+ return 0;
+}
+
+static int perf_tp_event_match(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ /*
+ * All tracepoints are from kernel-space.
+ */
+ if (event->attr.exclude_kernel)
+ return 0;
+
+ if (!perf_tp_filter_match(event, data))
+ return 0;
+
+ return 1;
+}
+
+void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
+ struct pt_regs *regs, struct hlist_head *head)
{
struct perf_sample_data data;
+ struct perf_event *event;
+ struct hlist_node *node;
+
struct perf_raw_record raw = {
.size = entry_size,
.data = record,
@@ -4479,26 +4615,18 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
perf_sample_data_init(&data, addr);
data.raw = &raw;
- /* Trace events already protected against recursion */
- do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
- &data, regs);
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
+ if (perf_tp_event_match(event, &data, regs))
+ perf_swevent_add(event, count, 1, &data, regs);
+ }
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(perf_tp_event);
-static int perf_tp_event_match(struct perf_event *event,
- struct perf_sample_data *data)
-{
- void *record = data->raw->data;
-
- if (likely(!event->filter) || filter_match_preds(event->filter, record))
- return 1;
- return 0;
-}
-
static void tp_perf_event_destroy(struct perf_event *event)
{
- perf_trace_disable(event->attr.config);
- swevent_hlist_put(event);
+ perf_trace_destroy(event);
}
static const struct pmu *tp_perf_event_init(struct perf_event *event)
@@ -4514,17 +4642,13 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
- if (perf_trace_enable(event->attr.config))
+ err = perf_trace_init(event);
+ if (err)
return NULL;
event->destroy = tp_perf_event_destroy;
- err = swevent_hlist_get(event);
- if (err) {
- perf_trace_disable(event->attr.config);
- return ERR_PTR(err);
- }
- return &perf_ops_generic;
+ return &perf_ops_tracepoint;
}
static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4552,12 +4676,6 @@ static void perf_event_free_filter(struct perf_event *event)
#else
-static int perf_tp_event_match(struct perf_event *event,
- struct perf_sample_data *data)
-{
- return 1;
-}
-
static const struct pmu *tp_perf_event_init(struct perf_event *event)
{
return NULL;
@@ -4886,54 +5004,53 @@ err_size:
goto out;
}
-static int perf_event_set_output(struct perf_event *event, int output_fd)
+static int
+perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
- struct perf_event *output_event = NULL;
- struct file *output_file = NULL;
- struct perf_event *old_output;
- int fput_needed = 0;
+ struct perf_mmap_data *data = NULL, *old_data = NULL;
int ret = -EINVAL;
- if (!output_fd)
+ if (!output_event)
goto set;
- output_file = fget_light(output_fd, &fput_needed);
- if (!output_file)
- return -EBADF;
-
- if (output_file->f_op != &perf_fops)
+ /* don't allow circular references */
+ if (event == output_event)
goto out;
- output_event = output_file->private_data;
-
- /* Don't chain output fds */
- if (output_event->output)
+ /*
+ * Don't allow cross-cpu buffers
+ */
+ if (output_event->cpu != event->cpu)
goto out;
- /* Don't set an output fd when we already have an output channel */
- if (event->data)
+ /*
+ * If its not a per-cpu buffer, it must be the same task.
+ */
+ if (output_event->cpu == -1 && output_event->ctx != event->ctx)
goto out;
- atomic_long_inc(&output_file->f_count);
-
set:
mutex_lock(&event->mmap_mutex);
- old_output = event->output;
- rcu_assign_pointer(event->output, output_event);
- mutex_unlock(&event->mmap_mutex);
+ /* Can't redirect output if we've got an active mmap() */
+ if (atomic_read(&event->mmap_count))
+ goto unlock;
- if (old_output) {
- /*
- * we need to make sure no existing perf_output_*()
- * is still referencing this event.
- */
- synchronize_rcu();
- fput(old_output->filp);
+ if (output_event) {
+ /* get the buffer we want to redirect to */
+ data = perf_mmap_data_get(output_event);
+ if (!data)
+ goto unlock;
}
+ old_data = event->data;
+ rcu_assign_pointer(event->data, data);
ret = 0;
+unlock:
+ mutex_unlock(&event->mmap_mutex);
+
+ if (old_data)
+ perf_mmap_data_put(old_data);
out:
- fput_light(output_file, fput_needed);
return ret;
}
@@ -4949,13 +5066,13 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event_attr __user *, attr_uptr,
pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
- struct perf_event *event, *group_leader;
+ struct perf_event *event, *group_leader = NULL, *output_event = NULL;
struct perf_event_attr attr;
struct perf_event_context *ctx;
struct file *event_file = NULL;
struct file *group_file = NULL;
+ int event_fd;
int fput_needed = 0;
- int fput_needed2 = 0;
int err;
/* for future expandability... */
@@ -4976,26 +5093,38 @@ SYSCALL_DEFINE5(perf_event_open,
return -EINVAL;
}
+ event_fd = get_unused_fd_flags(O_RDWR);
+ if (event_fd < 0)
+ return event_fd;
+
/*
* Get the target context (task or percpu):
*/
ctx = find_get_context(pid, cpu);
- if (IS_ERR(ctx))
- return PTR_ERR(ctx);
+ if (IS_ERR(ctx)) {
+ err = PTR_ERR(ctx);
+ goto err_fd;
+ }
+
+ if (group_fd != -1) {
+ group_leader = perf_fget_light(group_fd, &fput_needed);
+ if (IS_ERR(group_leader)) {
+ err = PTR_ERR(group_leader);
+ goto err_put_context;
+ }
+ group_file = group_leader->filp;
+ if (flags & PERF_FLAG_FD_OUTPUT)
+ output_event = group_leader;
+ if (flags & PERF_FLAG_FD_NO_GROUP)
+ group_leader = NULL;
+ }
/*
* Look up the group leader (we will attach this event to it):
*/
- group_leader = NULL;
- if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
+ if (group_leader) {
err = -EINVAL;
- group_file = fget_light(group_fd, &fput_needed);
- if (!group_file)
- goto err_put_context;
- if (group_file->f_op != &perf_fops)
- goto err_put_context;
- group_leader = group_file->private_data;
/*
* Do not allow a recursive hierarchy (this new sibling
* becoming part of another group-sibling):
@@ -5017,22 +5146,21 @@ SYSCALL_DEFINE5(perf_event_open,
event = perf_event_alloc(&attr, cpu, ctx, group_leader,
NULL, NULL, GFP_KERNEL);
- err = PTR_ERR(event);
- if (IS_ERR(event))
+ if (IS_ERR(event)) {
+ err = PTR_ERR(event);
goto err_put_context;
+ }
- err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR);
- if (err < 0)
- goto err_free_put_context;
+ if (output_event) {
+ err = perf_event_set_output(event, output_event);
+ if (err)
+ goto err_free_put_context;
+ }
- event_file = fget_light(err, &fput_needed2);
- if (!event_file)
+ event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
+ if (IS_ERR(event_file)) {
+ err = PTR_ERR(event_file);
goto err_free_put_context;
-
- if (flags & PERF_FLAG_FD_OUTPUT) {
- err = perf_event_set_output(event, group_fd);
- if (err)
- goto err_fput_free_put_context;
}
event->filp = event_file;
@@ -5048,19 +5176,23 @@ SYSCALL_DEFINE5(perf_event_open,
list_add_tail(&event->owner_entry, &current->perf_event_list);
mutex_unlock(&current->perf_event_mutex);
-err_fput_free_put_context:
- fput_light(event_file, fput_needed2);
+ /*
+ * Drop the reference on the group_event after placing the
+ * new event on the sibling_list. This ensures destruction
+ * of the group leader will find the pointer to itself in
+ * perf_group_detach().
+ */
+ fput_light(group_file, fput_needed);
+ fd_install(event_fd, event_file);
+ return event_fd;
err_free_put_context:
- if (err < 0)
- free_event(event);
-
+ free_event(event);
err_put_context:
- if (err < 0)
- put_ctx(ctx);
-
fput_light(group_file, fput_needed);
-
+ put_ctx(ctx);
+err_fd:
+ put_unused_fd(event_fd);
return err;
}
@@ -5371,6 +5503,7 @@ static void perf_free_event(struct perf_event *event,
fput(parent->filp);
+ perf_group_detach(event);
list_del_event(event, ctx);
free_event(event);
}
diff --git a/kernel/pid.c b/kernel/pid.c
index aebb30d9c233..e9fd8c132d26 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -513,6 +513,13 @@ void __init pidhash_init(void)
void __init pidmap_init(void)
{
+ /* bump default and minimum pid_max based on number of cpus */
+ pid_max = min(pid_max_max, max_t(int, pid_max,
+ PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
+ pid_max_min = max_t(int, pid_max_min,
+ PIDS_PER_CPU_MIN * num_possible_cpus());
+ pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
+
init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
/* Reserve PID 0. We never call free_pidmap(0) */
set_bit(0, init_pid_ns.pidmap[0].page);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index f42d3f737a33..996a4dec5f96 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -48,59 +48,49 @@
* or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
* held, taken with _irqsave. One lock to rule them all
*/
-struct pm_qos_request_list {
- struct list_head list;
- union {
- s32 value;
- s32 usec;
- s32 kbps;
- };
- int pm_qos_class;
+enum pm_qos_type {
+ PM_QOS_MAX, /* return the largest value */
+ PM_QOS_MIN /* return the smallest value */
};
-static s32 max_compare(s32 v1, s32 v2);
-static s32 min_compare(s32 v1, s32 v2);
-
struct pm_qos_object {
- struct pm_qos_request_list requests;
+ struct plist_head requests;
struct blocking_notifier_head *notifiers;
struct miscdevice pm_qos_power_miscdev;
char *name;
s32 default_value;
- atomic_t target_value;
- s32 (*comparitor)(s32, s32);
+ enum pm_qos_type type;
};
+static DEFINE_SPINLOCK(pm_qos_lock);
+
static struct pm_qos_object null_pm_qos;
static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
static struct pm_qos_object cpu_dma_pm_qos = {
- .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)},
+ .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
.notifiers = &cpu_dma_lat_notifier,
.name = "cpu_dma_latency",
.default_value = 2000 * USEC_PER_SEC,
- .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
- .comparitor = min_compare
+ .type = PM_QOS_MIN,
};
static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
static struct pm_qos_object network_lat_pm_qos = {
- .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)},
+ .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
.notifiers = &network_lat_notifier,
.name = "network_latency",
.default_value = 2000 * USEC_PER_SEC,
- .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
- .comparitor = min_compare
+ .type = PM_QOS_MIN
};
static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
static struct pm_qos_object network_throughput_pm_qos = {
- .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)},
+ .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
.notifiers = &network_throughput_notifier,
.name = "network_throughput",
.default_value = 0,
- .target_value = ATOMIC_INIT(0),
- .comparitor = max_compare
+ .type = PM_QOS_MAX,
};
@@ -111,8 +101,6 @@ static struct pm_qos_object *pm_qos_array[] = {
&network_throughput_pm_qos
};
-static DEFINE_SPINLOCK(pm_qos_lock);
-
static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
size_t count, loff_t *f_pos);
static int pm_qos_power_open(struct inode *inode, struct file *filp);
@@ -124,46 +112,55 @@ static const struct file_operations pm_qos_power_fops = {
.release = pm_qos_power_release,
};
-/* static helper functions */
-static s32 max_compare(s32 v1, s32 v2)
+/* unlocked internal variant */
+static inline int pm_qos_get_value(struct pm_qos_object *o)
{
- return max(v1, v2);
-}
+ if (plist_head_empty(&o->requests))
+ return o->default_value;
-static s32 min_compare(s32 v1, s32 v2)
-{
- return min(v1, v2);
-}
+ switch (o->type) {
+ case PM_QOS_MIN:
+ return plist_last(&o->requests)->prio;
+ case PM_QOS_MAX:
+ return plist_first(&o->requests)->prio;
-static void update_target(int pm_qos_class)
+ default:
+ /* runtime check for not using enum */
+ BUG();
+ }
+}
+
+static void update_target(struct pm_qos_object *o, struct plist_node *node,
+ int del, int value)
{
- s32 extreme_value;
- struct pm_qos_request_list *node;
unsigned long flags;
- int call_notifier = 0;
+ int prev_value, curr_value;
spin_lock_irqsave(&pm_qos_lock, flags);
- extreme_value = pm_qos_array[pm_qos_class]->default_value;
- list_for_each_entry(node,
- &pm_qos_array[pm_qos_class]->requests.list, list) {
- extreme_value = pm_qos_array[pm_qos_class]->comparitor(
- extreme_value, node->value);
- }
- if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) !=
- extreme_value) {
- call_notifier = 1;
- atomic_set(&pm_qos_array[pm_qos_class]->target_value,
- extreme_value);
- pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class,
- atomic_read(&pm_qos_array[pm_qos_class]->target_value));
+ prev_value = pm_qos_get_value(o);
+ /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */
+ if (value != PM_QOS_DEFAULT_VALUE) {
+ /*
+ * to change the list, we atomically remove, reinit
+ * with new value and add, then see if the extremal
+ * changed
+ */
+ plist_del(node, &o->requests);
+ plist_node_init(node, value);
+ plist_add(node, &o->requests);
+ } else if (del) {
+ plist_del(node, &o->requests);
+ } else {
+ plist_add(node, &o->requests);
}
+ curr_value = pm_qos_get_value(o);
spin_unlock_irqrestore(&pm_qos_lock, flags);
- if (call_notifier)
- blocking_notifier_call_chain(
- pm_qos_array[pm_qos_class]->notifiers,
- (unsigned long) extreme_value, NULL);
+ if (prev_value != curr_value)
+ blocking_notifier_call_chain(o->notifiers,
+ (unsigned long)curr_value,
+ NULL);
}
static int register_pm_qos_misc(struct pm_qos_object *qos)
@@ -196,10 +193,23 @@ static int find_pm_qos_object_by_minor(int minor)
*/
int pm_qos_request(int pm_qos_class)
{
- return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
+ unsigned long flags;
+ int value;
+
+ spin_lock_irqsave(&pm_qos_lock, flags);
+ value = pm_qos_get_value(pm_qos_array[pm_qos_class]);
+ spin_unlock_irqrestore(&pm_qos_lock, flags);
+
+ return value;
}
EXPORT_SYMBOL_GPL(pm_qos_request);
+int pm_qos_request_active(struct pm_qos_request_list *req)
+{
+ return req->pm_qos_class != 0;
+}
+EXPORT_SYMBOL_GPL(pm_qos_request_active);
+
/**
* pm_qos_add_request - inserts new qos request into the list
* @pm_qos_class: identifies which list of qos request to us
@@ -211,27 +221,23 @@ EXPORT_SYMBOL_GPL(pm_qos_request);
* element as a handle for use in updating and removal. Call needs to save
* this handle for later use.
*/
-struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value)
+void pm_qos_add_request(struct pm_qos_request_list *dep,
+ int pm_qos_class, s32 value)
{
- struct pm_qos_request_list *dep;
- unsigned long flags;
+ struct pm_qos_object *o = pm_qos_array[pm_qos_class];
+ int new_value;
- dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL);
- if (dep) {
- if (value == PM_QOS_DEFAULT_VALUE)
- dep->value = pm_qos_array[pm_qos_class]->default_value;
- else
- dep->value = value;
- dep->pm_qos_class = pm_qos_class;
-
- spin_lock_irqsave(&pm_qos_lock, flags);
- list_add(&dep->list,
- &pm_qos_array[pm_qos_class]->requests.list);
- spin_unlock_irqrestore(&pm_qos_lock, flags);
- update_target(pm_qos_class);
+ if (pm_qos_request_active(dep)) {
+ WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
+ return;
}
-
- return dep;
+ if (value == PM_QOS_DEFAULT_VALUE)
+ new_value = o->default_value;
+ else
+ new_value = value;
+ plist_node_init(&dep->list, new_value);
+ dep->pm_qos_class = pm_qos_class;
+ update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
}
EXPORT_SYMBOL_GPL(pm_qos_add_request);
@@ -246,27 +252,28 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request);
* Attempts are made to make this code callable on hot code paths.
*/
void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
- s32 new_value)
+ s32 new_value)
{
- unsigned long flags;
- int pending_update = 0;
s32 temp;
+ struct pm_qos_object *o;
+
+ if (!pm_qos_req) /*guard against callers passing in null */
+ return;
- if (pm_qos_req) { /*guard against callers passing in null */
- spin_lock_irqsave(&pm_qos_lock, flags);
- if (new_value == PM_QOS_DEFAULT_VALUE)
- temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value;
- else
- temp = new_value;
-
- if (temp != pm_qos_req->value) {
- pending_update = 1;
- pm_qos_req->value = temp;
- }
- spin_unlock_irqrestore(&pm_qos_lock, flags);
- if (pending_update)
- update_target(pm_qos_req->pm_qos_class);
+ if (!pm_qos_request_active(pm_qos_req)) {
+ WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
+ return;
}
+
+ o = pm_qos_array[pm_qos_req->pm_qos_class];
+
+ if (new_value == PM_QOS_DEFAULT_VALUE)
+ temp = o->default_value;
+ else
+ temp = new_value;
+
+ if (temp != pm_qos_req->list.prio)
+ update_target(o, &pm_qos_req->list, 0, temp);
}
EXPORT_SYMBOL_GPL(pm_qos_update_request);
@@ -280,19 +287,20 @@ EXPORT_SYMBOL_GPL(pm_qos_update_request);
*/
void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
{
- unsigned long flags;
- int qos_class;
+ struct pm_qos_object *o;
if (pm_qos_req == NULL)
return;
/* silent return to keep pcm code cleaner */
- qos_class = pm_qos_req->pm_qos_class;
- spin_lock_irqsave(&pm_qos_lock, flags);
- list_del(&pm_qos_req->list);
- kfree(pm_qos_req);
- spin_unlock_irqrestore(&pm_qos_lock, flags);
- update_target(qos_class);
+ if (!pm_qos_request_active(pm_qos_req)) {
+ WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
+ return;
+ }
+
+ o = pm_qos_array[pm_qos_req->pm_qos_class];
+ update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE);
+ memset(pm_qos_req, 0, sizeof(*pm_qos_req));
}
EXPORT_SYMBOL_GPL(pm_qos_remove_request);
@@ -340,8 +348,12 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
if (pm_qos_class >= 0) {
- filp->private_data = (void *) pm_qos_add_request(pm_qos_class,
- PM_QOS_DEFAULT_VALUE);
+ struct pm_qos_request_list *req = kzalloc(GFP_KERNEL, sizeof(*req));
+ if (!req)
+ return -ENOMEM;
+
+ pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
+ filp->private_data = req;
if (filp->private_data)
return 0;
@@ -353,8 +365,9 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp)
{
struct pm_qos_request_list *req;
- req = (struct pm_qos_request_list *)filp->private_data;
+ req = filp->private_data;
pm_qos_remove_request(req);
+ kfree(req);
return 0;
}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 00bb252f29a2..9829646d399c 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -363,7 +363,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
}
} else {
read_lock(&tasklist_lock);
- if (thread_group_leader(p) && p->signal) {
+ if (thread_group_leader(p) && p->sighand) {
error =
cpu_clock_sample_group(which_clock,
p, &rtn);
@@ -439,7 +439,7 @@ int posix_cpu_timer_del(struct k_itimer *timer)
if (likely(p != NULL)) {
read_lock(&tasklist_lock);
- if (unlikely(p->signal == NULL)) {
+ if (unlikely(p->sighand == NULL)) {
/*
* We raced with the reaping of the task.
* The deletion should have cleared us off the list.
@@ -691,10 +691,10 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
read_lock(&tasklist_lock);
/*
* We need the tasklist_lock to protect against reaping that
- * clears p->signal. If p has just been reaped, we can no
+ * clears p->sighand. If p has just been reaped, we can no
* longer get any information about it at all.
*/
- if (unlikely(p->signal == NULL)) {
+ if (unlikely(p->sighand == NULL)) {
read_unlock(&tasklist_lock);
put_task_struct(p);
timer->it.cpu.task = NULL;
@@ -863,7 +863,7 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
clear_dead = p->exit_state;
} else {
read_lock(&tasklist_lock);
- if (unlikely(p->signal == NULL)) {
+ if (unlikely(p->sighand == NULL)) {
/*
* The process has been reaped.
* We can't even collect a sample any more.
@@ -1199,7 +1199,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
spin_lock(&p->sighand->siglock);
} else {
read_lock(&tasklist_lock);
- if (unlikely(p->signal == NULL)) {
+ if (unlikely(p->sighand == NULL)) {
/*
* The process has been reaped.
* We can't even collect a sample any more.
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 00d1fda58ab6..ad723420acc3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -559,14 +559,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
new_timer->it_id = (timer_t) new_timer_id;
new_timer->it_clock = which_clock;
new_timer->it_overrun = -1;
- error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
- if (error)
- goto out;
- /*
- * return the timer_id now. The next step is hard to
- * back out if there is an error.
- */
if (copy_to_user(created_timer_id,
&new_timer_id, sizeof (new_timer_id))) {
error = -EFAULT;
@@ -597,6 +590,10 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
new_timer->sigq->info.si_tid = new_timer->it_id;
new_timer->sigq->info.si_code = SI_TIMER;
+ error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
+ if (error)
+ goto out;
+
spin_lock_irq(&current->sighand->siglock);
new_timer->it_signal = current->signal;
list_add(&new_timer->list, &current->signal->posix_timers);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 5c36ea9d55d2..ca6066a6952e 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -99,9 +99,13 @@ config PM_SLEEP_ADVANCED_DEBUG
depends on PM_ADVANCED_DEBUG
default n
+config SUSPEND_NVS
+ bool
+
config SUSPEND
bool "Suspend to RAM and standby"
depends on PM && ARCH_SUSPEND_POSSIBLE
+ select SUSPEND_NVS if HAS_IOMEM
default y
---help---
Allow the system to enter sleep states in which main memory is
@@ -130,13 +134,10 @@ config SUSPEND_FREEZER
Turning OFF this setting is NOT recommended! If in doubt, say Y.
-config HIBERNATION_NVS
- bool
-
config HIBERNATION
bool "Hibernation (aka 'suspend to disk')"
depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
- select HIBERNATION_NVS if HAS_IOMEM
+ select SUSPEND_NVS if HAS_IOMEM
---help---
Enable the suspend to disk (STD) functionality, which is usually
called "hibernation" in user interfaces. STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 524e058dcf06..f9063c6b185d 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -10,6 +10,6 @@ obj-$(CONFIG_SUSPEND) += suspend.o
obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
block_io.o
-obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o
+obj-$(CONFIG_SUSPEND_NVS) += nvs.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index aa9e916da4d5..d26f04e92743 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -277,7 +277,7 @@ static int create_image(int platform_mode)
goto Enable_irqs;
}
- if (hibernation_test(TEST_CORE))
+ if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events())
goto Power_up;
in_suspend = 1;
@@ -288,8 +288,10 @@ static int create_image(int platform_mode)
error);
/* Restore control flow magically appears here */
restore_processor_state();
- if (!in_suspend)
+ if (!in_suspend) {
+ events_check_enabled = false;
platform_leave(platform_mode);
+ }
Power_up:
sysdev_resume();
@@ -328,7 +330,7 @@ int hibernation_snapshot(int platform_mode)
error = platform_begin(platform_mode);
if (error)
- return error;
+ goto Close;
/* Preallocate image memory before shutting down devices. */
error = hibernate_preallocate_memory();
@@ -511,18 +513,24 @@ int hibernation_platform_enter(void)
local_irq_disable();
sysdev_suspend(PMSG_HIBERNATE);
+ if (!pm_check_wakeup_events()) {
+ error = -EAGAIN;
+ goto Power_up;
+ }
+
hibernation_ops->enter();
/* We should never get here */
while (1);
- /*
- * We don't need to reenable the nonboot CPUs or resume consoles, since
- * the system is going to be halted anyway.
- */
+ Power_up:
+ sysdev_resume();
+ local_irq_enable();
+ enable_nonboot_cpus();
+
Platform_finish:
hibernation_ops->finish();
- dpm_suspend_noirq(PMSG_RESTORE);
+ dpm_resume_noirq(PMSG_RESTORE);
Resume_devices:
entering_platform_hibernation = false;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b58800b21fc0..62b0bc6e4983 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -204,6 +204,60 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
power_attr(state);
+#ifdef CONFIG_PM_SLEEP
+/*
+ * The 'wakeup_count' attribute, along with the functions defined in
+ * drivers/base/power/wakeup.c, provides a means by which wakeup events can be
+ * handled in a non-racy way.
+ *
+ * If a wakeup event occurs when the system is in a sleep state, it simply is
+ * woken up. In turn, if an event that would wake the system up from a sleep
+ * state occurs when it is undergoing a transition to that sleep state, the
+ * transition should be aborted. Moreover, if such an event occurs when the
+ * system is in the working state, an attempt to start a transition to the
+ * given sleep state should fail during certain period after the detection of
+ * the event. Using the 'state' attribute alone is not sufficient to satisfy
+ * these requirements, because a wakeup event may occur exactly when 'state'
+ * is being written to and may be delivered to user space right before it is
+ * frozen, so the event will remain only partially processed until the system is
+ * woken up by another event. In particular, it won't cause the transition to
+ * a sleep state to be aborted.
+ *
+ * This difficulty may be overcome if user space uses 'wakeup_count' before
+ * writing to 'state'. It first should read from 'wakeup_count' and store
+ * the read value. Then, after carrying out its own preparations for the system
+ * transition to a sleep state, it should write the stored value to
+ * 'wakeup_count'. If that fails, at least one wakeup event has occured since
+ * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it
+ * is allowed to write to 'state', but the transition will be aborted if there
+ * are any wakeup events detected after 'wakeup_count' was written to.
+ */
+
+static ssize_t wakeup_count_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ unsigned long val;
+
+ return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR;
+}
+
+static ssize_t wakeup_count_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (sscanf(buf, "%lu", &val) == 1) {
+ if (pm_save_wakeup_count(val))
+ return n;
+ }
+ return -EINVAL;
+}
+
+power_attr(wakeup_count);
+#endif /* CONFIG_PM_SLEEP */
+
#ifdef CONFIG_PM_TRACE
int pm_trace_enabled;
@@ -236,6 +290,7 @@ static struct attribute * g[] = {
#endif
#ifdef CONFIG_PM_SLEEP
&pm_async_attr.attr,
+ &wakeup_count_attr.attr,
#ifdef CONFIG_PM_DEBUG
&pm_test_attr.attr,
#endif
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/nvs.c
index fdcad9ed5a7b..1836db60bbb6 100644
--- a/kernel/power/hibernate_nvs.c
+++ b/kernel/power/nvs.c
@@ -15,7 +15,7 @@
/*
* Platforms, like ACPI, may want us to save some memory used by them during
- * hibernation and to restore the contents of this memory during the subsequent
+ * suspend and to restore the contents of this memory during the subsequent
* resume. The code below implements a mechanism allowing us to do that.
*/
@@ -30,7 +30,7 @@ struct nvs_page {
static LIST_HEAD(nvs_list);
/**
- * hibernate_nvs_register - register platform NVS memory region to save
+ * suspend_nvs_register - register platform NVS memory region to save
* @start - physical address of the region
* @size - size of the region
*
@@ -38,7 +38,7 @@ static LIST_HEAD(nvs_list);
* things so that the data from page-aligned addresses in this region will
* be copied into separate RAM pages.
*/
-int hibernate_nvs_register(unsigned long start, unsigned long size)
+int suspend_nvs_register(unsigned long start, unsigned long size)
{
struct nvs_page *entry, *next;
@@ -68,9 +68,9 @@ int hibernate_nvs_register(unsigned long start, unsigned long size)
}
/**
- * hibernate_nvs_free - free data pages allocated for saving NVS regions
+ * suspend_nvs_free - free data pages allocated for saving NVS regions
*/
-void hibernate_nvs_free(void)
+void suspend_nvs_free(void)
{
struct nvs_page *entry;
@@ -86,16 +86,16 @@ void hibernate_nvs_free(void)
}
/**
- * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
+ * suspend_nvs_alloc - allocate memory necessary for saving NVS regions
*/
-int hibernate_nvs_alloc(void)
+int suspend_nvs_alloc(void)
{
struct nvs_page *entry;
list_for_each_entry(entry, &nvs_list, node) {
entry->data = (void *)__get_free_page(GFP_KERNEL);
if (!entry->data) {
- hibernate_nvs_free();
+ suspend_nvs_free();
return -ENOMEM;
}
}
@@ -103,9 +103,9 @@ int hibernate_nvs_alloc(void)
}
/**
- * hibernate_nvs_save - save NVS memory regions
+ * suspend_nvs_save - save NVS memory regions
*/
-void hibernate_nvs_save(void)
+void suspend_nvs_save(void)
{
struct nvs_page *entry;
@@ -119,12 +119,12 @@ void hibernate_nvs_save(void)
}
/**
- * hibernate_nvs_restore - restore NVS memory regions
+ * suspend_nvs_restore - restore NVS memory regions
*
* This function is going to be called with interrupts disabled, so it
* cannot iounmap the virtual addresses used to access the NVS region.
*/
-void hibernate_nvs_restore(void)
+void suspend_nvs_restore(void)
{
struct nvs_page *entry;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 56e7dbb8b996..7335952ee473 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -16,6 +16,12 @@
#include <linux/cpu.h>
#include <linux/syscalls.h>
#include <linux/gfp.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
#include "power.h"
@@ -130,19 +136,19 @@ static int suspend_enter(suspend_state_t state)
if (suspend_ops->prepare) {
error = suspend_ops->prepare();
if (error)
- return error;
+ goto Platform_finish;
}
error = dpm_suspend_noirq(PMSG_SUSPEND);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down\n");
- goto Platfrom_finish;
+ goto Platform_finish;
}
if (suspend_ops->prepare_late) {
error = suspend_ops->prepare_late();
if (error)
- goto Power_up_devices;
+ goto Platform_wake;
}
if (suspend_test(TEST_PLATFORM))
@@ -157,8 +163,10 @@ static int suspend_enter(suspend_state_t state)
error = sysdev_suspend(PMSG_SUSPEND);
if (!error) {
- if (!suspend_test(TEST_CORE))
+ if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) {
error = suspend_ops->enter(state);
+ events_check_enabled = false;
+ }
sysdev_resume();
}
@@ -172,10 +180,9 @@ static int suspend_enter(suspend_state_t state)
if (suspend_ops->wake)
suspend_ops->wake();
- Power_up_devices:
dpm_resume_noirq(PMSG_RESUME);
- Platfrom_finish:
+ Platform_finish:
if (suspend_ops->finish)
suspend_ops->finish();
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b0bb21778391..7c3ae83e41d7 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -32,7 +32,7 @@
/*
* The swap map is a data structure used for keeping track of each page
* written to a swap partition. It consists of many swap_map_page
- * structures that contain each an array of MAP_PAGE_SIZE swap entries.
+ * structures that contain each an array of MAP_PAGE_ENTRIES swap entries.
* These structures are stored on the swap and linked together with the
* help of the .next_swap member.
*
@@ -148,7 +148,7 @@ sector_t alloc_swapdev_block(int swap)
/**
* free_all_swap_pages - free swap pages allocated for saving image data.
- * It also frees the extents used to register which swap entres had been
+ * It also frees the extents used to register which swap entries had been
* allocated.
*/
diff --git a/kernel/profile.c b/kernel/profile.c
index dfadc5b729f1..b22a899934cc 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -365,14 +365,14 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- node = cpu_to_node(cpu);
+ node = cpu_to_mem(cpu);
per_cpu(cpu_profile_flip, cpu) = 0;
if (!per_cpu(cpu_profile_hits, cpu)[1]) {
page = alloc_pages_exact_node(node,
GFP_KERNEL | __GFP_ZERO,
0);
if (!page)
- return NOTIFY_BAD;
+ return notifier_from_errno(-ENOMEM);
per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
}
if (!per_cpu(cpu_profile_hits, cpu)[0]) {
@@ -388,7 +388,7 @@ out_free:
page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
per_cpu(cpu_profile_hits, cpu)[1] = NULL;
__free_page(page);
- return NOTIFY_BAD;
+ return notifier_from_errno(-ENOMEM);
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
if (prof_cpu_mask != NULL)
@@ -567,7 +567,7 @@ static int create_hash_tables(void)
int cpu;
for_each_online_cpu(cpu) {
- int node = cpu_to_node(cpu);
+ int node = cpu_to_mem(cpu);
struct page *page;
page = alloc_pages_exact_node(node,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6af9cdd558b7..74a3d693c196 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -594,6 +594,32 @@ int ptrace_request(struct task_struct *child, long request,
ret = ptrace_detach(child, data);
break;
+#ifdef CONFIG_BINFMT_ELF_FDPIC
+ case PTRACE_GETFDPIC: {
+ struct mm_struct *mm = get_task_mm(child);
+ unsigned long tmp = 0;
+
+ ret = -ESRCH;
+ if (!mm)
+ break;
+
+ switch (addr) {
+ case PTRACE_GETFDPIC_EXEC:
+ tmp = mm->context.exec_fdpic_loadmap;
+ break;
+ case PTRACE_GETFDPIC_INTERP:
+ tmp = mm->context.interp_fdpic_loadmap;
+ break;
+ default:
+ break;
+ }
+ mmput(mm);
+
+ ret = put_user(tmp, (unsigned long __user *) data);
+ break;
+ }
+#endif
+
#ifdef PTRACE_SINGLESTEP
case PTRACE_SINGLESTEP:
#endif
diff --git a/kernel/relay.c b/kernel/relay.c
index 4268287148c1..c7cf397fb929 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -539,7 +539,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
"relay_hotcpu_callback: cpu %d buffer "
"creation failed\n", hotcpu);
mutex_unlock(&relay_channels_mutex);
- return NOTIFY_BAD;
+ return notifier_from_errno(-ENOMEM);
}
}
mutex_unlock(&relay_channels_mutex);
diff --git a/kernel/sched.c b/kernel/sched.c
index 054a6012de99..f52a8801b7a2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -306,52 +306,6 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD;
*/
struct task_group init_task_group;
-/* return group to which a task belongs */
-static inline struct task_group *task_group(struct task_struct *p)
-{
- struct task_group *tg;
-
-#ifdef CONFIG_CGROUP_SCHED
- tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
- struct task_group, css);
-#else
- tg = &init_task_group;
-#endif
- return tg;
-}
-
-/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
-{
- /*
- * Strictly speaking this rcu_read_lock() is not needed since the
- * task_group is tied to the cgroup, which in turn can never go away
- * as long as there are tasks attached to it.
- *
- * However since task_group() uses task_subsys_state() which is an
- * rcu_dereference() user, this quiets CONFIG_PROVE_RCU.
- */
- rcu_read_lock();
-#ifdef CONFIG_FAIR_GROUP_SCHED
- p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
- p->se.parent = task_group(p)->se[cpu];
-#endif
-
-#ifdef CONFIG_RT_GROUP_SCHED
- p->rt.rt_rq = task_group(p)->rt_rq[cpu];
- p->rt.parent = task_group(p)->rt_se[cpu];
-#endif
- rcu_read_unlock();
-}
-
-#else
-
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline struct task_group *task_group(struct task_struct *p)
-{
- return NULL;
-}
-
#endif /* CONFIG_CGROUP_SCHED */
/* CFS-related fields in a runqueue */
@@ -544,6 +498,8 @@ struct rq {
struct root_domain *rd;
struct sched_domain *sd;
+ unsigned long cpu_power;
+
unsigned char idle_at_tick;
/* For active balancing */
int post_schedule;
@@ -642,6 +598,49 @@ static inline int cpu_of(struct rq *rq)
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() (&__raw_get_cpu_var(runqueues))
+#ifdef CONFIG_CGROUP_SCHED
+
+/*
+ * Return the group to which this tasks belongs.
+ *
+ * We use task_subsys_state_check() and extend the RCU verification
+ * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
+ * holds that lock for each task it moves into the cgroup. Therefore
+ * by holding that lock, we pin the task to the current cgroup.
+ */
+static inline struct task_group *task_group(struct task_struct *p)
+{
+ struct cgroup_subsys_state *css;
+
+ css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
+ lockdep_is_held(&task_rq(p)->lock));
+ return container_of(css, struct task_group, css);
+}
+
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
+ p->se.parent = task_group(p)->se[cpu];
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ p->rt.rt_rq = task_group(p)->rt_rq[cpu];
+ p->rt.parent = task_group(p)->rt_se[cpu];
+#endif
+}
+
+#else /* CONFIG_CGROUP_SCHED */
+
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+static inline struct task_group *task_group(struct task_struct *p)
+{
+ return NULL;
+}
+
+#endif /* CONFIG_CGROUP_SCHED */
+
inline void update_rq_clock(struct rq *rq)
{
if (!rq->skip_clock_update)
@@ -969,14 +968,6 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
}
}
-void task_rq_unlock_wait(struct task_struct *p)
-{
- struct rq *rq = task_rq(p);
-
- smp_mb(); /* spin-unlock-wait is not a full memory barrier */
- raw_spin_unlock_wait(&rq->lock);
-}
-
static void __task_rq_unlock(struct rq *rq)
__releases(rq->lock)
{
@@ -1263,6 +1254,12 @@ static void sched_avg_update(struct rq *rq)
s64 period = sched_avg_period();
while ((s64)(rq->clock - rq->age_stamp) > period) {
+ /*
+ * Inline assembly required to prevent the compiler
+ * optimising this loop into a divmod call.
+ * See __iter_div_u64_rem() for another example of this.
+ */
+ asm("" : "+rm" (rq->age_stamp));
rq->age_stamp += period;
rq->rt_avg /= 2;
}
@@ -1507,24 +1504,9 @@ static unsigned long target_load(int cpu, int type)
return max(rq->cpu_load[type-1], total);
}
-static struct sched_group *group_of(int cpu)
-{
- struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
-
- if (!sd)
- return NULL;
-
- return sd->groups;
-}
-
static unsigned long power_of(int cpu)
{
- struct sched_group *group = group_of(cpu);
-
- if (!group)
- return SCHED_LOAD_SCALE;
-
- return group->cpu_power;
+ return cpu_rq(cpu)->cpu_power;
}
static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
@@ -1681,9 +1663,6 @@ static void update_shares(struct sched_domain *sd)
static void update_h_load(long cpu)
{
- if (root_task_group_empty())
- return;
-
walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
}
@@ -1862,8 +1841,8 @@ static void dec_nr_running(struct rq *rq)
static void set_load_weight(struct task_struct *p)
{
if (task_has_rt_policy(p)) {
- p->se.load.weight = prio_to_weight[0] * 2;
- p->se.load.inv_weight = prio_to_wmult[0] >> 1;
+ p->se.load.weight = 0;
+ p->se.load.inv_weight = WMULT_CONST;
return;
}
@@ -2515,7 +2494,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
+ /*
+ * The child is not yet in the pid-hash so no cgroup attach races,
+ * and the cgroup is pinned to this child due to cgroup_fork()
+ * is ran before sched_fork().
+ *
+ * Silence PROVE_RCU.
+ */
+ rcu_read_lock();
set_task_cpu(p, cpu);
+ rcu_read_unlock();
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
if (likely(sched_info_on()))
@@ -2885,9 +2873,9 @@ unsigned long nr_iowait(void)
return sum;
}
-unsigned long nr_iowait_cpu(void)
+unsigned long nr_iowait_cpu(int cpu)
{
- struct rq *this = this_rq();
+ struct rq *this = cpu_rq(cpu);
return atomic_read(&this->nr_iowait);
}
@@ -4062,6 +4050,23 @@ int __sched wait_for_completion_killable(struct completion *x)
EXPORT_SYMBOL(wait_for_completion_killable);
/**
+ * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be
+ * signaled or for a specified timeout to expire. It can be
+ * interrupted by a kill signal. The timeout is in jiffies.
+ */
+unsigned long __sched
+wait_for_completion_killable_timeout(struct completion *x,
+ unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(wait_for_completion_killable_timeout);
+
+/**
* try_wait_for_completion - try to decrement a completion without blocking
* @x: completion structure
*
@@ -4469,16 +4474,6 @@ recheck:
}
if (user) {
-#ifdef CONFIG_RT_GROUP_SCHED
- /*
- * Do not allow realtime tasks into groups that have no runtime
- * assigned.
- */
- if (rt_bandwidth_enabled() && rt_policy(policy) &&
- task_group(p)->rt_bandwidth.rt_runtime == 0)
- return -EPERM;
-#endif
-
retval = security_task_setscheduler(p, policy, param);
if (retval)
return retval;
@@ -4494,6 +4489,22 @@ recheck:
* runqueue lock must be held.
*/
rq = __task_rq_lock(p);
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ if (user) {
+ /*
+ * Do not allow realtime tasks into groups that have no runtime
+ * assigned.
+ */
+ if (rt_bandwidth_enabled() && rt_policy(policy) &&
+ task_group(p)->rt_bandwidth.rt_runtime == 0) {
+ __task_rq_unlock(rq);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ return -EPERM;
+ }
+ }
+#endif
+
/* recheck policy now with rq lock held */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
@@ -7596,6 +7607,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
+ rq->cpu_power = SCHED_LOAD_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 87a330a7185f..35565395d00d 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -381,15 +381,9 @@ __initcall(init_sched_debug_procfs);
void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
{
unsigned long nr_switches;
- unsigned long flags;
- int num_threads = 1;
-
- if (lock_task_sighand(p, &flags)) {
- num_threads = atomic_read(&p->signal->count);
- unlock_task_sighand(p, &flags);
- }
- SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
+ get_nr_threads(p));
SEQ_printf(m,
"---------------------------------------------------------\n");
#define __P(F) \
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 217e4a9393e4..a878b5332daa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1225,7 +1225,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
unsigned long this_load, load;
int idx, this_cpu, prev_cpu;
unsigned long tl_per_task;
- unsigned int imbalance;
struct task_group *tg;
unsigned long weight;
int balanced;
@@ -1241,6 +1240,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
* effect of the currently running task from the load
* of the current CPU:
*/
+ rcu_read_lock();
if (sync) {
tg = task_group(current);
weight = current->se.load.weight;
@@ -1252,8 +1252,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
tg = task_group(p);
weight = p->se.load.weight;
- imbalance = 100 + (sd->imbalance_pct - 100) / 2;
-
/*
* In low-load situations, where prev_cpu is idle and this_cpu is idle
* due to the sync cause above having dropped this_load to 0, we'll
@@ -1263,9 +1261,22 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
* Otherwise check if either cpus are near enough in load to allow this
* task to be woken on this_cpu.
*/
- balanced = !this_load ||
- 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
- imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
+ if (this_load) {
+ unsigned long this_eff_load, prev_eff_load;
+
+ this_eff_load = 100;
+ this_eff_load *= power_of(prev_cpu);
+ this_eff_load *= this_load +
+ effective_load(tg, this_cpu, weight, weight);
+
+ prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= power_of(this_cpu);
+ prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+
+ balanced = this_eff_load <= prev_eff_load;
+ } else
+ balanced = true;
+ rcu_read_unlock();
/*
* If the currently running task will sleep within
@@ -2298,6 +2309,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
if (!power)
power = 1;
+ cpu_rq(cpu)->cpu_power = power;
sdg->cpu_power = power;
}
diff --git a/kernel/signal.c b/kernel/signal.c
index 825a3f24ad76..bded65187780 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -637,12 +637,12 @@ static inline bool si_fromuser(const struct siginfo *info)
/*
* Bad permissions for sending the signal
- * - the caller must hold at least the RCU read lock
+ * - the caller must hold the RCU read lock
*/
static int check_kill_permission(int sig, struct siginfo *info,
struct task_struct *t)
{
- const struct cred *cred = current_cred(), *tcred;
+ const struct cred *cred, *tcred;
struct pid *sid;
int error;
@@ -656,8 +656,10 @@ static int check_kill_permission(int sig, struct siginfo *info,
if (error)
return error;
+ cred = current_cred();
tcred = __task_cred(t);
- if ((cred->euid ^ tcred->suid) &&
+ if (!same_thread_group(current, t) &&
+ (cred->euid ^ tcred->suid) &&
(cred->euid ^ tcred->uid) &&
(cred->uid ^ tcred->suid) &&
(cred->uid ^ tcred->uid) &&
@@ -1083,23 +1085,24 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
/*
* Nuke all other threads in the group.
*/
-void zap_other_threads(struct task_struct *p)
+int zap_other_threads(struct task_struct *p)
{
- struct task_struct *t;
+ struct task_struct *t = p;
+ int count = 0;
p->signal->group_stop_count = 0;
- for (t = next_thread(p); t != p; t = next_thread(t)) {
- /*
- * Don't bother with already dead threads
- */
+ while_each_thread(p, t) {
+ count++;
+
+ /* Don't bother with already dead threads */
if (t->exit_state)
continue;
-
- /* SIGKILL will be handled before any pending SIGSTOP */
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
}
+
+ return count;
}
struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
@@ -1124,11 +1127,14 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
/*
* send signal info to all the members of a group
- * - the caller must hold the RCU read lock at least
*/
int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
{
- int ret = check_kill_permission(sig, info, p);
+ int ret;
+
+ rcu_read_lock();
+ ret = check_kill_permission(sig, info, p);
+ rcu_read_unlock();
if (!ret && sig)
ret = do_send_sig_info(sig, info, p, true);
diff --git a/kernel/smp.c b/kernel/smp.c
index 3fc697336183..75c970c715d3 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -52,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_UP_PREPARE_FROZEN:
if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
cpu_to_node(cpu)))
- return NOTIFY_BAD;
+ return notifier_from_errno(-ENOMEM);
break;
#ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0db913a5c60f..07b4f1b1a73a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -808,7 +808,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
if (IS_ERR(p)) {
printk("ksoftirqd for %i failed\n", hotcpu);
- return NOTIFY_BAD;
+ return notifier_from_errno(PTR_ERR(p));
}
kthread_bind(p, hotcpu);
per_cpu(ksoftirqd, hotcpu) = p;
@@ -850,7 +850,7 @@ static __init int spawn_ksoftirqd(void)
void *cpu = (void *)(long)smp_processor_id();
int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
- BUG_ON(err == NOTIFY_BAD);
+ BUG_ON(err != NOTIFY_OK);
cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
register_cpu_notifier(&cpu_nfb);
return 0;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b4e7431e7c78..70f8d90331e9 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -321,7 +321,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
- case CPU_DEAD:
+ case CPU_POST_DEAD:
{
struct cpu_stop_work *work;
diff --git a/kernel/sys.c b/kernel/sys.c
index 0d36d889c74d..e83ddbbaf89d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1632,9 +1632,9 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
-static void argv_cleanup(char **argv, char **envp)
+static void argv_cleanup(struct subprocess_info *info)
{
- argv_free(argv);
+ argv_free(info->argv);
}
/**
@@ -1668,7 +1668,7 @@ int orderly_poweroff(bool force)
goto out;
}
- call_usermodehelper_setcleanup(info, argv_cleanup);
+ call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL);
ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 997080f00e0b..d24f761f4876 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1471,12 +1471,12 @@ static struct ctl_table fs_table[] = {
},
#endif
{
- .procname = "pipe-max-pages",
- .data = &pipe_max_pages,
+ .procname = "pipe-max-size",
+ .data = &pipe_max_size,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .extra1 = &two,
+ .proc_handler = &pipe_proc_fn,
+ .extra1 = &pipe_min_size,
},
/*
* NOTE: do not add new entries to this table unless you have read
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1d7b9bc1c034..813993b5fb61 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -154,14 +154,14 @@ static void tick_nohz_update_jiffies(ktime_t now)
* Updates the per cpu time idle statistics counters
*/
static void
-update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
+update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
{
ktime_t delta;
if (ts->idle_active) {
delta = ktime_sub(now, ts->idle_entrytime);
ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
- if (nr_iowait_cpu() > 0)
+ if (nr_iowait_cpu(cpu) > 0)
ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
ts->idle_entrytime = now;
}
@@ -175,19 +175,19 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- update_ts_time_stats(ts, now, NULL);
+ update_ts_time_stats(cpu, ts, now, NULL);
ts->idle_active = 0;
sched_clock_idle_wakeup_event(0);
}
-static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
+static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
{
ktime_t now;
now = ktime_get();
- update_ts_time_stats(ts, now, NULL);
+ update_ts_time_stats(cpu, ts, now, NULL);
ts->idle_entrytime = now;
ts->idle_active = 1;
@@ -216,7 +216,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
if (!tick_nohz_enabled)
return -1;
- update_ts_time_stats(ts, ktime_get(), last_update_time);
+ update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
return ktime_to_us(ts->idle_sleeptime);
}
@@ -242,7 +242,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
if (!tick_nohz_enabled)
return -1;
- update_ts_time_stats(ts, ktime_get(), last_update_time);
+ update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
return ktime_to_us(ts->iowait_sleeptime);
}
@@ -284,7 +284,7 @@ void tick_nohz_stop_sched_tick(int inidle)
*/
ts->inidle = 1;
- now = tick_nohz_start_idle(ts);
+ now = tick_nohz_start_idle(cpu, ts);
/*
* If this cpu is offline and it is the one which updates
@@ -315,9 +315,6 @@ void tick_nohz_stop_sched_tick(int inidle)
goto end;
}
- if (nohz_ratelimit(cpu))
- goto end;
-
ts->idle_calls++;
/* Read jiffies and the time when jiffies were updated last */
do {
@@ -328,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle)
} while (read_seqretry(&xtime_lock, seq));
if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
- arch_needs_cpu(cpu)) {
+ arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) {
next_jiffies = last_jiffies + 1;
delta_jiffies = 1;
} else {
diff --git a/kernel/timer.c b/kernel/timer.c
index be394af5bc22..efde11e197c4 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -577,6 +577,19 @@ static void __init_timer(struct timer_list *timer,
lockdep_init_map(&timer->lockdep_map, name, key, 0);
}
+void setup_deferrable_timer_on_stack_key(struct timer_list *timer,
+ const char *name,
+ struct lock_class_key *key,
+ void (*function)(unsigned long),
+ unsigned long data)
+{
+ timer->function = function;
+ timer->data = data;
+ init_timer_on_stack_key(timer, name, key);
+ timer_set_deferrable(timer);
+}
+EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
+
/**
* init_timer_key - initialize a timer
* @timer: the timer to be initialized
@@ -752,11 +765,15 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
expires_limit = expires;
- if (timer->slack > -1)
+ if (timer->slack >= 0) {
expires_limit = expires + timer->slack;
- else if (time_after(expires, jiffies)) /* auto slack: use 0.4% */
- expires_limit = expires + (expires - jiffies)/256;
+ } else {
+ unsigned long now = jiffies;
+ /* No slack, if already expired else auto slack 0.4% */
+ if (time_after(expires, now))
+ expires_limit = expires + (expires - now)/256;
+ }
mask = expires ^ expires_limit;
if (mask == 0)
return expires;
@@ -1680,11 +1697,14 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
+ int err;
+
switch(action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- if (init_timers_cpu(cpu) < 0)
- return NOTIFY_BAD;
+ err = init_timers_cpu(cpu);
+ if (err < 0)
+ return notifier_from_errno(err);
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD:
@@ -1710,7 +1730,7 @@ void __init init_timers(void)
init_timer_stats();
- BUG_ON(err == NOTIFY_BAD);
+ BUG_ON(err != NOTIFY_OK);
register_cpu_notifier(&timers_nb);
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b3bc91a3f510..638711c17504 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -675,28 +675,33 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
}
}
-static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_abort(void *ignore,
+ struct request_queue *q, struct request *rq)
{
blk_add_trace_rq(q, rq, BLK_TA_ABORT);
}
-static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_insert(void *ignore,
+ struct request_queue *q, struct request *rq)
{
blk_add_trace_rq(q, rq, BLK_TA_INSERT);
}
-static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_issue(void *ignore,
+ struct request_queue *q, struct request *rq)
{
blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
}
-static void blk_add_trace_rq_requeue(struct request_queue *q,
+static void blk_add_trace_rq_requeue(void *ignore,
+ struct request_queue *q,
struct request *rq)
{
blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
}
-static void blk_add_trace_rq_complete(struct request_queue *q,
+static void blk_add_trace_rq_complete(void *ignore,
+ struct request_queue *q,
struct request *rq)
{
blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
@@ -724,34 +729,40 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
!bio_flagged(bio, BIO_UPTODATE), 0, NULL);
}
-static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_bounce(void *ignore,
+ struct request_queue *q, struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
}
-static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_complete(void *ignore,
+ struct request_queue *q, struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
}
-static void blk_add_trace_bio_backmerge(struct request_queue *q,
+static void blk_add_trace_bio_backmerge(void *ignore,
+ struct request_queue *q,
struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
}
-static void blk_add_trace_bio_frontmerge(struct request_queue *q,
+static void blk_add_trace_bio_frontmerge(void *ignore,
+ struct request_queue *q,
struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
}
-static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_queue(void *ignore,
+ struct request_queue *q, struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
}
-static void blk_add_trace_getrq(struct request_queue *q,
+static void blk_add_trace_getrq(void *ignore,
+ struct request_queue *q,
struct bio *bio, int rw)
{
if (bio)
@@ -765,7 +776,8 @@ static void blk_add_trace_getrq(struct request_queue *q,
}
-static void blk_add_trace_sleeprq(struct request_queue *q,
+static void blk_add_trace_sleeprq(void *ignore,
+ struct request_queue *q,
struct bio *bio, int rw)
{
if (bio)
@@ -779,7 +791,7 @@ static void blk_add_trace_sleeprq(struct request_queue *q,
}
}
-static void blk_add_trace_plug(struct request_queue *q)
+static void blk_add_trace_plug(void *ignore, struct request_queue *q)
{
struct blk_trace *bt = q->blk_trace;
@@ -787,7 +799,7 @@ static void blk_add_trace_plug(struct request_queue *q)
__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
}
-static void blk_add_trace_unplug_io(struct request_queue *q)
+static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q)
{
struct blk_trace *bt = q->blk_trace;
@@ -800,7 +812,7 @@ static void blk_add_trace_unplug_io(struct request_queue *q)
}
}
-static void blk_add_trace_unplug_timer(struct request_queue *q)
+static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
{
struct blk_trace *bt = q->blk_trace;
@@ -813,7 +825,8 @@ static void blk_add_trace_unplug_timer(struct request_queue *q)
}
}
-static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
+static void blk_add_trace_split(void *ignore,
+ struct request_queue *q, struct bio *bio,
unsigned int pdu)
{
struct blk_trace *bt = q->blk_trace;
@@ -829,6 +842,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
/**
* blk_add_trace_remap - Add a trace for a remap operation
+ * @ignore: trace callback data parameter (not used)
* @q: queue the io is for
* @bio: the source bio
* @dev: target device
@@ -839,8 +853,9 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
* it spans a stripe (or similar). Add a trace for that action.
*
**/
-static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
- dev_t dev, sector_t from)
+static void blk_add_trace_remap(void *ignore,
+ struct request_queue *q, struct bio *bio,
+ dev_t dev, sector_t from)
{
struct blk_trace *bt = q->blk_trace;
struct blk_io_trace_remap r;
@@ -859,6 +874,7 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
/**
* blk_add_trace_rq_remap - Add a trace for a request-remap operation
+ * @ignore: trace callback data parameter (not used)
* @q: queue the io is for
* @rq: the source request
* @dev: target device
@@ -869,7 +885,8 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
* Add a trace for that action.
*
**/
-static void blk_add_trace_rq_remap(struct request_queue *q,
+static void blk_add_trace_rq_remap(void *ignore,
+ struct request_queue *q,
struct request *rq, dev_t dev,
sector_t from)
{
@@ -921,64 +938,64 @@ static void blk_register_tracepoints(void)
{
int ret;
- ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
+ ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
WARN_ON(ret);
- ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
+ ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
WARN_ON(ret);
- ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
+ ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
WARN_ON(ret);
- ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+ ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
WARN_ON(ret);
- ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
+ ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+ ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
+ ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+ ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+ ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
+ ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
WARN_ON(ret);
- ret = register_trace_block_getrq(blk_add_trace_getrq);
+ ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
WARN_ON(ret);
- ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
+ ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
WARN_ON(ret);
- ret = register_trace_block_plug(blk_add_trace_plug);
+ ret = register_trace_block_plug(blk_add_trace_plug, NULL);
WARN_ON(ret);
- ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+ ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
WARN_ON(ret);
- ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
+ ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
WARN_ON(ret);
- ret = register_trace_block_split(blk_add_trace_split);
+ ret = register_trace_block_split(blk_add_trace_split, NULL);
WARN_ON(ret);
- ret = register_trace_block_remap(blk_add_trace_remap);
+ ret = register_trace_block_remap(blk_add_trace_remap, NULL);
WARN_ON(ret);
- ret = register_trace_block_rq_remap(blk_add_trace_rq_remap);
+ ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
WARN_ON(ret);
}
static void blk_unregister_tracepoints(void)
{
- unregister_trace_block_rq_remap(blk_add_trace_rq_remap);
- unregister_trace_block_remap(blk_add_trace_remap);
- unregister_trace_block_split(blk_add_trace_split);
- unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
- unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
- unregister_trace_block_plug(blk_add_trace_plug);
- unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
- unregister_trace_block_getrq(blk_add_trace_getrq);
- unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
- unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
- unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
- unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
- unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
- unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
- unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
- unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
- unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
- unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
+ unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
+ unregister_trace_block_remap(blk_add_trace_remap, NULL);
+ unregister_trace_block_split(blk_add_trace_split, NULL);
+ unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
+ unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
+ unregister_trace_block_plug(blk_add_trace_plug, NULL);
+ unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
+ unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
+ unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
+ unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
+ unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
+ unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
+ unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
+ unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
+ unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
+ unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
+ unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
+ unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
tracepoint_synchronize_unregister();
}
@@ -1321,7 +1338,7 @@ out:
}
static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
return print_one_line(iter, false);
}
@@ -1343,7 +1360,8 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
}
static enum print_line_t
-blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
+blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return blk_trace_synthesize_old_trace(iter) ?
TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
@@ -1381,12 +1399,16 @@ static struct tracer blk_tracer __read_mostly = {
.set_flag = blk_tracer_set_flag,
};
-static struct trace_event trace_blk_event = {
- .type = TRACE_BLK,
+static struct trace_event_functions trace_blk_event_funcs = {
.trace = blk_trace_event_print,
.binary = blk_trace_event_print_binary,
};
+static struct trace_event trace_blk_event = {
+ .type = TRACE_BLK,
+ .funcs = &trace_blk_event_funcs,
+};
+
static int __init init_blk_tracer(void)
{
if (!register_ftrace_event(&trace_blk_event)) {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 32837e19e3bd..6d2cb14f9449 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3234,7 +3234,8 @@ free:
}
static void
-ftrace_graph_probe_sched_switch(struct task_struct *prev, struct task_struct *next)
+ftrace_graph_probe_sched_switch(void *ignore,
+ struct task_struct *prev, struct task_struct *next)
{
unsigned long long timestamp;
int index;
@@ -3288,7 +3289,7 @@ static int start_graph_tracing(void)
} while (ret == -EAGAIN);
if (!ret) {
- ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch);
+ ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
if (ret)
pr_info("ftrace_graph: Couldn't activate tracepoint"
" probe to kernel_sched_switch\n");
@@ -3364,7 +3365,7 @@ void unregister_ftrace_graph(void)
ftrace_graph_entry = ftrace_graph_entry_stub;
ftrace_shutdown(FTRACE_STOP_FUNC_RET);
unregister_pm_notifier(&ftrace_suspend_notifier);
- unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
+ unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
out:
mutex_unlock(&ftrace_lock);
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index a91da69f153a..bbfc1bb1660b 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -95,7 +95,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
trace_wake_up();
}
-static void kmemtrace_kmalloc(unsigned long call_site,
+static void kmemtrace_kmalloc(void *ignore,
+ unsigned long call_site,
const void *ptr,
size_t bytes_req,
size_t bytes_alloc,
@@ -105,7 +106,8 @@ static void kmemtrace_kmalloc(unsigned long call_site,
bytes_req, bytes_alloc, gfp_flags, -1);
}
-static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
+static void kmemtrace_kmem_cache_alloc(void *ignore,
+ unsigned long call_site,
const void *ptr,
size_t bytes_req,
size_t bytes_alloc,
@@ -115,7 +117,8 @@ static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
bytes_req, bytes_alloc, gfp_flags, -1);
}
-static void kmemtrace_kmalloc_node(unsigned long call_site,
+static void kmemtrace_kmalloc_node(void *ignore,
+ unsigned long call_site,
const void *ptr,
size_t bytes_req,
size_t bytes_alloc,
@@ -126,7 +129,8 @@ static void kmemtrace_kmalloc_node(unsigned long call_site,
bytes_req, bytes_alloc, gfp_flags, node);
}
-static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
+static void kmemtrace_kmem_cache_alloc_node(void *ignore,
+ unsigned long call_site,
const void *ptr,
size_t bytes_req,
size_t bytes_alloc,
@@ -137,12 +141,14 @@ static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
bytes_req, bytes_alloc, gfp_flags, node);
}
-static void kmemtrace_kfree(unsigned long call_site, const void *ptr)
+static void
+kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr)
{
kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
}
-static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr)
+static void kmemtrace_kmem_cache_free(void *ignore,
+ unsigned long call_site, const void *ptr)
{
kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
}
@@ -151,34 +157,34 @@ static int kmemtrace_start_probes(void)
{
int err;
- err = register_trace_kmalloc(kmemtrace_kmalloc);
+ err = register_trace_kmalloc(kmemtrace_kmalloc, NULL);
if (err)
return err;
- err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
+ err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
if (err)
return err;
- err = register_trace_kmalloc_node(kmemtrace_kmalloc_node);
+ err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
if (err)
return err;
- err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
+ err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
if (err)
return err;
- err = register_trace_kfree(kmemtrace_kfree);
+ err = register_trace_kfree(kmemtrace_kfree, NULL);
if (err)
return err;
- err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
+ err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
return err;
}
static void kmemtrace_stop_probes(void)
{
- unregister_trace_kmalloc(kmemtrace_kmalloc);
- unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
- unregister_trace_kmalloc_node(kmemtrace_kmalloc_node);
- unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
- unregister_trace_kfree(kmemtrace_kfree);
- unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
+ unregister_trace_kmalloc(kmemtrace_kmalloc, NULL);
+ unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
+ unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
+ unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
+ unregister_trace_kfree(kmemtrace_kfree, NULL);
+ unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
}
static int kmem_trace_init(struct trace_array *tr)
@@ -237,7 +243,8 @@ struct kmemtrace_user_event_alloc {
};
static enum print_line_t
-kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
+kmemtrace_print_alloc(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct trace_seq *s = &iter->seq;
struct kmemtrace_alloc_entry *entry;
@@ -257,7 +264,8 @@ kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
}
static enum print_line_t
-kmemtrace_print_free(struct trace_iterator *iter, int flags)
+kmemtrace_print_free(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct trace_seq *s = &iter->seq;
struct kmemtrace_free_entry *entry;
@@ -275,7 +283,8 @@ kmemtrace_print_free(struct trace_iterator *iter, int flags)
}
static enum print_line_t
-kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
+kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct trace_seq *s = &iter->seq;
struct kmemtrace_alloc_entry *entry;
@@ -309,7 +318,8 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
}
static enum print_line_t
-kmemtrace_print_free_user(struct trace_iterator *iter, int flags)
+kmemtrace_print_free_user(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct trace_seq *s = &iter->seq;
struct kmemtrace_free_entry *entry;
@@ -463,18 +473,26 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
}
}
-static struct trace_event kmem_trace_alloc = {
- .type = TRACE_KMEM_ALLOC,
+static struct trace_event_functions kmem_trace_alloc_funcs = {
.trace = kmemtrace_print_alloc,
.binary = kmemtrace_print_alloc_user,
};
-static struct trace_event kmem_trace_free = {
- .type = TRACE_KMEM_FREE,
+static struct trace_event kmem_trace_alloc = {
+ .type = TRACE_KMEM_ALLOC,
+ .funcs = &kmem_trace_alloc_funcs,
+};
+
+static struct trace_event_functions kmem_trace_free_funcs = {
.trace = kmemtrace_print_free,
.binary = kmemtrace_print_free_user,
};
+static struct trace_event kmem_trace_free = {
+ .type = TRACE_KMEM_FREE,
+ .funcs = &kmem_trace_free_funcs,
+};
+
static struct tracer kmem_tracer __read_mostly = {
.name = "kmemtrace",
.init = kmem_trace_init,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7f6059c5aa94..1da7b6ea8b85 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1768,6 +1768,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
* must fill the old tail_page with padding.
*/
if (tail >= BUF_PAGE_SIZE) {
+ /*
+ * If the page was filled, then we still need
+ * to update the real_end. Reset it to zero
+ * and the reader will ignore it.
+ */
+ if (tail == BUF_PAGE_SIZE)
+ tail_page->real_end = 0;
+
local_sub(length, &tail_page->write);
return;
}
@@ -3894,12 +3902,12 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
ret = read;
cpu_buffer->lost_events = 0;
+
+ commit = local_read(&bpage->commit);
/*
* Set a flag in the commit field if we lost events
*/
if (missed_events) {
- commit = local_read(&bpage->commit);
-
/* If there is room at the end of the page to save the
* missed events, then record it there.
*/
@@ -3907,10 +3915,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
memcpy(&bpage->data[commit], &missed_events,
sizeof(missed_events));
local_add(RB_MISSED_STORED, &bpage->commit);
+ commit += sizeof(missed_events);
}
local_add(RB_MISSED_EVENTS, &bpage->commit);
}
+ /*
+ * This page may be off to user land. Zero it out here.
+ */
+ if (commit < BUF_PAGE_SIZE)
+ memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
+
out_unlock:
spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8a76339a9e65..086d36316805 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1936,7 +1936,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
}
if (event)
- return event->trace(iter, sym_flags);
+ return event->funcs->trace(iter, sym_flags, event);
if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
goto partial;
@@ -1962,7 +1962,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
event = ftrace_find_event(entry->type);
if (event)
- return event->raw(iter, 0);
+ return event->funcs->raw(iter, 0, event);
if (!trace_seq_printf(s, "%d ?\n", entry->type))
goto partial;
@@ -1989,7 +1989,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
event = ftrace_find_event(entry->type);
if (event) {
- enum print_line_t ret = event->hex(iter, 0);
+ enum print_line_t ret = event->funcs->hex(iter, 0, event);
if (ret != TRACE_TYPE_HANDLED)
return ret;
}
@@ -2014,7 +2014,8 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
}
event = ftrace_find_event(entry->type);
- return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED;
+ return event ? event->funcs->binary(iter, 0, event) :
+ TRACE_TYPE_HANDLED;
}
int trace_empty(struct trace_iterator *iter)
@@ -3665,7 +3666,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
size_t count, loff_t *ppos)
{
struct ftrace_buffer_info *info = filp->private_data;
- unsigned int pos;
ssize_t ret;
size_t size;
@@ -3692,11 +3692,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
if (ret < 0)
return 0;
- pos = ring_buffer_page_len(info->spare);
-
- if (pos < PAGE_SIZE)
- memset(info->spare + pos, 0, PAGE_SIZE - pos);
-
read:
size = PAGE_SIZE - info->read;
if (size > count)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d1ce0bec1b3f..2cd96399463f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -405,12 +405,12 @@ void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
int pc);
#else
-static inline void ftrace_trace_stack(struct trace_array *tr,
+static inline void ftrace_trace_stack(struct ring_buffer *buffer,
unsigned long flags, int skip, int pc)
{
}
-static inline void ftrace_trace_userstack(struct trace_array *tr,
+static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
unsigned long flags, int pc)
{
}
@@ -778,12 +778,15 @@ extern void print_subsystem_event_filter(struct event_subsystem *system,
struct trace_seq *s);
extern int filter_assign_type(const char *type);
+struct list_head *
+trace_get_fields(struct ftrace_event_call *event_call);
+
static inline int
filter_check_discard(struct ftrace_event_call *call, void *rec,
struct ring_buffer *buffer,
struct ring_buffer_event *event)
{
- if (unlikely(call->filter_active) &&
+ if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
!filter_match_preds(call->filter, rec)) {
ring_buffer_discard_commit(buffer, event);
return 1;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index b9bc4d470177..8d3538b4ea5f 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -143,7 +143,7 @@ static void branch_trace_reset(struct trace_array *tr)
}
static enum print_line_t trace_branch_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct trace_branch *field;
@@ -167,9 +167,13 @@ static void branch_print_header(struct seq_file *s)
" |\n");
}
+static struct trace_event_functions trace_branch_funcs = {
+ .trace = trace_branch_print,
+};
+
static struct trace_event trace_branch_event = {
.type = TRACE_BRANCH,
- .trace = trace_branch_print,
+ .funcs = &trace_branch_funcs,
};
static struct tracer branch_trace __read_mostly =
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 0565bb42566f..8a2b73f7c068 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,13 +9,9 @@
#include <linux/kprobes.h>
#include "trace.h"
-DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
-EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs);
-
EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
-static char *perf_trace_buf;
-static char *perf_trace_buf_nmi;
+static char *perf_trace_buf[4];
/*
* Force it to be aligned to unsigned long to avoid misaligned accesses
@@ -27,57 +23,84 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
/* Count the events in use (per event id, not per instance) */
static int total_ref_count;
-static int perf_trace_event_enable(struct ftrace_event_call *event)
+static int perf_trace_event_init(struct ftrace_event_call *tp_event,
+ struct perf_event *p_event)
{
- char *buf;
+ struct hlist_head *list;
int ret = -ENOMEM;
+ int cpu;
- if (event->perf_refcount++ > 0)
+ p_event->tp_event = tp_event;
+ if (tp_event->perf_refcount++ > 0)
return 0;
- if (!total_ref_count) {
- buf = (char *)alloc_percpu(perf_trace_t);
- if (!buf)
- goto fail_buf;
+ list = alloc_percpu(struct hlist_head);
+ if (!list)
+ goto fail;
- rcu_assign_pointer(perf_trace_buf, buf);
+ for_each_possible_cpu(cpu)
+ INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
- buf = (char *)alloc_percpu(perf_trace_t);
- if (!buf)
- goto fail_buf_nmi;
+ tp_event->perf_events = list;
- rcu_assign_pointer(perf_trace_buf_nmi, buf);
- }
+ if (!total_ref_count) {
+ char *buf;
+ int i;
- ret = event->perf_event_enable(event);
- if (!ret) {
- total_ref_count++;
- return 0;
+ for (i = 0; i < 4; i++) {
+ buf = (char *)alloc_percpu(perf_trace_t);
+ if (!buf)
+ goto fail;
+
+ perf_trace_buf[i] = buf;
+ }
}
-fail_buf_nmi:
+ if (tp_event->class->reg)
+ ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
+ else
+ ret = tracepoint_probe_register(tp_event->name,
+ tp_event->class->perf_probe,
+ tp_event);
+
+ if (ret)
+ goto fail;
+
+ total_ref_count++;
+ return 0;
+
+fail:
if (!total_ref_count) {
- free_percpu(perf_trace_buf_nmi);
- free_percpu(perf_trace_buf);
- perf_trace_buf_nmi = NULL;
- perf_trace_buf = NULL;
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ free_percpu(perf_trace_buf[i]);
+ perf_trace_buf[i] = NULL;
+ }
+ }
+
+ if (!--tp_event->perf_refcount) {
+ free_percpu(tp_event->perf_events);
+ tp_event->perf_events = NULL;
}
-fail_buf:
- event->perf_refcount--;
return ret;
}
-int perf_trace_enable(int event_id)
+int perf_trace_init(struct perf_event *p_event)
{
- struct ftrace_event_call *event;
+ struct ftrace_event_call *tp_event;
+ int event_id = p_event->attr.config;
int ret = -EINVAL;
mutex_lock(&event_mutex);
- list_for_each_entry(event, &ftrace_events, list) {
- if (event->id == event_id && event->perf_event_enable &&
- try_module_get(event->mod)) {
- ret = perf_trace_event_enable(event);
+ list_for_each_entry(tp_event, &ftrace_events, list) {
+ if (tp_event->event.type == event_id &&
+ tp_event->class &&
+ (tp_event->class->perf_probe ||
+ tp_event->class->reg) &&
+ try_module_get(tp_event->mod)) {
+ ret = perf_trace_event_init(tp_event, p_event);
break;
}
}
@@ -86,90 +109,87 @@ int perf_trace_enable(int event_id)
return ret;
}
-static void perf_trace_event_disable(struct ftrace_event_call *event)
+int perf_trace_enable(struct perf_event *p_event)
{
- char *buf, *nmi_buf;
+ struct ftrace_event_call *tp_event = p_event->tp_event;
+ struct hlist_head *list;
- if (--event->perf_refcount > 0)
- return;
+ list = tp_event->perf_events;
+ if (WARN_ON_ONCE(!list))
+ return -EINVAL;
- event->perf_event_disable(event);
-
- if (!--total_ref_count) {
- buf = perf_trace_buf;
- rcu_assign_pointer(perf_trace_buf, NULL);
+ list = this_cpu_ptr(list);
+ hlist_add_head_rcu(&p_event->hlist_entry, list);
- nmi_buf = perf_trace_buf_nmi;
- rcu_assign_pointer(perf_trace_buf_nmi, NULL);
-
- /*
- * Ensure every events in profiling have finished before
- * releasing the buffers
- */
- synchronize_sched();
+ return 0;
+}
- free_percpu(buf);
- free_percpu(nmi_buf);
- }
+void perf_trace_disable(struct perf_event *p_event)
+{
+ hlist_del_rcu(&p_event->hlist_entry);
}
-void perf_trace_disable(int event_id)
+void perf_trace_destroy(struct perf_event *p_event)
{
- struct ftrace_event_call *event;
+ struct ftrace_event_call *tp_event = p_event->tp_event;
+ int i;
mutex_lock(&event_mutex);
- list_for_each_entry(event, &ftrace_events, list) {
- if (event->id == event_id) {
- perf_trace_event_disable(event);
- module_put(event->mod);
- break;
+ if (--tp_event->perf_refcount > 0)
+ goto out;
+
+ if (tp_event->class->reg)
+ tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
+ else
+ tracepoint_probe_unregister(tp_event->name,
+ tp_event->class->perf_probe,
+ tp_event);
+
+ /*
+ * Ensure our callback won't be called anymore. See
+ * tracepoint_probe_unregister() and __DO_TRACE().
+ */
+ synchronize_sched();
+
+ free_percpu(tp_event->perf_events);
+ tp_event->perf_events = NULL;
+
+ if (!--total_ref_count) {
+ for (i = 0; i < 4; i++) {
+ free_percpu(perf_trace_buf[i]);
+ perf_trace_buf[i] = NULL;
}
}
+out:
mutex_unlock(&event_mutex);
}
__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
- int *rctxp, unsigned long *irq_flags)
+ struct pt_regs *regs, int *rctxp)
{
struct trace_entry *entry;
- char *trace_buf, *raw_data;
- int pc, cpu;
+ unsigned long flags;
+ char *raw_data;
+ int pc;
BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
pc = preempt_count();
- /* Protect the per cpu buffer, begin the rcu read side */
- local_irq_save(*irq_flags);
-
*rctxp = perf_swevent_get_recursion_context();
if (*rctxp < 0)
- goto err_recursion;
-
- cpu = smp_processor_id();
-
- if (in_nmi())
- trace_buf = rcu_dereference_sched(perf_trace_buf_nmi);
- else
- trace_buf = rcu_dereference_sched(perf_trace_buf);
-
- if (!trace_buf)
- goto err;
+ return NULL;
- raw_data = per_cpu_ptr(trace_buf, cpu);
+ raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
/* zero the dead bytes from align to not leak stack to user */
memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
entry = (struct trace_entry *)raw_data;
- tracing_generic_entry_update(entry, *irq_flags, pc);
+ local_save_flags(flags);
+ tracing_generic_entry_update(entry, flags, pc);
entry->type = type;
return raw_data;
-err:
- perf_swevent_put_recursion_context(*rctxp);
-err_recursion:
- local_irq_restore(*irq_flags);
- return NULL;
}
EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c697c7043349..53cffc0b0801 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -29,11 +29,23 @@ DEFINE_MUTEX(event_mutex);
LIST_HEAD(ftrace_events);
+struct list_head *
+trace_get_fields(struct ftrace_event_call *event_call)
+{
+ if (!event_call->class->get_fields)
+ return &event_call->class->fields;
+ return event_call->class->get_fields(event_call);
+}
+
int trace_define_field(struct ftrace_event_call *call, const char *type,
const char *name, int offset, int size, int is_signed,
int filter_type)
{
struct ftrace_event_field *field;
+ struct list_head *head;
+
+ if (WARN_ON(!call->class))
+ return 0;
field = kzalloc(sizeof(*field), GFP_KERNEL);
if (!field)
@@ -56,7 +68,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
field->size = size;
field->is_signed = is_signed;
- list_add(&field->link, &call->fields);
+ head = trace_get_fields(call);
+ list_add(&field->link, head);
return 0;
@@ -94,8 +107,10 @@ static int trace_define_common_fields(struct ftrace_event_call *call)
void trace_destroy_fields(struct ftrace_event_call *call)
{
struct ftrace_event_field *field, *next;
+ struct list_head *head;
- list_for_each_entry_safe(field, next, &call->fields, link) {
+ head = trace_get_fields(call);
+ list_for_each_entry_safe(field, next, head, link) {
list_del(&field->link);
kfree(field->type);
kfree(field->name);
@@ -107,11 +122,9 @@ int trace_event_raw_init(struct ftrace_event_call *call)
{
int id;
- id = register_ftrace_event(call->event);
+ id = register_ftrace_event(&call->event);
if (!id)
return -ENODEV;
- call->id = id;
- INIT_LIST_HEAD(&call->fields);
return 0;
}
@@ -124,23 +137,33 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
switch (enable) {
case 0:
- if (call->enabled) {
- call->enabled = 0;
+ if (call->flags & TRACE_EVENT_FL_ENABLED) {
+ call->flags &= ~TRACE_EVENT_FL_ENABLED;
tracing_stop_cmdline_record();
- call->unregfunc(call);
+ if (call->class->reg)
+ call->class->reg(call, TRACE_REG_UNREGISTER);
+ else
+ tracepoint_probe_unregister(call->name,
+ call->class->probe,
+ call);
}
break;
case 1:
- if (!call->enabled) {
+ if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
tracing_start_cmdline_record();
- ret = call->regfunc(call);
+ if (call->class->reg)
+ ret = call->class->reg(call, TRACE_REG_REGISTER);
+ else
+ ret = tracepoint_probe_register(call->name,
+ call->class->probe,
+ call);
if (ret) {
tracing_stop_cmdline_record();
pr_info("event trace: Could not enable event "
"%s\n", call->name);
break;
}
- call->enabled = 1;
+ call->flags |= TRACE_EVENT_FL_ENABLED;
}
break;
}
@@ -171,15 +194,16 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
mutex_lock(&event_mutex);
list_for_each_entry(call, &ftrace_events, list) {
- if (!call->name || !call->regfunc)
+ if (!call->name || !call->class ||
+ (!call->class->probe && !call->class->reg))
continue;
if (match &&
strcmp(match, call->name) != 0 &&
- strcmp(match, call->system) != 0)
+ strcmp(match, call->class->system) != 0)
continue;
- if (sub && strcmp(sub, call->system) != 0)
+ if (sub && strcmp(sub, call->class->system) != 0)
continue;
if (event && strcmp(event, call->name) != 0)
@@ -297,7 +321,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
* The ftrace subsystem is for showing formats only.
* They can not be enabled or disabled via the event files.
*/
- if (call->regfunc)
+ if (call->class && (call->class->probe || call->class->reg))
return call;
}
@@ -328,7 +352,7 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
(*pos)++;
list_for_each_entry_continue(call, &ftrace_events, list) {
- if (call->enabled)
+ if (call->flags & TRACE_EVENT_FL_ENABLED)
return call;
}
@@ -355,8 +379,8 @@ static int t_show(struct seq_file *m, void *v)
{
struct ftrace_event_call *call = v;
- if (strcmp(call->system, TRACE_SYSTEM) != 0)
- seq_printf(m, "%s:", call->system);
+ if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
+ seq_printf(m, "%s:", call->class->system);
seq_printf(m, "%s\n", call->name);
return 0;
@@ -387,7 +411,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
struct ftrace_event_call *call = filp->private_data;
char *buf;
- if (call->enabled)
+ if (call->flags & TRACE_EVENT_FL_ENABLED)
buf = "1\n";
else
buf = "0\n";
@@ -450,10 +474,11 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
mutex_lock(&event_mutex);
list_for_each_entry(call, &ftrace_events, list) {
- if (!call->name || !call->regfunc)
+ if (!call->name || !call->class ||
+ (!call->class->probe && !call->class->reg))
continue;
- if (system && strcmp(call->system, system) != 0)
+ if (system && strcmp(call->class->system, system) != 0)
continue;
/*
@@ -461,7 +486,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
* or if all events or cleared, or if we have
* a mixture.
*/
- set |= (1 << !!call->enabled);
+ set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED));
/*
* If we have a mixture, no need to look further.
@@ -525,6 +550,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
{
struct ftrace_event_call *call = filp->private_data;
struct ftrace_event_field *field;
+ struct list_head *head;
struct trace_seq *s;
int common_field_count = 5;
char *buf;
@@ -540,10 +566,11 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
trace_seq_init(s);
trace_seq_printf(s, "name: %s\n", call->name);
- trace_seq_printf(s, "ID: %d\n", call->id);
+ trace_seq_printf(s, "ID: %d\n", call->event.type);
trace_seq_printf(s, "format:\n");
- list_for_each_entry_reverse(field, &call->fields, link) {
+ head = trace_get_fields(call);
+ list_for_each_entry_reverse(field, head, link) {
/*
* Smartly shows the array type(except dynamic array).
* Normal:
@@ -613,7 +640,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
return -ENOMEM;
trace_seq_init(s);
- trace_seq_printf(s, "%d\n", call->id);
+ trace_seq_printf(s, "%d\n", call->event.type);
r = simple_read_from_buffer(ubuf, cnt, ppos,
s->buffer, s->len);
@@ -919,14 +946,15 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
const struct file_operations *filter,
const struct file_operations *format)
{
+ struct list_head *head;
int ret;
/*
* If the trace point header did not define TRACE_SYSTEM
* then the system would be called "TRACE_SYSTEM".
*/
- if (strcmp(call->system, TRACE_SYSTEM) != 0)
- d_events = event_subsystem_dir(call->system, d_events);
+ if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
+ d_events = event_subsystem_dir(call->class->system, d_events);
call->dir = debugfs_create_dir(call->name, d_events);
if (!call->dir) {
@@ -935,22 +963,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
return -1;
}
- if (call->regfunc)
+ if (call->class->probe || call->class->reg)
trace_create_file("enable", 0644, call->dir, call,
enable);
- if (call->id && call->perf_event_enable)
+#ifdef CONFIG_PERF_EVENTS
+ if (call->event.type && (call->class->perf_probe || call->class->reg))
trace_create_file("id", 0444, call->dir, call,
id);
+#endif
- if (call->define_fields) {
- ret = trace_define_common_fields(call);
- if (!ret)
- ret = call->define_fields(call);
- if (ret < 0) {
- pr_warning("Could not initialize trace point"
- " events/%s\n", call->name);
- return ret;
+ if (call->class->define_fields) {
+ /*
+ * Other events may have the same class. Only update
+ * the fields if they are not already defined.
+ */
+ head = trace_get_fields(call);
+ if (list_empty(head)) {
+ ret = trace_define_common_fields(call);
+ if (!ret)
+ ret = call->class->define_fields(call);
+ if (ret < 0) {
+ pr_warning("Could not initialize trace point"
+ " events/%s\n", call->name);
+ return ret;
+ }
}
trace_create_file("filter", 0644, call->dir, call,
filter);
@@ -970,8 +1007,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
if (!call->name)
return -EINVAL;
- if (call->raw_init) {
- ret = call->raw_init(call);
+ if (call->class->raw_init) {
+ ret = call->class->raw_init(call);
if (ret < 0) {
if (ret != -ENOSYS)
pr_warning("Could not initialize trace "
@@ -1035,13 +1072,13 @@ static void remove_subsystem_dir(const char *name)
static void __trace_remove_event_call(struct ftrace_event_call *call)
{
ftrace_event_enable_disable(call, 0);
- if (call->event)
- __unregister_ftrace_event(call->event);
+ if (call->event.funcs)
+ __unregister_ftrace_event(&call->event);
debugfs_remove_recursive(call->dir);
list_del(&call->list);
trace_destroy_fields(call);
destroy_preds(call);
- remove_subsystem_dir(call->system);
+ remove_subsystem_dir(call->class->system);
}
/* Remove an event_call */
@@ -1132,8 +1169,8 @@ static void trace_module_add_events(struct module *mod)
/* The linker may leave blanks */
if (!call->name)
continue;
- if (call->raw_init) {
- ret = call->raw_init(call);
+ if (call->class->raw_init) {
+ ret = call->class->raw_init(call);
if (ret < 0) {
if (ret != -ENOSYS)
pr_warning("Could not initialize trace "
@@ -1286,8 +1323,8 @@ static __init int event_trace_init(void)
/* The linker may leave blanks */
if (!call->name)
continue;
- if (call->raw_init) {
- ret = call->raw_init(call);
+ if (call->class->raw_init) {
+ ret = call->class->raw_init(call);
if (ret < 0) {
if (ret != -ENOSYS)
pr_warning("Could not initialize trace "
@@ -1388,8 +1425,8 @@ static __init void event_trace_self_tests(void)
list_for_each_entry(call, &ftrace_events, list) {
- /* Only test those that have a regfunc */
- if (!call->regfunc)
+ /* Only test those that have a probe */
+ if (!call->class || !call->class->probe)
continue;
/*
@@ -1399,8 +1436,8 @@ static __init void event_trace_self_tests(void)
* syscalls as we test.
*/
#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
- if (call->system &&
- strcmp(call->system, "syscalls") == 0)
+ if (call->class->system &&
+ strcmp(call->class->system, "syscalls") == 0)
continue;
#endif
@@ -1410,7 +1447,7 @@ static __init void event_trace_self_tests(void)
* If an event is already enabled, someone is using
* it and the self test should not be on.
*/
- if (call->enabled) {
+ if (call->flags & TRACE_EVENT_FL_ENABLED) {
pr_warning("Enabled event during self test!\n");
WARN_ON_ONCE(1);
continue;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 58092d844a1f..57bb1bb32999 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -500,8 +500,10 @@ static struct ftrace_event_field *
find_event_field(struct ftrace_event_call *call, char *name)
{
struct ftrace_event_field *field;
+ struct list_head *head;
- list_for_each_entry(field, &call->fields, link) {
+ head = trace_get_fields(call);
+ list_for_each_entry(field, head, link) {
if (!strcmp(field->name, name))
return field;
}
@@ -545,7 +547,7 @@ static void filter_disable_preds(struct ftrace_event_call *call)
struct event_filter *filter = call->filter;
int i;
- call->filter_active = 0;
+ call->flags &= ~TRACE_EVENT_FL_FILTERED;
filter->n_preds = 0;
for (i = 0; i < MAX_FILTER_PRED; i++)
@@ -572,7 +574,7 @@ void destroy_preds(struct ftrace_event_call *call)
{
__free_preds(call->filter);
call->filter = NULL;
- call->filter_active = 0;
+ call->flags &= ~TRACE_EVENT_FL_FILTERED;
}
static struct event_filter *__alloc_preds(void)
@@ -611,7 +613,7 @@ static int init_preds(struct ftrace_event_call *call)
if (call->filter)
return 0;
- call->filter_active = 0;
+ call->flags &= ~TRACE_EVENT_FL_FILTERED;
call->filter = __alloc_preds();
if (IS_ERR(call->filter))
return PTR_ERR(call->filter);
@@ -625,10 +627,10 @@ static int init_subsystem_preds(struct event_subsystem *system)
int err;
list_for_each_entry(call, &ftrace_events, list) {
- if (!call->define_fields)
+ if (!call->class || !call->class->define_fields)
continue;
- if (strcmp(call->system, system->name) != 0)
+ if (strcmp(call->class->system, system->name) != 0)
continue;
err = init_preds(call);
@@ -644,10 +646,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
struct ftrace_event_call *call;
list_for_each_entry(call, &ftrace_events, list) {
- if (!call->define_fields)
+ if (!call->class || !call->class->define_fields)
continue;
- if (strcmp(call->system, system->name) != 0)
+ if (strcmp(call->class->system, system->name) != 0)
continue;
filter_disable_preds(call);
@@ -1249,10 +1251,10 @@ static int replace_system_preds(struct event_subsystem *system,
list_for_each_entry(call, &ftrace_events, list) {
struct event_filter *filter = call->filter;
- if (!call->define_fields)
+ if (!call->class || !call->class->define_fields)
continue;
- if (strcmp(call->system, system->name) != 0)
+ if (strcmp(call->class->system, system->name) != 0)
continue;
/* try to see if the filter can be applied */
@@ -1266,7 +1268,7 @@ static int replace_system_preds(struct event_subsystem *system,
if (err)
filter_disable_preds(call);
else {
- call->filter_active = 1;
+ call->flags |= TRACE_EVENT_FL_FILTERED;
replace_filter_string(filter, filter_string);
}
fail = false;
@@ -1315,7 +1317,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
if (err)
append_filter_err(ps, call->filter);
else
- call->filter_active = 1;
+ call->flags |= TRACE_EVENT_FL_FILTERED;
out:
filter_opstack_clear(ps);
postfix_clear(ps);
@@ -1393,7 +1395,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
mutex_lock(&event_mutex);
list_for_each_entry(call, &ftrace_events, list) {
- if (call->id == event_id)
+ if (call->event.type == event_id)
break;
}
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e091f64ba6ce..8536e2a65969 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -127,7 +127,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
static int ftrace_raw_init_event(struct ftrace_event_call *call)
{
- INIT_LIST_HEAD(&call->fields);
+ INIT_LIST_HEAD(&call->class->fields);
return 0;
}
@@ -153,17 +153,21 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
#define F_printk(fmt, args...) #fmt ", " __stringify(args)
#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
+#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \
+ \
+struct ftrace_event_class event_class_ftrace_##call = { \
+ .system = __stringify(TRACE_SYSTEM), \
+ .define_fields = ftrace_define_fields_##call, \
+ .raw_init = ftrace_raw_init_event, \
+}; \
\
struct ftrace_event_call __used \
__attribute__((__aligned__(4))) \
__attribute__((section("_ftrace_events"))) event_##call = { \
.name = #call, \
- .id = type, \
- .system = __stringify(TRACE_SYSTEM), \
- .raw_init = ftrace_raw_init_event, \
+ .event.type = etype, \
+ .class = &event_class_ftrace_##call, \
.print_fmt = print, \
- .define_fields = ftrace_define_fields_##call, \
}; \
#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index dd11c830eb84..79f4bac99a94 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1025,7 +1025,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
if (!event)
return TRACE_TYPE_UNHANDLED;
- ret = event->trace(iter, sym_flags);
+ ret = event->funcs->trace(iter, sym_flags, event);
if (ret != TRACE_TYPE_HANDLED)
return ret;
}
@@ -1112,7 +1112,8 @@ print_graph_function(struct trace_iterator *iter)
}
static enum print_line_t
-print_graph_function_event(struct trace_iterator *iter, int flags)
+print_graph_function_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return print_graph_function(iter);
}
@@ -1225,14 +1226,18 @@ void graph_trace_close(struct trace_iterator *iter)
}
}
+static struct trace_event_functions graph_functions = {
+ .trace = print_graph_function_event,
+};
+
static struct trace_event graph_trace_entry_event = {
.type = TRACE_GRAPH_ENT,
- .trace = print_graph_function_event,
+ .funcs = &graph_functions,
};
static struct trace_event graph_trace_ret_event = {
.type = TRACE_GRAPH_RET,
- .trace = print_graph_function_event,
+ .funcs = &graph_functions
};
static struct tracer graph_trace __read_mostly = {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index a7514326052b..f52b5f50299d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -324,8 +324,8 @@ struct trace_probe {
unsigned long nhit;
unsigned int flags; /* For TP_FLAG_* */
const char *symbol; /* symbol name */
+ struct ftrace_event_class class;
struct ftrace_event_call call;
- struct trace_event event;
ssize_t size; /* trace entry size */
unsigned int nr_args;
struct probe_arg args[];
@@ -404,6 +404,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
goto error;
}
+ tp->call.class = &tp->class;
tp->call.name = kstrdup(event, GFP_KERNEL);
if (!tp->call.name)
goto error;
@@ -413,8 +414,8 @@ static struct trace_probe *alloc_trace_probe(const char *group,
goto error;
}
- tp->call.system = kstrdup(group, GFP_KERNEL);
- if (!tp->call.system)
+ tp->class.system = kstrdup(group, GFP_KERNEL);
+ if (!tp->class.system)
goto error;
INIT_LIST_HEAD(&tp->list);
@@ -443,7 +444,7 @@ static void free_trace_probe(struct trace_probe *tp)
for (i = 0; i < tp->nr_args; i++)
free_probe_arg(&tp->args[i]);
- kfree(tp->call.system);
+ kfree(tp->call.class->system);
kfree(tp->call.name);
kfree(tp->symbol);
kfree(tp);
@@ -456,7 +457,7 @@ static struct trace_probe *find_probe_event(const char *event,
list_for_each_entry(tp, &probe_list, list)
if (strcmp(tp->call.name, event) == 0 &&
- strcmp(tp->call.system, group) == 0)
+ strcmp(tp->call.class->system, group) == 0)
return tp;
return NULL;
}
@@ -481,7 +482,7 @@ static int register_trace_probe(struct trace_probe *tp)
mutex_lock(&probe_lock);
/* register as an event */
- old_tp = find_probe_event(tp->call.name, tp->call.system);
+ old_tp = find_probe_event(tp->call.name, tp->call.class->system);
if (old_tp) {
/* delete old event */
unregister_trace_probe(old_tp);
@@ -904,7 +905,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
int i;
seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
- seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
+ seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
if (!tp->symbol)
seq_printf(m, " 0x%p", tp->rp.kp.addr);
@@ -1061,8 +1062,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
size = sizeof(*entry) + tp->size;
- event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
- irq_flags, pc);
+ event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
+ size, irq_flags, pc);
if (!event)
return;
@@ -1094,8 +1095,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
size = sizeof(*entry) + tp->size;
- event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
- irq_flags, pc);
+ event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
+ size, irq_flags, pc);
if (!event)
return;
@@ -1112,18 +1113,17 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
/* Event entry printers */
enum print_line_t
-print_kprobe_event(struct trace_iterator *iter, int flags)
+print_kprobe_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct kprobe_trace_entry_head *field;
struct trace_seq *s = &iter->seq;
- struct trace_event *event;
struct trace_probe *tp;
u8 *data;
int i;
field = (struct kprobe_trace_entry_head *)iter->ent;
- event = ftrace_find_event(field->ent.type);
- tp = container_of(event, struct trace_probe, event);
+ tp = container_of(event, struct trace_probe, call.event);
if (!trace_seq_printf(s, "%s: (", tp->call.name))
goto partial;
@@ -1149,18 +1149,17 @@ partial:
}
enum print_line_t
-print_kretprobe_event(struct trace_iterator *iter, int flags)
+print_kretprobe_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct kretprobe_trace_entry_head *field;
struct trace_seq *s = &iter->seq;
- struct trace_event *event;
struct trace_probe *tp;
u8 *data;
int i;
field = (struct kretprobe_trace_entry_head *)iter->ent;
- event = ftrace_find_event(field->ent.type);
- tp = container_of(event, struct trace_probe, event);
+ tp = container_of(event, struct trace_probe, call.event);
if (!trace_seq_printf(s, "%s: (", tp->call.name))
goto partial;
@@ -1217,8 +1216,6 @@ static void probe_event_disable(struct ftrace_event_call *call)
static int probe_event_raw_init(struct ftrace_event_call *event_call)
{
- INIT_LIST_HEAD(&event_call->fields);
-
return 0;
}
@@ -1341,9 +1338,9 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
struct ftrace_event_call *call = &tp->call;
struct kprobe_trace_entry_head *entry;
+ struct hlist_head *head;
u8 *data;
int size, __size, i;
- unsigned long irq_flags;
int rctx;
__size = sizeof(*entry) + tp->size;
@@ -1353,7 +1350,7 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
"profile buffer not large enough"))
return;
- entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
+ entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
if (!entry)
return;
@@ -1362,7 +1359,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
for (i = 0; i < tp->nr_args; i++)
call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
- perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs);
+ head = this_cpu_ptr(call->perf_events);
+ perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
}
/* Kretprobe profile handler */
@@ -1372,9 +1370,9 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
struct ftrace_event_call *call = &tp->call;
struct kretprobe_trace_entry_head *entry;
+ struct hlist_head *head;
u8 *data;
int size, __size, i;
- unsigned long irq_flags;
int rctx;
__size = sizeof(*entry) + tp->size;
@@ -1384,7 +1382,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
"profile buffer not large enough"))
return;
- entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
+ entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
if (!entry)
return;
@@ -1394,8 +1392,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
for (i = 0; i < tp->nr_args; i++)
call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
- perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
- irq_flags, regs);
+ head = this_cpu_ptr(call->perf_events);
+ perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
}
static int probe_perf_enable(struct ftrace_event_call *call)
@@ -1425,6 +1423,26 @@ static void probe_perf_disable(struct ftrace_event_call *call)
}
#endif /* CONFIG_PERF_EVENTS */
+static __kprobes
+int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
+{
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return probe_event_enable(event);
+ case TRACE_REG_UNREGISTER:
+ probe_event_disable(event);
+ return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return probe_perf_enable(event);
+ case TRACE_REG_PERF_UNREGISTER:
+ probe_perf_disable(event);
+ return 0;
+#endif
+ }
+ return 0;
+}
static __kprobes
int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
@@ -1454,6 +1472,14 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
return 0; /* We don't tweek kernel, so just return 0 */
}
+static struct trace_event_functions kretprobe_funcs = {
+ .trace = print_kretprobe_event
+};
+
+static struct trace_event_functions kprobe_funcs = {
+ .trace = print_kprobe_event
+};
+
static int register_probe_event(struct trace_probe *tp)
{
struct ftrace_event_call *call = &tp->call;
@@ -1461,36 +1487,31 @@ static int register_probe_event(struct trace_probe *tp)
/* Initialize ftrace_event_call */
if (probe_is_return(tp)) {
- tp->event.trace = print_kretprobe_event;
- call->raw_init = probe_event_raw_init;
- call->define_fields = kretprobe_event_define_fields;
+ INIT_LIST_HEAD(&call->class->fields);
+ call->event.funcs = &kretprobe_funcs;
+ call->class->raw_init = probe_event_raw_init;
+ call->class->define_fields = kretprobe_event_define_fields;
} else {
- tp->event.trace = print_kprobe_event;
- call->raw_init = probe_event_raw_init;
- call->define_fields = kprobe_event_define_fields;
+ INIT_LIST_HEAD(&call->class->fields);
+ call->event.funcs = &kprobe_funcs;
+ call->class->raw_init = probe_event_raw_init;
+ call->class->define_fields = kprobe_event_define_fields;
}
if (set_print_fmt(tp) < 0)
return -ENOMEM;
- call->event = &tp->event;
- call->id = register_ftrace_event(&tp->event);
- if (!call->id) {
+ ret = register_ftrace_event(&call->event);
+ if (!ret) {
kfree(call->print_fmt);
return -ENODEV;
}
- call->enabled = 0;
- call->regfunc = probe_event_enable;
- call->unregfunc = probe_event_disable;
-
-#ifdef CONFIG_PERF_EVENTS
- call->perf_event_enable = probe_perf_enable;
- call->perf_event_disable = probe_perf_disable;
-#endif
+ call->flags = 0;
+ call->class->reg = kprobe_register;
call->data = tp;
ret = trace_add_event_call(call);
if (ret) {
pr_info("Failed to register kprobe event: %s\n", call->name);
kfree(call->print_fmt);
- unregister_ftrace_event(&tp->event);
+ unregister_ftrace_event(&call->event);
}
return ret;
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ab13d7008061..57c1b4596470 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -742,6 +742,9 @@ int register_ftrace_event(struct trace_event *event)
if (WARN_ON(!event))
goto out;
+ if (WARN_ON(!event->funcs))
+ goto out;
+
INIT_LIST_HEAD(&event->list);
if (!event->type) {
@@ -774,14 +777,14 @@ int register_ftrace_event(struct trace_event *event)
goto out;
}
- if (event->trace == NULL)
- event->trace = trace_nop_print;
- if (event->raw == NULL)
- event->raw = trace_nop_print;
- if (event->hex == NULL)
- event->hex = trace_nop_print;
- if (event->binary == NULL)
- event->binary = trace_nop_print;
+ if (event->funcs->trace == NULL)
+ event->funcs->trace = trace_nop_print;
+ if (event->funcs->raw == NULL)
+ event->funcs->raw = trace_nop_print;
+ if (event->funcs->hex == NULL)
+ event->funcs->hex = trace_nop_print;
+ if (event->funcs->binary == NULL)
+ event->funcs->binary = trace_nop_print;
key = event->type & (EVENT_HASHSIZE - 1);
@@ -823,13 +826,15 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
* Standard events
*/
-enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags)
+enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return TRACE_TYPE_HANDLED;
}
/* TRACE_FN */
-static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct ftrace_entry *field;
struct trace_seq *s = &iter->seq;
@@ -856,7 +861,8 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
return TRACE_TYPE_PARTIAL_LINE;
}
-static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct ftrace_entry *field;
@@ -870,7 +876,8 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
return TRACE_TYPE_HANDLED;
}
-static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct ftrace_entry *field;
struct trace_seq *s = &iter->seq;
@@ -883,7 +890,8 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
return TRACE_TYPE_HANDLED;
}
-static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct ftrace_entry *field;
struct trace_seq *s = &iter->seq;
@@ -896,14 +904,18 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
return TRACE_TYPE_HANDLED;
}
-static struct trace_event trace_fn_event = {
- .type = TRACE_FN,
+static struct trace_event_functions trace_fn_funcs = {
.trace = trace_fn_trace,
.raw = trace_fn_raw,
.hex = trace_fn_hex,
.binary = trace_fn_bin,
};
+static struct trace_event trace_fn_event = {
+ .type = TRACE_FN,
+ .funcs = &trace_fn_funcs,
+};
+
/* TRACE_CTX an TRACE_WAKE */
static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
char *delim)
@@ -932,13 +944,14 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
return TRACE_TYPE_HANDLED;
}
-static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return trace_ctxwake_print(iter, "==>");
}
static enum print_line_t trace_wake_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
return trace_ctxwake_print(iter, " +");
}
@@ -966,12 +979,14 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
return TRACE_TYPE_HANDLED;
}
-static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return trace_ctxwake_raw(iter, 0);
}
-static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return trace_ctxwake_raw(iter, '+');
}
@@ -1000,18 +1015,20 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
return TRACE_TYPE_HANDLED;
}
-static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return trace_ctxwake_hex(iter, 0);
}
-static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return trace_ctxwake_hex(iter, '+');
}
static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct ctx_switch_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1028,25 +1045,33 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
return TRACE_TYPE_HANDLED;
}
-static struct trace_event trace_ctx_event = {
- .type = TRACE_CTX,
+static struct trace_event_functions trace_ctx_funcs = {
.trace = trace_ctx_print,
.raw = trace_ctx_raw,
.hex = trace_ctx_hex,
.binary = trace_ctxwake_bin,
};
-static struct trace_event trace_wake_event = {
- .type = TRACE_WAKE,
+static struct trace_event trace_ctx_event = {
+ .type = TRACE_CTX,
+ .funcs = &trace_ctx_funcs,
+};
+
+static struct trace_event_functions trace_wake_funcs = {
.trace = trace_wake_print,
.raw = trace_wake_raw,
.hex = trace_wake_hex,
.binary = trace_ctxwake_bin,
};
+static struct trace_event trace_wake_event = {
+ .type = TRACE_WAKE,
+ .funcs = &trace_wake_funcs,
+};
+
/* TRACE_SPECIAL */
static enum print_line_t trace_special_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct special_entry *field;
@@ -1062,7 +1087,7 @@ static enum print_line_t trace_special_print(struct trace_iterator *iter,
}
static enum print_line_t trace_special_hex(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct special_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1077,7 +1102,7 @@ static enum print_line_t trace_special_hex(struct trace_iterator *iter,
}
static enum print_line_t trace_special_bin(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct special_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1091,18 +1116,22 @@ static enum print_line_t trace_special_bin(struct trace_iterator *iter,
return TRACE_TYPE_HANDLED;
}
-static struct trace_event trace_special_event = {
- .type = TRACE_SPECIAL,
+static struct trace_event_functions trace_special_funcs = {
.trace = trace_special_print,
.raw = trace_special_print,
.hex = trace_special_hex,
.binary = trace_special_bin,
};
+static struct trace_event trace_special_event = {
+ .type = TRACE_SPECIAL,
+ .funcs = &trace_special_funcs,
+};
+
/* TRACE_STACK */
static enum print_line_t trace_stack_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct stack_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1130,17 +1159,21 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
return TRACE_TYPE_PARTIAL_LINE;
}
-static struct trace_event trace_stack_event = {
- .type = TRACE_STACK,
+static struct trace_event_functions trace_stack_funcs = {
.trace = trace_stack_print,
.raw = trace_special_print,
.hex = trace_special_hex,
.binary = trace_special_bin,
};
+static struct trace_event trace_stack_event = {
+ .type = TRACE_STACK,
+ .funcs = &trace_stack_funcs,
+};
+
/* TRACE_USER_STACK */
static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct userstack_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1159,17 +1192,22 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
return TRACE_TYPE_PARTIAL_LINE;
}
-static struct trace_event trace_user_stack_event = {
- .type = TRACE_USER_STACK,
+static struct trace_event_functions trace_user_stack_funcs = {
.trace = trace_user_stack_print,
.raw = trace_special_print,
.hex = trace_special_hex,
.binary = trace_special_bin,
};
+static struct trace_event trace_user_stack_event = {
+ .type = TRACE_USER_STACK,
+ .funcs = &trace_user_stack_funcs,
+};
+
/* TRACE_BPRINT */
static enum print_line_t
-trace_bprint_print(struct trace_iterator *iter, int flags)
+trace_bprint_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct trace_entry *entry = iter->ent;
struct trace_seq *s = &iter->seq;
@@ -1194,7 +1232,8 @@ trace_bprint_print(struct trace_iterator *iter, int flags)
static enum print_line_t
-trace_bprint_raw(struct trace_iterator *iter, int flags)
+trace_bprint_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct bprint_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1213,16 +1252,19 @@ trace_bprint_raw(struct trace_iterator *iter, int flags)
return TRACE_TYPE_PARTIAL_LINE;
}
+static struct trace_event_functions trace_bprint_funcs = {
+ .trace = trace_bprint_print,
+ .raw = trace_bprint_raw,
+};
static struct trace_event trace_bprint_event = {
.type = TRACE_BPRINT,
- .trace = trace_bprint_print,
- .raw = trace_bprint_raw,
+ .funcs = &trace_bprint_funcs,
};
/* TRACE_PRINT */
static enum print_line_t trace_print_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct print_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1241,7 +1283,8 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
return TRACE_TYPE_PARTIAL_LINE;
}
-static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct print_entry *field;
@@ -1256,12 +1299,16 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
return TRACE_TYPE_PARTIAL_LINE;
}
-static struct trace_event trace_print_event = {
- .type = TRACE_PRINT,
+static struct trace_event_functions trace_print_funcs = {
.trace = trace_print_print,
.raw = trace_print_raw,
};
+static struct trace_event trace_print_event = {
+ .type = TRACE_PRINT,
+ .funcs = &trace_print_funcs,
+};
+
static struct trace_event *events[] __initdata = {
&trace_fn_event,
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 9d91c72ba38b..c038eba0492b 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -25,7 +25,7 @@ extern void trace_event_read_unlock(void);
extern struct trace_event *ftrace_find_event(int type);
extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
- int flags);
+ int flags, struct trace_event *event);
extern int
trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index a55fccfede5d..8f758d070c43 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -50,7 +50,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
}
static void
-probe_sched_switch(struct task_struct *prev, struct task_struct *next)
+probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
{
struct trace_array_cpu *data;
unsigned long flags;
@@ -108,7 +108,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
}
static void
-probe_sched_wakeup(struct task_struct *wakee, int success)
+probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
{
struct trace_array_cpu *data;
unsigned long flags;
@@ -138,21 +138,21 @@ static int tracing_sched_register(void)
{
int ret;
- ret = register_trace_sched_wakeup(probe_sched_wakeup);
+ ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL);
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_wakeup\n");
return ret;
}
- ret = register_trace_sched_wakeup_new(probe_sched_wakeup);
+ ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_wakeup_new\n");
goto fail_deprobe;
}
- ret = register_trace_sched_switch(probe_sched_switch);
+ ret = register_trace_sched_switch(probe_sched_switch, NULL);
if (ret) {
pr_info("sched trace: Couldn't activate tracepoint"
" probe to kernel_sched_switch\n");
@@ -161,17 +161,17 @@ static int tracing_sched_register(void)
return ret;
fail_deprobe_wake_new:
- unregister_trace_sched_wakeup_new(probe_sched_wakeup);
+ unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
fail_deprobe:
- unregister_trace_sched_wakeup(probe_sched_wakeup);
+ unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
return ret;
}
static void tracing_sched_unregister(void)
{
- unregister_trace_sched_switch(probe_sched_switch);
- unregister_trace_sched_wakeup_new(probe_sched_wakeup);
- unregister_trace_sched_wakeup(probe_sched_wakeup);
+ unregister_trace_sched_switch(probe_sched_switch, NULL);
+ unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
+ unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
}
static void tracing_start_sched_switch(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 8052446ceeaa..0e73bc2ef8c5 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -98,7 +98,8 @@ static int report_latency(cycle_t delta)
return 1;
}
-static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
+static void
+probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
{
if (task != wakeup_task)
return;
@@ -107,7 +108,8 @@ static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
}
static void notrace
-probe_wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
+probe_wakeup_sched_switch(void *ignore,
+ struct task_struct *prev, struct task_struct *next)
{
struct trace_array_cpu *data;
cycle_t T0, T1, delta;
@@ -199,7 +201,7 @@ static void wakeup_reset(struct trace_array *tr)
}
static void
-probe_wakeup(struct task_struct *p, int success)
+probe_wakeup(void *ignore, struct task_struct *p, int success)
{
struct trace_array_cpu *data;
int cpu = smp_processor_id();
@@ -263,28 +265,28 @@ static void start_wakeup_tracer(struct trace_array *tr)
{
int ret;
- ret = register_trace_sched_wakeup(probe_wakeup);
+ ret = register_trace_sched_wakeup(probe_wakeup, NULL);
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_wakeup\n");
return;
}
- ret = register_trace_sched_wakeup_new(probe_wakeup);
+ ret = register_trace_sched_wakeup_new(probe_wakeup, NULL);
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_wakeup_new\n");
goto fail_deprobe;
}
- ret = register_trace_sched_switch(probe_wakeup_sched_switch);
+ ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL);
if (ret) {
pr_info("sched trace: Couldn't activate tracepoint"
" probe to kernel_sched_switch\n");
goto fail_deprobe_wake_new;
}
- ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task);
+ ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_migrate_task\n");
@@ -311,19 +313,19 @@ static void start_wakeup_tracer(struct trace_array *tr)
return;
fail_deprobe_wake_new:
- unregister_trace_sched_wakeup_new(probe_wakeup);
+ unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
fail_deprobe:
- unregister_trace_sched_wakeup(probe_wakeup);
+ unregister_trace_sched_wakeup(probe_wakeup, NULL);
}
static void stop_wakeup_tracer(struct trace_array *tr)
{
tracer_enabled = 0;
unregister_ftrace_function(&trace_ops);
- unregister_trace_sched_switch(probe_wakeup_sched_switch);
- unregister_trace_sched_wakeup_new(probe_wakeup);
- unregister_trace_sched_wakeup(probe_wakeup);
- unregister_trace_sched_migrate_task(probe_wakeup_migrate_task);
+ unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
+ unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
+ unregister_trace_sched_wakeup(probe_wakeup, NULL);
+ unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
}
static int __wakeup_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 4d6d711717f2..34e35804304b 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -15,6 +15,54 @@ static int sys_refcount_exit;
static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
+static int syscall_enter_register(struct ftrace_event_call *event,
+ enum trace_reg type);
+static int syscall_exit_register(struct ftrace_event_call *event,
+ enum trace_reg type);
+
+static int syscall_enter_define_fields(struct ftrace_event_call *call);
+static int syscall_exit_define_fields(struct ftrace_event_call *call);
+
+static struct list_head *
+syscall_get_enter_fields(struct ftrace_event_call *call)
+{
+ struct syscall_metadata *entry = call->data;
+
+ return &entry->enter_fields;
+}
+
+static struct list_head *
+syscall_get_exit_fields(struct ftrace_event_call *call)
+{
+ struct syscall_metadata *entry = call->data;
+
+ return &entry->exit_fields;
+}
+
+struct trace_event_functions enter_syscall_print_funcs = {
+ .trace = print_syscall_enter,
+};
+
+struct trace_event_functions exit_syscall_print_funcs = {
+ .trace = print_syscall_exit,
+};
+
+struct ftrace_event_class event_class_syscall_enter = {
+ .system = "syscalls",
+ .reg = syscall_enter_register,
+ .define_fields = syscall_enter_define_fields,
+ .get_fields = syscall_get_enter_fields,
+ .raw_init = init_syscall_trace,
+};
+
+struct ftrace_event_class event_class_syscall_exit = {
+ .system = "syscalls",
+ .reg = syscall_exit_register,
+ .define_fields = syscall_exit_define_fields,
+ .get_fields = syscall_get_exit_fields,
+ .raw_init = init_syscall_trace,
+};
+
extern unsigned long __start_syscalls_metadata[];
extern unsigned long __stop_syscalls_metadata[];
@@ -53,7 +101,8 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
}
enum print_line_t
-print_syscall_enter(struct trace_iterator *iter, int flags)
+print_syscall_enter(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct trace_seq *s = &iter->seq;
struct trace_entry *ent = iter->ent;
@@ -68,7 +117,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
if (!entry)
goto end;
- if (entry->enter_event->id != ent->type) {
+ if (entry->enter_event->event.type != ent->type) {
WARN_ON_ONCE(1);
goto end;
}
@@ -105,7 +154,8 @@ end:
}
enum print_line_t
-print_syscall_exit(struct trace_iterator *iter, int flags)
+print_syscall_exit(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct trace_seq *s = &iter->seq;
struct trace_entry *ent = iter->ent;
@@ -123,7 +173,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
return TRACE_TYPE_HANDLED;
}
- if (entry->exit_event->id != ent->type) {
+ if (entry->exit_event->event.type != ent->type) {
WARN_ON_ONCE(1);
return TRACE_TYPE_UNHANDLED;
}
@@ -205,7 +255,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)
kfree(call->print_fmt);
}
-int syscall_enter_define_fields(struct ftrace_event_call *call)
+static int syscall_enter_define_fields(struct ftrace_event_call *call)
{
struct syscall_trace_enter trace;
struct syscall_metadata *meta = call->data;
@@ -228,7 +278,7 @@ int syscall_enter_define_fields(struct ftrace_event_call *call)
return ret;
}
-int syscall_exit_define_fields(struct ftrace_event_call *call)
+static int syscall_exit_define_fields(struct ftrace_event_call *call)
{
struct syscall_trace_exit trace;
int ret;
@@ -243,7 +293,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
return ret;
}
-void ftrace_syscall_enter(struct pt_regs *regs, long id)
+void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_trace_enter *entry;
struct syscall_metadata *sys_data;
@@ -265,7 +315,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
event = trace_current_buffer_lock_reserve(&buffer,
- sys_data->enter_event->id, size, 0, 0);
+ sys_data->enter_event->event.type, size, 0, 0);
if (!event)
return;
@@ -278,7 +328,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
trace_current_buffer_unlock_commit(buffer, event, 0, 0);
}
-void ftrace_syscall_exit(struct pt_regs *regs, long ret)
+void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_trace_exit *entry;
struct syscall_metadata *sys_data;
@@ -297,7 +347,7 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
return;
event = trace_current_buffer_lock_reserve(&buffer,
- sys_data->exit_event->id, sizeof(*entry), 0, 0);
+ sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
if (!event)
return;
@@ -320,7 +370,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
return -ENOSYS;
mutex_lock(&syscall_trace_lock);
if (!sys_refcount_enter)
- ret = register_trace_sys_enter(ftrace_syscall_enter);
+ ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
if (!ret) {
set_bit(num, enabled_enter_syscalls);
sys_refcount_enter++;
@@ -340,7 +390,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
sys_refcount_enter--;
clear_bit(num, enabled_enter_syscalls);
if (!sys_refcount_enter)
- unregister_trace_sys_enter(ftrace_syscall_enter);
+ unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
mutex_unlock(&syscall_trace_lock);
}
@@ -354,7 +404,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
return -ENOSYS;
mutex_lock(&syscall_trace_lock);
if (!sys_refcount_exit)
- ret = register_trace_sys_exit(ftrace_syscall_exit);
+ ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
if (!ret) {
set_bit(num, enabled_exit_syscalls);
sys_refcount_exit++;
@@ -374,7 +424,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
sys_refcount_exit--;
clear_bit(num, enabled_exit_syscalls);
if (!sys_refcount_exit)
- unregister_trace_sys_exit(ftrace_syscall_exit);
+ unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
mutex_unlock(&syscall_trace_lock);
}
@@ -434,11 +484,11 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
-static void perf_syscall_enter(struct pt_regs *regs, long id)
+static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
- unsigned long flags;
+ struct hlist_head *head;
int syscall_nr;
int rctx;
int size;
@@ -461,14 +511,16 @@ static void perf_syscall_enter(struct pt_regs *regs, long id)
return;
rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
- sys_data->enter_event->id, &rctx, &flags);
+ sys_data->enter_event->event.type, regs, &rctx);
if (!rec)
return;
rec->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);
- perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
+
+ head = this_cpu_ptr(sys_data->enter_event->perf_events);
+ perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
}
int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -480,7 +532,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call)
mutex_lock(&syscall_trace_lock);
if (!sys_perf_refcount_enter)
- ret = register_trace_sys_enter(perf_syscall_enter);
+ ret = register_trace_sys_enter(perf_syscall_enter, NULL);
if (ret) {
pr_info("event trace: Could not activate"
"syscall entry trace point");
@@ -502,15 +554,15 @@ void perf_sysenter_disable(struct ftrace_event_call *call)
sys_perf_refcount_enter--;
clear_bit(num, enabled_perf_enter_syscalls);
if (!sys_perf_refcount_enter)
- unregister_trace_sys_enter(perf_syscall_enter);
+ unregister_trace_sys_enter(perf_syscall_enter, NULL);
mutex_unlock(&syscall_trace_lock);
}
-static void perf_syscall_exit(struct pt_regs *regs, long ret)
+static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
- unsigned long flags;
+ struct hlist_head *head;
int syscall_nr;
int rctx;
int size;
@@ -536,14 +588,15 @@ static void perf_syscall_exit(struct pt_regs *regs, long ret)
return;
rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
- sys_data->exit_event->id, &rctx, &flags);
+ sys_data->exit_event->event.type, regs, &rctx);
if (!rec)
return;
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
- perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
+ head = this_cpu_ptr(sys_data->exit_event->perf_events);
+ perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
}
int perf_sysexit_enable(struct ftrace_event_call *call)
@@ -555,7 +608,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call)
mutex_lock(&syscall_trace_lock);
if (!sys_perf_refcount_exit)
- ret = register_trace_sys_exit(perf_syscall_exit);
+ ret = register_trace_sys_exit(perf_syscall_exit, NULL);
if (ret) {
pr_info("event trace: Could not activate"
"syscall exit trace point");
@@ -577,9 +630,50 @@ void perf_sysexit_disable(struct ftrace_event_call *call)
sys_perf_refcount_exit--;
clear_bit(num, enabled_perf_exit_syscalls);
if (!sys_perf_refcount_exit)
- unregister_trace_sys_exit(perf_syscall_exit);
+ unregister_trace_sys_exit(perf_syscall_exit, NULL);
mutex_unlock(&syscall_trace_lock);
}
#endif /* CONFIG_PERF_EVENTS */
+static int syscall_enter_register(struct ftrace_event_call *event,
+ enum trace_reg type)
+{
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return reg_event_syscall_enter(event);
+ case TRACE_REG_UNREGISTER:
+ unreg_event_syscall_enter(event);
+ return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return perf_sysenter_enable(event);
+ case TRACE_REG_PERF_UNREGISTER:
+ perf_sysenter_disable(event);
+ return 0;
+#endif
+ }
+ return 0;
+}
+
+static int syscall_exit_register(struct ftrace_event_call *event,
+ enum trace_reg type)
+{
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return reg_event_syscall_exit(event);
+ case TRACE_REG_UNREGISTER:
+ unreg_event_syscall_exit(event);
+ return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return perf_sysexit_enable(event);
+ case TRACE_REG_PERF_UNREGISTER:
+ perf_sysexit_disable(event);
+ return 0;
+#endif
+ }
+ return 0;
+}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index cc2d2faa7d9e..a7cc3793baf6 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -49,7 +49,8 @@ static void cpu_workqueue_stat_free(struct kref *kref)
/* Insertion of a work */
static void
-probe_workqueue_insertion(struct task_struct *wq_thread,
+probe_workqueue_insertion(void *ignore,
+ struct task_struct *wq_thread,
struct work_struct *work)
{
int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -70,7 +71,8 @@ found:
/* Execution of a work */
static void
-probe_workqueue_execution(struct task_struct *wq_thread,
+probe_workqueue_execution(void *ignore,
+ struct task_struct *wq_thread,
struct work_struct *work)
{
int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -90,7 +92,8 @@ found:
}
/* Creation of a cpu workqueue thread */
-static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
+static void probe_workqueue_creation(void *ignore,
+ struct task_struct *wq_thread, int cpu)
{
struct cpu_workqueue_stats *cws;
unsigned long flags;
@@ -114,7 +117,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
}
/* Destruction of a cpu workqueue thread */
-static void probe_workqueue_destruction(struct task_struct *wq_thread)
+static void
+probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
{
/* Workqueue only execute on one cpu */
int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -259,19 +263,19 @@ int __init trace_workqueue_early_init(void)
{
int ret, cpu;
- ret = register_trace_workqueue_insertion(probe_workqueue_insertion);
+ ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
if (ret)
goto out;
- ret = register_trace_workqueue_execution(probe_workqueue_execution);
+ ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
if (ret)
goto no_insertion;
- ret = register_trace_workqueue_creation(probe_workqueue_creation);
+ ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
if (ret)
goto no_execution;
- ret = register_trace_workqueue_destruction(probe_workqueue_destruction);
+ ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
if (ret)
goto no_creation;
@@ -283,11 +287,11 @@ int __init trace_workqueue_early_init(void)
return 0;
no_creation:
- unregister_trace_workqueue_creation(probe_workqueue_creation);
+ unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
no_execution:
- unregister_trace_workqueue_execution(probe_workqueue_execution);
+ unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
no_insertion:
- unregister_trace_workqueue_insertion(probe_workqueue_insertion);
+ unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
out:
pr_warning("trace_workqueue: unable to trace workqueues\n");
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index cc89be5bc0f8..c77f3eceea25 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -54,7 +54,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
*/
struct tracepoint_entry {
struct hlist_node hlist;
- void **funcs;
+ struct tracepoint_func *funcs;
int refcount; /* Number of times armed. 0 if disarmed. */
char name[0];
};
@@ -64,12 +64,12 @@ struct tp_probes {
struct rcu_head rcu;
struct list_head list;
} u;
- void *probes[0];
+ struct tracepoint_func probes[0];
};
static inline void *allocate_probes(int count)
{
- struct tp_probes *p = kmalloc(count * sizeof(void *)
+ struct tp_probes *p = kmalloc(count * sizeof(struct tracepoint_func)
+ sizeof(struct tp_probes), GFP_KERNEL);
return p == NULL ? NULL : p->probes;
}
@@ -79,7 +79,7 @@ static void rcu_free_old_probes(struct rcu_head *head)
kfree(container_of(head, struct tp_probes, u.rcu));
}
-static inline void release_probes(void *old)
+static inline void release_probes(struct tracepoint_func *old)
{
if (old) {
struct tp_probes *tp_probes = container_of(old,
@@ -95,15 +95,16 @@ static void debug_print_probes(struct tracepoint_entry *entry)
if (!tracepoint_debug || !entry->funcs)
return;
- for (i = 0; entry->funcs[i]; i++)
- printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]);
+ for (i = 0; entry->funcs[i].func; i++)
+ printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func);
}
-static void *
-tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
+static struct tracepoint_func *
+tracepoint_entry_add_probe(struct tracepoint_entry *entry,
+ void *probe, void *data)
{
int nr_probes = 0;
- void **old, **new;
+ struct tracepoint_func *old, *new;
WARN_ON(!probe);
@@ -111,8 +112,9 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
old = entry->funcs;
if (old) {
/* (N -> N+1), (N != 0, 1) probes */
- for (nr_probes = 0; old[nr_probes]; nr_probes++)
- if (old[nr_probes] == probe)
+ for (nr_probes = 0; old[nr_probes].func; nr_probes++)
+ if (old[nr_probes].func == probe &&
+ old[nr_probes].data == data)
return ERR_PTR(-EEXIST);
}
/* + 2 : one for new probe, one for NULL func */
@@ -120,9 +122,10 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
if (new == NULL)
return ERR_PTR(-ENOMEM);
if (old)
- memcpy(new, old, nr_probes * sizeof(void *));
- new[nr_probes] = probe;
- new[nr_probes + 1] = NULL;
+ memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
+ new[nr_probes].func = probe;
+ new[nr_probes].data = data;
+ new[nr_probes + 1].func = NULL;
entry->refcount = nr_probes + 1;
entry->funcs = new;
debug_print_probes(entry);
@@ -130,10 +133,11 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
}
static void *
-tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
+tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
+ void *probe, void *data)
{
int nr_probes = 0, nr_del = 0, i;
- void **old, **new;
+ struct tracepoint_func *old, *new;
old = entry->funcs;
@@ -142,8 +146,10 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
debug_print_probes(entry);
/* (N -> M), (N > 1, M >= 0) probes */
- for (nr_probes = 0; old[nr_probes]; nr_probes++) {
- if ((!probe || old[nr_probes] == probe))
+ for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
+ if (!probe ||
+ (old[nr_probes].func == probe &&
+ old[nr_probes].data == data))
nr_del++;
}
@@ -160,10 +166,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
new = allocate_probes(nr_probes - nr_del + 1);
if (new == NULL)
return ERR_PTR(-ENOMEM);
- for (i = 0; old[i]; i++)
- if ((probe && old[i] != probe))
+ for (i = 0; old[i].func; i++)
+ if (probe &&
+ (old[i].func != probe || old[i].data != data))
new[j++] = old[i];
- new[nr_probes - nr_del] = NULL;
+ new[nr_probes - nr_del].func = NULL;
entry->refcount = nr_probes - nr_del;
entry->funcs = new;
}
@@ -315,18 +322,19 @@ static void tracepoint_update_probes(void)
module_update_tracepoints();
}
-static void *tracepoint_add_probe(const char *name, void *probe)
+static struct tracepoint_func *
+tracepoint_add_probe(const char *name, void *probe, void *data)
{
struct tracepoint_entry *entry;
- void *old;
+ struct tracepoint_func *old;
entry = get_tracepoint(name);
if (!entry) {
entry = add_tracepoint(name);
if (IS_ERR(entry))
- return entry;
+ return (struct tracepoint_func *)entry;
}
- old = tracepoint_entry_add_probe(entry, probe);
+ old = tracepoint_entry_add_probe(entry, probe, data);
if (IS_ERR(old) && !entry->refcount)
remove_tracepoint(entry);
return old;
@@ -340,12 +348,12 @@ static void *tracepoint_add_probe(const char *name, void *probe)
* Returns 0 if ok, error value on error.
* The probe address must at least be aligned on the architecture pointer size.
*/
-int tracepoint_probe_register(const char *name, void *probe)
+int tracepoint_probe_register(const char *name, void *probe, void *data)
{
- void *old;
+ struct tracepoint_func *old;
mutex_lock(&tracepoints_mutex);
- old = tracepoint_add_probe(name, probe);
+ old = tracepoint_add_probe(name, probe, data);
mutex_unlock(&tracepoints_mutex);
if (IS_ERR(old))
return PTR_ERR(old);
@@ -356,15 +364,16 @@ int tracepoint_probe_register(const char *name, void *probe)
}
EXPORT_SYMBOL_GPL(tracepoint_probe_register);
-static void *tracepoint_remove_probe(const char *name, void *probe)
+static struct tracepoint_func *
+tracepoint_remove_probe(const char *name, void *probe, void *data)
{
struct tracepoint_entry *entry;
- void *old;
+ struct tracepoint_func *old;
entry = get_tracepoint(name);
if (!entry)
return ERR_PTR(-ENOENT);
- old = tracepoint_entry_remove_probe(entry, probe);
+ old = tracepoint_entry_remove_probe(entry, probe, data);
if (IS_ERR(old))
return old;
if (!entry->refcount)
@@ -382,12 +391,12 @@ static void *tracepoint_remove_probe(const char *name, void *probe)
* itself uses stop_machine(), which insures that every preempt disabled section
* have finished.
*/
-int tracepoint_probe_unregister(const char *name, void *probe)
+int tracepoint_probe_unregister(const char *name, void *probe, void *data)
{
- void *old;
+ struct tracepoint_func *old;
mutex_lock(&tracepoints_mutex);
- old = tracepoint_remove_probe(name, probe);
+ old = tracepoint_remove_probe(name, probe, data);
mutex_unlock(&tracepoints_mutex);
if (IS_ERR(old))
return PTR_ERR(old);
@@ -418,12 +427,13 @@ static void tracepoint_add_old_probes(void *old)
*
* caller must call tracepoint_probe_update_all()
*/
-int tracepoint_probe_register_noupdate(const char *name, void *probe)
+int tracepoint_probe_register_noupdate(const char *name, void *probe,
+ void *data)
{
- void *old;
+ struct tracepoint_func *old;
mutex_lock(&tracepoints_mutex);
- old = tracepoint_add_probe(name, probe);
+ old = tracepoint_add_probe(name, probe, data);
if (IS_ERR(old)) {
mutex_unlock(&tracepoints_mutex);
return PTR_ERR(old);
@@ -441,12 +451,13 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
*
* caller must call tracepoint_probe_update_all()
*/
-int tracepoint_probe_unregister_noupdate(const char *name, void *probe)
+int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
+ void *data)
{
- void *old;
+ struct tracepoint_func *old;
mutex_lock(&tracepoints_mutex);
- old = tracepoint_remove_probe(name, probe);
+ old = tracepoint_remove_probe(name, probe, data);
if (IS_ERR(old)) {
mutex_unlock(&tracepoints_mutex);
return PTR_ERR(old);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index b2d70d38dff4..25915832291a 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>
+#include <linux/highuid.h>
#include <linux/cred.h>
/*
@@ -82,3 +83,46 @@ void free_user_ns(struct kref *kref)
schedule_work(&ns->destroyer);
}
EXPORT_SYMBOL(free_user_ns);
+
+uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid)
+{
+ struct user_namespace *tmp;
+
+ if (likely(to == cred->user->user_ns))
+ return uid;
+
+
+ /* Is cred->user the creator of the target user_ns
+ * or the creator of one of it's parents?
+ */
+ for ( tmp = to; tmp != &init_user_ns;
+ tmp = tmp->creator->user_ns ) {
+ if (cred->user == tmp->creator) {
+ return (uid_t)0;
+ }
+ }
+
+ /* No useful relationship so no mapping */
+ return overflowuid;
+}
+
+gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid)
+{
+ struct user_namespace *tmp;
+
+ if (likely(to == cred->user->user_ns))
+ return gid;
+
+ /* Is cred->user the creator of the target user_ns
+ * or the creator of one of it's parents?
+ */
+ for ( tmp = to; tmp != &init_user_ns;
+ tmp = tmp->creator->user_ns ) {
+ if (cred->user == tmp->creator) {
+ return (gid_t)0;
+ }
+ }
+
+ /* No useful relationship so no mapping */
+ return overflowgid;
+}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 77dabbf64b8f..327d2deb4451 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1110,7 +1110,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
unsigned int cpu = (unsigned long)hcpu;
struct cpu_workqueue_struct *cwq;
struct workqueue_struct *wq;
- int ret = NOTIFY_OK;
+ int err = 0;
action &= ~CPU_TASKS_FROZEN;
@@ -1124,12 +1124,13 @@ undo:
switch (action) {
case CPU_UP_PREPARE:
- if (!create_workqueue_thread(cwq, cpu))
+ err = create_workqueue_thread(cwq, cpu);
+ if (!err)
break;
printk(KERN_ERR "workqueue [%s] for %i failed\n",
wq->name, cpu);
action = CPU_UP_CANCELED;
- ret = NOTIFY_BAD;
+ err = -ENOMEM;
goto undo;
case CPU_ONLINE:
@@ -1150,7 +1151,7 @@ undo:
cpumask_clear_cpu(cpu, cpu_populated_map);
}
- return ret;
+ return notifier_from_errno(err);
}
#ifdef CONFIG_SMP