From d631097577f6fe027f4903f62eabced6445d19bf Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 31 Oct 2011 11:07:42 +0200 Subject: tracing: fix event_subsystem ref counting Fix a bug introduced by e9dbfae5, which prevents event_subsystem from ever being released. Ref_count was added to keep track of subsystem users, not for counting events. Subsystem is created with ref_count = 1, so there is no need to increment it for every event, we have nr_events for that. Fix this by touching ref_count only when we actually have a new user - subsystem_open(). Cc: stable@vger.kernel.org Signed-off-by: Ilya Dryomov Link: http://lkml.kernel.org/r/1320052062-7846-1-git-send-email-idryomov@gmail.com Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 581876f9f387..c212a7f934ec 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1078,7 +1078,6 @@ event_subsystem_dir(const char *name, struct dentry *d_events) /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { if (strcmp(system->name, name) == 0) { - __get_system(system); system->nr_events++; return system->entry; } -- cgit From ed0449af5373abd766c79fbf83254bebc996bd23 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 1 Nov 2011 09:09:35 +0800 Subject: tracing: Restore system filter behavior Though not all events have field 'prev_pid', it was allowed to do this: # echo 'prev_pid == 100' > events/sched/filter but commit 75b8e98263fdb0bfbdeba60d4db463259f1fe8a2 (tracing/filter: Swap entire filter of events) broke it without any reason. Link: http://lkml.kernel.org/r/4EAF46CF.8040408@cn.fujitsu.com Signed-off-by: Li Zefan Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 816d3d074979..86040d9c1acc 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1649,7 +1649,9 @@ static int replace_system_preds(struct event_subsystem *system, */ err = replace_preds(call, NULL, ps, filter_string, true); if (err) - goto fail; + call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; + else + call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; } list_for_each_entry(call, &ftrace_events, list) { @@ -1658,6 +1660,9 @@ static int replace_system_preds(struct event_subsystem *system, if (strcmp(call->class->system, system->name) != 0) continue; + if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER) + continue; + filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); if (!filter_item) goto fail_mem; -- cgit From dcfce4a095932e6e95d83ad982be3609947963bc Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 11 Oct 2011 17:11:08 +0200 Subject: oprofile, x86: Reimplement nmi timer mode using perf event The legacy x86 nmi watchdog code was removed with the implementation of the perf based nmi watchdog. This broke Oprofile's nmi timer mode. To run nmi timer mode we relied on a continuous ticking nmi source which the nmi watchdog provided. The nmi tick was no longer available and current watchdog can not be used anymore since it runs with very long periods in the range of seconds. This patch reimplements the nmi timer mode using a perf counter nmi source. V2: * removing pr_info() * fix undefined reference to `__udivdi3' for 32 bit build * fix section mismatch of .cpuinit.data:nmi_timer_cpu_nb * removed nmi timer setup in arch/x86 * implemented function stubs for op_nmi_init/exit() * made code more readable in oprofile_init() V3: * fix architectural initialization in oprofile_init() * fix CONFIG_OPROFILE_NMI_TIMER dependencies Acked-by: Peter Zijlstra Signed-off-by: Robert Richter --- kernel/events/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d1a1bee35228..d2e28bdd523a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1322,6 +1322,7 @@ retry: } raw_spin_unlock_irq(&ctx->lock); } +EXPORT_SYMBOL_GPL(perf_event_disable); static void perf_set_shadow_time(struct perf_event *event, struct perf_event_context *ctx, @@ -1806,6 +1807,7 @@ retry: out: raw_spin_unlock_irq(&ctx->lock); } +EXPORT_SYMBOL_GPL(perf_event_enable); int perf_event_refresh(struct perf_event *event, int refresh) { -- cgit From 49aa29513ec995f201664cf6eee36e5326ed38bf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 2 Nov 2011 16:46:46 -0400 Subject: tracing: Add boiler plate for subsystem filter The system filter can be used to set multiple event filters that exist within the system. But currently it displays the last filter written that does not necessarily correspond to the filters within the system. The system filter itself is not used to filter any events. The system filter is just a means to set filters of the events within it. Because this causes an ambiguous state when the system filter reads a filter string but the events within the system have different strings it is best to just show a boiler plate: ### global filter ### # Use this to set filters for multiple events. # Only events with the given fields will be affected. # If no events are modified, an error message will be displayed here. If an error occurs while writing to the system filter, the system filter will replace the boiler plate with the error message as it currently does. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 86040d9c1acc..fdc6d22d406b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -27,6 +27,12 @@ #include "trace.h" #include "trace_output.h" +#define DEFAULT_SYS_FILTER_MESSAGE \ + "### global filter ###\n" \ + "# Use this to set filters for multiple events.\n" \ + "# Only events with the given fields will be affected.\n" \ + "# If no events are modified, an error message will be displayed here" + enum filter_op_ids { OP_OR, @@ -646,7 +652,7 @@ void print_subsystem_event_filter(struct event_subsystem *system, if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else - trace_seq_printf(s, "none\n"); + trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); mutex_unlock(&event_mutex); } @@ -1838,7 +1844,10 @@ int apply_subsystem_event_filter(struct event_subsystem *system, if (!filter) goto out; - replace_filter_string(filter, filter_string); + /* System filters just show a default message */ + kfree(filter->filter_string); + filter->filter_string = NULL; + /* * No event actually uses the system filter * we can free it without synchronize_sched(). @@ -1848,14 +1857,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system, parse_init(ps, filter_ops, filter_string); err = filter_parse(ps); - if (err) { - append_filter_err(ps, system->filter); - goto out; - } + if (err) + goto err_filter; err = replace_system_preds(system, ps, filter_string); if (err) - append_filter_err(ps, system->filter); + goto err_filter; out: filter_opstack_clear(ps); @@ -1865,6 +1872,11 @@ out_unlock: mutex_unlock(&event_mutex); return err; + +err_filter: + replace_filter_string(filter, filter_string); + append_filter_err(ps, system->filter); + goto out; } #ifdef CONFIG_PERF_EVENTS -- cgit From e5e78d08f3ab3094783b8df08a5b6d1d1a56a58f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 2 Nov 2011 20:24:16 -0400 Subject: lockdep: Show subclass in pretty print of lockdep output The pretty print of the lockdep debug splat uses just the lock name to show how the locking scenario happens. But when it comes to nesting locks, the output becomes confusing which takes away the point of the pretty printing of the lock scenario. Without displaying the subclass info, we get the following output: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(slock-AF_INET); lock(slock-AF_INET); lock(slock-AF_INET); lock(slock-AF_INET); *** DEADLOCK *** The above looks more of a A->A locking bug than a A->B B->A. By adding the subclass to the output, we can see what really happened: other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(slock-AF_INET); lock(slock-AF_INET/1); lock(slock-AF_INET); lock(slock-AF_INET/1); *** DEADLOCK *** This bug was discovered while tracking down a real bug caught by lockdep. Link: http://lkml.kernel.org/r/20111025202049.GB25043@hostway.ca Cc: Peter Zijlstra Reported-by: Thomas Gleixner Tested-by: Simon Kirby Signed-off-by: Steven Rostedt --- kernel/lockdep.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 91d67ce3a8d5..6bd915df5fd3 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -490,36 +490,32 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) usage[i] = '\0'; } -static int __print_lock_name(struct lock_class *class) +static void __print_lock_name(struct lock_class *class) { char str[KSYM_NAME_LEN]; const char *name; - name = class->name; - if (!name) - name = __get_key_name(class->key, str); - - return printk("%s", name); -} - -static void print_lock_name(struct lock_class *class) -{ - char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; - const char *name; - - get_usage_chars(class, usage); - name = class->name; if (!name) { name = __get_key_name(class->key, str); - printk(" (%s", name); + printk("%s", name); } else { - printk(" (%s", name); + printk("%s", name); if (class->name_version > 1) printk("#%d", class->name_version); if (class->subclass) printk("/%d", class->subclass); } +} + +static void print_lock_name(struct lock_class *class) +{ + char usage[LOCK_USAGE_CHARS]; + + get_usage_chars(class, usage); + + printk(" ("); + __print_lock_name(class); printk("){%s}", usage); } -- cgit From 8ee3c92b7f2751c392be2d8fc360a410480b8757 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 4 Nov 2011 10:45:23 -0400 Subject: ftrace: Remove force undef config value left for testing A forced undef of a config value was used for testing and was accidently left in during the final commit. This causes x86 to run slower than needed while running function tracing as well as causes the function graph selftest to fail when DYNMAIC_FTRACE is not set. This is because the code in MCOUNT expects the ftrace code to be processed with the config value set that happened to be forced not set. The forced config option was left in by: commit 6331c28c962561aee59e5a493b7556a4bb585957 ftrace: Fix dynamic selftest failure on some archs Link: http://lkml.kernel.org/r/20111102150255.GA6973@debian Cc: stable@vger.kernel.org Reported-by: Rabin Vincent Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 077d85387908..0fcc6caead1c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -151,7 +151,6 @@ void clear_ftrace_function(void) ftrace_pid_function = ftrace_stub; } -#undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST /* * For those archs that do not test ftrace_trace_stop in their -- cgit From c8452afb7426f7e21388492f40227582e3e83879 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 18 Oct 2011 19:55:51 +0200 Subject: jump_label: jump_label_inc may return before the code is patched If cpu A calls jump_label_inc() just after atomic_add_return() is called by cpu B, atomic_inc_not_zero() will return value greater then zero and jump_label_inc() will return to a caller before jump_label_update() finishes its job on cpu B. Link: http://lkml.kernel.org/r/20111018175551.GH17571@redhat.com Cc: stable@vger.kernel.org Cc: Peter Zijlstra Acked-by: Jason Baron Signed-off-by: Gleb Natapov Signed-off-by: Steven Rostedt --- kernel/jump_label.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index a8ce45097f3d..e6f1f24ad577 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -66,8 +66,9 @@ void jump_label_inc(struct jump_label_key *key) return; jump_label_lock(); - if (atomic_add_return(1, &key->enabled) == 1) + if (atomic_read(&key->enabled) == 0) jump_label_update(key, JUMP_LABEL_ENABLE); + atomic_inc(&key->enabled); jump_label_unlock(); } -- cgit From d4d34b981a5327eec956c6cb4cce397ce6f57279 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 4 Nov 2011 20:32:39 -0400 Subject: ftrace: Fix hash record accounting bug If the set_ftrace_filter is cleared by writing just whitespace to it, then the filter hash refcounts will be decremented but not updated. This causes two bugs: 1) No functions will be enabled for tracing when they all should be 2) If the users clears the set_ftrace_filter twice, it will crash ftrace: ------------[ cut here ]------------ WARNING: at /home/rostedt/work/git/linux-trace.git/kernel/trace/ftrace.c:1384 __ftrace_hash_rec_update.part.27+0x157/0x1a7() Modules linked in: Pid: 2330, comm: bash Not tainted 3.1.0-test+ #32 Call Trace: [] warn_slowpath_common+0x83/0x9b [] warn_slowpath_null+0x1a/0x1c [] __ftrace_hash_rec_update.part.27+0x157/0x1a7 [] ? ftrace_regex_release+0xa7/0x10f [] ? kfree+0xe5/0x115 [] ftrace_hash_move+0x2e/0x151 [] ftrace_regex_release+0xba/0x10f [] fput+0xfd/0x1c2 [] filp_close+0x6d/0x78 [] sys_dup3+0x197/0x1c1 [] sys_dup2+0x4f/0x54 [] system_call_fastpath+0x16/0x1b ---[ end trace 77a3a7ee73794a02 ]--- Link: http://lkml.kernel.org/r/20111101141420.GA4918@debian Reported-by: Rabin Vincent Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0fcc6caead1c..7caa4505508d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1210,7 +1210,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, if (!src->count) { free_ftrace_hash_rcu(*dst); rcu_assign_pointer(*dst, EMPTY_HASH); - return 0; + /* still need to update the function records */ + ret = 0; + goto out; } /* -- cgit From 7e9a49ef542610609144d1afcd516dc3fafac4d6 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 7 Nov 2011 16:08:49 +0100 Subject: tracing/latency: Fix header output for latency tracers In case the the graph tracer (CONFIG_FUNCTION_GRAPH_TRACER) or even the function tracer (CONFIG_FUNCTION_TRACER) are not set, the latency tracers do not display proper latency header. The involved/fixed latency tracers are: wakeup_rt wakeup preemptirqsoff preemptoff irqsoff The patch adds proper handling of tracer configuration options for latency tracers, and displaying correct header info accordingly. * The current output (for wakeup tracer) with both graph and function tracers disabled is: # tracer: wakeup # -0 0d.h5 1us+: 0:120:R + [000] 7: 0:R watchdog/0 -0 0d.h5 3us+: ttwu_do_activate.clone.1 <-try_to_wake_up ... * The fixed output is: # tracer: wakeup # # wakeup latency trace v1.1.5 on 3.1.0-tip+ # -------------------------------------------------------------------- # latency: 55 us, #4/4, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) # ----------------- # | task: migration/0-6 (uid:0 nice:0 policy:1 rt_prio:99) # ----------------- # # _------=> CPU# # / _-----=> irqs-off # | / _----=> need-resched # || / _---=> hardirq/softirq # ||| / _--=> preempt-depth # |||| / delay # cmd pid ||||| time | caller # \ / ||||| \ | / cat-1129 0d..4 1us : 1129:120:R + [000] 6: 0:R migration/0 cat-1129 0d..4 2us+: ttwu_do_activate.clone.1 <-try_to_wake_up * The current output (for wakeup tracer) with only function tracer enabled is: # tracer: wakeup # cat-1140 0d..4 1us+: 1140:120:R + [000] 6: 0:R migration/0 cat-1140 0d..4 2us : ttwu_do_activate.clone.1 <-try_to_wake_up * The fixed output is: # tracer: wakeup # # wakeup latency trace v1.1.5 on 3.1.0-tip+ # -------------------------------------------------------------------- # latency: 207 us, #109/109, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2) # ----------------- # | task: watchdog/1-12 (uid:0 nice:0 policy:1 rt_prio:99) # ----------------- # # _------=> CPU# # / _-----=> irqs-off # | / _----=> need-resched # || / _---=> hardirq/softirq # ||| / _--=> preempt-depth # |||| / delay # cmd pid ||||| time | caller # \ / ||||| \ | / -0 1d.h5 1us+: 0:120:R + [001] 12: 0:R watchdog/1 -0 1d.h5 3us : ttwu_do_activate.clone.1 <-try_to_wake_up Link: http://lkml.kernel.org/r/20111107150849.GE1807@m.brq.redhat.com Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 15 +++++++++++++++ kernel/trace/trace.h | 1 + kernel/trace/trace_irqsoff.c | 13 ++++++++++++- kernel/trace/trace_sched_wakeup.c | 13 ++++++++++++- 4 files changed, 40 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b24a72d35008..b296186eb93a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2140,6 +2140,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) return print_trace_fmt(iter); } +void trace_latency_header(struct seq_file *m) +{ + struct trace_iterator *iter = m->private; + + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + + if (iter->iter_flags & TRACE_FILE_LAT_FMT) + print_trace_header(m, iter); + + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_lat_help_header(m); +} + void trace_default_header(struct seq_file *m) { struct trace_iterator *iter = m->private; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 092e1f8d18dc..f8ec2291b522 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -370,6 +370,7 @@ void trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc); +void trace_latency_header(struct seq_file *m); void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); int trace_empty(struct trace_iterator *iter); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index a1a3359996a7..a248c686b2b8 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -280,9 +280,20 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) } static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } -static void irqsoff_print_header(struct seq_file *s) { } static void irqsoff_trace_open(struct trace_iterator *iter) { } static void irqsoff_trace_close(struct trace_iterator *iter) { } + +#ifdef CONFIG_FUNCTION_TRACER +static void irqsoff_print_header(struct seq_file *s) +{ + trace_default_header(s); +} +#else +static void irqsoff_print_header(struct seq_file *s) +{ + trace_latency_header(s); +} +#endif /* CONFIG_FUNCTION_TRACER */ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ /* diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e4a70c0c71b6..ff791ea48b57 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -280,9 +280,20 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter) } static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } -static void wakeup_print_header(struct seq_file *s) { } static void wakeup_trace_open(struct trace_iterator *iter) { } static void wakeup_trace_close(struct trace_iterator *iter) { } + +#ifdef CONFIG_FUNCTION_TRACER +static void wakeup_print_header(struct seq_file *s) +{ + trace_default_header(s); +} +#else +static void wakeup_print_header(struct seq_file *s) +{ + trace_latency_header(s); +} +#endif /* CONFIG_FUNCTION_TRACER */ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ /* -- cgit From 9251f904f95175b4a1d8cbc0449e748f9edd7629 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 16 Oct 2011 17:15:04 +0200 Subject: perf: Carve out callchain functionality Split the callchain code from the perf events core into a new kernel/events/callchain.c file. This simplifies a bit the big core.c Signed-off-by: Borislav Petkov Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian [keep ctx recursion handling inline and use internal headers] Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1318778104-17152-1-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/events/Makefile | 2 +- kernel/events/callchain.c | 191 ++++++++++++++++++++++++++++++++++++++++++ kernel/events/core.c | 209 ---------------------------------------------- kernel/events/internal.h | 39 ++++++++- 4 files changed, 230 insertions(+), 211 deletions(-) create mode 100644 kernel/events/callchain.c (limited to 'kernel') diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 89e5e8aa4c36..22d901f9caf4 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_core.o = -pg endif -obj-y := core.o ring_buffer.o +obj-y := core.o ring_buffer.o callchain.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c new file mode 100644 index 000000000000..057e24b665cf --- /dev/null +++ b/kernel/events/callchain.c @@ -0,0 +1,191 @@ +/* + * Performance events callchain code, extracted from core.c: + * + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra + * Copyright © 2009 Paul Mackerras, IBM Corp. + * + * For licensing details see kernel-base/COPYING + */ + +#include +#include +#include "internal.h" + +struct callchain_cpus_entries { + struct rcu_head rcu_head; + struct perf_callchain_entry *cpu_entries[0]; +}; + +static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); +static atomic_t nr_callchain_events; +static DEFINE_MUTEX(callchain_mutex); +static struct callchain_cpus_entries *callchain_cpus_entries; + + +__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +__weak void perf_callchain_user(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +static void release_callchain_buffers_rcu(struct rcu_head *head) +{ + struct callchain_cpus_entries *entries; + int cpu; + + entries = container_of(head, struct callchain_cpus_entries, rcu_head); + + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + + kfree(entries); +} + +static void release_callchain_buffers(void) +{ + struct callchain_cpus_entries *entries; + + entries = callchain_cpus_entries; + rcu_assign_pointer(callchain_cpus_entries, NULL); + call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); +} + +static int alloc_callchain_buffers(void) +{ + int cpu; + int size; + struct callchain_cpus_entries *entries; + + /* + * We can't use the percpu allocation API for data that can be + * accessed from NMI. Use a temporary manual per cpu allocation + * until that gets sorted out. + */ + size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); + + entries = kzalloc(size, GFP_KERNEL); + if (!entries) + return -ENOMEM; + + size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; + + for_each_possible_cpu(cpu) { + entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, + cpu_to_node(cpu)); + if (!entries->cpu_entries[cpu]) + goto fail; + } + + rcu_assign_pointer(callchain_cpus_entries, entries); + + return 0; + +fail: + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + kfree(entries); + + return -ENOMEM; +} + +int get_callchain_buffers(void) +{ + int err = 0; + int count; + + mutex_lock(&callchain_mutex); + + count = atomic_inc_return(&nr_callchain_events); + if (WARN_ON_ONCE(count < 1)) { + err = -EINVAL; + goto exit; + } + + if (count > 1) { + /* If the allocation failed, give up */ + if (!callchain_cpus_entries) + err = -ENOMEM; + goto exit; + } + + err = alloc_callchain_buffers(); + if (err) + release_callchain_buffers(); +exit: + mutex_unlock(&callchain_mutex); + + return err; +} + +void put_callchain_buffers(void) +{ + if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { + release_callchain_buffers(); + mutex_unlock(&callchain_mutex); + } +} + +static struct perf_callchain_entry *get_callchain_entry(int *rctx) +{ + int cpu; + struct callchain_cpus_entries *entries; + + *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); + if (*rctx == -1) + return NULL; + + entries = rcu_dereference(callchain_cpus_entries); + if (!entries) + return NULL; + + cpu = smp_processor_id(); + + return &entries->cpu_entries[cpu][*rctx]; +} + +static void +put_callchain_entry(int rctx) +{ + put_recursion_context(__get_cpu_var(callchain_recursion), rctx); +} + +struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + int rctx; + struct perf_callchain_entry *entry; + + + entry = get_callchain_entry(&rctx); + if (rctx == -1) + return NULL; + + if (!entry) + goto exit_put; + + entry->nr = 0; + + if (!user_mode(regs)) { + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); + perf_callchain_kernel(entry, regs); + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_user(entry, regs); + } + +exit_put: + put_callchain_entry(rctx); + + return entry; +} diff --git a/kernel/events/core.c b/kernel/events/core.c index 0e8457da6f95..eadac69265fc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2569,215 +2569,6 @@ static u64 perf_event_read(struct perf_event *event) return perf_event_count(event); } -/* - * Callchain support - */ - -struct callchain_cpus_entries { - struct rcu_head rcu_head; - struct perf_callchain_entry *cpu_entries[0]; -}; - -static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); -static atomic_t nr_callchain_events; -static DEFINE_MUTEX(callchain_mutex); -struct callchain_cpus_entries *callchain_cpus_entries; - - -__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ -} - -__weak void perf_callchain_user(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ -} - -static void release_callchain_buffers_rcu(struct rcu_head *head) -{ - struct callchain_cpus_entries *entries; - int cpu; - - entries = container_of(head, struct callchain_cpus_entries, rcu_head); - - for_each_possible_cpu(cpu) - kfree(entries->cpu_entries[cpu]); - - kfree(entries); -} - -static void release_callchain_buffers(void) -{ - struct callchain_cpus_entries *entries; - - entries = callchain_cpus_entries; - rcu_assign_pointer(callchain_cpus_entries, NULL); - call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); -} - -static int alloc_callchain_buffers(void) -{ - int cpu; - int size; - struct callchain_cpus_entries *entries; - - /* - * We can't use the percpu allocation API for data that can be - * accessed from NMI. Use a temporary manual per cpu allocation - * until that gets sorted out. - */ - size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); - - entries = kzalloc(size, GFP_KERNEL); - if (!entries) - return -ENOMEM; - - size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; - - for_each_possible_cpu(cpu) { - entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, - cpu_to_node(cpu)); - if (!entries->cpu_entries[cpu]) - goto fail; - } - - rcu_assign_pointer(callchain_cpus_entries, entries); - - return 0; - -fail: - for_each_possible_cpu(cpu) - kfree(entries->cpu_entries[cpu]); - kfree(entries); - - return -ENOMEM; -} - -static int get_callchain_buffers(void) -{ - int err = 0; - int count; - - mutex_lock(&callchain_mutex); - - count = atomic_inc_return(&nr_callchain_events); - if (WARN_ON_ONCE(count < 1)) { - err = -EINVAL; - goto exit; - } - - if (count > 1) { - /* If the allocation failed, give up */ - if (!callchain_cpus_entries) - err = -ENOMEM; - goto exit; - } - - err = alloc_callchain_buffers(); - if (err) - release_callchain_buffers(); -exit: - mutex_unlock(&callchain_mutex); - - return err; -} - -static void put_callchain_buffers(void) -{ - if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { - release_callchain_buffers(); - mutex_unlock(&callchain_mutex); - } -} - -static int get_recursion_context(int *recursion) -{ - int rctx; - - if (in_nmi()) - rctx = 3; - else if (in_irq()) - rctx = 2; - else if (in_softirq()) - rctx = 1; - else - rctx = 0; - - if (recursion[rctx]) - return -1; - - recursion[rctx]++; - barrier(); - - return rctx; -} - -static inline void put_recursion_context(int *recursion, int rctx) -{ - barrier(); - recursion[rctx]--; -} - -static struct perf_callchain_entry *get_callchain_entry(int *rctx) -{ - int cpu; - struct callchain_cpus_entries *entries; - - *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); - if (*rctx == -1) - return NULL; - - entries = rcu_dereference(callchain_cpus_entries); - if (!entries) - return NULL; - - cpu = smp_processor_id(); - - return &entries->cpu_entries[cpu][*rctx]; -} - -static void -put_callchain_entry(int rctx) -{ - put_recursion_context(__get_cpu_var(callchain_recursion), rctx); -} - -static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - int rctx; - struct perf_callchain_entry *entry; - - - entry = get_callchain_entry(&rctx); - if (rctx == -1) - return NULL; - - if (!entry) - goto exit_put; - - entry->nr = 0; - - if (!user_mode(regs)) { - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); - perf_callchain_kernel(entry, regs); - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - - if (regs) { - perf_callchain_store(entry, PERF_CONTEXT_USER); - perf_callchain_user(entry, regs); - } - -exit_put: - put_callchain_entry(rctx); - - return entry; -} - /* * Initialize the perf_event context in a task_struct: */ diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 09097dd8116c..be4a43f6de4f 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -1,6 +1,10 @@ #ifndef _KERNEL_EVENTS_INTERNAL_H #define _KERNEL_EVENTS_INTERNAL_H +#include + +/* Buffer handling */ + #define RING_BUFFER_WRITABLE 0x01 struct ring_buffer { @@ -64,7 +68,7 @@ static inline int page_order(struct ring_buffer *rb) } #endif -static unsigned long perf_data_size(struct ring_buffer *rb) +static inline unsigned long perf_data_size(struct ring_buffer *rb) { return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); } @@ -93,4 +97,37 @@ __output_copy(struct perf_output_handle *handle, } while (len); } +/* Callchain handling */ +extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); +extern int get_callchain_buffers(void); +extern void put_callchain_buffers(void); + +static inline int get_recursion_context(int *recursion) +{ + int rctx; + + if (in_nmi()) + rctx = 3; + else if (in_irq()) + rctx = 2; + else if (in_softirq()) + rctx = 1; + else + rctx = 0; + + if (recursion[rctx]) + return -1; + + recursion[rctx]++; + barrier(); + + return rctx; +} + +static inline void put_recursion_context(int *recursion, int rctx) +{ + barrier(); + recursion[rctx]--; +} + #endif /* _KERNEL_EVENTS_INTERNAL_H */ -- cgit From 5d81e5cfb37a174e8ddc0413e2e70cdf05807ace Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Mon, 7 Nov 2011 15:54:12 +0300 Subject: events: Don't divide events if it has field period This patch solves the following problem: Now some samples may be lost due to throttling. The number of samples is restricted by sysctl_perf_event_sample_rate/HZ. A trace event is divided on some samples according to event's period. I don't sure, that we should generate more than one sample on each trace event. I think the better way to use SAMPLE_PERIOD. E.g.: I want to trace when a process sleeps. I created a process, which sleeps for 1ms and for 4ms. perf got 100 events in both cases. swapper 0 [000] 1141.371830: sched_stat_sleep: comm=foo pid=1801 delay=1386750 [ns] swapper 0 [000] 1141.369444: sched_stat_sleep: comm=foo pid=1801 delay=4499585 [ns] In the first case a kernel want to send 4499585 events and in the second case it wants to send 1386750 events. perf-reports shows that process sleeps in both places equal time. It's bug. With this patch kernel generates one event on each "sleep" and the time slice is saved in the field "period". Perf knows how handle it. Signed-off-by: Andrew Vagin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1320670457-2633428-3-git-send-email-avagin@openvz.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index eadac69265fc..8d9dea56c262 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4528,7 +4528,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, struct hw_perf_event *hwc = &event->hw; int throttle = 0; - data->period = event->hw.last_period; if (!overflow) overflow = perf_swevent_set_period(event); @@ -4562,6 +4561,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, if (!is_sampling_event(event)) return; + if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { + data->period = nr; + return perf_swevent_overflow(event, 1, data, regs); + } else + data->period = event->hw.last_period; + if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) return perf_swevent_overflow(event, 1, data, regs); -- cgit From 77271ce4b2c0df0a76ad1cbb6a95b07e1f88c1ea Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 17 Nov 2011 09:34:33 -0500 Subject: tracing: Add irq, preempt-count and need resched info to default trace output People keep asking how to get the preempt count, irq, and need resched info and we keep telling them to enable the latency format. Some developers think that traces without this info is completely useless, and for a lot of tasks it is useless. The first option was to enable the latency trace as the default format, but the header for the latency format is pretty useless for most tracers and it also does the timestamp in straight microseconds from the time the trace started. This is sometimes more difficult to read as the default trace is seconds from the start of boot up. Latency format: # tracer: nop # # nop latency trace v1.1.5 on 3.2.0-rc1-test+ # -------------------------------------------------------------------- # latency: 0 us, #159771/64234230, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:4) # ----------------- # | task: -0 (uid:0 nice:0 policy:0 rt_prio:0) # ----------------- # # _------=> CPU# # / _-----=> irqs-off # | / _----=> need-resched # || / _---=> hardirq/softirq # ||| / _--=> preempt-depth # |||| / delay # cmd pid ||||| time | caller # \ / ||||| \ | / migratio-6 0...2 41778231us+: rcu_note_context_switch <-__schedule migratio-6 0...2 41778233us : trace_rcu_utilization <-rcu_note_context_switch migratio-6 0...2 41778235us+: rcu_sched_qs <-rcu_note_context_switch migratio-6 0d..2 41778236us+: rcu_preempt_qs <-rcu_note_context_switch migratio-6 0...2 41778238us : trace_rcu_utilization <-rcu_note_context_switch migratio-6 0...2 41778239us+: debug_lockdep_rcu_enabled <-__schedule default format: # tracer: nop # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | migration/0-6 [000] 50.025810: rcu_note_context_switch <-__schedule migration/0-6 [000] 50.025812: trace_rcu_utilization <-rcu_note_context_switch migration/0-6 [000] 50.025813: rcu_sched_qs <-rcu_note_context_switch migration/0-6 [000] 50.025815: rcu_preempt_qs <-rcu_note_context_switch migration/0-6 [000] 50.025817: trace_rcu_utilization <-rcu_note_context_switch migration/0-6 [000] 50.025818: debug_lockdep_rcu_enabled <-__schedule migration/0-6 [000] 50.025820: debug_lockdep_rcu_enabled <-__schedule The latency format header has latency information that is pretty meaningless for most tracers. Although some of the header is useful, and we can add that later to the default format as well. What is really useful with the latency format is the irqs-off, need-resched hard/softirq context and the preempt count. This commit adds the option irq-info which is on by default that adds this information: # tracer: nop # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | -0 [000] d..2 49.309305: cpuidle_get_driver <-cpuidle_idle_call -0 [000] d..2 49.309307: mwait_idle <-cpu_idle -0 [000] d..2 49.309309: need_resched <-mwait_idle -0 [000] d..2 49.309310: test_ti_thread_flag <-need_resched -0 [000] d..2 49.309312: trace_power_start.constprop.13 <-mwait_idle -0 [000] d..2 49.309313: trace_cpu_idle <-mwait_idle -0 [000] d..2 49.309315: need_resched <-mwait_idle If a user wants the old format, they can disable the 'irq-info' option: # tracer: nop # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | -0 [000] 49.309305: cpuidle_get_driver <-cpuidle_idle_call -0 [000] 49.309307: mwait_idle <-cpu_idle -0 [000] 49.309309: need_resched <-mwait_idle -0 [000] 49.309310: test_ti_thread_flag <-need_resched -0 [000] 49.309312: trace_power_start.constprop.13 <-mwait_idle -0 [000] 49.309313: trace_cpu_idle <-mwait_idle -0 [000] 49.309315: need_resched <-mwait_idle Requested-by: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 24 ++++++++++++++++++++---- kernel/trace/trace.h | 1 + kernel/trace/trace_output.c | 16 ++++++++++++++-- 3 files changed, 35 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9e158cc84ced..fa5cee639962 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -338,7 +338,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | - TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; + TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | + TRACE_ITER_IRQ_INFO; static int trace_stop_count; static DEFINE_RAW_SPINLOCK(tracing_start_lock); @@ -426,6 +427,7 @@ static const char *trace_options[] = { "record-cmd", "overwrite", "disable_on_free", + "irq-info", NULL }; @@ -1857,10 +1859,20 @@ static void print_lat_help_header(struct seq_file *m) static void print_func_help_header(struct seq_file *m) { - seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); + seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); seq_puts(m, "# | | | | |\n"); } +static void print_func_help_header_irq(struct seq_file *m) +{ + seq_puts(m, "# _-----=> irqs-off\n"); + seq_puts(m, "# / _----=> need-resched\n"); + seq_puts(m, "# | / _---=> hardirq/softirq\n"); + seq_puts(m, "# || / _--=> preempt-depth\n"); + seq_puts(m, "# ||| / delay\n"); + seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"); + seq_puts(m, "# | | | |||| | |\n"); +} void print_trace_header(struct seq_file *m, struct trace_iterator *iter) @@ -2170,8 +2182,12 @@ void trace_default_header(struct seq_file *m) if (!(trace_flags & TRACE_ITER_VERBOSE)) print_lat_help_header(m); } else { - if (!(trace_flags & TRACE_ITER_VERBOSE)) - print_func_help_header(m); + if (!(trace_flags & TRACE_ITER_VERBOSE)) { + if (trace_flags & TRACE_ITER_IRQ_INFO) + print_func_help_header_irq(m); + else + print_func_help_header(m); + } } } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f8ec2291b522..2c2657462ac3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -655,6 +655,7 @@ enum trace_iterator_flags { TRACE_ITER_RECORD_CMD = 0x100000, TRACE_ITER_OVERWRITE = 0x200000, TRACE_ITER_STOP_ON_FREE = 0x400000, + TRACE_ITER_IRQ_INFO = 0x800000, }; /* diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 51999309a6cf..0d6ff3555942 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -627,11 +627,23 @@ int trace_print_context(struct trace_iterator *iter) unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned long secs = (unsigned long)t; char comm[TASK_COMM_LEN]; + int ret; trace_find_cmdline(entry->pid, comm); - return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", - comm, entry->pid, iter->cpu, secs, usec_rem); + ret = trace_seq_printf(s, "%16s-%-5d [%03d] ", + comm, entry->pid, iter->cpu); + if (!ret) + return 0; + + if (trace_flags & TRACE_ITER_IRQ_INFO) { + ret = trace_print_lat_fmt(s, entry); + if (!ret) + return 0; + } + + return trace_seq_printf(s, " %5lu.%06lu: ", + secs, usec_rem); } int trace_print_lat_context(struct trace_iterator *iter) -- cgit From 39eaf7ef884dcc44f7ff1bac803ca2a1dcf43544 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 17 Nov 2011 10:35:16 -0500 Subject: tracing: Add entries in buffer and total entries to default output header Knowing the number of event entries in the ring buffer compared to the total number that were written is useful information. The latency format gives this information and there's no reason that the default format does not. This information is now added to the default header, along with the number of online CPUs: # tracer: nop # # entries-in-buffer/entries-written: 159836/64690869 #P:4 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | -0 [000] ...2 49.442971: local_touch_nmi <-cpu_idle -0 [000] d..2 49.442973: enter_idle <-cpu_idle -0 [000] d..2 49.442974: atomic_notifier_call_chain <-enter_idle -0 [000] d..2 49.442976: __atomic_notifier_call_chain <-atomic_notifier The above shows that the trace contains 159836 entries, but 64690869 were written. One could figure out that there were 64531033 entries that were dropped. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 72 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fa5cee639962..7392070ffc39 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1845,6 +1845,33 @@ static void s_stop(struct seq_file *m, void *p) trace_event_read_unlock(); } +static void +get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries) +{ + unsigned long count; + int cpu; + + *total = 0; + *entries = 0; + + for_each_tracing_cpu(cpu) { + count = ring_buffer_entries_cpu(tr->buffer, cpu); + /* + * If this buffer has skipped entries, then we hold all + * entries for the trace and we need to ignore the + * ones before the time stamp. + */ + if (tr->data[cpu]->skipped_entries) { + count -= tr->data[cpu]->skipped_entries; + /* total is the same as the entries */ + *total += count; + } else + *total += count + + ring_buffer_overrun_cpu(tr->buffer, cpu); + *entries += count; + } +} + static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n"); @@ -1857,14 +1884,27 @@ static void print_lat_help_header(struct seq_file *m) seq_puts(m, "# \\ / ||||| \\ | / \n"); } -static void print_func_help_header(struct seq_file *m) +static void print_event_info(struct trace_array *tr, struct seq_file *m) { + unsigned long total; + unsigned long entries; + + get_total_entries(tr, &total, &entries); + seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", + entries, total, num_online_cpus()); + seq_puts(m, "#\n"); +} + +static void print_func_help_header(struct trace_array *tr, struct seq_file *m) +{ + print_event_info(tr, m); seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); seq_puts(m, "# | | | | |\n"); } -static void print_func_help_header_irq(struct seq_file *m) +static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m) { + print_event_info(tr, m); seq_puts(m, "# _-----=> irqs-off\n"); seq_puts(m, "# / _----=> need-resched\n"); seq_puts(m, "# | / _---=> hardirq/softirq\n"); @@ -1881,32 +1921,14 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) struct trace_array *tr = iter->tr; struct trace_array_cpu *data = tr->data[tr->cpu]; struct tracer *type = current_trace; - unsigned long entries = 0; - unsigned long total = 0; - unsigned long count; + unsigned long entries; + unsigned long total; const char *name = "preemption"; - int cpu; if (type) name = type->name; - - for_each_tracing_cpu(cpu) { - count = ring_buffer_entries_cpu(tr->buffer, cpu); - /* - * If this buffer has skipped entries, then we hold all - * entries for the trace and we need to ignore the - * ones before the time stamp. - */ - if (tr->data[cpu]->skipped_entries) { - count -= tr->data[cpu]->skipped_entries; - /* total is the same as the entries */ - total += count; - } else - total += count + - ring_buffer_overrun_cpu(tr->buffer, cpu); - entries += count; - } + get_total_entries(tr, &total, &entries); seq_printf(m, "# %s latency trace v1.1.5 on %s\n", name, UTS_RELEASE); @@ -2184,9 +2206,9 @@ void trace_default_header(struct seq_file *m) } else { if (!(trace_flags & TRACE_ITER_VERBOSE)) { if (trace_flags & TRACE_ITER_IRQ_INFO) - print_func_help_header_irq(m); + print_func_help_header_irq(iter->tr, m); else - print_func_help_header(m); + print_func_help_header(iter->tr, m); } } } -- cgit From 0f5a2601284237e2ba089389fd75d67f77626cef Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Nov 2011 14:38:16 +0100 Subject: perf: Avoid a useless pmu_disable() in the perf-tick Gleb writes: > Currently pmu is disabled and re-enabled on each timer interrupt even > when no rotation or frequency adjustment is needed. On Intel CPU this > results in two writes into PERF_GLOBAL_CTRL MSR per tick. On bare metal > it does not cause significant slowdown, but when running perf in a virtual > machine it leads to 20% slowdown on my machine. Cure this by keeping a perf_event_context::nr_freq counter that counts the number of active events that require frequency adjustments and use this in a similar fashion to the already existing nr_events != nr_active test in perf_rotate_context(). By being able to exclude both rotation and frequency adjustments a-priory for the common case we can avoid the otherwise superfluous PMU disable. Suggested-by: Gleb Natapov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-515yhoatehd3gza7we9fapaa@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 48 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a355ffb0b28f..b3fed52aaf20 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1130,6 +1130,8 @@ event_sched_out(struct perf_event *event, if (!is_software_event(event)) cpuctx->active_oncpu--; ctx->nr_active--; + if (event->attr.freq && event->attr.sample_freq) + ctx->nr_freq--; if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; } @@ -1407,6 +1409,8 @@ event_sched_in(struct perf_event *event, if (!is_software_event(event)) cpuctx->active_oncpu++; ctx->nr_active++; + if (event->attr.freq && event->attr.sample_freq) + ctx->nr_freq++; if (event->attr.exclusive) cpuctx->exclusive = 1; @@ -2329,6 +2333,9 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) u64 interrupts, now; s64 delta; + if (!ctx->nr_freq) + return; + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; @@ -2384,12 +2391,14 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) { u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; struct perf_event_context *ctx = NULL; - int rotate = 0, remove = 1; + int rotate = 0, remove = 1, freq = 0; if (cpuctx->ctx.nr_events) { remove = 0; if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) rotate = 1; + if (cpuctx->ctx.nr_freq) + freq = 1; } ctx = cpuctx->task_ctx; @@ -2397,33 +2406,40 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) remove = 0; if (ctx->nr_events != ctx->nr_active) rotate = 1; + if (ctx->nr_freq) + freq = 1; } + if (!rotate && !freq) + goto done; + perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); - perf_ctx_adjust_freq(&cpuctx->ctx, interval); - if (ctx) - perf_ctx_adjust_freq(ctx, interval); - if (!rotate) - goto done; + if (freq) { + perf_ctx_adjust_freq(&cpuctx->ctx, interval); + if (ctx) + perf_ctx_adjust_freq(ctx, interval); + } - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - if (ctx) - ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); + if (rotate) { + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + if (ctx) + ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); - rotate_ctx(&cpuctx->ctx); - if (ctx) - rotate_ctx(ctx); + rotate_ctx(&cpuctx->ctx); + if (ctx) + rotate_ctx(ctx); + + perf_event_sched_in(cpuctx, ctx, current); + } - perf_event_sched_in(cpuctx, ctx, current); + perf_pmu_enable(cpuctx->ctx.pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); done: if (remove) list_del_init(&cpuctx->rotation_list); - - perf_pmu_enable(cpuctx->ctx.pmu); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } void perf_event_task_tick(void) -- cgit From 1d9b482e78d3b16f97f85a82849f82db7eed3102 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 23 Nov 2011 12:34:20 +0100 Subject: perf: Remove superfluous arguments Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-yv4o74vh90suyghccgykbnry@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index b3fed52aaf20..eeda5403590c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1667,8 +1667,7 @@ retry: * Note: this works for group members as well as group leaders * since the non-leader members' sibling_lists will be empty. */ -static void __perf_event_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) +static void __perf_event_mark_enabled(struct perf_event *event) { struct perf_event *sub; u64 tstamp = perf_event_time(event); @@ -1706,7 +1705,7 @@ static int __perf_event_enable(void *info) */ perf_cgroup_set_timestamp(current, ctx); - __perf_event_mark_enabled(event, ctx); + __perf_event_mark_enabled(event); if (!event_filter_match(event)) { if (is_cgroup_event(event)) @@ -1787,7 +1786,7 @@ void perf_event_enable(struct perf_event *event) retry: if (!ctx->is_active) { - __perf_event_mark_enabled(event, ctx); + __perf_event_mark_enabled(event); goto out; } @@ -2466,7 +2465,7 @@ static int event_enable_on_exec(struct perf_event *event, if (event->state >= PERF_EVENT_STATE_INACTIVE) return 0; - __perf_event_mark_enabled(event, ctx); + __perf_event_mark_enabled(event); return 1; } -- cgit From b79387ef185af2323594920923cecba5753c3817 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Nov 2011 11:25:43 +0100 Subject: perf: Fix enable_on_exec for sibling events Deng-Cheng Zhu reported that sibling events that were created disabled with enable_on_exec would never get enabled. Iterate all events instead of the group lists. Reported-by: Deng-Cheng Zhu Tested-by: Deng-Cheng Zhu Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1322048382.14799.41.camel@twins Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index eeda5403590c..3c1541d7a53d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2497,13 +2497,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) raw_spin_lock(&ctx->lock); task_ctx_sched_out(ctx); - list_for_each_entry(event, &ctx->pinned_groups, group_entry) { - ret = event_enable_on_exec(event, ctx); - if (ret) - enabled = 1; - } - - list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + list_for_each_entry(event, &ctx->event_list, event_entry) { ret = event_enable_on_exec(event, ctx); if (ret) enabled = 1; -- cgit From b202952075f62603bea9bfb6ebc6b0420db11949 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 27 Nov 2011 17:59:09 +0200 Subject: perf, core: Rate limit perf_sched_events jump_label patching jump_lable patching is very expensive operation that involves pausing all cpus. The patching of perf_sched_events jump_label is easily controllable from userspace by unprivileged user. When te user runs a loop like this: "while true; do perf stat -e cycles true; done" ... the performance of my test application that just increments a counter for one second drops by 4%. This is on a 16 cpu box with my test application using only one of them. An impact on a real server doing real work will be worse. Performance of KVM PMU drops nearly 50% due to jump_lable for "perf record" since KVM PMU implementation creates and destroys perf event frequently. This patch introduces a way to rate limit jump_label patching and uses it to fix the above problem. I believe that as jump_label use will spread the problem will become more common and thus solving it in a generic code is appropriate. Also fixing it in the perf code would result in moving jump_label accounting logic to perf code with all the ifdefs in case of JUMP_LABEL=n kernel. With this patch all details are nicely hidden inside jump_label code. Signed-off-by: Gleb Natapov Acked-by: Jason Baron Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20111127155909.GO2557@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 13 ++++++++----- kernel/jump_label.c | 35 +++++++++++++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3c1541d7a53d..3a3b1a18f490 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -128,7 +128,7 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -struct jump_label_key perf_sched_events __read_mostly; +struct jump_label_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static atomic_t nr_mmap_events __read_mostly; @@ -2748,7 +2748,7 @@ static void free_event(struct perf_event *event) if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_dec(&perf_sched_events); + jump_label_dec_deferred(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -2759,7 +2759,7 @@ static void free_event(struct perf_event *event) put_callchain_buffers(); if (is_cgroup_event(event)) { atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_dec(&perf_sched_events); + jump_label_dec_deferred(&perf_sched_events); } } @@ -5784,7 +5784,7 @@ done: if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_inc(&perf_sched_events); + jump_label_inc(&perf_sched_events.key); if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -6022,7 +6022,7 @@ SYSCALL_DEFINE5(perf_event_open, * - that may need work on context switch */ atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_inc(&perf_sched_events); + jump_label_inc(&perf_sched_events.key); } /* @@ -6868,6 +6868,9 @@ void __init perf_event_init(void) ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); + + /* do not patch jump label more than once per second */ + jump_label_rate_limit(&perf_sched_events, HZ); } static int __init perf_event_sysfs_init(void) diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 66ff7109f697..51a175ab0a03 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -72,15 +72,46 @@ void jump_label_inc(struct jump_label_key *key) jump_label_unlock(); } -void jump_label_dec(struct jump_label_key *key) +static void __jump_label_dec(struct jump_label_key *key, + unsigned long rate_limit, struct delayed_work *work) { if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) return; - jump_label_update(key, JUMP_LABEL_DISABLE); + if (rate_limit) { + atomic_inc(&key->enabled); + schedule_delayed_work(work, rate_limit); + } else + jump_label_update(key, JUMP_LABEL_DISABLE); + jump_label_unlock(); } +static void jump_label_update_timeout(struct work_struct *work) +{ + struct jump_label_key_deferred *key = + container_of(work, struct jump_label_key_deferred, work.work); + __jump_label_dec(&key->key, 0, NULL); +} + +void jump_label_dec(struct jump_label_key *key) +{ + __jump_label_dec(key, 0, NULL); +} + +void jump_label_dec_deferred(struct jump_label_key_deferred *key) +{ + __jump_label_dec(&key->key, key->timeout, &key->work); +} + + +void jump_label_rate_limit(struct jump_label_key_deferred *key, + unsigned long rl) +{ + key->timeout = rl; + INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); +} + static int addr_conflict(struct jump_entry *entry, void *start, void *end) { if (entry->code <= (unsigned long)end && -- cgit From 9cdbe1cbac4ec318037297175587a0080acc9d11 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 6 Dec 2011 17:27:29 +0100 Subject: jump_label, x86: Fix section mismatch WARNING: arch/x86/kernel/built-in.o(.text+0x4c71): Section mismatch in reference from the function arch_jump_label_transform_static() to the function .init.text:text_poke_early() The function arch_jump_label_transform_static() references the function __init text_poke_early(). This is often because arch_jump_label_transform_static lacks a __init annotation or the annotation of text_poke_early is wrong. Signed-off-by: Peter Zijlstra Cc: Jason Baron Link: http://lkml.kernel.org/n/tip-9lefe89mrvurrwpqw5h8xm8z@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 51a175ab0a03..3fb7b79c86fd 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -142,7 +142,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start, * running code can override this to make the non-live update case * cheaper. */ -void __weak arch_jump_label_transform_static(struct jump_entry *entry, +void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { arch_jump_label_transform(entry, type); -- cgit From ac99b862fb98a36929831791da31714f709c2aa8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Jul 2011 14:20:14 +0200 Subject: jump_label: Provide jump_label_key initializers Provide two initializers for jump_label_key that initialize it enabled or disabled. Also modify all jump_label code to allow for jump_labels to be initialized enabled. Signed-off-by: Peter Zijlstra Cc: Jason Baron Link: http://lkml.kernel.org/n/tip-p40e3yj21b68y03z1yv825e7@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 3fb7b79c86fd..30c3c7708132 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -248,8 +248,13 @@ void jump_label_apply_nops(struct module *mod) if (iter_start == iter_stop) return; - for (iter = iter_start; iter < iter_stop; iter++) - arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); + for (iter = iter_start; iter < iter_stop; iter++) { + struct jump_label_key *iterk; + + iterk = (struct jump_label_key *)(unsigned long)iter->key; + arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? + JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); + } } static int jump_label_add_module(struct module *mod) @@ -289,8 +294,7 @@ static int jump_label_add_module(struct module *mod) key->next = jlm; if (jump_label_enabled(key)) - __jump_label_update(key, iter, iter_stop, - JUMP_LABEL_ENABLE); + __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); } return 0; -- cgit