diff options
Diffstat (limited to 'kernel/trace/trace_osnoise.c')
| -rw-r--r-- | kernel/trace/trace_osnoise.c | 1030 |
1 files changed, 886 insertions, 144 deletions
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 870a08da5b48..827104d00bc0 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -24,6 +24,7 @@ #include <linux/sched/clock.h> #include <uapi/linux/sched/types.h> #include <linux/sched.h> +#include <linux/string.h> #include "trace.h" #ifdef CONFIG_X86_LOCAL_APIC @@ -49,6 +50,28 @@ #define DEFAULT_TIMERLAT_PRIO 95 /* FIFO 95 */ /* + * osnoise/options entries. + */ +enum osnoise_options_index { + OSN_DEFAULTS = 0, + OSN_WORKLOAD, + OSN_PANIC_ON_STOP, + OSN_PREEMPT_DISABLE, + OSN_IRQ_DISABLE, + OSN_MAX +}; + +static const char * const osnoise_options_str[OSN_MAX] = { + "DEFAULTS", + "OSNOISE_WORKLOAD", + "PANIC_ON_STOP", + "OSNOISE_PREEMPT_DISABLE", + "OSNOISE_IRQ_DISABLE" }; + +#define OSN_DEFAULT_OPTIONS 0x2 +static unsigned long osnoise_options = OSN_DEFAULT_OPTIONS; + +/* * trace_array of the enabled osnoise/timerlat instances. */ struct osnoise_instance { @@ -125,9 +148,8 @@ static void osnoise_unregister_instance(struct trace_array *tr) * register/unregister serialization is provided by trace's * trace_types_lock. */ - lockdep_assert_held(&trace_types_lock); - - list_for_each_entry_rcu(inst, &osnoise_instances, list) { + list_for_each_entry_rcu(inst, &osnoise_instances, list, + lockdep_is_held(&trace_types_lock)) { if (inst->tr == tr) { list_del_rcu(&inst->list); found = 1; @@ -138,7 +160,7 @@ static void osnoise_unregister_instance(struct trace_array *tr) if (!found) return; - kvfree_rcu(inst); + kvfree_rcu_mightsleep(inst); } /* @@ -160,6 +182,7 @@ struct osn_irq { #define IRQ_CONTEXT 0 #define THREAD_CONTEXT 1 +#define THREAD_URET 2 /* * sofirq runtime info. */ @@ -196,7 +219,7 @@ struct osnoise_variables { /* * Per-cpu runtime information. */ -DEFINE_PER_CPU(struct osnoise_variables, per_cpu_osnoise_var); +static DEFINE_PER_CPU(struct osnoise_variables, per_cpu_osnoise_var); /* * this_cpu_osn_var - Return the per-cpu osnoise_variables on its relative CPU @@ -206,6 +229,11 @@ static inline struct osnoise_variables *this_cpu_osn_var(void) return this_cpu_ptr(&per_cpu_osnoise_var); } +/* + * Protect the interface. + */ +static struct mutex interface_lock; + #ifdef CONFIG_TIMERLAT_TRACER /* * Runtime information for the timer mode. @@ -217,9 +245,10 @@ struct timerlat_variables { u64 abs_period; bool tracing_thread; u64 count; + bool uthread_migrate; }; -DEFINE_PER_CPU(struct timerlat_variables, per_cpu_timerlat_var); +static DEFINE_PER_CPU(struct timerlat_variables, per_cpu_timerlat_var); /* * this_cpu_tmr_var - Return the per-cpu timerlat_variables on its relative CPU @@ -236,14 +265,20 @@ static inline void tlat_var_reset(void) { struct timerlat_variables *tlat_var; int cpu; + + /* Synchronize with the timerlat interfaces */ + mutex_lock(&interface_lock); /* * So far, all the values are initialized as 0, so * zeroing the structure is perfect. */ - for_each_cpu(cpu, cpu_online_mask) { + for_each_online_cpu(cpu) { tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); + if (tlat_var->kthread) + hrtimer_cancel(&tlat_var->timer); memset(tlat_var, 0, sizeof(*tlat_var)); } + mutex_unlock(&interface_lock); } #else /* CONFIG_TIMERLAT_TRACER */ #define tlat_var_reset() do {} while (0) @@ -261,7 +296,7 @@ static inline void osn_var_reset(void) * So far, all the values are initialized as 0, so * zeroing the structure is perfect. */ - for_each_cpu(cpu, cpu_online_mask) { + for_each_online_cpu(cpu) { osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); memset(osn_var, 0, sizeof(*osn_var)); } @@ -282,38 +317,6 @@ static inline void osn_var_reset_all(void) bool trace_osnoise_callback_enabled; /* - * osnoise sample structure definition. Used to store the statistics of a - * sample run. - */ -struct osnoise_sample { - u64 runtime; /* runtime */ - u64 noise; /* noise */ - u64 max_sample; /* max single noise sample */ - int hw_count; /* # HW (incl. hypervisor) interference */ - int nmi_count; /* # NMIs during this sample */ - int irq_count; /* # IRQs during this sample */ - int softirq_count; /* # softirqs during this sample */ - int thread_count; /* # threads during this sample */ -}; - -#ifdef CONFIG_TIMERLAT_TRACER -/* - * timerlat sample structure definition. Used to store the statistics of - * a sample run. - */ -struct timerlat_sample { - u64 timer_latency; /* timer_latency */ - unsigned int seqnum; /* unique sequence */ - int context; /* timer context */ -}; -#endif - -/* - * Protect the interface. - */ -struct mutex interface_lock; - -/* * Tracer data. */ static struct osnoise_data { @@ -326,7 +329,7 @@ static struct osnoise_data { u64 print_stack; /* print IRQ stack if total > */ int timerlat_tracer; /* timerlat tracer */ #endif - bool tainted; /* infor users and developers about a problem */ + bool tainted; /* info users and developers about a problem */ } osnoise_data = { .sample_period = DEFAULT_SAMPLE_PERIOD, .sample_runtime = DEFAULT_SAMPLE_RUNTIME, @@ -468,9 +471,8 @@ static void print_osnoise_headers(struct seq_file *s) * Record an osnoise_sample into the tracer buffer. */ static void -__trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer) +__record_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer) { - struct trace_event_call *call = &event_osnoise; struct ring_buffer_event *event; struct osnoise_entry *entry; @@ -488,22 +490,23 @@ __trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffe entry->softirq_count = sample->softirq_count; entry->thread_count = sample->thread_count; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } /* - * Record an osnoise_sample on all osnoise instances. + * Record an osnoise_sample on all osnoise instances and fire trace event. */ -static void trace_osnoise_sample(struct osnoise_sample *sample) +static void record_osnoise_sample(struct osnoise_sample *sample) { struct osnoise_instance *inst; struct trace_buffer *buffer; + trace_osnoise_sample(sample); + rcu_read_lock(); list_for_each_entry_rcu(inst, &osnoise_instances, list) { buffer = inst->tr->array_buffer.buffer; - __trace_osnoise_sample(sample, buffer); + __record_osnoise_sample(sample, buffer); } rcu_read_unlock(); } @@ -547,9 +550,8 @@ static void print_timerlat_headers(struct seq_file *s) #endif /* CONFIG_PREEMPT_RT */ static void -__trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer) +__record_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer) { - struct trace_event_call *call = &event_osnoise; struct ring_buffer_event *event; struct timerlat_entry *entry; @@ -562,22 +564,23 @@ __trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buf entry->context = sample->context; entry->timer_latency = sample->timer_latency; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } /* * Record an timerlat_sample into the tracer buffer. */ -static void trace_timerlat_sample(struct timerlat_sample *sample) +static void record_timerlat_sample(struct timerlat_sample *sample) { struct osnoise_instance *inst; struct trace_buffer *buffer; + trace_timerlat_sample(sample); + rcu_read_lock(); list_for_each_entry_rcu(inst, &osnoise_instances, list) { buffer = inst->tr->array_buffer.buffer; - __trace_timerlat_sample(sample, buffer); + __record_timerlat_sample(sample, buffer); } rcu_read_unlock(); } @@ -625,7 +628,6 @@ static void timerlat_save_stack(int skip) static void __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, unsigned int size) { - struct trace_event_call *call = &event_osnoise; struct ring_buffer_event *event; struct stack_entry *entry; @@ -636,11 +638,10 @@ __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, u entry = ring_buffer_event_data(event); - memcpy(&entry->caller, fstack->calls, size); entry->size = fstack->nr_entries; + memcpy(&entry->caller, fstack->calls, size); - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } /* @@ -737,7 +738,7 @@ cond_move_thread_delta_start(struct osnoise_variables *osn_var, u64 duration) /* * get_int_safe_duration - Get the duration of a window * - * The irq, softirq and thread varaibles need to have its duration without + * The irq, softirq and thread variables need to have its duration without * the interference from higher priority interrupts. Instead of keeping a * variable to discount the interrupt interference from these variables, the * starting time of these variables are pushed forward with the interrupt's @@ -917,7 +918,7 @@ void osnoise_trace_irq_entry(int id) void osnoise_trace_irq_exit(int id, const char *desc) { struct osnoise_variables *osn_var = this_cpu_osn_var(); - int duration; + s64 duration; if (!osn_var->sampling) return; @@ -1048,7 +1049,7 @@ static void trace_softirq_entry_callback(void *data, unsigned int vec_nr) static void trace_softirq_exit_callback(void *data, unsigned int vec_nr) { struct osnoise_variables *osn_var = this_cpu_osn_var(); - int duration; + s64 duration; if (!osn_var->sampling) return; @@ -1144,7 +1145,7 @@ thread_entry(struct osnoise_variables *osn_var, struct task_struct *t) static void thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) { - int duration; + s64 duration; if (!osn_var->sampling) return; @@ -1160,6 +1161,89 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) osn_var->thread.arrival_time = 0; } +#ifdef CONFIG_TIMERLAT_TRACER +/* + * osnoise_stop_exception - Stop tracing and the tracer. + */ +static __always_inline void osnoise_stop_exception(char *msg, int cpu) +{ + struct osnoise_instance *inst; + struct trace_array *tr; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) { + tr = inst->tr; + trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, + "stop tracing hit on cpu %d due to exception: %s\n", + smp_processor_id(), + msg); + + if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options)) + panic("tracer hit on cpu %d due to exception: %s\n", + smp_processor_id(), + msg); + + tracer_tracing_off(tr); + } + rcu_read_unlock(); +} + +/* + * trace_sched_migrate_callback - sched:sched_migrate_task trace event handler + * + * his function is hooked to the sched:sched_migrate_task trace event, and monitors + * timerlat user-space thread migration. + */ +static void trace_sched_migrate_callback(void *data, struct task_struct *p, int dest_cpu) +{ + struct osnoise_variables *osn_var; + long cpu = task_cpu(p); + + osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); + if (osn_var->pid == p->pid && dest_cpu != cpu) { + per_cpu_ptr(&per_cpu_timerlat_var, cpu)->uthread_migrate = 1; + osnoise_taint("timerlat user-thread migrated\n"); + osnoise_stop_exception("timerlat user-thread migrated", cpu); + } +} + +static bool monitor_enabled; + +static int register_migration_monitor(void) +{ + int ret = 0; + + /* + * Timerlat thread migration check is only required when running timerlat in user-space. + * Thus, enable callback only if timerlat is set with no workload. + */ + if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) { + if (WARN_ON_ONCE(monitor_enabled)) + return 0; + + ret = register_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); + if (!ret) + monitor_enabled = true; + } + + return ret; +} + +static void unregister_migration_monitor(void) +{ + if (!monitor_enabled) + return; + + unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); + monitor_enabled = false; +} +#else +static int register_migration_monitor(void) +{ + return 0; +} +static void unregister_migration_monitor(void) {} +#endif /* * trace_sched_switch - sched:sched_switch trace event handler * @@ -1167,20 +1251,23 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) * used to record the beginning and to report the end of a thread noise window. */ static void -trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p, - struct task_struct *n) +trace_sched_switch_callback(void *data, bool preempt, + struct task_struct *p, + struct task_struct *n, + unsigned int prev_state) { struct osnoise_variables *osn_var = this_cpu_osn_var(); + int workload = test_bit(OSN_WORKLOAD, &osnoise_options); - if (p->pid != osn_var->pid) + if ((p->pid != osn_var->pid) || !workload) thread_exit(osn_var, p); - if (n->pid != osn_var->pid) + if ((n->pid != osn_var->pid) || !workload) thread_entry(osn_var, n); } /* - * hook_thread_events - Hook the insturmentation for thread noise + * hook_thread_events - Hook the instrumentation for thread noise * * Hook the osnoise tracer callbacks to handle the noise from other * threads on the necessary kernel events. @@ -1193,11 +1280,19 @@ static int hook_thread_events(void) if (ret) return -EINVAL; + ret = register_migration_monitor(); + if (ret) + goto out_unreg; + return 0; + +out_unreg: + unregister_trace_sched_switch(trace_sched_switch_callback, NULL); + return -EINVAL; } /* - * unhook_thread_events - *nhook the insturmentation for thread noise + * unhook_thread_events - unhook the instrumentation for thread noise * * Unook the osnoise tracer callbacks to handle the noise from other * threads on the necessary kernel events. @@ -1205,6 +1300,7 @@ static int hook_thread_events(void) static void unhook_thread_events(void) { unregister_trace_sched_switch(trace_sched_switch_callback, NULL); + unregister_migration_monitor(); } /* @@ -1253,12 +1349,31 @@ static __always_inline void osnoise_stop_tracing(void) trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, "stop tracing hit on cpu %d\n", smp_processor_id()); + if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options)) + panic("tracer hit stop condition on CPU %d\n", smp_processor_id()); + tracer_tracing_off(tr); } rcu_read_unlock(); } /* + * osnoise_has_tracing_on - Check if there is at least one instance on + */ +static __always_inline int osnoise_has_tracing_on(void) +{ + struct osnoise_instance *inst; + int trace_is_on = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) + trace_is_on += tracer_tracing_is_on(inst->tr); + rcu_read_unlock(); + + return trace_is_on; +} + +/* * notify_new_max_latency - Notify a new max latency via fsnotify interface. */ static void notify_new_max_latency(u64 latency) @@ -1269,7 +1384,7 @@ static void notify_new_max_latency(u64 latency) rcu_read_lock(); list_for_each_entry_rcu(inst, &osnoise_instances, list) { tr = inst->tr; - if (tr->max_latency < latency) { + if (tracer_tracing_is_on(tr) && tr->max_latency < latency) { tr->max_latency = latency; latency_fsnotify(tr); } @@ -1287,12 +1402,14 @@ static void notify_new_max_latency(u64 latency) */ static int run_osnoise(void) { + bool disable_irq = test_bit(OSN_IRQ_DISABLE, &osnoise_options); struct osnoise_variables *osn_var = this_cpu_osn_var(); u64 start, sample, last_sample; u64 last_int_count, int_count; s64 noise = 0, max_noise = 0; s64 total, last_total = 0; struct osnoise_sample s; + bool disable_preemption; unsigned int threshold; u64 runtime, stop_in; u64 sum_noise = 0; @@ -1300,6 +1417,12 @@ static int run_osnoise(void) int ret = -1; /* + * Disabling preemption is only required if IRQs are enabled, + * and the options is set on. + */ + disable_preemption = !disable_irq && test_bit(OSN_PREEMPT_DISABLE, &osnoise_options); + + /* * Considers the current thread as the workload. */ osn_var->pid = current->pid; @@ -1310,9 +1433,18 @@ static int run_osnoise(void) save_osn_sample_stats(osn_var, &s); /* - * if threshold is 0, use the default value of 5 us. + * if threshold is 0, use the default value of 1 us. */ - threshold = tracing_thresh ? : 5000; + threshold = tracing_thresh ? : 1000; + + /* + * Apply PREEMPT and IRQ disabled options. + */ + if (disable_irq) + local_irq_disable(); + + if (disable_preemption) + preempt_disable(); /* * Make sure NMIs see sampling first @@ -1328,7 +1460,7 @@ static int run_osnoise(void) stop_in = osnoise_data.stop_tracing * NSEC_PER_USEC; /* - * Start timestemp + * Start timestamp */ start = time_get(); @@ -1387,10 +1519,33 @@ static int run_osnoise(void) } /* + * In some cases, notably when running on a nohz_full CPU with + * a stopped tick PREEMPT_RCU or PREEMPT_LAZY have no way to + * account for QSs. This will eventually cause unwarranted + * noise as RCU forces preemption as the means of ending the + * current grace period. We avoid this by calling + * rcu_momentary_eqs(), which performs a zero duration EQS + * allowing RCU to end the current grace period. This call + * shouldn't be wrapped inside an RCU critical section. + * + * Normally QSs for other cases are handled through cond_resched(). + * For simplicity, however, we call rcu_momentary_eqs() for all + * configurations here. + */ + if (!disable_irq) + local_irq_disable(); + + rcu_momentary_eqs(); + + if (!disable_irq) + local_irq_enable(); + + /* * For the non-preemptive kernel config: let threads runs, if - * they so wish. + * they so wish, unless set not do to so. */ - cond_resched(); + if (!disable_irq && !disable_preemption) + cond_resched(); last_sample = sample; last_int_count = int_count; @@ -1410,6 +1565,15 @@ static int run_osnoise(void) barrier(); /* + * Return to the preemptive state. + */ + if (disable_preemption) + preempt_enable(); + + if (disable_irq) + local_irq_enable(); + + /* * Save noise info. */ s.noise = time_to_us(sum_noise); @@ -1420,7 +1584,7 @@ static int run_osnoise(void) /* Save interference stats info */ diff_osn_sample_stats(osn_var, &s); - trace_osnoise_sample(&s); + record_osnoise_sample(&s); notify_new_max_latency(max_noise); @@ -1435,6 +1599,75 @@ out: static struct cpumask osnoise_cpumask; static struct cpumask save_cpumask; +static struct cpumask kthread_cpumask; + +/* + * osnoise_sleep - sleep until the next period + */ +static void osnoise_sleep(bool skip_period) +{ + u64 interval; + ktime_t wake_time; + + mutex_lock(&interface_lock); + if (skip_period) + interval = osnoise_data.sample_period; + else + interval = osnoise_data.sample_period - osnoise_data.sample_runtime; + mutex_unlock(&interface_lock); + + /* + * differently from hwlat_detector, the osnoise tracer can run + * without a pause because preemption is on. + */ + if (!interval) { + /* Let synchronize_rcu_tasks() make progress */ + cond_resched_tasks_rcu_qs(); + return; + } + + wake_time = ktime_add_us(ktime_get(), interval); + __set_current_state(TASK_INTERRUPTIBLE); + + while (schedule_hrtimeout(&wake_time, HRTIMER_MODE_ABS)) { + if (kthread_should_stop()) + break; + } +} + +/* + * osnoise_migration_pending - checks if the task needs to migrate + * + * osnoise/timerlat threads are per-cpu. If there is a pending request to + * migrate the thread away from the current CPU, something bad has happened. + * Play the good citizen and leave. + * + * Returns 0 if it is safe to continue, 1 otherwise. + */ +static inline int osnoise_migration_pending(void) +{ + if (!current->migration_pending) + return 0; + + /* + * If migration is pending, there is a task waiting for the + * tracer to enable migration. The tracer does not allow migration, + * thus: taint and leave to unblock the blocked thread. + */ + osnoise_taint("migration requested to osnoise threads, leaving."); + + /* + * Unset this thread from the threads managed by the interface. + * The tracers are responsible for cleaning their env before + * exiting. + */ + mutex_lock(&interface_lock); + this_cpu_osn_var()->kthread = NULL; + cpumask_clear_cpu(smp_processor_id(), &kthread_cpumask); + mutex_unlock(&interface_lock); + + return 1; +} /* * osnoise_main - The osnoise detection kernel thread @@ -1444,32 +1677,35 @@ static struct cpumask save_cpumask; */ static int osnoise_main(void *data) { - u64 interval; - - while (!kthread_should_stop()) { + unsigned long flags; - run_osnoise(); - - mutex_lock(&interface_lock); - interval = osnoise_data.sample_period - osnoise_data.sample_runtime; - mutex_unlock(&interface_lock); + /* + * This thread was created pinned to the CPU using PF_NO_SETAFFINITY. + * The problem is that cgroup does not allow PF_NO_SETAFFINITY thread. + * + * To work around this limitation, disable migration and remove the + * flag. + */ + migrate_disable(); + raw_spin_lock_irqsave(¤t->pi_lock, flags); + current->flags &= ~(PF_NO_SETAFFINITY); + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); - do_div(interval, USEC_PER_MSEC); + while (!kthread_should_stop()) { + if (osnoise_migration_pending()) + break; - /* - * differently from hwlat_detector, the osnoise tracer can run - * without a pause because preemption is on. - */ - if (interval < 1) { - /* Let synchronize_rcu_tasks() make progress */ - cond_resched_tasks_rcu_qs(); + /* skip a period if tracing is off on all instances */ + if (!osnoise_has_tracing_on()) { + osnoise_sleep(true); continue; } - if (msleep_interruptible(interval)) - break; + run_osnoise(); + osnoise_sleep(false); } + migrate_enable(); return 0; } @@ -1543,13 +1779,31 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) s.timer_latency = diff; s.context = IRQ_CONTEXT; - trace_timerlat_sample(&s); + record_timerlat_sample(&s); + + if (osnoise_data.stop_tracing) { + if (time_to_us(diff) >= osnoise_data.stop_tracing) { - notify_new_max_latency(diff); + /* + * At this point, if stop_tracing is set and <= print_stack, + * print_stack is set and would be printed in the thread handler. + * + * Thus, print the stack trace as it is helpful to define the + * root cause of an IRQ latency. + */ + if (osnoise_data.stop_tracing <= osnoise_data.print_stack) { + timerlat_save_stack(0); + timerlat_dump_stack(time_to_us(diff)); + } - if (osnoise_data.stop_tracing) - if (time_to_us(diff) >= osnoise_data.stop_tracing) osnoise_stop_tracing(); + notify_new_max_latency(diff); + + wake_up_process(tlat->kthread); + + return HRTIMER_NORESTART; + } + } wake_up_process(tlat->kthread); @@ -1599,6 +1853,7 @@ static int timerlat_main(void *data) struct timerlat_variables *tlat = this_cpu_tmr_var(); struct timerlat_sample s; struct sched_param sp; + unsigned long flags; u64 now, diff; /* @@ -1607,15 +1862,26 @@ static int timerlat_main(void *data) sp.sched_priority = DEFAULT_TIMERLAT_PRIO; sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + /* + * This thread was created pinned to the CPU using PF_NO_SETAFFINITY. + * The problem is that cgroup does not allow PF_NO_SETAFFINITY thread. + * + * To work around this limitation, disable migration and remove the + * flag. + */ + migrate_disable(); + raw_spin_lock_irqsave(¤t->pi_lock, flags); + current->flags &= ~(PF_NO_SETAFFINITY); + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + tlat->count = 0; tlat->tracing_thread = false; - hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - tlat->timer.function = timerlat_irq; + hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); tlat->kthread = current; osn_var->pid = current->pid; /* - * Anotate the arrival time. + * Annotate the arrival time. */ tlat->abs_period = hrtimer_cb_get_time(&tlat->timer); @@ -1624,6 +1890,7 @@ static int timerlat_main(void *data) osn_var->sampling = 1; while (!kthread_should_stop()) { + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); diff = now - tlat->abs_period; @@ -1631,7 +1898,9 @@ static int timerlat_main(void *data) s.timer_latency = diff; s.context = THREAD_CONTEXT; - trace_timerlat_sample(&s); + record_timerlat_sample(&s); + + notify_new_max_latency(diff); timerlat_dump_stack(time_to_us(diff)); @@ -1640,10 +1909,14 @@ static int timerlat_main(void *data) if (time_to_us(diff) >= osnoise_data.stop_tracing_total) osnoise_stop_tracing(); + if (osnoise_migration_pending()) + break; + wait_next_period(tlat); } hrtimer_cancel(&tlat->timer); + migrate_enable(); return 0; } #else /* CONFIG_TIMERLAT_TRACER */ @@ -1660,10 +1933,30 @@ static void stop_kthread(unsigned int cpu) { struct task_struct *kthread; - kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; - if (kthread) - kthread_stop(kthread); - per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + kthread = xchg_relaxed(&(per_cpu(per_cpu_osnoise_var, cpu).kthread), NULL); + if (kthread) { + if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask) && + !WARN_ON(!test_bit(OSN_WORKLOAD, &osnoise_options))) { + kthread_stop(kthread); + } else if (!WARN_ON(test_bit(OSN_WORKLOAD, &osnoise_options))) { + /* + * This is a user thread waiting on the timerlat_fd. We need + * to close all users, and the best way to guarantee this is + * by killing the thread. NOTE: this is a purpose specific file. + */ + kill_pid(kthread->thread_pid, SIGKILL, 1); + put_task_struct(kthread); + } + } else { + /* if no workload, just return */ + if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + /* + * This is set in the osnoise tracer case. + */ + per_cpu(per_cpu_osnoise_var, cpu).sampling = false; + barrier(); + } + } } /* @@ -1685,7 +1978,7 @@ static void stop_per_cpu_kthreads(void) } /* - * start_kthread - Start a workload tread + * start_kthread - Start a workload thread */ static int start_kthread(unsigned int cpu) { @@ -1693,10 +1986,20 @@ static int start_kthread(unsigned int cpu) void *main = osnoise_main; char comm[24]; + /* Do not start a new thread if it is already running */ + if (per_cpu(per_cpu_osnoise_var, cpu).kthread) + return 0; + if (timerlat_enabled()) { snprintf(comm, 24, "timerlat/%d", cpu); main = timerlat_main; } else { + /* if no workload, just return */ + if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + per_cpu(per_cpu_osnoise_var, cpu).sampling = true; + barrier(); + return 0; + } snprintf(comm, 24, "osnoise/%d", cpu); } @@ -1704,11 +2007,11 @@ static int start_kthread(unsigned int cpu) if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); - stop_per_cpu_kthreads(); return -ENOMEM; } per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread; + cpumask_set_cpu(cpu, &kthread_cpumask); return 0; } @@ -1725,20 +2028,33 @@ static int start_per_cpu_kthreads(void) int retval = 0; int cpu; + if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + if (timerlat_enabled()) + return 0; + } + cpus_read_lock(); /* * Run only on online CPUs in which osnoise is allowed to run. */ cpumask_and(current_mask, cpu_online_mask, &osnoise_cpumask); - for_each_possible_cpu(cpu) - per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + for_each_possible_cpu(cpu) { + if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask)) { + struct task_struct *kthread; + + kthread = xchg_relaxed(&(per_cpu(per_cpu_osnoise_var, cpu).kthread), NULL); + if (!WARN_ON(!kthread)) + kthread_stop(kthread); + } + } for_each_cpu(cpu, current_mask) { retval = start_kthread(cpu); if (retval) { + cpus_read_unlock(); stop_per_cpu_kthreads(); - break; + return retval; } } @@ -1752,24 +2068,21 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) { unsigned int cpu = smp_processor_id(); - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (!osnoise_has_registered_instances()) - goto out_unlock_trace; + return; - mutex_lock(&interface_lock); - cpus_read_lock(); + guard(mutex)(&interface_lock); + guard(cpus_read_lock)(); + + if (!cpu_online(cpu)) + return; if (!cpumask_test_cpu(cpu, &osnoise_cpumask)) - goto out_unlock; + return; start_kthread(cpu); - -out_unlock: - cpus_read_unlock(); - mutex_unlock(&interface_lock); -out_unlock_trace: - mutex_unlock(&trace_types_lock); } static DECLARE_WORK(osnoise_hotplug_work, osnoise_hotplug_workfn); @@ -1811,6 +2124,150 @@ static void osnoise_init_hotplug_support(void) #endif /* CONFIG_HOTPLUG_CPU */ /* + * seq file functions for the osnoise/options file. + */ +static void *s_options_start(struct seq_file *s, loff_t *pos) +{ + int option = *pos; + + mutex_lock(&interface_lock); + + if (option >= OSN_MAX) + return NULL; + + return pos; +} + +static void *s_options_next(struct seq_file *s, void *v, loff_t *pos) +{ + int option = ++(*pos); + + if (option >= OSN_MAX) + return NULL; + + return pos; +} + +static int s_options_show(struct seq_file *s, void *v) +{ + loff_t *pos = v; + int option = *pos; + + if (option == OSN_DEFAULTS) { + if (osnoise_options == OSN_DEFAULT_OPTIONS) + seq_printf(s, "%s", osnoise_options_str[option]); + else + seq_printf(s, "NO_%s", osnoise_options_str[option]); + goto out; + } + + if (test_bit(option, &osnoise_options)) + seq_printf(s, "%s", osnoise_options_str[option]); + else + seq_printf(s, "NO_%s", osnoise_options_str[option]); + +out: + if (option != OSN_MAX) + seq_puts(s, " "); + + return 0; +} + +static void s_options_stop(struct seq_file *s, void *v) +{ + seq_puts(s, "\n"); + mutex_unlock(&interface_lock); +} + +static const struct seq_operations osnoise_options_seq_ops = { + .start = s_options_start, + .next = s_options_next, + .show = s_options_show, + .stop = s_options_stop +}; + +static int osnoise_options_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &osnoise_options_seq_ops); +}; + +/** + * osnoise_options_write - Write function for "options" entry + * @filp: The active open file structure + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in @file + * + * Writing the option name sets the option, writing the "NO_" + * prefix in front of the option name disables it. + * + * Writing "DEFAULTS" resets the option values to the default ones. + */ +static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + int running, option, enable, retval; + char buf[256], *option_str; + + if (cnt >= 256) + return -EINVAL; + + if (copy_from_user(buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (strncmp(buf, "NO_", 3)) { + option_str = strstrip(buf); + enable = true; + } else { + option_str = strstrip(&buf[3]); + enable = false; + } + + option = match_string(osnoise_options_str, OSN_MAX, option_str); + if (option < 0) + return -EINVAL; + + /* + * trace_types_lock is taken to avoid concurrency on start/stop. + */ + mutex_lock(&trace_types_lock); + running = osnoise_has_registered_instances(); + if (running) + stop_per_cpu_kthreads(); + + mutex_lock(&interface_lock); + /* + * avoid CPU hotplug operations that might read options. + */ + cpus_read_lock(); + + retval = cnt; + + if (enable) { + if (option == OSN_DEFAULTS) + osnoise_options = OSN_DEFAULT_OPTIONS; + else + set_bit(option, &osnoise_options); + } else { + if (option == OSN_DEFAULTS) + retval = -EINVAL; + else + clear_bit(option, &osnoise_options); + } + + cpus_read_unlock(); + mutex_unlock(&interface_lock); + + if (running) + start_per_cpu_kthreads(); + mutex_unlock(&trace_types_lock); + + return retval; +} + +/* * osnoise_cpus_read - Read function for reading the "cpus" file * @filp: The active open file structure * @ubuf: The userspace provided buffer to read value into @@ -1823,31 +2280,22 @@ static ssize_t osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - char *mask_str; + char *mask_str __free(kfree) = NULL; int len; - mutex_lock(&interface_lock); + guard(mutex)(&interface_lock); len = snprintf(NULL, 0, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)) + 1; mask_str = kmalloc(len, GFP_KERNEL); - if (!mask_str) { - count = -ENOMEM; - goto out_unlock; - } + if (!mask_str) + return -ENOMEM; len = snprintf(mask_str, len, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)); - if (len >= count) { - count = -EINVAL; - goto out_free; - } + if (len >= count) + return -EINVAL; count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); -out_free: - kfree(mask_str); -out_unlock: - mutex_unlock(&interface_lock); - return count; } @@ -1855,7 +2303,7 @@ out_unlock: * osnoise_cpus_write - Write function for "cpus" entry * @filp: The active open file structure * @ubuf: The user buffer that contains the value to write - * @cnt: The maximum number of bytes to write to "file" + * @count: The maximum number of bytes to write to "file" * @ppos: The current position in @file * * This function provides a write implementation for the "cpus" @@ -1873,13 +2321,14 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, { cpumask_var_t osnoise_cpumask_new; int running, err; - char buf[256]; + char *buf __free(kfree) = NULL; - if (count >= 256) - return -EINVAL; + if (count < 1) + return 0; - if (copy_from_user(buf, ubuf, count)) - return -EFAULT; + buf = memdup_user_nul(ubuf, count); + if (IS_ERR(buf)) + return PTR_ERR(buf); if (!zalloc_cpumask_var(&osnoise_cpumask_new, GFP_KERNEL)) return -ENOMEM; @@ -1920,6 +2369,223 @@ err_free: return err; } +#ifdef CONFIG_TIMERLAT_TRACER +static int timerlat_fd_open(struct inode *inode, struct file *file) +{ + struct osnoise_variables *osn_var; + struct timerlat_variables *tlat; + long cpu = (long) inode->i_cdev; + + mutex_lock(&interface_lock); + + /* + * This file is accessible only if timerlat is enabled, and + * NO_OSNOISE_WORKLOAD is set. + */ + if (!timerlat_enabled() || test_bit(OSN_WORKLOAD, &osnoise_options)) { + mutex_unlock(&interface_lock); + return -EINVAL; + } + + migrate_disable(); + + osn_var = this_cpu_osn_var(); + + /* + * The osn_var->pid holds the single access to this file. + */ + if (osn_var->pid) { + mutex_unlock(&interface_lock); + migrate_enable(); + return -EBUSY; + } + + /* + * timerlat tracer is a per-cpu tracer. Check if the user-space too + * is pinned to a single CPU. The tracer laters monitor if the task + * migrates and then disables tracer if it does. However, it is + * worth doing this basic acceptance test to avoid obviusly wrong + * setup. + */ + if (current->nr_cpus_allowed > 1 || cpu != smp_processor_id()) { + mutex_unlock(&interface_lock); + migrate_enable(); + return -EPERM; + } + + /* + * From now on, it is good to go. + */ + file->private_data = inode->i_cdev; + + get_task_struct(current); + + osn_var->kthread = current; + osn_var->pid = current->pid; + + /* + * Setup is done. + */ + mutex_unlock(&interface_lock); + + tlat = this_cpu_tmr_var(); + tlat->count = 0; + + hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); + + migrate_enable(); + return 0; +}; + +/* + * timerlat_fd_read - Read function for "timerlat_fd" file + * @file: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * Prints 1 on timerlat, the number of interferences on osnoise, -1 on error. + */ +static ssize_t +timerlat_fd_read(struct file *file, char __user *ubuf, size_t count, + loff_t *ppos) +{ + long cpu = (long) file->private_data; + struct osnoise_variables *osn_var; + struct timerlat_variables *tlat; + struct timerlat_sample s; + s64 diff; + u64 now; + + migrate_disable(); + + tlat = this_cpu_tmr_var(); + + /* + * While in user-space, the thread is migratable. There is nothing + * we can do about it. + * So, if the thread is running on another CPU, stop the machinery. + */ + if (cpu == smp_processor_id()) { + if (tlat->uthread_migrate) { + migrate_enable(); + return -EINVAL; + } + } else { + per_cpu_ptr(&per_cpu_timerlat_var, cpu)->uthread_migrate = 1; + osnoise_taint("timerlat user thread migrate\n"); + osnoise_stop_tracing(); + migrate_enable(); + return -EINVAL; + } + + osn_var = this_cpu_osn_var(); + + /* + * The timerlat in user-space runs in a different order: + * the read() starts from the execution of the previous occurrence, + * sleeping for the next occurrence. + * + * So, skip if we are entering on read() before the first wakeup + * from timerlat IRQ: + */ + if (likely(osn_var->sampling)) { + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); + diff = now - tlat->abs_period; + + /* + * it was not a timer firing, but some other signal? + */ + if (diff < 0) + goto out; + + s.seqnum = tlat->count; + s.timer_latency = diff; + s.context = THREAD_URET; + + record_timerlat_sample(&s); + + notify_new_max_latency(diff); + + tlat->tracing_thread = false; + if (osnoise_data.stop_tracing_total) + if (time_to_us(diff) >= osnoise_data.stop_tracing_total) + osnoise_stop_tracing(); + } else { + tlat->tracing_thread = false; + tlat->kthread = current; + + /* Annotate now to drift new period */ + tlat->abs_period = hrtimer_cb_get_time(&tlat->timer); + + osn_var->sampling = 1; + } + + /* wait for the next period */ + wait_next_period(tlat); + + /* This is the wakeup from this cycle */ + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); + diff = now - tlat->abs_period; + + /* + * it was not a timer firing, but some other signal? + */ + if (diff < 0) + goto out; + + s.seqnum = tlat->count; + s.timer_latency = diff; + s.context = THREAD_CONTEXT; + + record_timerlat_sample(&s); + + if (osnoise_data.stop_tracing_total) { + if (time_to_us(diff) >= osnoise_data.stop_tracing_total) { + timerlat_dump_stack(time_to_us(diff)); + notify_new_max_latency(diff); + osnoise_stop_tracing(); + } + } + +out: + migrate_enable(); + return 0; +} + +static int timerlat_fd_release(struct inode *inode, struct file *file) +{ + struct osnoise_variables *osn_var; + struct timerlat_variables *tlat_var; + long cpu = (long) file->private_data; + + migrate_disable(); + mutex_lock(&interface_lock); + + osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); + tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); + + if (tlat_var->kthread) + hrtimer_cancel(&tlat_var->timer); + memset(tlat_var, 0, sizeof(*tlat_var)); + + osn_var->sampling = 0; + osn_var->pid = 0; + + /* + * We are leaving, not being stopped... see stop_kthread(); + */ + if (osn_var->kthread) { + put_task_struct(osn_var->kthread); + osn_var->kthread = NULL; + } + + mutex_unlock(&interface_lock); + migrate_enable(); + return 0; +} +#endif + /* * osnoise/runtime_us: cannot be greater than the period. */ @@ -1975,14 +2641,21 @@ static struct trace_min_max_param osnoise_print_stack = { /* * osnoise/timerlat_period: min 100 us, max 1 s */ -u64 timerlat_min_period = 100; -u64 timerlat_max_period = 1000000; +static u64 timerlat_min_period = 100; +static u64 timerlat_max_period = 1000000; static struct trace_min_max_param timerlat_period = { .lock = &interface_lock, .val = &osnoise_data.timerlat_period, .max = &timerlat_max_period, .min = &timerlat_min_period, }; + +static const struct file_operations timerlat_fd_fops = { + .open = timerlat_fd_open, + .read = timerlat_fd_read, + .release = timerlat_fd_release, + .llseek = generic_file_llseek, +}; #endif static const struct file_operations cpus_fops = { @@ -1992,6 +2665,14 @@ static const struct file_operations cpus_fops = { .llseek = generic_file_llseek, }; +static const struct file_operations osnoise_options_fops = { + .open = osnoise_options_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = osnoise_options_write +}; + #ifdef CONFIG_TIMERLAT_TRACER #ifdef CONFIG_STACKTRACE static int init_timerlat_stack_tracefs(struct dentry *top_dir) @@ -2012,18 +2693,63 @@ static int init_timerlat_stack_tracefs(struct dentry *top_dir) } #endif /* CONFIG_STACKTRACE */ +static int osnoise_create_cpu_timerlat_fd(struct dentry *top_dir) +{ + struct dentry *timerlat_fd; + struct dentry *per_cpu; + struct dentry *cpu_dir; + char cpu_str[30]; /* see trace.c: tracing_init_tracefs_percpu() */ + long cpu; + + /* + * Why not using tracing instance per_cpu/ dir? + * + * Because osnoise/timerlat have a single workload, having + * multiple files like these are waste of memory. + */ + per_cpu = tracefs_create_dir("per_cpu", top_dir); + if (!per_cpu) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + snprintf(cpu_str, 30, "cpu%ld", cpu); + cpu_dir = tracefs_create_dir(cpu_str, per_cpu); + if (!cpu_dir) + goto out_clean; + + timerlat_fd = trace_create_file("timerlat_fd", TRACE_MODE_READ, + cpu_dir, NULL, &timerlat_fd_fops); + if (!timerlat_fd) + goto out_clean; + + /* Record the CPU */ + d_inode(timerlat_fd)->i_cdev = (void *)(cpu); + } + + return 0; + +out_clean: + tracefs_remove(per_cpu); + return -ENOMEM; +} + /* * init_timerlat_tracefs - A function to initialize the timerlat interface files */ static int init_timerlat_tracefs(struct dentry *top_dir) { struct dentry *tmp; + int retval; tmp = tracefs_create_file("timerlat_period_us", TRACE_MODE_WRITE, top_dir, &timerlat_period, &trace_min_max_fops); if (!tmp) return -ENOMEM; + retval = osnoise_create_cpu_timerlat_fd(top_dir); + if (retval) + return retval; + return init_timerlat_stack_tracefs(top_dir); } #else /* CONFIG_TIMERLAT_TRACER */ @@ -2078,6 +2804,11 @@ static int init_tracefs(void) if (!tmp) goto err; + tmp = trace_create_file("options", TRACE_MODE_WRITE, top_dir, NULL, + &osnoise_options_fops); + if (!tmp) + goto err; + ret = init_timerlat_tracefs(top_dir); if (ret) goto err; @@ -2189,6 +2920,17 @@ static void osnoise_workload_stop(void) if (osnoise_has_registered_instances()) return; + /* + * If callbacks were already disabled in a previous stop + * call, there is no need to disable then again. + * + * For instance, this happens when tracing is stopped via: + * echo 0 > tracing_on + * echo nop > current_tracer. + */ + if (!trace_osnoise_callback_enabled) + return; + trace_osnoise_callback_enabled = false; /* * Make sure that ftrace_nmi_enter/exit() see |
