From cc6795aeffea0a80d0baf9ad31ba926a6c42cef5 Mon Sep 17 00:00:00 2001 From: Andrew Murray Date: Thu, 10 Jan 2019 13:53:25 +0000 Subject: perf/core: Add PERF_PMU_CAP_NO_EXCLUDE for exclusion incapable PMUs Many PMU drivers do not have the capability to exclude counting events that occur in specific contexts such as idle, kernel, guest, etc. These drivers indicate this by returning an error in their event_init upon testing the events attribute flags. This approach is error prone and often inconsistent. Let's instead allow PMU drivers to advertise their inability to exclude based on context via a new capability: PERF_PMU_CAP_NO_EXCLUDE. This allows the perf core to reject requests for exclusion events where there is no support in the PMU. Signed-off-by: Andrew Murray Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Ivan Kokshaysky Cc: Linus Torvalds Cc: Mark Rutland Cc: Matt Turner Cc: Michael Ellerman Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Richard Henderson Cc: Russell King Cc: Sascha Hauer Cc: Shawn Guo Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linuxppc-dev@lists.ozlabs.org Cc: robin.murphy@arm.com Cc: suzuki.poulose@arm.com Link: https://lkml.kernel.org/r/1547128414-50693-4-git-send-email-andrew.murray@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3cd13a30f732..fbe59b793b36 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9772,6 +9772,15 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) if (ctx) perf_event_ctx_unlock(event->group_leader, ctx); + if (!ret) { + if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && + event_has_any_exclude_flag(event)) { + if (event->destroy) + event->destroy(event); + ret = -EINVAL; + } + } + if (ret) module_put(pmu->module); -- cgit From 5620196951192f7cd2da0a04e7c0113f40bfc14e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 11 Jan 2019 13:20:20 -0300 Subject: perf: Make perf_event_output() propagate the output() return For the original mode of operation it isn't needed, since we report back errors via PERF_RECORD_LOST records in the ring buffer, but for use in bpf_perf_event_output() it is convenient to return the errors, basically -ENOSPC. Currently bpf_perf_event_output() returns an error indication, the last thing it does, which is to push it to the ring buffer is that can fail and if so, this failure won't be reported back to its users, fix it. Reported-by: Jamal Hadi Salim Tested-by: Jamal Hadi Salim Acked-by: Peter Zijlstra (Intel) Cc: Adrian Hunter Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/r/20190118150938.GN5823@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 11 +++++++---- kernel/trace/bpf_trace.c | 3 +-- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index fbe59b793b36..bc525cd1615c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6489,7 +6489,7 @@ void perf_prepare_sample(struct perf_event_header *header, data->phys_addr = perf_virt_to_phys(data->addr); } -static __always_inline void +static __always_inline int __perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs, @@ -6499,13 +6499,15 @@ __perf_event_output(struct perf_event *event, { struct perf_output_handle handle; struct perf_event_header header; + int err; /* protect the callchain buffers */ rcu_read_lock(); perf_prepare_sample(&header, data, event, regs); - if (output_begin(&handle, event, header.size)) + err = output_begin(&handle, event, header.size); + if (err) goto exit; perf_output_sample(&handle, &header, data, event); @@ -6514,6 +6516,7 @@ __perf_event_output(struct perf_event *event, exit: rcu_read_unlock(); + return err; } void @@ -6532,12 +6535,12 @@ perf_event_output_backward(struct perf_event *event, __perf_event_output(event, data, regs, perf_output_begin_backward); } -void +int perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { - __perf_event_output(event, data, regs, perf_output_begin); + return __perf_event_output(event, data, regs, perf_output_begin); } /* diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 8b068adb9da1..088c2032ceaf 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -431,8 +431,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; - perf_event_output(event, sd, regs); - return 0; + return perf_event_output(event, sd, regs); } BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, -- cgit From 76193a94522f1d4edf2447a536f3f796ce56343b Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 17 Jan 2019 08:15:13 -0800 Subject: perf, bpf: Introduce PERF_RECORD_KSYMBOL For better performance analysis of dynamically JITed and loaded kernel functions, such as BPF programs, this patch introduces PERF_RECORD_KSYMBOL, a new perf_event_type that exposes kernel symbol register/unregister information to user space. The following data structure is used for PERF_RECORD_KSYMBOL. /* * struct { * struct perf_event_header header; * u64 addr; * u32 len; * u16 ksym_type; * u16 flags; * char name[]; * struct sample_id sample_id; * }; */ Signed-off-by: Song Liu Reviewed-by: Arnaldo Carvalho de Melo Tested-by: Arnaldo Carvalho de Melo Acked-by: Peter Zijlstra Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Peter Zijlstra Cc: kernel-team@fb.com Cc: netdev@vger.kernel.org Link: http://lkml.kernel.org/r/20190117161521.1341602-2-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 97 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index bc525cd1615c..e04ab5f325cf 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -385,6 +385,7 @@ static atomic_t nr_namespaces_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; +static atomic_t nr_ksymbol_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -4235,7 +4236,7 @@ static bool is_sb_event(struct perf_event *event) if (attr->mmap || attr->mmap_data || attr->mmap2 || attr->comm || attr->comm_exec || - attr->task || + attr->task || attr->ksymbol || attr->context_switch) return true; return false; @@ -4305,6 +4306,8 @@ static void unaccount_event(struct perf_event *event) dec = true; if (has_branch_stack(event)) dec = true; + if (event->attr.ksymbol) + atomic_dec(&nr_ksymbol_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) @@ -7653,6 +7656,97 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_output_end(&handle); } +/* + * ksymbol register/unregister tracking + */ + +struct perf_ksymbol_event { + const char *name; + int name_len; + struct { + struct perf_event_header header; + u64 addr; + u32 len; + u16 ksym_type; + u16 flags; + } event_id; +}; + +static int perf_event_ksymbol_match(struct perf_event *event) +{ + return event->attr.ksymbol; +} + +static void perf_event_ksymbol_output(struct perf_event *event, void *data) +{ + struct perf_ksymbol_event *ksymbol_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_ksymbol_match(event)) + return; + + perf_event_header__init_id(&ksymbol_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, event, + ksymbol_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, ksymbol_event->event_id); + __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, + const char *sym) +{ + struct perf_ksymbol_event ksymbol_event; + char name[KSYM_NAME_LEN]; + u16 flags = 0; + int name_len; + + if (!atomic_read(&nr_ksymbol_events)) + return; + + if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX || + ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) + goto err; + + strlcpy(name, sym, KSYM_NAME_LEN); + name_len = strlen(name) + 1; + while (!IS_ALIGNED(name_len, sizeof(u64))) + name[name_len++] = '\0'; + BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64)); + + if (unregister) + flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER; + + ksymbol_event = (struct perf_ksymbol_event){ + .name = name, + .name_len = name_len, + .event_id = { + .header = { + .type = PERF_RECORD_KSYMBOL, + .size = sizeof(ksymbol_event.event_id) + + name_len, + }, + .addr = addr, + .len = len, + .ksym_type = ksym_type, + .flags = flags, + }, + }; + + perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL); + return; +err: + WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type); +} + void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; @@ -9912,6 +10006,8 @@ static void account_event(struct perf_event *event) inc = true; if (is_cgroup_event(event)) inc = true; + if (event->attr.ksymbol) + atomic_inc(&nr_ksymbol_events); if (inc) { /* -- cgit From 6ee52e2a3fe4ea35520720736e6791df1fb67106 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 17 Jan 2019 08:15:15 -0800 Subject: perf, bpf: Introduce PERF_RECORD_BPF_EVENT For better performance analysis of BPF programs, this patch introduces PERF_RECORD_BPF_EVENT, a new perf_event_type that exposes BPF program load/unload information to user space. Each BPF program may contain up to BPF_MAX_SUBPROGS (256) sub programs. The following example shows kernel symbols for a BPF program with 7 sub programs: ffffffffa0257cf9 t bpf_prog_b07ccb89267cf242_F ffffffffa02592e1 t bpf_prog_2dcecc18072623fc_F ffffffffa025b0e9 t bpf_prog_bb7a405ebaec5d5c_F ffffffffa025dd2c t bpf_prog_a7540d4a39ec1fc7_F ffffffffa025fcca t bpf_prog_05762d4ade0e3737_F ffffffffa026108f t bpf_prog_db4bd11e35df90d4_F ffffffffa0263f00 t bpf_prog_89d64e4abf0f0126_F ffffffffa0257cf9 t bpf_prog_ae31629322c4b018__dummy_tracepoi When a bpf program is loaded, PERF_RECORD_KSYMBOL is generated for each of these sub programs. Therefore, PERF_RECORD_BPF_EVENT is not needed for simple profiling. For annotation, user space need to listen to PERF_RECORD_BPF_EVENT and gather more information about these (sub) programs via sys_bpf. Signed-off-by: Song Liu Reviewed-by: Arnaldo Carvalho de Melo Acked-by: Alexei Starovoitov Acked-by: Peter Zijlstra (Intel) Tested-by: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: Peter Zijlstra Cc: kernel-team@fb.com Cc: netdev@vger.kernel.org Link: http://lkml.kernel.org/r/20190117161521.1341602-4-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/bpf/core.c | 2 +- kernel/bpf/syscall.c | 2 + kernel/events/core.c | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 118 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f908b9356025..19c49313c709 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -495,7 +495,7 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog, *symbol_end = addr + hdr->pages * PAGE_SIZE; } -static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) { const char *end = sym + KSYM_NAME_LEN; const struct btf_type *type; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b155cd17c1bd..30ebd085790b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1211,6 +1211,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { + perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); bpf_prog_kallsyms_del_all(prog); @@ -1554,6 +1555,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) } bpf_prog_kallsyms_add(prog); + perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); return err; free_used_maps: diff --git a/kernel/events/core.c b/kernel/events/core.c index e04ab5f325cf..236bb8ddb7bc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -386,6 +386,7 @@ static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; static atomic_t nr_ksymbol_events __read_mostly; +static atomic_t nr_bpf_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -4308,6 +4309,8 @@ static void unaccount_event(struct perf_event *event) dec = true; if (event->attr.ksymbol) atomic_dec(&nr_ksymbol_events); + if (event->attr.bpf_event) + atomic_dec(&nr_bpf_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) @@ -7747,6 +7750,116 @@ err: WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type); } +/* + * bpf program load/unload tracking + */ + +struct perf_bpf_event { + struct bpf_prog *prog; + struct { + struct perf_event_header header; + u16 type; + u16 flags; + u32 id; + u8 tag[BPF_TAG_SIZE]; + } event_id; +}; + +static int perf_event_bpf_match(struct perf_event *event) +{ + return event->attr.bpf_event; +} + +static void perf_event_bpf_output(struct perf_event *event, void *data) +{ + struct perf_bpf_event *bpf_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_bpf_match(event)) + return; + + perf_event_header__init_id(&bpf_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, event, + bpf_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, bpf_event->event_id); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, + enum perf_bpf_event_type type) +{ + bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD; + char sym[KSYM_NAME_LEN]; + int i; + + if (prog->aux->func_cnt == 0) { + bpf_get_prog_name(prog, sym); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, + (u64)(unsigned long)prog->bpf_func, + prog->jited_len, unregister, sym); + } else { + for (i = 0; i < prog->aux->func_cnt; i++) { + struct bpf_prog *subprog = prog->aux->func[i]; + + bpf_get_prog_name(subprog, sym); + perf_event_ksymbol( + PERF_RECORD_KSYMBOL_TYPE_BPF, + (u64)(unsigned long)subprog->bpf_func, + subprog->jited_len, unregister, sym); + } + } +} + +void perf_event_bpf_event(struct bpf_prog *prog, + enum perf_bpf_event_type type, + u16 flags) +{ + struct perf_bpf_event bpf_event; + + if (type <= PERF_BPF_EVENT_UNKNOWN || + type >= PERF_BPF_EVENT_MAX) + return; + + switch (type) { + case PERF_BPF_EVENT_PROG_LOAD: + case PERF_BPF_EVENT_PROG_UNLOAD: + if (atomic_read(&nr_ksymbol_events)) + perf_event_bpf_emit_ksymbols(prog, type); + break; + default: + break; + } + + if (!atomic_read(&nr_bpf_events)) + return; + + bpf_event = (struct perf_bpf_event){ + .prog = prog, + .event_id = { + .header = { + .type = PERF_RECORD_BPF_EVENT, + .size = sizeof(bpf_event.event_id), + }, + .type = type, + .flags = flags, + .id = prog->aux->id, + }, + }; + + BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64)); + + memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE); + perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); +} + void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; @@ -10008,6 +10121,8 @@ static void account_event(struct perf_event *event) inc = true; if (event->attr.ksymbol) atomic_inc(&nr_ksymbol_events); + if (event->attr.bpf_event) + atomic_inc(&nr_bpf_events); if (inc) { /* -- cgit From 6934058d9fb6c058fb5e5b11cdcb19834e205c91 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 17 Jan 2019 08:15:21 -0800 Subject: bpf: Add module name [bpf] to ksymbols for bpf programs With this patch, /proc/kallsyms will show BPF programs as t bpf_prog__ [bpf] Signed-off-by: Song Liu Reviewed-by: Arnaldo Carvalho de Melo Tested-by: Arnaldo Carvalho de Melo Acked-by: Peter Zijlstra Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Peter Zijlstra Cc: kernel-team@fb.com Cc: netdev@vger.kernel.org Link: http://lkml.kernel.org/r/20190117161521.1341602-10-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/kallsyms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index f3a04994e063..14934afa9e68 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -494,7 +494,7 @@ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) static int get_ksymbol_bpf(struct kallsym_iter *iter) { - iter->module_name[0] = '\0'; + strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN); iter->exported = 0; return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, &iter->value, &iter->type, -- cgit From 8e86e01526764e8cdc77b80a8f24f33e6847b9e7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jan 2019 12:10:59 +0100 Subject: perf/core: Convert to SPDX license identifiers Use proper SPDX license identifiers instead of the bogus reference to kernel-base/COPYING. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Greg Kroah-Hartman Cc: Jiri Olsa Cc: Kate Stewart Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/20190116111308.012666937@linutronix.de Signed-off-by: Ingo Molnar --- kernel/events/callchain.c | 3 +-- kernel/events/core.c | 3 +-- kernel/events/ring_buffer.c | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 24a77c34e9ad..c2b41a263166 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Performance events callchain code, extracted from core.c: * @@ -5,8 +6,6 @@ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. - * - * For licensing details see kernel-base/COPYING */ #include diff --git a/kernel/events/core.c b/kernel/events/core.c index 280a72b3a553..5b89de7918d0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Performance events core code: * @@ -5,8 +6,6 @@ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. - * - * For licensing details see kernel-base/COPYING */ #include diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 309ef5a64af5..ed6409300ef5 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Performance events ring-buffer code: * @@ -5,8 +6,6 @@ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. - * - * For licensing details see kernel-base/COPYING */ #include -- cgit From 469eb32eaf361971dfc8ad165af14ae3f2217487 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jan 2019 12:11:00 +0100 Subject: perf/hw_breakpoints: Convert to SPDX license identifier Replace the license boiler plate with a SPDX license identifier. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Acked-by: Paul McKenney Cc: Alan Stern Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Greg Kroah-Hartman Cc: Jiri Olsa Cc: Kate Stewart Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/20190116111308.105855650@linutronix.de Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 5befb338a18d..c5cd852fe86b 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -1,18 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0+ /* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * Copyright (C) 2007 Alan Stern * Copyright (C) IBM Corporation, 2009 * Copyright (C) 2009, Frederic Weisbecker -- cgit From 720e596a16cc170798a60dc7afa27146ec5fb14e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Jan 2019 12:11:01 +0100 Subject: perf/uprobes: Convert to SPDX license identifier Replace the license boiler plate with a SPDX license identifier. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Acked-by: Paul McKenney Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Greg Kroah-Hartman Cc: Jiri Olsa Cc: Kate Stewart Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/20190116111308.211981422@linutronix.de Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8aef47ee7bfa..affa830a198c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * User-space Probes (UProbes) * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * Copyright (C) IBM Corporation, 2008-2012 * Authors: * Srikar Dronamraju -- cgit From 8c94abbbe1ba24961278055434504b7dc3595415 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Mon, 28 Jan 2019 14:27:26 +0200 Subject: perf: Convert perf_event_context.refcount to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable perf_event_context.refcount is used as pure reference counter. Convert it to refcount_t and fix up the operations. ** Important note for maintainers: Some functions from refcount_t API defined in lib/refcount.c have different memory ordering guarantees than their atomic counterparts. Please check Documentation/core-api/refcount-vs-atomic.rst for more information. Normally the differences should not matter since refcount_t provides enough guarantees to satisfy the refcounting use cases, but in some rare cases it might matter. Please double check that you don't have some undocumented memory guarantees for this variable usage. For the perf_event_context.refcount it might make a difference in following places: - get_ctx(), perf_event_ctx_lock_nested(), perf_lock_task_context() and __perf_event_ctx_lock_double(): increment in refcount_inc_not_zero() only guarantees control dependency on success vs. fully ordered atomic counterpart - put_ctx(): decrement in refcount_dec_and_test() provides RELEASE ordering and ACQUIRE ordering + control dependency on success vs. fully ordered atomic counterpart Suggested-by: Kees Cook Signed-off-by: Elena Reshetova Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: namhyung@kernel.org Link: https://lkml.kernel.org/r/1548678448-24458-2-git-send-email-elena.reshetova@intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 5b89de7918d0..677164d54547 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1172,7 +1172,7 @@ static void perf_event_ctx_deactivate(struct perf_event_context *ctx) static void get_ctx(struct perf_event_context *ctx) { - WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); + refcount_inc(&ctx->refcount); } static void free_ctx(struct rcu_head *head) @@ -1186,7 +1186,7 @@ static void free_ctx(struct rcu_head *head) static void put_ctx(struct perf_event_context *ctx) { - if (atomic_dec_and_test(&ctx->refcount)) { + if (refcount_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) put_ctx(ctx->parent_ctx); if (ctx->task && ctx->task != TASK_TOMBSTONE) @@ -1268,7 +1268,7 @@ perf_event_ctx_lock_nested(struct perf_event *event, int nesting) again: rcu_read_lock(); ctx = READ_ONCE(event->ctx); - if (!atomic_inc_not_zero(&ctx->refcount)) { + if (!refcount_inc_not_zero(&ctx->refcount)) { rcu_read_unlock(); goto again; } @@ -1401,7 +1401,7 @@ retry: } if (ctx->task == TASK_TOMBSTONE || - !atomic_inc_not_zero(&ctx->refcount)) { + !refcount_inc_not_zero(&ctx->refcount)) { raw_spin_unlock(&ctx->lock); ctx = NULL; } else { @@ -4057,7 +4057,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx) INIT_LIST_HEAD(&ctx->event_list); INIT_LIST_HEAD(&ctx->pinned_active); INIT_LIST_HEAD(&ctx->flexible_active); - atomic_set(&ctx->refcount, 1); + refcount_set(&ctx->refcount, 1); } static struct perf_event_context * @@ -10613,7 +10613,7 @@ __perf_event_ctx_lock_double(struct perf_event *group_leader, again: rcu_read_lock(); gctx = READ_ONCE(group_leader->ctx); - if (!atomic_inc_not_zero(&gctx->refcount)) { + if (!refcount_inc_not_zero(&gctx->refcount)) { rcu_read_unlock(); goto again; } -- cgit From fecb8ed2ce7010db373f8517ee815380d8e3c0c4 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Mon, 28 Jan 2019 14:27:27 +0200 Subject: perf/ring_buffer: Convert ring_buffer.refcount to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable ring_buffer.refcount is used as pure reference counter. Convert it to refcount_t and fix up the operations. ** Important note for maintainers: Some functions from refcount_t API defined in lib/refcount.c have different memory ordering guarantees than their atomic counterparts. Please check Documentation/core-api/refcount-vs-atomic.rst for more information. Normally the differences should not matter since refcount_t provides enough guarantees to satisfy the refcounting use cases, but in some rare cases it might matter. Please double check that you don't have some undocumented memory guarantees for this variable usage. For the ring_buffer.refcount it might make a difference in following places: - ring_buffer_get(): increment in refcount_inc_not_zero() only guarantees control dependency on success vs. fully ordered atomic counterpart - ring_buffer_put(): decrement in refcount_dec_and_test() only provides RELEASE ordering and ACQUIRE ordering + control dependency on success vs. fully ordered atomic counterpart Suggested-by: Kees Cook Signed-off-by: Elena Reshetova Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: namhyung@kernel.org Link: https://lkml.kernel.org/r/1548678448-24458-3-git-send-email-elena.reshetova@intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- kernel/events/internal.h | 3 ++- kernel/events/ring_buffer.c | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 677164d54547..284232edf9be 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5393,7 +5393,7 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event) rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { - if (!atomic_inc_not_zero(&rb->refcount)) + if (!refcount_inc_not_zero(&rb->refcount)) rb = NULL; } rcu_read_unlock(); @@ -5403,7 +5403,7 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event) void ring_buffer_put(struct ring_buffer *rb) { - if (!atomic_dec_and_test(&rb->refcount)) + if (!refcount_dec_and_test(&rb->refcount)) return; WARN_ON_ONCE(!list_empty(&rb->event_list)); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 6dc725a7e7bc..4718de2a04e6 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -4,13 +4,14 @@ #include #include +#include /* Buffer handling */ #define RING_BUFFER_WRITABLE 0x01 struct ring_buffer { - atomic_t refcount; + refcount_t refcount; struct rcu_head rcu_head; #ifdef CONFIG_PERF_USE_VMALLOC struct work_struct work; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index ed6409300ef5..0a71d16ca41b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -284,7 +284,7 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) else rb->overwrite = 1; - atomic_set(&rb->refcount, 1); + refcount_set(&rb->refcount, 1); INIT_LIST_HEAD(&rb->event_list); spin_lock_init(&rb->event_lock); -- cgit From ca3bb3d027f69ac3ab1dafb32bde2f5a3a44439c Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Mon, 28 Jan 2019 14:27:28 +0200 Subject: perf/ring_buffer: Convert ring_buffer.aux_refcount to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable ring_buffer.aux_refcount is used as pure reference counter. Convert it to refcount_t and fix up the operations. ** Important note for maintainers: Some functions from refcount_t API defined in lib/refcount.c have different memory ordering guarantees than their atomic counterparts. Please check Documentation/core-api/refcount-vs-atomic.rst for more information. Normally the differences should not matter since refcount_t provides enough guarantees to satisfy the refcounting use cases, but in some rare cases it might matter. Please double check that you don't have some undocumented memory guarantees for this variable usage. For the ring_buffer.aux_refcount it might make a difference in following places: - perf_aux_output_begin(): increment in refcount_inc_not_zero() only guarantees control dependency on success vs. fully ordered atomic counterpart - rb_free_aux(): decrement in refcount_dec_and_test() only provides RELEASE ordering and ACQUIRE ordering + control dependency on success vs. fully ordered atomic counterpart Suggested-by: Kees Cook Signed-off-by: Elena Reshetova Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: namhyung@kernel.org Link: https://lkml.kernel.org/r/1548678448-24458-4-git-send-email-elena.reshetova@intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- kernel/events/internal.h | 2 +- kernel/events/ring_buffer.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 284232edf9be..5aeb4c74fb99 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5468,7 +5468,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) /* this has to be the last one */ rb_free_aux(rb); - WARN_ON_ONCE(atomic_read(&rb->aux_refcount)); + WARN_ON_ONCE(refcount_read(&rb->aux_refcount)); mutex_unlock(&event->mmap_mutex); } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 4718de2a04e6..79c47076700a 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -49,7 +49,7 @@ struct ring_buffer { atomic_t aux_mmap_count; unsigned long aux_mmap_locked; void (*free_aux)(void *); - atomic_t aux_refcount; + refcount_t aux_refcount; void **aux_pages; void *aux_priv; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 0a71d16ca41b..805f0423ee0b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -357,7 +357,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, if (!atomic_read(&rb->aux_mmap_count)) goto err; - if (!atomic_inc_not_zero(&rb->aux_refcount)) + if (!refcount_inc_not_zero(&rb->aux_refcount)) goto err; /* @@ -670,7 +670,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, * we keep a refcount here to make sure either of the two can * reference them safely. */ - atomic_set(&rb->aux_refcount, 1); + refcount_set(&rb->aux_refcount, 1); rb->aux_overwrite = overwrite; rb->aux_watermark = watermark; @@ -689,7 +689,7 @@ out: void rb_free_aux(struct ring_buffer *rb) { - if (atomic_dec_and_test(&rb->aux_refcount)) + if (refcount_dec_and_test(&rb->aux_refcount)) __rb_free_aux(rb); } -- cgit From 840018668ce2d96783356204ff282d6c9b0e5f66 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Thu, 31 Jan 2019 11:47:08 -0700 Subject: perf/aux: Make perf_event accessible to setup_aux() When pmu::setup_aux() is called the coresight PMU needs to know which sink to use for the session by looking up the information in the event's attr::config2 field. As such simply replace the cpu information by the complete perf_event structure and change all affected customers. Signed-off-by: Mathieu Poirier Reviewed-by: Suzuki Poulouse Acked-by: Peter Zijlstra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Heiko Carstens Cc: Jiri Olsa Cc: Mark Rutland Cc: Martin Schwidefsky Cc: Namhyung Kim Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linux-s390@vger.kernel.org Link: http://lkml.kernel.org/r/20190131184714.20388-2-mathieu.poirier@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 805f0423ee0b..70ae2422cbaf 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -657,7 +657,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, goto out; } - rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages, + rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages, overwrite); if (!rb->aux_priv) goto out; -- cgit From c13324a505c7790fe91a9df35be2e0462abccdb0 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 13 Feb 2019 01:12:15 +0900 Subject: x86/kprobes: Prohibit probing on functions before kprobe_int3_handler() Prohibit probing on the functions called before kprobe_int3_handler() in do_int3(). More specifically, ftrace_int3_handler(), poke_int3_handler(), and ist_enter(). And since rcu_nmi_enter() is called by ist_enter(), it also should be marked as NOKPROBE_SYMBOL. Since those are handled before kprobe_int3_handler(), probing those functions can cause a breakpoint recursion and crash the kernel. Signed-off-by: Masami Hiramatsu Cc: Alexander Shishkin Cc: Andrea Righi Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/154998793571.31052.11301258949601150994.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9180158756d2..74db52a0a466 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -62,6 +62,7 @@ #include #include #include +#include #include "tree.h" #include "rcu.h" @@ -872,6 +873,7 @@ void rcu_nmi_enter(void) { rcu_nmi_enter_common(false); } +NOKPROBE_SYMBOL(rcu_nmi_enter); /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle -- cgit From 6143c6fb1e8f9bde9c434038f7548a19d36b55e7 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 13 Feb 2019 01:13:12 +0900 Subject: kprobes: Search non-suffixed symbol in blacklist Newer GCC versions can generate some different instances of a function with suffixed symbols if the function is optimized and only has a part of that. (e.g. .constprop, .part etc.) In this case, it is not enough to check the entry of kprobe blacklist because it only records non-suffixed symbol address. To fix this issue, search non-suffixed symbol in blacklist if given address is within a symbol which has a suffix. Note that this can cause false positive cases if a kprobe-safe function is optimized to suffixed instance and has same name symbol which is blacklisted. But I would like to chose a fail-safe design for this issue. Signed-off-by: Masami Hiramatsu Reviewed-by: Steven Rostedt (VMware) Cc: Alexander Shishkin Cc: Andrea Righi Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/154998799234.31052.6136378903570418008.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f4ddfdd2d07e..c83e54727131 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1396,7 +1396,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr) addr < (unsigned long)__kprobes_text_end; } -bool within_kprobe_blacklist(unsigned long addr) +static bool __within_kprobe_blacklist(unsigned long addr) { struct kprobe_blacklist_entry *ent; @@ -1410,7 +1410,26 @@ bool within_kprobe_blacklist(unsigned long addr) if (addr >= ent->start_addr && addr < ent->end_addr) return true; } + return false; +} +bool within_kprobe_blacklist(unsigned long addr) +{ + char symname[KSYM_NAME_LEN], *p; + + if (__within_kprobe_blacklist(addr)) + return true; + + /* Check if the address is on a suffixed-symbol */ + if (!lookup_symbol_name(addr, symname)) { + p = strchr(symname, '.'); + if (!p) + return false; + *p = '\0'; + addr = (unsigned long)kprobe_lookup_name(symname, 0); + if (addr) + return __within_kprobe_blacklist(addr); + } return false; } -- cgit From eeeb080bae906a57b6513d37efe3c38f2cb87a1c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 13 Feb 2019 01:13:40 +0900 Subject: kprobes: Prohibit probing on hardirq tracers Since kprobes breakpoint handling involves hardirq tracer, probing these functions cause breakpoint recursion problem. Prohibit probing on those functions. Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt (VMware) Cc: Alexander Shishkin Cc: Andrea Righi Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/154998802073.31052.17255044712514564153.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/trace/trace_irqsoff.c | 9 +++++++-- kernel/trace/trace_preemptirq.c | 5 +++++ 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index d3294721f119..d42a473b8240 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "trace.h" @@ -365,7 +366,7 @@ out: __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); } -static inline void +static nokprobe_inline void start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) { int cpu; @@ -401,7 +402,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) atomic_dec(&data->disabled); } -static inline void +static nokprobe_inline void stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) { int cpu; @@ -443,6 +444,7 @@ void start_critical_timings(void) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); } EXPORT_SYMBOL_GPL(start_critical_timings); +NOKPROBE_SYMBOL(start_critical_timings); void stop_critical_timings(void) { @@ -452,6 +454,7 @@ void stop_critical_timings(void) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); } EXPORT_SYMBOL_GPL(stop_critical_timings); +NOKPROBE_SYMBOL(stop_critical_timings); #ifdef CONFIG_FUNCTION_TRACER static bool function_enabled; @@ -611,6 +614,7 @@ void tracer_hardirqs_on(unsigned long a0, unsigned long a1) if (!preempt_trace(pc) && irq_trace()) stop_critical_timing(a0, a1, pc); } +NOKPROBE_SYMBOL(tracer_hardirqs_on); void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { @@ -619,6 +623,7 @@ void tracer_hardirqs_off(unsigned long a0, unsigned long a1) if (!preempt_trace(pc) && irq_trace()) start_critical_timing(a0, a1, pc); } +NOKPROBE_SYMBOL(tracer_hardirqs_off); static int irqsoff_tracer_init(struct trace_array *tr) { diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 71f553cceb3c..4d8e99fdbbbe 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "trace.h" #define CREATE_TRACE_POINTS @@ -30,6 +31,7 @@ void trace_hardirqs_on(void) lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on); +NOKPROBE_SYMBOL(trace_hardirqs_on); void trace_hardirqs_off(void) { @@ -43,6 +45,7 @@ void trace_hardirqs_off(void) lockdep_hardirqs_off(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_off); +NOKPROBE_SYMBOL(trace_hardirqs_off); __visible void trace_hardirqs_on_caller(unsigned long caller_addr) { @@ -56,6 +59,7 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr) lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on_caller); +NOKPROBE_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { @@ -69,6 +73,7 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr) lockdep_hardirqs_off(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_off_caller); +NOKPROBE_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_TRACE_IRQFLAGS */ #ifdef CONFIG_TRACE_PREEMPT_TOGGLE -- cgit From a39f15b9644fac3f950f522c39e667c3af25c588 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 13 Feb 2019 01:14:37 +0900 Subject: kprobes: Prohibit probing on RCU debug routine Since kprobe itself depends on RCU, probing on RCU debug routine can cause recursive breakpoint bugs. Prohibit probing on RCU debug routines. int3 ->do_int3() ->ist_enter() ->RCU_LOCKDEP_WARN() ->debug_lockdep_rcu_enabled() -> int3 Signed-off-by: Masami Hiramatsu Cc: Alexander Shishkin Cc: Andrea Righi Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/154998807741.31052.11229157537816341591.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/rcu/update.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 1971869c4072..f4ca36d92138 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -52,6 +52,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS @@ -249,6 +250,7 @@ int notrace debug_lockdep_rcu_enabled(void) current->lockdep_recursion == 0; } EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); +NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled); /** * rcu_read_lock_held() - might we be in RCU read-side critical section? -- cgit From 2f43c6022d84b2f562623a7023f49f1431e50747 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 13 Feb 2019 01:15:05 +0900 Subject: kprobes: Prohibit probing on lockdep functions Some lockdep functions can be involved in breakpoint handling and probing on those functions can cause a breakpoint recursion. Prohibit probing on those functions by blacklist. Signed-off-by: Masami Hiramatsu Cc: Alexander Shishkin Cc: Andrea Righi Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/154998810578.31052.1680977921449292812.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 95932333a48b..bc35a54ae3d4 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -50,6 +50,7 @@ #include #include #include +#include #include @@ -2814,6 +2815,7 @@ void lockdep_hardirqs_on(unsigned long ip) __trace_hardirqs_on_caller(ip); current->lockdep_recursion = 0; } +NOKPROBE_SYMBOL(lockdep_hardirqs_on); /* * Hardirqs were disabled: @@ -2843,6 +2845,7 @@ void lockdep_hardirqs_off(unsigned long ip) } else debug_atomic_inc(redundant_hardirqs_off); } +NOKPROBE_SYMBOL(lockdep_hardirqs_off); /* * Softirqs will be enabled: @@ -3650,7 +3653,8 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) return 0; } -static int __lock_is_held(const struct lockdep_map *lock, int read) +static nokprobe_inline +int __lock_is_held(const struct lockdep_map *lock, int read) { struct task_struct *curr = current; int i; @@ -3883,6 +3887,7 @@ int lock_is_held_type(const struct lockdep_map *lock, int read) return ret; } EXPORT_SYMBOL_GPL(lock_is_held_type); +NOKPROBE_SYMBOL(lock_is_held_type); struct pin_cookie lock_pin_lock(struct lockdep_map *lock) { -- cgit From 18736eef12137c59f60cc9f56dc5bea05c92e0eb Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 15 Feb 2019 13:56:54 +0200 Subject: perf: Copy parent's address filter offsets on clone When a child event is allocated in the inherit_event() path, the VMA based filter offsets are not copied from the parent, even though the address space mapping of the new task remains the same, which leads to no trace for the new task until exec. Reported-by: Mansour Alharthi Signed-off-by: Alexander Shishkin Tested-by: Mathieu Poirier Acked-by: Peter Zijlstra Cc: Jiri Olsa Fixes: 375637bc5249 ("perf/core: Introduce address range filtering") Link: http://lkml.kernel.org/r/20190215115655.63469-2-alexander.shishkin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 5aeb4c74fb99..2d89efc0a3e0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1255,6 +1255,7 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event_context::lock * perf_event::mmap_mutex * mmap_sem + * perf_addr_filters_head::lock * * cpu_hotplug_lock * pmus_lock @@ -10312,6 +10313,20 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, goto err_per_task; } + /* + * Clone the parent's vma offsets: they are valid until exec() + * even if the mm is not shared with the parent. + */ + if (event->parent) { + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + + raw_spin_lock_irq(&ifh->lock); + memcpy(event->addr_filters_offs, + event->parent->addr_filters_offs, + pmu->nr_addr_filters * sizeof(unsigned long)); + raw_spin_unlock_irq(&ifh->lock); + } + /* force hw sync on the address filters */ event->addr_filters_gen = 1; } -- cgit From c60f83b813e5b25ccd5de7e8c8925c31b3aebcc1 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 15 Feb 2019 13:56:55 +0200 Subject: perf, pt, coresight: Fix address filters for vmas with non-zero offset Currently, the address range calculation for file-based filters works as long as the vma that maps the matching part of the object file starts from offset zero into the file (vm_pgoff==0). Otherwise, the resulting filter range would be off by vm_pgoff pages. Another related problem is that in case of a partially matching vma, that is, a vma that matches part of a filter region, the filter range size wouldn't be adjusted. Fix the arithmetics around address filter range calculations, taking into account vma offset, so that the entire calculation is done before the filter configuration is passed to the PMU drivers instead of having those drivers do the final bit of arithmetics. Based on the patch by Adrian Hunter . Reported-by: Adrian Hunter Signed-off-by: Alexander Shishkin Tested-by: Mathieu Poirier Acked-by: Peter Zijlstra Cc: Jiri Olsa Fixes: 375637bc5249 ("perf/core: Introduce address range filtering") Link: http://lkml.kernel.org/r/20190215115655.63469-3-alexander.shishkin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 81 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 2d89efc0a3e0..16609f6737da 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2799,7 +2799,7 @@ static int perf_event_stop(struct perf_event *event, int restart) * * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, * we update the addresses of corresponding vmas in - * event::addr_filters_offs array and bump the event::addr_filters_gen; + * event::addr_filter_ranges array and bump the event::addr_filters_gen; * (p2) when an event is scheduled in (pmu::add), it calls * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() * if the generation has changed since the previous call. @@ -4446,7 +4446,7 @@ static void _free_event(struct perf_event *event) perf_event_free_bpf_prog(event); perf_addr_filters_splice(event, NULL); - kfree(event->addr_filters_offs); + kfree(event->addr_filter_ranges); if (event->destroy) event->destroy(event); @@ -6687,7 +6687,8 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { if (filter->path.dentry) { - event->addr_filters_offs[count] = 0; + event->addr_filter_ranges[count].start = 0; + event->addr_filter_ranges[count].size = 0; restart++; } @@ -7367,28 +7368,47 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter, return true; } +static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter, + struct vm_area_struct *vma, + struct perf_addr_filter_range *fr) +{ + unsigned long vma_size = vma->vm_end - vma->vm_start; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT; + struct file *file = vma->vm_file; + + if (!perf_addr_filter_match(filter, file, off, vma_size)) + return false; + + if (filter->offset < off) { + fr->start = vma->vm_start; + fr->size = min(vma_size, filter->size - (off - filter->offset)); + } else { + fr->start = vma->vm_start + filter->offset - off; + fr->size = min(vma->vm_end - fr->start, filter->size); + } + + return true; +} + static void __perf_addr_filters_adjust(struct perf_event *event, void *data) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); struct vm_area_struct *vma = data; - unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags; - struct file *file = vma->vm_file; struct perf_addr_filter *filter; unsigned int restart = 0, count = 0; + unsigned long flags; if (!has_addr_filter(event)) return; - if (!file) + if (!vma->vm_file) return; raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { - if (perf_addr_filter_match(filter, file, off, - vma->vm_end - vma->vm_start)) { - event->addr_filters_offs[count] = vma->vm_start; + if (perf_addr_filter_vma_adjust(filter, vma, + &event->addr_filter_ranges[count])) restart++; - } count++; } @@ -8978,26 +8998,19 @@ static void perf_addr_filters_splice(struct perf_event *event, * @filter; if so, adjust filter's address range. * Called with mm::mmap_sem down for reading. */ -static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter, - struct mm_struct *mm) +static void perf_addr_filter_apply(struct perf_addr_filter *filter, + struct mm_struct *mm, + struct perf_addr_filter_range *fr) { struct vm_area_struct *vma; for (vma = mm->mmap; vma; vma = vma->vm_next) { - struct file *file = vma->vm_file; - unsigned long off = vma->vm_pgoff << PAGE_SHIFT; - unsigned long vma_size = vma->vm_end - vma->vm_start; - - if (!file) + if (!vma->vm_file) continue; - if (!perf_addr_filter_match(filter, file, off, vma_size)) - continue; - - return vma->vm_start; + if (perf_addr_filter_vma_adjust(filter, vma, fr)) + return; } - - return 0; } /* @@ -9031,15 +9044,15 @@ static void perf_event_addr_filters_apply(struct perf_event *event) raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { - event->addr_filters_offs[count] = 0; + event->addr_filter_ranges[count].start = 0; + event->addr_filter_ranges[count].size = 0; /* * Adjust base offset if the filter is associated to a binary * that needs to be mapped: */ if (filter->path.dentry) - event->addr_filters_offs[count] = - perf_addr_filter_apply(filter, mm); + perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]); count++; } @@ -10305,10 +10318,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, goto err_pmu; if (has_addr_filter(event)) { - event->addr_filters_offs = kcalloc(pmu->nr_addr_filters, - sizeof(unsigned long), - GFP_KERNEL); - if (!event->addr_filters_offs) { + event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters, + sizeof(struct perf_addr_filter_range), + GFP_KERNEL); + if (!event->addr_filter_ranges) { err = -ENOMEM; goto err_per_task; } @@ -10321,9 +10334,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); raw_spin_lock_irq(&ifh->lock); - memcpy(event->addr_filters_offs, - event->parent->addr_filters_offs, - pmu->nr_addr_filters * sizeof(unsigned long)); + memcpy(event->addr_filter_ranges, + event->parent->addr_filter_ranges, + pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range)); raw_spin_unlock_irq(&ifh->lock); } @@ -10345,7 +10358,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, return event; err_addr_filters: - kfree(event->addr_filters_offs); + kfree(event->addr_filter_ranges); err_per_task: exclusive_event_destroy(event); -- cgit