From 568f196756ad9fe2b49c46bbf6a9de1b190438b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Jan 2019 17:21:52 -0800 Subject: bpf: check that BPF programs run with preemption disabled Introduce cant_sleep() macro for annotation of functions that cannot sleep. Use it in BPF_PROG_RUN to catch execution of BPF programs in preemptable context. Suggested-by: Jann Horn Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 2 +- include/linux/kernel.h | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 95e2d7ebdf21..f32b3eca5a04 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -533,7 +533,7 @@ struct sk_filter { struct bpf_prog *prog; }; -#define BPF_PROG_RUN(filter, ctx) (*(filter)->bpf_func)(ctx, (filter)->insnsi) +#define BPF_PROG_RUN(filter, ctx) ({ cant_sleep(); (*(filter)->bpf_func)(ctx, (filter)->insnsi); }) #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 8f0e68e250a7..a8868a32098c 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -245,8 +245,10 @@ extern int _cond_resched(void); #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP - void ___might_sleep(const char *file, int line, int preempt_offset); - void __might_sleep(const char *file, int line, int preempt_offset); +extern void ___might_sleep(const char *file, int line, int preempt_offset); +extern void __might_sleep(const char *file, int line, int preempt_offset); +extern void __cant_sleep(const char *file, int line, int preempt_offset); + /** * might_sleep - annotation for functions that can sleep * @@ -259,6 +261,13 @@ extern int _cond_resched(void); */ # define might_sleep() \ do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) +/** + * cant_sleep - annotation for functions that cannot sleep + * + * this macro will print a stack trace if it is executed with preemption enabled + */ +# define cant_sleep() \ + do { __cant_sleep(__FILE__, __LINE__, 0); } while (0) # define sched_annotate_sleep() (current->task_state_change = 0) #else static inline void ___might_sleep(const char *file, int line, @@ -266,6 +275,7 @@ extern int _cond_resched(void); static inline void __might_sleep(const char *file, int line, int preempt_offset) { } # define might_sleep() do { might_resched(); } while (0) +# define cant_sleep() do { } while (0) # define sched_annotate_sleep() do { } while (0) #endif -- cgit From 492ecee892c2a4ba6a14903d5d586ff750b7e805 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 25 Feb 2019 14:28:39 -0800 Subject: bpf: enable program stats JITed BPF programs are indistinguishable from kernel functions, but unlike kernel code BPF code can be changed often. Typical approach of "perf record" + "perf report" profiling and tuning of kernel code works just as well for BPF programs, but kernel code doesn't need to be monitored whereas BPF programs do. Users load and run large amount of BPF programs. These BPF stats allow tools monitor the usage of BPF on the server. The monitoring tools will turn sysctl kernel.bpf_stats_enabled on and off for few seconds to sample average cost of the programs. Aggregated data over hours and days will provide an insight into cost of BPF and alarms can trigger in case given program suddenly gets more expensive. The cost of two sched_clock() per program invocation adds ~20 nsec. Fast BPF progs (like selftests/bpf/progs/test_pkt_access.c) will slow down from ~10 nsec to ~30 nsec. static_key minimizes the cost of the stats collection. There is no measurable difference before/after this patch with kernel.bpf_stats_enabled=0 Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 9 +++++++++ include/linux/filter.h | 20 +++++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index de18227b3d95..a2132e09dc1c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -16,6 +16,7 @@ #include #include #include +#include struct bpf_verifier_env; struct perf_event; @@ -340,6 +341,12 @@ enum bpf_cgroup_storage_type { #define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX +struct bpf_prog_stats { + u64 cnt; + u64 nsecs; + struct u64_stats_sync syncp; +}; + struct bpf_prog_aux { atomic_t refcnt; u32 used_map_cnt; @@ -389,6 +396,7 @@ struct bpf_prog_aux { * main prog always has linfo_idx == 0 */ u32 linfo_idx; + struct bpf_prog_stats __percpu *stats; union { struct work_struct work; struct rcu_head rcu; @@ -559,6 +567,7 @@ void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); extern int sysctl_unprivileged_bpf_disabled; +extern int sysctl_bpf_stats_enabled; int bpf_map_new_fd(struct bpf_map *map, int flags); int bpf_prog_new_fd(struct bpf_prog *prog); diff --git a/include/linux/filter.h b/include/linux/filter.h index f32b3eca5a04..7e5e3db11106 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -533,7 +533,24 @@ struct sk_filter { struct bpf_prog *prog; }; -#define BPF_PROG_RUN(filter, ctx) ({ cant_sleep(); (*(filter)->bpf_func)(ctx, (filter)->insnsi); }) +DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); + +#define BPF_PROG_RUN(prog, ctx) ({ \ + u32 ret; \ + cant_sleep(); \ + if (static_branch_unlikely(&bpf_stats_enabled_key)) { \ + struct bpf_prog_stats *stats; \ + u64 start = sched_clock(); \ + ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi); \ + stats = this_cpu_ptr(prog->aux->stats); \ + u64_stats_update_begin(&stats->syncp); \ + stats->cnt++; \ + stats->nsecs += sched_clock() - start; \ + u64_stats_update_end(&stats->syncp); \ + } else { \ + ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi); \ + } \ + ret; }) #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN @@ -764,6 +781,7 @@ void bpf_prog_free_jited_linfo(struct bpf_prog *prog); void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog); struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); +struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); void __bpf_prog_free(struct bpf_prog *fp); -- cgit From 5f8f8b93aeb8371c54af08bece2bd04bc2d48707 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 25 Feb 2019 14:28:40 -0800 Subject: bpf: expose program stats via bpf_prog_info Return bpf program run_time_ns and run_cnt via bpf_prog_info Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bcdd2474eee7..2e308e90ffea 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2813,6 +2813,8 @@ struct bpf_prog_info { __u32 jited_line_info_rec_size; __u32 nr_prog_tags; __aligned_u64 prog_tags; + __u64 run_time_ns; + __u64 run_cnt; } __attribute__((aligned(8))); struct bpf_map_info { -- cgit From f7c917ba11a67632a8452ea99fe132f626a7a2cc Mon Sep 17 00:00:00 2001 From: brakmo Date: Fri, 1 Mar 2019 12:38:46 -0800 Subject: bpf: add bpf helper bpf_skb_ecn_set_ce This patch adds a new bpf helper BPF_FUNC_skb_ecn_set_ce "int bpf_skb_ecn_set_ce(struct sk_buff *skb)". It is added to BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently can be attached to the ingress and egress path. The helper is needed because his type of bpf_prog cannot modify the skb directly. This helper is used to set the ECN field of ECN capable IP packets to ce (congestion encountered) in the IPv6 or IPv4 header of the skb. It can be used by a bpf_prog to manage egress or ingress network bandwdith limit per cgroupv2 by inducing an ECN response in the TCP sender. This works best when using DCTCP. Signed-off-by: Lawrence Brakmo Signed-off-by: Martin KaFai Lau Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2e308e90ffea..3c38ac9a92a7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2359,6 +2359,13 @@ union bpf_attr { * Return * A **struct bpf_tcp_sock** pointer on success, or NULL in * case of failure. + * + * int bpf_skb_ecn_set_ce(struct sk_buf *skb) + * Description + * Sets ECN of IP header to ce (congestion encountered) if + * current value is ect (ECN capable). Works with IPv6 and IPv4. + * Return + * 1 if set, 0 if not set. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2457,7 +2464,8 @@ union bpf_attr { FN(spin_lock), \ FN(spin_unlock), \ FN(sk_fullsock), \ - FN(tcp_sock), + FN(tcp_sock), \ + FN(skb_ecn_set_ce), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit