From da2577fdd0932ea4eefe73903f1130ee366767d2 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Sat, 8 Jun 2019 12:54:19 -0700 Subject: bpf: lpm_trie: check left child of last leftmost node for NULL If the leftmost parent node of the tree has does not have a child on the left side, then trie_get_next_key (and bpftool map dump) will not look at the child on the right. This leads to the traversal missing elements. Lookup is not affected. Update selftest to handle this case. Reproducer: bpftool map create /sys/fs/bpf/lpm type lpm_trie key 6 \ value 1 entries 256 name test_lpm flags 1 bpftool map update pinned /sys/fs/bpf/lpm key 8 0 0 0 0 0 value 1 bpftool map update pinned /sys/fs/bpf/lpm key 16 0 0 0 0 128 value 2 bpftool map dump pinned /sys/fs/bpf/lpm Returns only 1 element. (2 expected) Fixes: b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE") Signed-off-by: Jonathan Lemon Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/lpm_trie.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index e61630c2e50b..864e2a496376 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -716,9 +716,14 @@ find_leftmost: * have exact two children, so this function will never return NULL. */ for (node = search_root; node;) { - if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + if (node->flags & LPM_TREE_NODE_FLAG_IM) { + node = rcu_dereference(node->child[0]); + } else { next_node = node; - node = rcu_dereference(node->child[0]); + node = rcu_dereference(node->child[0]); + if (!node) + node = rcu_dereference(next_node->child[1]); + } } do_copy: next_key->prefixlen = next_node->prefixlen; -- cgit From d4dd153d551634683fccf8881f606fa9f3dfa1ef Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 14 Jun 2019 17:20:13 +0900 Subject: bpf, devmap: Fix premature entry free on destroying map dev_map_free() waits for flush_needed bitmap to be empty in order to ensure all flush operations have completed before freeing its entries. However the corresponding clear_bit() was called before using the entries, so the entries could be used after free. All access to the entries needs to be done before clearing the bit. It seems commit a5e2da6e9787 ("bpf: netdev is never null in __dev_map_flush") accidentally changed the clear_bit() and memory access order. Note that the problem happens only in __dev_map_flush(), not in dev_map_flush_old(). dev_map_flush_old() is called only after nulling out the corresponding netdev_map entry, so dev_map_free() never frees the entry thus no such race happens there. Fixes: a5e2da6e9787 ("bpf: netdev is never null in __dev_map_flush") Signed-off-by: Toshiaki Makita Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 1e525d70f833..e001fb1a96b1 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -291,10 +291,10 @@ void __dev_map_flush(struct bpf_map *map) if (unlikely(!dev)) continue; - __clear_bit(bit, bitmap); - bq = this_cpu_ptr(dev->bulkq); bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true); + + __clear_bit(bit, bitmap); } } -- cgit From edabf4d9dd905acd60048ea1579943801e3a4876 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 14 Jun 2019 17:20:14 +0900 Subject: bpf, devmap: Add missing bulk queue free dev_map_free() forgot to free bulk queue when freeing its entries. Fixes: 5d053f9da431 ("bpf: devmap prepare xdp frames for bulking") Signed-off-by: Toshiaki Makita Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index e001fb1a96b1..a126d95d12de 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -186,6 +186,7 @@ static void dev_map_free(struct bpf_map *map) if (!dev) continue; + free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } -- cgit From 86723c8640633bee4b4588d3c7784ee7a0032f65 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 14 Jun 2019 17:20:15 +0900 Subject: bpf, devmap: Add missing RCU read lock on flush .ndo_xdp_xmit() assumes it is called under RCU. For example virtio_net uses RCU to detect it has setup the resources for tx. The assumption accidentally broke when introducing bulk queue in devmap. Fixes: 5d053f9da431 ("bpf: devmap prepare xdp frames for bulking") Reported-by: David Ahern Signed-off-by: Toshiaki Makita Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a126d95d12de..1defea4b2755 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -282,6 +282,7 @@ void __dev_map_flush(struct bpf_map *map) unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); u32 bit; + rcu_read_lock(); for_each_set_bit(bit, bitmap, map->max_entries) { struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); struct xdp_bulk_queue *bq; @@ -297,6 +298,7 @@ void __dev_map_flush(struct bpf_map *map) __clear_bit(bit, bitmap); } + rcu_read_unlock(); } /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or @@ -389,6 +391,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) int cpu; + rcu_read_lock(); for_each_online_cpu(cpu) { bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); __clear_bit(dev->bit, bitmap); @@ -396,6 +399,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) bq = per_cpu_ptr(dev->bulkq, cpu); bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); } + rcu_read_unlock(); } } -- cgit From 9594dc3c7e71b9f52bee1d7852eb3d4e3aea9e99 Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Tue, 11 Jun 2019 14:53:04 -0700 Subject: bpf: fix nested bpf tracepoints with per-cpu data BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as they do not increment bpf_prog_active while executing. This enables three levels of nesting, to support - a kprobe or raw tp or perf event, - another one of the above that irq context happens to call, and - another one in nmi context (at most one of which may be a kprobe or perf event). Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data") Signed-off-by: Matt Mullins Acked-by: Andrii Nakryiko Acked-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 100 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 84 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f92d6ad5e080..1c9a4745e596 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -410,8 +410,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { .arg4_type = ARG_CONST_SIZE, }; -static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd); - static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, u64 flags, struct perf_sample_data *sd) @@ -442,24 +440,50 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, return perf_event_output(event, sd, regs); } +/* + * Support executing tracepoints in normal, irq, and nmi context that each call + * bpf_perf_event_output + */ +struct bpf_trace_sample_data { + struct perf_sample_data sds[3]; +}; + +static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds); +static DEFINE_PER_CPU(int, bpf_trace_nest_level); BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, u64, flags, void *, data, u64, size) { - struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd); + struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds); + int nest_level = this_cpu_inc_return(bpf_trace_nest_level); struct perf_raw_record raw = { .frag = { .size = size, .data = data, }, }; + struct perf_sample_data *sd; + int err; - if (unlikely(flags & ~(BPF_F_INDEX_MASK))) - return -EINVAL; + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) { + err = -EBUSY; + goto out; + } + + sd = &sds->sds[nest_level - 1]; + + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) { + err = -EINVAL; + goto out; + } perf_sample_data_init(sd, 0, 0); sd->raw = &raw; - return __bpf_perf_event_output(regs, map, flags, sd); + err = __bpf_perf_event_output(regs, map, flags, sd); + +out: + this_cpu_dec(bpf_trace_nest_level); + return err; } static const struct bpf_func_proto bpf_perf_event_output_proto = { @@ -822,16 +846,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) /* * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp * to avoid potential recursive reuse issue when/if tracepoints are added - * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack + * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack. + * + * Since raw tracepoints run despite bpf_prog_active, support concurrent usage + * in normal, irq, and nmi context. */ -static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); +struct bpf_raw_tp_regs { + struct pt_regs regs[3]; +}; +static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs); +static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level); +static struct pt_regs *get_bpf_raw_tp_regs(void) +{ + struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs); + int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level); + + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) { + this_cpu_dec(bpf_raw_tp_nest_level); + return ERR_PTR(-EBUSY); + } + + return &tp_regs->regs[nest_level - 1]; +} + +static void put_bpf_raw_tp_regs(void) +{ + this_cpu_dec(bpf_raw_tp_nest_level); +} + BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags, void *, data, u64, size) { - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + struct pt_regs *regs = get_bpf_raw_tp_regs(); + int ret; + + if (IS_ERR(regs)) + return PTR_ERR(regs); perf_fetch_caller_regs(regs); - return ____bpf_perf_event_output(regs, map, flags, data, size); + ret = ____bpf_perf_event_output(regs, map, flags, data, size); + + put_bpf_raw_tp_regs(); + return ret; } static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { @@ -848,12 +904,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags) { - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + struct pt_regs *regs = get_bpf_raw_tp_regs(); + int ret; + + if (IS_ERR(regs)) + return PTR_ERR(regs); perf_fetch_caller_regs(regs); /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ - return bpf_get_stackid((unsigned long) regs, (unsigned long) map, - flags, 0, 0); + ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map, + flags, 0, 0); + put_bpf_raw_tp_regs(); + return ret; } static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { @@ -868,11 +930,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args, void *, buf, u32, size, u64, flags) { - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + struct pt_regs *regs = get_bpf_raw_tp_regs(); + int ret; + + if (IS_ERR(regs)) + return PTR_ERR(regs); perf_fetch_caller_regs(regs); - return bpf_get_stack((unsigned long) regs, (unsigned long) buf, - (unsigned long) size, flags, 0); + ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf, + (unsigned long) size, flags, 0); + put_bpf_raw_tp_regs(); + return ret; } static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { -- cgit