summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2020-01-27 14:31:40 +0100
committerDavid S. Miller <davem@davemloft.net>2020-01-27 14:31:40 +0100
commit9e0703a2650def6fffe91235a880bb6f37d9ad74 (patch)
tree0fdbc8cd4ce67c569c5ab55b19bab509eaa474ea /kernel
parentc312840cd79061af37158cb42590931cfa364c1b (diff)
parent82650dab9a5a2928c8d982cce5e3c687f14f8716 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Daniel Borkmann says: ==================== pull-request: bpf-next 2020-01-27 The following pull-request contains BPF updates for your *net-next* tree. We've added 20 non-merge commits during the last 5 day(s) which contain a total of 24 files changed, 433 insertions(+), 104 deletions(-). The main changes are: 1) Make BPF trampolines and dispatcher aware for the stack unwinder, from Jiri Olsa. 2) Improve handling of failed CO-RE relocations in libbpf, from Andrii Nakryiko. 3) Several fixes to BPF sockmap and reuseport selftests, from Lorenz Bauer. 4) Various cleanups in BPF devmap's XDP flush code, from John Fastabend. 5) Fix BPF flow dissector when used with port ranges, from Yoshiki Komachi. 6) Fix bpffs' map_seq_next callback to always inc position index, from Vasily Averin. 7) Allow overriding LLVM tooling for runqslower utility, from Andrey Ignatov. 8) Silence false-positive lockdep splats in devmap hash lookup, from Amol Grover. 9) Fix fentry/fexit selftests to initialize a variable before use, from John Sperbeck. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/btf.c16
-rw-r--r--kernel/bpf/devmap.c29
-rw-r--r--kernel/bpf/dispatcher.c4
-rw-r--r--kernel/bpf/inode.c3
-rw-r--r--kernel/bpf/trampoline.c80
-rw-r--r--kernel/extable.c7
6 files changed, 112 insertions, 27 deletions
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 32963b6d5a9c..b7c1660fb594 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3669,6 +3669,19 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
}
}
+static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
+{
+ /* t comes in already as a pointer */
+ t = btf_type_by_id(btf, t->type);
+
+ /* allow const */
+ if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST)
+ t = btf_type_by_id(btf, t->type);
+
+ /* char, signed char, unsigned char */
+ return btf_type_is_int(t) && t->size == 1;
+}
+
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
@@ -3735,6 +3748,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
*/
return true;
+ if (is_string_ptr(btf, t))
+ return true;
+
/* this is a pointer to another type */
info->reg_type = PTR_TO_BTF_ID;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index de630f980282..58bdca5d978a 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -190,10 +190,12 @@ static void dev_map_free(struct bpf_map *map)
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
* so the programs (can be more than one that used this map) were
- * disconnected from events. Wait for outstanding critical sections in
- * these programs to complete. The rcu critical section only guarantees
- * no further reads against netdev_map. It does __not__ ensure pending
- * flush operations (if any) are complete.
+ * disconnected from events. The following synchronize_rcu() guarantees
+ * both rcu read critical sections complete and waits for
+ * preempt-disable regions (NAPI being the relevant context here) so we
+ * are certain there will be no further reads against the netdev_map and
+ * all flush operations are complete. Flush operations can only be done
+ * from NAPI context for this reason.
*/
spin_lock(&dev_map_lock);
@@ -263,7 +265,8 @@ struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
struct hlist_head *head = dev_map_index_hash(dtab, key);
struct bpf_dtab_netdev *dev;
- hlist_for_each_entry_rcu(dev, head, index_hlist)
+ hlist_for_each_entry_rcu(dev, head, index_hlist,
+ lockdep_is_held(&dtab->index_lock))
if (dev->idx == key)
return dev;
@@ -363,16 +366,17 @@ error:
* from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
* net device can be torn down. On devmap tear down we ensure the flush list
* is empty before completing to ensure all flush operations have completed.
+ * When drivers update the bpf program they may need to ensure any flush ops
+ * are also complete. Using synchronize_rcu or call_rcu will suffice for this
+ * because both wait for napi context to exit.
*/
void __dev_flush(void)
{
struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq, *tmp;
- rcu_read_lock();
list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
bq_xmit_all(bq, XDP_XMIT_FLUSH);
- rcu_read_unlock();
}
/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
@@ -502,12 +506,11 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
return -EINVAL;
/* Use call_rcu() here to ensure any rcu critical sections have
- * completed, but this does not guarantee a flush has happened
- * yet. Because driver side rcu_read_lock/unlock only protects the
- * running XDP program. However, for pending flush operations the
- * dev and ctx are stored in another per cpu map. And additionally,
- * the driver tear down ensures all soft irqs are complete before
- * removing the net device in the case of dev_put equals zero.
+ * completed as well as any flush operations because call_rcu
+ * will wait for preempt-disable region to complete, NAPI in this
+ * context. And additionally, the driver tear down ensures all
+ * soft irqs are complete before removing the net device in the
+ * case of dev_put equals zero.
*/
old_dev = xchg(&dtab->netdev_map[k], NULL);
if (old_dev)
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index 204ee61a3904..b3e5b214fed8 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -113,7 +113,7 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
noff = 0;
} else {
old = d->image + d->image_off;
- noff = d->image_off ^ (PAGE_SIZE / 2);
+ noff = d->image_off ^ (BPF_IMAGE_SIZE / 2);
}
new = d->num_progs ? d->image + noff : NULL;
@@ -140,7 +140,7 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
mutex_lock(&d->mutex);
if (!d->image) {
- d->image = bpf_jit_alloc_exec_page();
+ d->image = bpf_image_alloc();
if (!d->image)
goto out;
}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index e11059b99f18..bd2fd8eab470 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -196,6 +196,7 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
void *key = map_iter(m)->key;
void *prev_key;
+ (*pos)++;
if (map_iter(m)->done)
return NULL;
@@ -208,8 +209,6 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
map_iter(m)->done = true;
return NULL;
}
-
- ++(*pos);
return key;
}
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index eb64c245052b..6b264a92064b 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -4,6 +4,7 @@
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/ftrace.h>
+#include <linux/rbtree_latch.h>
/* dummy _ops. The verifier will operate on target program's ops. */
const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -16,11 +17,12 @@ const struct bpf_prog_ops bpf_extension_prog_ops = {
#define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS)
static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
+static struct latch_tree_root image_tree __cacheline_aligned;
-/* serializes access to trampoline_table */
+/* serializes access to trampoline_table and image_tree */
static DEFINE_MUTEX(trampoline_mutex);
-void *bpf_jit_alloc_exec_page(void)
+static void *bpf_jit_alloc_exec_page(void)
{
void *image;
@@ -36,6 +38,64 @@ void *bpf_jit_alloc_exec_page(void)
return image;
}
+static __always_inline bool image_tree_less(struct latch_tree_node *a,
+ struct latch_tree_node *b)
+{
+ struct bpf_image *ia = container_of(a, struct bpf_image, tnode);
+ struct bpf_image *ib = container_of(b, struct bpf_image, tnode);
+
+ return ia < ib;
+}
+
+static __always_inline int image_tree_comp(void *addr, struct latch_tree_node *n)
+{
+ void *image = container_of(n, struct bpf_image, tnode);
+
+ if (addr < image)
+ return -1;
+ if (addr >= image + PAGE_SIZE)
+ return 1;
+
+ return 0;
+}
+
+static const struct latch_tree_ops image_tree_ops = {
+ .less = image_tree_less,
+ .comp = image_tree_comp,
+};
+
+static void *__bpf_image_alloc(bool lock)
+{
+ struct bpf_image *image;
+
+ image = bpf_jit_alloc_exec_page();
+ if (!image)
+ return NULL;
+
+ if (lock)
+ mutex_lock(&trampoline_mutex);
+ latch_tree_insert(&image->tnode, &image_tree, &image_tree_ops);
+ if (lock)
+ mutex_unlock(&trampoline_mutex);
+ return image->data;
+}
+
+void *bpf_image_alloc(void)
+{
+ return __bpf_image_alloc(true);
+}
+
+bool is_bpf_image_address(unsigned long addr)
+{
+ bool ret;
+
+ rcu_read_lock();
+ ret = latch_tree_find((void *) addr, &image_tree, &image_tree_ops) != NULL;
+ rcu_read_unlock();
+
+ return ret;
+}
+
struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
{
struct bpf_trampoline *tr;
@@ -56,7 +116,7 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
goto out;
/* is_root was checked earlier. No need for bpf_jit_charge_modmem() */
- image = bpf_jit_alloc_exec_page();
+ image = __bpf_image_alloc(false);
if (!image) {
kfree(tr);
tr = NULL;
@@ -131,14 +191,14 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
}
/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
- * bytes on x86. Pick a number to fit into PAGE_SIZE / 2
+ * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2
*/
#define BPF_MAX_TRAMP_PROGS 40
static int bpf_trampoline_update(struct bpf_trampoline *tr)
{
- void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2;
- void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2;
+ void *old_image = tr->image + ((tr->selector + 1) & 1) * BPF_IMAGE_SIZE/2;
+ void *new_image = tr->image + (tr->selector & 1) * BPF_IMAGE_SIZE/2;
struct bpf_prog *progs_to_run[BPF_MAX_TRAMP_PROGS];
int fentry_cnt = tr->progs_cnt[BPF_TRAMP_FENTRY];
int fexit_cnt = tr->progs_cnt[BPF_TRAMP_FEXIT];
@@ -174,7 +234,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
*/
synchronize_rcu_tasks();
- err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2,
+ err = arch_prepare_bpf_trampoline(new_image, new_image + BPF_IMAGE_SIZE / 2,
&tr->func.model, flags,
fentry, fentry_cnt,
fexit, fexit_cnt,
@@ -284,6 +344,8 @@ out:
void bpf_trampoline_put(struct bpf_trampoline *tr)
{
+ struct bpf_image *image;
+
if (!tr)
return;
mutex_lock(&trampoline_mutex);
@@ -294,9 +356,11 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
goto out;
if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
goto out;
+ image = container_of(tr->image, struct bpf_image, data);
+ latch_tree_erase(&image->tnode, &image_tree, &image_tree_ops);
/* wait for tasks to get out of trampoline before freeing it */
synchronize_rcu_tasks();
- bpf_jit_free_exec(tr->image);
+ bpf_jit_free_exec(image);
hlist_del(&tr->hlist);
kfree(tr);
out:
diff --git a/kernel/extable.c b/kernel/extable.c
index f6920a11e28a..a0024f27d3a1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -131,8 +131,9 @@ int kernel_text_address(unsigned long addr)
* triggers a stack trace, or a WARN() that happens during
* coming back from idle, or cpu on or offlining.
*
- * is_module_text_address() as well as the kprobe slots
- * and is_bpf_text_address() require RCU to be watching.
+ * is_module_text_address() as well as the kprobe slots,
+ * is_bpf_text_address() and is_bpf_image_address require
+ * RCU to be watching.
*/
no_rcu = !rcu_is_watching();
@@ -148,6 +149,8 @@ int kernel_text_address(unsigned long addr)
goto out;
if (is_bpf_text_address(addr))
goto out;
+ if (is_bpf_image_address(addr))
+ goto out;
ret = 0;
out:
if (no_rcu)