summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-01-28 16:02:33 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2020-01-28 16:02:33 -0800
commitbd2463ac7d7ec51d432f23bf0e893fb371a908cd (patch)
tree3da32c23be83adb9d9bda7e51b51fa39f69f2447 /kernel
parenta78208e2436963d0b2c7d186277d6e1a9755029a (diff)
parentf76e4c167ea2212e23c15ee7e601a865e822c291 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from David Miller: 1) Add WireGuard 2) Add HE and TWT support to ath11k driver, from John Crispin. 3) Add ESP in TCP encapsulation support, from Sabrina Dubroca. 4) Add variable window congestion control to TIPC, from Jon Maloy. 5) Add BCM84881 PHY driver, from Russell King. 6) Start adding netlink support for ethtool operations, from Michal Kubecek. 7) Add XDP drop and TX action support to ena driver, from Sameeh Jubran. 8) Add new ipv4 route notifications so that mlxsw driver does not have to handle identical routes itself. From Ido Schimmel. 9) Add BPF dynamic program extensions, from Alexei Starovoitov. 10) Support RX and TX timestamping in igc, from Vinicius Costa Gomes. 11) Add support for macsec HW offloading, from Antoine Tenart. 12) Add initial support for MPTCP protocol, from Christoph Paasch, Matthieu Baerts, Florian Westphal, Peter Krystad, and many others. 13) Add Octeontx2 PF support, from Sunil Goutham, Geetha sowjanya, Linu Cherian, and others. * git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1469 commits) net: phy: add default ARCH_BCM_IPROC for MDIO_BCM_IPROC udp: segment looped gso packets correctly netem: change mailing list qed: FW 8.42.2.0 debug features qed: rt init valid initialization changed qed: Debug feature: ilt and mdump qed: FW 8.42.2.0 Add fw overlay feature qed: FW 8.42.2.0 HSI changes qed: FW 8.42.2.0 iscsi/fcoe changes qed: Add abstraction for different hsi values per chip qed: FW 8.42.2.0 Additional ll2 type qed: Use dmae to write to widebus registers in fw_funcs qed: FW 8.42.2.0 Parser offsets modified qed: FW 8.42.2.0 Queue Manager changes qed: FW 8.42.2.0 Expose new registers and change windows qed: FW 8.42.2.0 Internal ram offsets modifications MAINTAINERS: Add entry for Marvell OcteonTX2 Physical Function driver Documentation: net: octeontx2: Add RVU HW and drivers overview octeontx2-pf: ethtool RSS config support octeontx2-pf: Add basic ethtool support ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/Makefile4
-rw-r--r--kernel/bpf/arraymap.c2
-rw-r--r--kernel/bpf/bpf_struct_ops.c634
-rw-r--r--kernel/bpf/bpf_struct_ops_types.h9
-rw-r--r--kernel/bpf/btf.c504
-rw-r--r--kernel/bpf/cgroup.c97
-rw-r--r--kernel/bpf/core.c7
-rw-r--r--kernel/bpf/cpumap.c76
-rw-r--r--kernel/bpf/devmap.c190
-rw-r--r--kernel/bpf/dispatcher.c158
-rw-r--r--kernel/bpf/hashtab.c264
-rw-r--r--kernel/bpf/helpers.c12
-rw-r--r--kernel/bpf/inode.c46
-rw-r--r--kernel/bpf/map_in_map.c3
-rw-r--r--kernel/bpf/syscall.c695
-rw-r--r--kernel/bpf/trampoline.c157
-rw-r--r--kernel/bpf/verifier.c504
-rw-r--r--kernel/bpf/xskmap.c18
-rw-r--r--kernel/cgroup/cgroup.c5
-rw-r--r--kernel/extable.c7
-rw-r--r--kernel/trace/bpf_trace.c27
21 files changed, 2798 insertions, 621 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 3f671bf617e8..046ce5d98033 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o
+obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
@@ -26,3 +27,6 @@ endif
ifeq ($(CONFIG_SYSFS),y)
obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o
endif
+ifeq ($(CONFIG_BPF_JIT),y)
+obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
+endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index f0d19bbb9211..95d77770353c 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -503,6 +503,8 @@ const struct bpf_map_ops array_map_ops = {
.map_mmap = array_map_mmap,
.map_seq_show_elem = array_map_seq_show_elem,
.map_check_btf = array_map_check_btf,
+ .map_lookup_batch = generic_map_lookup_batch,
+ .map_update_batch = generic_map_update_batch,
};
const struct bpf_map_ops percpu_array_map_ops = {
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
new file mode 100644
index 000000000000..8ad1c9ea26b2
--- /dev/null
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -0,0 +1,634 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2019 Facebook */
+
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/btf.h>
+#include <linux/filter.h>
+#include <linux/slab.h>
+#include <linux/numa.h>
+#include <linux/seq_file.h>
+#include <linux/refcount.h>
+#include <linux/mutex.h>
+
+enum bpf_struct_ops_state {
+ BPF_STRUCT_OPS_STATE_INIT,
+ BPF_STRUCT_OPS_STATE_INUSE,
+ BPF_STRUCT_OPS_STATE_TOBEFREE,
+};
+
+#define BPF_STRUCT_OPS_COMMON_VALUE \
+ refcount_t refcnt; \
+ enum bpf_struct_ops_state state
+
+struct bpf_struct_ops_value {
+ BPF_STRUCT_OPS_COMMON_VALUE;
+ char data[0] ____cacheline_aligned_in_smp;
+};
+
+struct bpf_struct_ops_map {
+ struct bpf_map map;
+ const struct bpf_struct_ops *st_ops;
+ /* protect map_update */
+ struct mutex lock;
+ /* progs has all the bpf_prog that is populated
+ * to the func ptr of the kernel's struct
+ * (in kvalue.data).
+ */
+ struct bpf_prog **progs;
+ /* image is a page that has all the trampolines
+ * that stores the func args before calling the bpf_prog.
+ * A PAGE_SIZE "image" is enough to store all trampoline for
+ * "progs[]".
+ */
+ void *image;
+ /* uvalue->data stores the kernel struct
+ * (e.g. tcp_congestion_ops) that is more useful
+ * to userspace than the kvalue. For example,
+ * the bpf_prog's id is stored instead of the kernel
+ * address of a func ptr.
+ */
+ struct bpf_struct_ops_value *uvalue;
+ /* kvalue.data stores the actual kernel's struct
+ * (e.g. tcp_congestion_ops) that will be
+ * registered to the kernel subsystem.
+ */
+ struct bpf_struct_ops_value kvalue;
+};
+
+#define VALUE_PREFIX "bpf_struct_ops_"
+#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
+
+/* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is
+ * the map's value exposed to the userspace and its btf-type-id is
+ * stored at the map->btf_vmlinux_value_type_id.
+ *
+ */
+#define BPF_STRUCT_OPS_TYPE(_name) \
+extern struct bpf_struct_ops bpf_##_name; \
+ \
+struct bpf_struct_ops_##_name { \
+ BPF_STRUCT_OPS_COMMON_VALUE; \
+ struct _name data ____cacheline_aligned_in_smp; \
+};
+#include "bpf_struct_ops_types.h"
+#undef BPF_STRUCT_OPS_TYPE
+
+enum {
+#define BPF_STRUCT_OPS_TYPE(_name) BPF_STRUCT_OPS_TYPE_##_name,
+#include "bpf_struct_ops_types.h"
+#undef BPF_STRUCT_OPS_TYPE
+ __NR_BPF_STRUCT_OPS_TYPE,
+};
+
+static struct bpf_struct_ops * const bpf_struct_ops[] = {
+#define BPF_STRUCT_OPS_TYPE(_name) \
+ [BPF_STRUCT_OPS_TYPE_##_name] = &bpf_##_name,
+#include "bpf_struct_ops_types.h"
+#undef BPF_STRUCT_OPS_TYPE
+};
+
+const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = {
+};
+
+const struct bpf_prog_ops bpf_struct_ops_prog_ops = {
+};
+
+static const struct btf_type *module_type;
+
+void bpf_struct_ops_init(struct btf *btf)
+{
+ s32 type_id, value_id, module_id;
+ const struct btf_member *member;
+ struct bpf_struct_ops *st_ops;
+ struct bpf_verifier_log log = {};
+ const struct btf_type *t;
+ char value_name[128];
+ const char *mname;
+ u32 i, j;
+
+ /* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */
+#define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name);
+#include "bpf_struct_ops_types.h"
+#undef BPF_STRUCT_OPS_TYPE
+
+ module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT);
+ if (module_id < 0) {
+ pr_warn("Cannot find struct module in btf_vmlinux\n");
+ return;
+ }
+ module_type = btf_type_by_id(btf, module_id);
+
+ for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
+ st_ops = bpf_struct_ops[i];
+
+ if (strlen(st_ops->name) + VALUE_PREFIX_LEN >=
+ sizeof(value_name)) {
+ pr_warn("struct_ops name %s is too long\n",
+ st_ops->name);
+ continue;
+ }
+ sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name);
+
+ value_id = btf_find_by_name_kind(btf, value_name,
+ BTF_KIND_STRUCT);
+ if (value_id < 0) {
+ pr_warn("Cannot find struct %s in btf_vmlinux\n",
+ value_name);
+ continue;
+ }
+
+ type_id = btf_find_by_name_kind(btf, st_ops->name,
+ BTF_KIND_STRUCT);
+ if (type_id < 0) {
+ pr_warn("Cannot find struct %s in btf_vmlinux\n",
+ st_ops->name);
+ continue;
+ }
+ t = btf_type_by_id(btf, type_id);
+ if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) {
+ pr_warn("Cannot support #%u members in struct %s\n",
+ btf_type_vlen(t), st_ops->name);
+ continue;
+ }
+
+ for_each_member(j, t, member) {
+ const struct btf_type *func_proto;
+
+ mname = btf_name_by_offset(btf, member->name_off);
+ if (!*mname) {
+ pr_warn("anon member in struct %s is not supported\n",
+ st_ops->name);
+ break;
+ }
+
+ if (btf_member_bitfield_size(t, member)) {
+ pr_warn("bit field member %s in struct %s is not supported\n",
+ mname, st_ops->name);
+ break;
+ }
+
+ func_proto = btf_type_resolve_func_ptr(btf,
+ member->type,
+ NULL);
+ if (func_proto &&
+ btf_distill_func_proto(&log, btf,
+ func_proto, mname,
+ &st_ops->func_models[j])) {
+ pr_warn("Error in parsing func ptr %s in struct %s\n",
+ mname, st_ops->name);
+ break;
+ }
+ }
+
+ if (j == btf_type_vlen(t)) {
+ if (st_ops->init(btf)) {
+ pr_warn("Error in init bpf_struct_ops %s\n",
+ st_ops->name);
+ } else {
+ st_ops->type_id = type_id;
+ st_ops->type = t;
+ st_ops->value_id = value_id;
+ st_ops->value_type = btf_type_by_id(btf,
+ value_id);
+ }
+ }
+ }
+}
+
+extern struct btf *btf_vmlinux;
+
+static const struct bpf_struct_ops *
+bpf_struct_ops_find_value(u32 value_id)
+{
+ unsigned int i;
+
+ if (!value_id || !btf_vmlinux)
+ return NULL;
+
+ for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
+ if (bpf_struct_ops[i]->value_id == value_id)
+ return bpf_struct_ops[i];
+ }
+
+ return NULL;
+}
+
+const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id)
+{
+ unsigned int i;
+
+ if (!type_id || !btf_vmlinux)
+ return NULL;
+
+ for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
+ if (bpf_struct_ops[i]->type_id == type_id)
+ return bpf_struct_ops[i];
+ }
+
+ return NULL;
+}
+
+static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ if (key && *(u32 *)key == 0)
+ return -ENOENT;
+
+ *(u32 *)next_key = 0;
+ return 0;
+}
+
+int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
+ void *value)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ struct bpf_struct_ops_value *uvalue, *kvalue;
+ enum bpf_struct_ops_state state;
+
+ if (unlikely(*(u32 *)key != 0))
+ return -ENOENT;
+
+ kvalue = &st_map->kvalue;
+ /* Pair with smp_store_release() during map_update */
+ state = smp_load_acquire(&kvalue->state);
+ if (state == BPF_STRUCT_OPS_STATE_INIT) {
+ memset(value, 0, map->value_size);
+ return 0;
+ }
+
+ /* No lock is needed. state and refcnt do not need
+ * to be updated together under atomic context.
+ */
+ uvalue = (struct bpf_struct_ops_value *)value;
+ memcpy(uvalue, st_map->uvalue, map->value_size);
+ uvalue->state = state;
+ refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
+
+ return 0;
+}
+
+static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return ERR_PTR(-EINVAL);
+}
+
+static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
+{
+ const struct btf_type *t = st_map->st_ops->type;
+ u32 i;
+
+ for (i = 0; i < btf_type_vlen(t); i++) {
+ if (st_map->progs[i]) {
+ bpf_prog_put(st_map->progs[i]);
+ st_map->progs[i] = NULL;
+ }
+ }
+}
+
+static int check_zero_holes(const struct btf_type *t, void *data)
+{
+ const struct btf_member *member;
+ u32 i, moff, msize, prev_mend = 0;
+ const struct btf_type *mtype;
+
+ for_each_member(i, t, member) {
+ moff = btf_member_bit_offset(t, member) / 8;
+ if (moff > prev_mend &&
+ memchr_inv(data + prev_mend, 0, moff - prev_mend))
+ return -EINVAL;
+
+ mtype = btf_type_by_id(btf_vmlinux, member->type);
+ mtype = btf_resolve_size(btf_vmlinux, mtype, &msize,
+ NULL, NULL);
+ if (IS_ERR(mtype))
+ return PTR_ERR(mtype);
+ prev_mend = moff + msize;
+ }
+
+ if (t->size > prev_mend &&
+ memchr_inv(data + prev_mend, 0, t->size - prev_mend))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ const struct bpf_struct_ops *st_ops = st_map->st_ops;
+ struct bpf_struct_ops_value *uvalue, *kvalue;
+ const struct btf_member *member;
+ const struct btf_type *t = st_ops->type;
+ void *udata, *kdata;
+ int prog_fd, err = 0;
+ void *image;
+ u32 i;
+
+ if (flags)
+ return -EINVAL;
+
+ if (*(u32 *)key != 0)
+ return -E2BIG;
+
+ err = check_zero_holes(st_ops->value_type, value);
+ if (err)
+ return err;
+
+ uvalue = (struct bpf_struct_ops_value *)value;
+ err = check_zero_holes(t, uvalue->data);
+ if (err)
+ return err;
+
+ if (uvalue->state || refcount_read(&uvalue->refcnt))
+ return -EINVAL;
+
+ uvalue = (struct bpf_struct_ops_value *)st_map->uvalue;
+ kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue;
+
+ mutex_lock(&st_map->lock);
+
+ if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) {
+ err = -EBUSY;
+ goto unlock;
+ }
+
+ memcpy(uvalue, value, map->value_size);
+
+ udata = &uvalue->data;
+ kdata = &kvalue->data;
+ image = st_map->image;
+
+ for_each_member(i, t, member) {
+ const struct btf_type *mtype, *ptype;
+ struct bpf_prog *prog;
+ u32 moff;
+
+ moff = btf_member_bit_offset(t, member) / 8;
+ ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL);
+ if (ptype == module_type) {
+ if (*(void **)(udata + moff))
+ goto reset_unlock;
+ *(void **)(kdata + moff) = BPF_MODULE_OWNER;
+ continue;
+ }
+
+ err = st_ops->init_member(t, member, kdata, udata);
+ if (err < 0)
+ goto reset_unlock;
+
+ /* The ->init_member() has handled this member */
+ if (err > 0)
+ continue;
+
+ /* If st_ops->init_member does not handle it,
+ * we will only handle func ptrs and zero-ed members
+ * here. Reject everything else.
+ */
+
+ /* All non func ptr member must be 0 */
+ if (!ptype || !btf_type_is_func_proto(ptype)) {
+ u32 msize;
+
+ mtype = btf_type_by_id(btf_vmlinux, member->type);
+ mtype = btf_resolve_size(btf_vmlinux, mtype, &msize,
+ NULL, NULL);
+ if (IS_ERR(mtype)) {
+ err = PTR_ERR(mtype);
+ goto reset_unlock;
+ }
+
+ if (memchr_inv(udata + moff, 0, msize)) {
+ err = -EINVAL;
+ goto reset_unlock;
+ }
+
+ continue;
+ }
+
+ prog_fd = (int)(*(unsigned long *)(udata + moff));
+ /* Similar check as the attr->attach_prog_fd */
+ if (!prog_fd)
+ continue;
+
+ prog = bpf_prog_get(prog_fd);
+ if (IS_ERR(prog)) {
+ err = PTR_ERR(prog);
+ goto reset_unlock;
+ }
+ st_map->progs[i] = prog;
+
+ if (prog->type != BPF_PROG_TYPE_STRUCT_OPS ||
+ prog->aux->attach_btf_id != st_ops->type_id ||
+ prog->expected_attach_type != i) {
+ err = -EINVAL;
+ goto reset_unlock;
+ }
+
+ err = arch_prepare_bpf_trampoline(image,
+ st_map->image + PAGE_SIZE,
+ &st_ops->func_models[i], 0,
+ &prog, 1, NULL, 0, NULL);
+ if (err < 0)
+ goto reset_unlock;
+
+ *(void **)(kdata + moff) = image;
+ image += err;
+
+ /* put prog_id to udata */
+ *(unsigned long *)(udata + moff) = prog->aux->id;
+ }
+
+ refcount_set(&kvalue->refcnt, 1);
+ bpf_map_inc(map);
+
+ set_memory_ro((long)st_map->image, 1);
+ set_memory_x((long)st_map->image, 1);
+ err = st_ops->reg(kdata);
+ if (likely(!err)) {
+ /* Pair with smp_load_acquire() during lookup_elem().
+ * It ensures the above udata updates (e.g. prog->aux->id)
+ * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
+ */
+ smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE);
+ goto unlock;
+ }
+
+ /* Error during st_ops->reg(). It is very unlikely since
+ * the above init_member() should have caught it earlier
+ * before reg(). The only possibility is if there was a race
+ * in registering the struct_ops (under the same name) to
+ * a sub-system through different struct_ops's maps.
+ */
+ set_memory_nx((long)st_map->image, 1);
+ set_memory_rw((long)st_map->image, 1);
+ bpf_map_put(map);
+
+reset_unlock:
+ bpf_struct_ops_map_put_progs(st_map);
+ memset(uvalue, 0, map->value_size);
+ memset(kvalue, 0, map->value_size);
+unlock:
+ mutex_unlock(&st_map->lock);
+ return err;
+}
+
+static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
+{
+ enum bpf_struct_ops_state prev_state;
+ struct bpf_struct_ops_map *st_map;
+
+ st_map = (struct bpf_struct_ops_map *)map;
+ prev_state = cmpxchg(&st_map->kvalue.state,
+ BPF_STRUCT_OPS_STATE_INUSE,
+ BPF_STRUCT_OPS_STATE_TOBEFREE);
+ if (prev_state == BPF_STRUCT_OPS_STATE_INUSE) {
+ st_map->st_ops->unreg(&st_map->kvalue.data);
+ if (refcount_dec_and_test(&st_map->kvalue.refcnt))
+ bpf_map_put(map);
+ }
+
+ return 0;
+}
+
+static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
+ struct seq_file *m)
+{
+ void *value;
+ int err;
+
+ value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
+ if (!value)
+ return;
+
+ err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
+ if (!err) {
+ btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id,
+ value, m);
+ seq_puts(m, "\n");
+ }
+
+ kfree(value);
+}
+
+static void bpf_struct_ops_map_free(struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+ if (st_map->progs)
+ bpf_struct_ops_map_put_progs(st_map);
+ bpf_map_area_free(st_map->progs);
+ bpf_jit_free_exec(st_map->image);
+ bpf_map_area_free(st_map->uvalue);
+ bpf_map_area_free(st_map);
+}
+
+static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
+{
+ if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
+ attr->map_flags || !attr->btf_vmlinux_value_type_id)
+ return -EINVAL;
+ return 0;
+}
+
+static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
+{
+ const struct bpf_struct_ops *st_ops;
+ size_t map_total_size, st_map_size;
+ struct bpf_struct_ops_map *st_map;
+ const struct btf_type *t, *vt;
+ struct bpf_map_memory mem;
+ struct bpf_map *map;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
+ if (!st_ops)
+ return ERR_PTR(-ENOTSUPP);
+
+ vt = st_ops->value_type;
+ if (attr->value_size != vt->size)
+ return ERR_PTR(-EINVAL);
+
+ t = st_ops->type;
+
+ st_map_size = sizeof(*st_map) +
+ /* kvalue stores the
+ * struct bpf_struct_ops_tcp_congestions_ops
+ */
+ (vt->size - sizeof(struct bpf_struct_ops_value));
+ map_total_size = st_map_size +
+ /* uvalue */
+ sizeof(vt->size) +
+ /* struct bpf_progs **progs */
+ btf_type_vlen(t) * sizeof(struct bpf_prog *);
+ err = bpf_map_charge_init(&mem, map_total_size);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE);
+ if (!st_map) {
+ bpf_map_charge_finish(&mem);
+ return ERR_PTR(-ENOMEM);
+ }
+ st_map->st_ops = st_ops;
+ map = &st_map->map;
+
+ st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
+ st_map->progs =
+ bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *),
+ NUMA_NO_NODE);
+ st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (!st_map->uvalue || !st_map->progs || !st_map->image) {
+ bpf_struct_ops_map_free(map);
+ bpf_map_charge_finish(&mem);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ mutex_init(&st_map->lock);
+ set_vm_flush_reset_perms(st_map->image);
+ bpf_map_init_from_attr(map, attr);
+ bpf_map_charge_move(&map->memory, &mem);
+
+ return map;
+}
+
+const struct bpf_map_ops bpf_struct_ops_map_ops = {
+ .map_alloc_check = bpf_struct_ops_map_alloc_check,
+ .map_alloc = bpf_struct_ops_map_alloc,
+ .map_free = bpf_struct_ops_map_free,
+ .map_get_next_key = bpf_struct_ops_map_get_next_key,
+ .map_lookup_elem = bpf_struct_ops_map_lookup_elem,
+ .map_delete_elem = bpf_struct_ops_map_delete_elem,
+ .map_update_elem = bpf_struct_ops_map_update_elem,
+ .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem,
+};
+
+/* "const void *" because some subsystem is
+ * passing a const (e.g. const struct tcp_congestion_ops *)
+ */
+bool bpf_struct_ops_get(const void *kdata)
+{
+ struct bpf_struct_ops_value *kvalue;
+
+ kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+
+ return refcount_inc_not_zero(&kvalue->refcnt);
+}
+
+void bpf_struct_ops_put(const void *kdata)
+{
+ struct bpf_struct_ops_value *kvalue;
+
+ kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ if (refcount_dec_and_test(&kvalue->refcnt)) {
+ struct bpf_struct_ops_map *st_map;
+
+ st_map = container_of(kvalue, struct bpf_struct_ops_map,
+ kvalue);
+ bpf_map_put(&st_map->map);
+ }
+}
diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h
new file mode 100644
index 000000000000..066d83ea1c99
--- /dev/null
+++ b/kernel/bpf/bpf_struct_ops_types.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* internal file - do not include directly */
+
+#ifdef CONFIG_BPF_JIT
+#ifdef CONFIG_INET
+#include <net/tcp.h>
+BPF_STRUCT_OPS_TYPE(tcp_congestion_ops)
+#endif
+#endif
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ed2075884724..b7c1660fb594 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -180,11 +180,6 @@
*/
#define BTF_MAX_SIZE (16 * 1024 * 1024)
-#define for_each_member(i, struct_type, member) \
- for (i = 0, member = btf_type_member(struct_type); \
- i < btf_type_vlen(struct_type); \
- i++, member++)
-
#define for_each_member_from(i, from, struct_type, member) \
for (i = from, member = btf_type_member(struct_type) + from; \
i < btf_type_vlen(struct_type); \
@@ -281,6 +276,11 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_DATASEC] = "DATASEC",
};
+static const char *btf_type_str(const struct btf_type *t)
+{
+ return btf_kind_str[BTF_INFO_KIND(t->info)];
+}
+
struct btf_kind_operations {
s32 (*check_meta)(struct btf_verifier_env *env,
const struct btf_type *t,
@@ -382,6 +382,65 @@ static bool btf_type_is_datasec(const struct btf_type *t)
return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
}
+s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
+{
+ const struct btf_type *t;
+ const char *tname;
+ u32 i;
+
+ for (i = 1; i <= btf->nr_types; i++) {
+ t = btf->types[i];
+ if (BTF_INFO_KIND(t->info) != kind)
+ continue;
+
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (!strcmp(tname, name))
+ return i;
+ }
+
+ return -ENOENT;
+}
+
+const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
+ u32 id, u32 *res_id)
+{
+ const struct btf_type *t = btf_type_by_id(btf, id);
+
+ while (btf_type_is_modifier(t)) {
+ id = t->type;
+ t = btf_type_by_id(btf, t->type);
+ }
+
+ if (res_id)
+ *res_id = id;
+
+ return t;
+}
+
+const struct btf_type *btf_type_resolve_ptr(const struct btf *btf,
+ u32 id, u32 *res_id)
+{
+ const struct btf_type *t;
+
+ t = btf_type_skip_modifiers(btf, id, NULL);
+ if (!btf_type_is_ptr(t))
+ return NULL;
+
+ return btf_type_skip_modifiers(btf, t->type, res_id);
+}
+
+const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
+ u32 id, u32 *res_id)
+{
+ const struct btf_type *ptype;
+
+ ptype = btf_type_resolve_ptr(btf, id, res_id);
+ if (ptype && btf_type_is_func_proto(ptype))
+ return ptype;
+
+ return NULL;
+}
+
/* Types that act only as a source, not sink or intermediate
* type when resolving.
*/
@@ -446,30 +505,6 @@ static const char *btf_int_encoding_str(u8 encoding)
return "UNKN";
}
-static u16 btf_type_vlen(const struct btf_type *t)
-{
- return BTF_INFO_VLEN(t->info);
-}
-
-static bool btf_type_kflag(const struct btf_type *t)
-{
- return BTF_INFO_KFLAG(t->info);
-}
-
-static u32 btf_member_bit_offset(const struct btf_type *struct_type,
- const struct btf_member *member)
-{
- return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset)
- : member->offset;
-}
-
-static u32 btf_member_bitfield_size(const struct btf_type *struct_type,
- const struct btf_member *member)
-{
- return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset)
- : 0;
-}
-
static u32 btf_type_int(const struct btf_type *t)
{
return *(u32 *)(t + 1);
@@ -480,11 +515,6 @@ static const struct btf_array *btf_type_array(const struct btf_type *t)
return (const struct btf_array *)(t + 1);
}
-static const struct btf_member *btf_type_member(const struct btf_type *t)
-{
- return (const struct btf_member *)(t + 1);
-}
-
static const struct btf_enum *btf_type_enum(const struct btf_type *t)
{
return (const struct btf_enum *)(t + 1);
@@ -1057,7 +1087,7 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env)
* *elem_type: same as return type ("struct X")
* *total_nelems: 1
*/
-static const struct btf_type *
+const struct btf_type *
btf_resolve_size(const struct btf *btf, const struct btf_type *type,
u32 *type_size, const struct btf_type **elem_type,
u32 *total_nelems)
@@ -1111,8 +1141,10 @@ resolved:
return ERR_PTR(-EINVAL);
*type_size = nelems * size;
- *total_nelems = nelems;
- *elem_type = type;
+ if (total_nelems)
+ *total_nelems = nelems;
+ if (elem_type)
+ *elem_type = type;
return array_type ? : type;
}
@@ -1826,7 +1858,10 @@ static void btf_modifier_seq_show(const struct btf *btf,
u32 type_id, void *data,
u8 bits_offset, struct seq_file *m)
{
- t = btf_type_id_resolve(btf, &type_id);
+ if (btf->resolved_ids)
+ t = btf_type_id_resolve(btf, &type_id);
+ else
+ t = btf_type_skip_modifiers(btf, type_id, NULL);
btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
}
@@ -2621,8 +2656,8 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- if (btf_type_vlen(t)) {
- btf_verifier_log_type(env, t, "vlen != 0");
+ if (btf_type_vlen(t) > BTF_FUNC_GLOBAL) {
+ btf_verifier_log_type(env, t, "Invalid func linkage");
return -EINVAL;
}
@@ -3476,7 +3511,8 @@ static u8 bpf_ctx_convert_map[] = {
static const struct btf_member *
btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
- const struct btf_type *t, enum bpf_prog_type prog_type)
+ const struct btf_type *t, enum bpf_prog_type prog_type,
+ int arg)
{
const struct btf_type *conv_struct;
const struct btf_type *ctx_struct;
@@ -3497,12 +3533,13 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
* is not supported yet.
* BPF_PROG_TYPE_RAW_TRACEPOINT is fine.
*/
- bpf_log(log, "BPF program ctx type is not a struct\n");
+ if (log->level & BPF_LOG_LEVEL)
+ bpf_log(log, "arg#%d type is not a struct\n", arg);
return NULL;
}
tname = btf_name_by_offset(btf, t->name_off);
if (!tname) {
- bpf_log(log, "BPF program ctx struct doesn't have a name\n");
+ bpf_log(log, "arg#%d struct doesn't have a name\n", arg);
return NULL;
}
/* prog_type is valid bpf program type. No need for bounds check. */
@@ -3535,11 +3572,12 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
struct btf *btf,
const struct btf_type *t,
- enum bpf_prog_type prog_type)
+ enum bpf_prog_type prog_type,
+ int arg)
{
const struct btf_member *prog_ctx_type, *kern_ctx_type;
- prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type);
+ prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type, arg);
if (!prog_ctx_type)
return -ENOENT;
kern_ctx_type = prog_ctx_type + 1;
@@ -3605,6 +3643,8 @@ struct btf *btf_parse_vmlinux(void)
goto errout;
}
+ bpf_struct_ops_init(btf);
+
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
return btf;
@@ -3629,6 +3669,19 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
}
}
+static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
+{
+ /* t comes in already as a pointer */
+ t = btf_type_by_id(btf, t->type);
+
+ /* allow const */
+ if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST)
+ t = btf_type_by_id(btf, t->type);
+
+ /* char, signed char, unsigned char */
+ return btf_type_is_int(t) && t->size == 1;
+}
+
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
@@ -3677,7 +3730,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* skip modifiers */
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (btf_type_is_int(t))
+ if (btf_type_is_int(t) || btf_type_is_enum(t))
/* accessing a scalar */
return true;
if (!btf_type_is_ptr(t)) {
@@ -3695,12 +3748,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
*/
return true;
+ if (is_string_ptr(btf, t))
+ return true;
+
/* this is a pointer to another type */
info->reg_type = PTR_TO_BTF_ID;
- info->btf_id = t->type;
if (tgt_prog) {
- ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type);
+ ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg);
if (ret > 0) {
info->btf_id = ret;
return true;
@@ -3708,10 +3763,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
return false;
}
}
+
+ info->btf_id = t->type;
t = btf_type_by_id(btf, t->type);
/* skip modifiers */
- while (btf_type_is_modifier(t))
+ while (btf_type_is_modifier(t)) {
+ info->btf_id = t->type;
t = btf_type_by_id(btf, t->type);
+ }
if (!btf_type_is_struct(t)) {
bpf_log(log,
"func '%s' arg%d type %s is not a struct\n",
@@ -3737,23 +3796,57 @@ int btf_struct_access(struct bpf_verifier_log *log,
again:
tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
if (!btf_type_is_struct(t)) {
- bpf_log(log, "Type '%s' is not a struct", tname);
+ bpf_log(log, "Type '%s' is not a struct\n", tname);
return -EINVAL;
}
- for_each_member(i, t, member) {
- if (btf_member_bitfield_size(t, member))
- /* bitfields are not supported yet */
- continue;
+ if (off + size > t->size) {
+ bpf_log(log, "access beyond struct %s at off %u size %u\n",
+ tname, off, size);
+ return -EACCES;
+ }
+ for_each_member(i, t, member) {
/* offset of the field in bytes */
moff = btf_member_bit_offset(t, member) / 8;
if (off + size <= moff)
/* won't find anything, field is already too far */
break;
+
+ if (btf_member_bitfield_size(t, member)) {
+ u32 end_bit = btf_member_bit_offset(t, member) +
+ btf_member_bitfield_size(t, member);
+
+ /* off <= moff instead of off == moff because clang
+ * does not generate a BTF member for anonymous
+ * bitfield like the ":16" here:
+ * struct {
+ * int :16;
+ * int x:8;
+ * };
+ */
+ if (off <= moff &&
+ BITS_ROUNDUP_BYTES(end_bit) <= off + size)
+ return SCALAR_VALUE;
+
+ /* off may be accessing a following member
+ *
+ * or
+ *
+ * Doing partial access at either end of this
+ * bitfield. Continue on this case also to
+ * treat it as not accessing this bitfield
+ * and eventually error out as field not
+ * found to keep it simple.
+ * It could be relaxed if there was a legit
+ * partial access case later.
+ */
+ continue;
+ }
+
/* In case of "off" is pointing to holes of a struct */
if (off < moff)
- continue;
+ break;
/* type of the field */
mtype = btf_type_by_id(btf_vmlinux, member->type);
@@ -4043,11 +4136,158 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return 0;
}
-int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog)
+/* Compare BTFs of two functions assuming only scalars and pointers to context.
+ * t1 points to BTF_KIND_FUNC in btf1
+ * t2 points to BTF_KIND_FUNC in btf2
+ * Returns:
+ * EINVAL - function prototype mismatch
+ * EFAULT - verifier bug
+ * 0 - 99% match. The last 1% is validated by the verifier.
+ */
+int btf_check_func_type_match(struct bpf_verifier_log *log,
+ struct btf *btf1, const struct btf_type *t1,
+ struct btf *btf2, const struct btf_type *t2)
+{
+ const struct btf_param *args1, *args2;
+ const char *fn1, *fn2, *s1, *s2;
+ u32 nargs1, nargs2, i;
+
+ fn1 = btf_name_by_offset(btf1, t1->name_off);
+ fn2 = btf_name_by_offset(btf2, t2->name_off);
+
+ if (btf_func_linkage(t1) != BTF_FUNC_GLOBAL) {
+ bpf_log(log, "%s() is not a global function\n", fn1);
+ return -EINVAL;
+ }
+ if (btf_func_linkage(t2) != BTF_FUNC_GLOBAL) {
+ bpf_log(log, "%s() is not a global function\n", fn2);
+ return -EINVAL;
+ }
+
+ t1 = btf_type_by_id(btf1, t1->type);
+ if (!t1 || !btf_type_is_func_proto(t1))
+ return -EFAULT;
+ t2 = btf_type_by_id(btf2, t2->type);
+ if (!t2 || !btf_type_is_func_proto(t2))
+ return -EFAULT;
+
+ args1 = (const struct btf_param *)(t1 + 1);
+ nargs1 = btf_type_vlen(t1);
+ args2 = (const struct btf_param *)(t2 + 1);
+ nargs2 = btf_type_vlen(t2);
+
+ if (nargs1 != nargs2) {
+ bpf_log(log, "%s() has %d args while %s() has %d args\n",
+ fn1, nargs1, fn2, nargs2);
+ return -EINVAL;
+ }
+
+ t1 = btf_type_skip_modifiers(btf1, t1->type, NULL);
+ t2 = btf_type_skip_modifiers(btf2, t2->type, NULL);
+ if (t1->info != t2->info) {
+ bpf_log(log,
+ "Return type %s of %s() doesn't match type %s of %s()\n",
+ btf_type_str(t1), fn1,
+ btf_type_str(t2), fn2);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < nargs1; i++) {
+ t1 = btf_type_skip_modifiers(btf1, args1[i].type, NULL);
+ t2 = btf_type_skip_modifiers(btf2, args2[i].type, NULL);
+
+ if (t1->info != t2->info) {
+ bpf_log(log, "arg%d in %s() is %s while %s() has %s\n",
+ i, fn1, btf_type_str(t1),
+ fn2, btf_type_str(t2));
+ return -EINVAL;
+ }
+ if (btf_type_has_size(t1) && t1->size != t2->size) {
+ bpf_log(log,
+ "arg%d in %s() has size %d while %s() has %d\n",
+ i, fn1, t1->size,
+ fn2, t2->size);
+ return -EINVAL;
+ }
+
+ /* global functions are validated with scalars and pointers
+ * to context only. And only global functions can be replaced.
+ * Hence type check only those types.
+ */
+ if (btf_type_is_int(t1) || btf_type_is_enum(t1))
+ continue;
+ if (!btf_type_is_ptr(t1)) {
+ bpf_log(log,
+ "arg%d in %s() has unrecognized type\n",
+ i, fn1);
+ return -EINVAL;
+ }
+ t1 = btf_type_skip_modifiers(btf1, t1->type, NULL);
+ t2 = btf_type_skip_modifiers(btf2, t2->type, NULL);
+ if (!btf_type_is_struct(t1)) {
+ bpf_log(log,
+ "arg%d in %s() is not a pointer to context\n",
+ i, fn1);
+ return -EINVAL;
+ }
+ if (!btf_type_is_struct(t2)) {
+ bpf_log(log,
+ "arg%d in %s() is not a pointer to context\n",
+ i, fn2);
+ return -EINVAL;
+ }
+ /* This is an optional check to make program writing easier.
+ * Compare names of structs and report an error to the user.
+ * btf_prepare_func_args() already checked that t2 struct
+ * is a context type. btf_prepare_func_args() will check
+ * later that t1 struct is a context type as well.
+ */
+ s1 = btf_name_by_offset(btf1, t1->name_off);
+ s2 = btf_name_by_offset(btf2, t2->name_off);
+ if (strcmp(s1, s2)) {
+ bpf_log(log,
+ "arg%d %s(struct %s *) doesn't match %s(struct %s *)\n",
+ i, fn1, s1, fn2, s2);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/* Compare BTFs of given program with BTF of target program */
+int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog,
+ struct btf *btf2, const struct btf_type *t2)
+{
+ struct btf *btf1 = prog->aux->btf;
+ const struct btf_type *t1;
+ u32 btf_id = 0;
+
+ if (!prog->aux->func_info) {
+ bpf_log(&env->log, "Program extension requires BTF\n");
+ return -EINVAL;
+ }
+
+ btf_id = prog->aux->func_info[0].type_id;
+ if (!btf_id)
+ return -EFAULT;
+
+ t1 = btf_type_by_id(btf1, btf_id);
+ if (!t1 || !btf_type_is_func(t1))
+ return -EFAULT;
+
+ return btf_check_func_type_match(&env->log, btf1, t1, btf2, t2);
+}
+
+/* Compare BTF of a function with given bpf_reg_state.
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - there is a type mismatch or BTF is not available.
+ * 0 - BTF matches with what bpf_reg_state expects.
+ * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
+ */
+int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
+ struct bpf_reg_state *reg)
{
- struct bpf_verifier_state *st = env->cur_state;
- struct bpf_func_state *func = st->frame[st->curframe];
- struct bpf_reg_state *reg = func->regs;
struct bpf_verifier_log *log = &env->log;
struct bpf_prog *prog = env->prog;
struct btf *btf = prog->aux->btf;
@@ -4057,27 +4297,30 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog)
const char *tname;
if (!prog->aux->func_info)
- return 0;
+ return -EINVAL;
btf_id = prog->aux->func_info[subprog].type_id;
if (!btf_id)
- return 0;
+ return -EFAULT;
if (prog->aux->func_info_aux[subprog].unreliable)
- return 0;
+ return -EINVAL;
t = btf_type_by_id(btf, btf_id);
if (!t || !btf_type_is_func(t)) {
- bpf_log(log, "BTF of subprog %d doesn't point to KIND_FUNC\n",
+ /* These checks were already done by the verifier while loading
+ * struct bpf_func_info
+ */
+ bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n",
subprog);
- return -EINVAL;
+ return -EFAULT;
}
tname = btf_name_by_offset(btf, t->name_off);
t = btf_type_by_id(btf, t->type);
if (!t || !btf_type_is_func_proto(t)) {
- bpf_log(log, "Invalid type of func %s\n", tname);
- return -EINVAL;
+ bpf_log(log, "Invalid BTF of func %s\n", tname);
+ return -EFAULT;
}
args = (const struct btf_param *)(t + 1);
nargs = btf_type_vlen(t);
@@ -4103,25 +4346,130 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog)
bpf_log(log, "R%d is not a pointer\n", i + 1);
goto out;
}
- /* If program is passing PTR_TO_CTX into subprogram
- * check that BTF type matches.
+ /* If function expects ctx type in BTF check that caller
+ * is passing PTR_TO_CTX.
*/
- if (reg[i + 1].type == PTR_TO_CTX &&
- !btf_get_prog_ctx_type(log, btf, t, prog->type))
- goto out;
- /* All other pointers are ok */
- continue;
+ if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) {
+ if (reg[i + 1].type != PTR_TO_CTX) {
+ bpf_log(log,
+ "arg#%d expected pointer to ctx, but got %s\n",
+ i, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ goto out;
+ }
+ if (check_ctx_reg(env, &reg[i + 1], i + 1))
+ goto out;
+ continue;
+ }
}
- bpf_log(log, "Unrecognized argument type %s\n",
- btf_kind_str[BTF_INFO_KIND(t->info)]);
+ bpf_log(log, "Unrecognized arg#%d type %s\n",
+ i, btf_kind_str[BTF_INFO_KIND(t->info)]);
goto out;
}
return 0;
out:
- /* LLVM optimizations can remove arguments from static functions. */
- bpf_log(log,
- "Type info disagrees with actual arguments due to compiler optimizations\n");
+ /* Compiler optimizations can remove arguments from static functions
+ * or mismatched type can be passed into a global function.
+ * In such cases mark the function as unreliable from BTF point of view.
+ */
prog->aux->func_info_aux[subprog].unreliable = true;
+ return -EINVAL;
+}
+
+/* Convert BTF of a function into bpf_reg_state if possible
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - cannot convert BTF.
+ * 0 - Successfully converted BTF into bpf_reg_state
+ * (either PTR_TO_CTX or SCALAR_VALUE).
+ */
+int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
+ struct bpf_reg_state *reg)
+{
+ struct bpf_verifier_log *log = &env->log;
+ struct bpf_prog *prog = env->prog;
+ enum bpf_prog_type prog_type = prog->type;
+ struct btf *btf = prog->aux->btf;
+ const struct btf_param *args;
+ const struct btf_type *t;
+ u32 i, nargs, btf_id;
+ const char *tname;
+
+ if (!prog->aux->func_info ||
+ prog->aux->func_info_aux[subprog].linkage != BTF_FUNC_GLOBAL) {
+ bpf_log(log, "Verifier bug\n");
+ return -EFAULT;
+ }
+
+ btf_id = prog->aux->func_info[subprog].type_id;
+ if (!btf_id) {
+ bpf_log(log, "Global functions need valid BTF\n");
+ return -EFAULT;
+ }
+
+ t = btf_type_by_id(btf, btf_id);
+ if (!t || !btf_type_is_func(t)) {
+ /* These checks were already done by the verifier while loading
+ * struct bpf_func_info
+ */
+ bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n",
+ subprog);
+ return -EFAULT;
+ }
+ tname = btf_name_by_offset(btf, t->name_off);
+
+ if (log->level & BPF_LOG_LEVEL)
+ bpf_log(log, "Validating %s() func#%d...\n",
+ tname, subprog);
+
+ if (prog->aux->func_info_aux[subprog].unreliable) {
+ bpf_log(log, "Verifier bug in function %s()\n", tname);
+ return -EFAULT;
+ }
+ if (prog_type == BPF_PROG_TYPE_EXT)
+ prog_type = prog->aux->linked_prog->type;
+
+ t = btf_type_by_id(btf, t->type);
+ if (!t || !btf_type_is_func_proto(t)) {
+ bpf_log(log, "Invalid type of function %s()\n", tname);
+ return -EFAULT;
+ }
+ args = (const struct btf_param *)(t + 1);
+ nargs = btf_type_vlen(t);
+ if (nargs > 5) {
+ bpf_log(log, "Global function %s() with %d > 5 args. Buggy compiler.\n",
+ tname, nargs);
+ return -EINVAL;
+ }
+ /* check that function returns int */
+ t = btf_type_by_id(btf, t->type);
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_int(t) && !btf_type_is_enum(t)) {
+ bpf_log(log,
+ "Global function %s() doesn't return scalar. Only those are supported.\n",
+ tname);
+ return -EINVAL;
+ }
+ /* Convert BTF function arguments into verifier types.
+ * Only PTR_TO_CTX and SCALAR are supported atm.
+ */
+ for (i = 0; i < nargs; i++) {
+ t = btf_type_by_id(btf, args[i].type);
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+ if (btf_type_is_int(t) || btf_type_is_enum(t)) {
+ reg[i + 1].type = SCALAR_VALUE;
+ continue;
+ }
+ if (btf_type_is_ptr(t) &&
+ btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
+ reg[i + 1].type = PTR_TO_CTX;
+ continue;
+ }
+ bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
+ i, btf_kind_str[BTF_INFO_KIND(t->info)], tname);
+ return -EINVAL;
+ }
return 0;
}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 9e43b72eb619..9a500fadbef5 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -106,8 +106,7 @@ static u32 prog_list_length(struct list_head *head)
* if parent has overridable or multi-prog, allow attaching
*/
static bool hierarchy_allows_attach(struct cgroup *cgrp,
- enum bpf_attach_type type,
- u32 new_flags)
+ enum bpf_attach_type type)
{
struct cgroup *p;
@@ -290,31 +289,34 @@ cleanup:
* propagate the change to descendants
* @cgrp: The cgroup which descendants to traverse
* @prog: A program to attach
+ * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
* @type: Type of attach operation
* @flags: Option flags
*
* Must be called with cgroup_mutex held.
*/
int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+ struct bpf_prog *replace_prog,
enum bpf_attach_type type, u32 flags)
{
+ u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
struct list_head *progs = &cgrp->bpf.progs[type];
struct bpf_prog *old_prog = NULL;
struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE],
*old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL};
+ struct bpf_prog_list *pl, *replace_pl = NULL;
enum bpf_cgroup_storage_type stype;
- struct bpf_prog_list *pl;
- bool pl_was_allocated;
int err;
- if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
+ if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
+ ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
/* invalid combination */
return -EINVAL;
- if (!hierarchy_allows_attach(cgrp, type, flags))
+ if (!hierarchy_allows_attach(cgrp, type))
return -EPERM;
- if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
+ if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags)
/* Disallow attaching non-overridable on top
* of existing overridable in this cgroup.
* Disallow attaching multi-prog if overridable or none
@@ -324,6 +326,21 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
return -E2BIG;
+ if (flags & BPF_F_ALLOW_MULTI) {
+ list_for_each_entry(pl, progs, node) {
+ if (pl->prog == prog)
+ /* disallow attaching the same prog twice */
+ return -EINVAL;
+ if (pl->prog == replace_prog)
+ replace_pl = pl;
+ }
+ if ((flags & BPF_F_REPLACE) && !replace_pl)
+ /* prog to replace not found for cgroup */
+ return -ENOENT;
+ } else if (!list_empty(progs)) {
+ replace_pl = list_first_entry(progs, typeof(*pl), node);
+ }
+
for_each_cgroup_storage_type(stype) {
storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
if (IS_ERR(storage[stype])) {
@@ -334,53 +351,28 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
}
}
- if (flags & BPF_F_ALLOW_MULTI) {
- list_for_each_entry(pl, progs, node) {
- if (pl->prog == prog) {
- /* disallow attaching the same prog twice */
- for_each_cgroup_storage_type(stype)
- bpf_cgroup_storage_free(storage[stype]);
- return -EINVAL;
- }
+ if (replace_pl) {
+ pl = replace_pl;
+ old_prog = pl->prog;
+ for_each_cgroup_storage_type(stype) {
+ old_storage[stype] = pl->storage[stype];
+ bpf_cgroup_storage_unlink(old_storage[stype]);
}
-
+ } else {
pl = kmalloc(sizeof(*pl), GFP_KERNEL);
if (!pl) {
for_each_cgroup_storage_type(stype)
bpf_cgroup_storage_free(storage[stype]);
return -ENOMEM;
}
-
- pl_was_allocated = true;
- pl->prog = prog;
- for_each_cgroup_storage_type(stype)
- pl->storage[stype] = storage[stype];
list_add_tail(&pl->node, progs);
- } else {
- if (list_empty(progs)) {
- pl = kmalloc(sizeof(*pl), GFP_KERNEL);
- if (!pl) {
- for_each_cgroup_storage_type(stype)
- bpf_cgroup_storage_free(storage[stype]);
- return -ENOMEM;
- }
- pl_was_allocated = true;
- list_add_tail(&pl->node, progs);
- } else {
- pl = list_first_entry(progs, typeof(*pl), node);
- old_prog = pl->prog;
- for_each_cgroup_storage_type(stype) {
- old_storage[stype] = pl->storage[stype];
- bpf_cgroup_storage_unlink(old_storage[stype]);
- }
- pl_was_allocated = false;
- }
- pl->prog = prog;
- for_each_cgroup_storage_type(stype)
- pl->storage[stype] = storage[stype];
}
- cgrp->bpf.flags[type] = flags;
+ pl->prog = prog;
+ for_each_cgroup_storage_type(stype)
+ pl->storage[stype] = storage[stype];
+
+ cgrp->bpf.flags[type] = saved_flags;
err = update_effective_progs(cgrp, type);
if (err)
@@ -408,7 +400,7 @@ cleanup:
pl->storage[stype] = old_storage[stype];
bpf_cgroup_storage_link(old_storage[stype], cgrp, type);
}
- if (pl_was_allocated) {
+ if (!replace_pl) {
list_del(&pl->node);
kfree(pl);
}
@@ -546,6 +538,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
int cgroup_bpf_prog_attach(const union bpf_attr *attr,
enum bpf_prog_type ptype, struct bpf_prog *prog)
{
+ struct bpf_prog *replace_prog = NULL;
struct cgroup *cgrp;
int ret;
@@ -553,8 +546,20 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr,
if (IS_ERR(cgrp))
return PTR_ERR(cgrp);
- ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+ if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
+ (attr->attach_flags & BPF_F_REPLACE)) {
+ replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype);
+ if (IS_ERR(replace_prog)) {
+ cgroup_put(cgrp);
+ return PTR_ERR(replace_prog);
+ }
+ }
+
+ ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type,
attr->attach_flags);
+
+ if (replace_prog)
+ bpf_prog_put(replace_prog);
cgroup_put(cgrp);
return ret;
}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index af6b738cf435..973a20d49749 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -222,8 +222,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
u32 pages, delta;
int ret;
- BUG_ON(fp_old == NULL);
-
size = round_up(size, PAGE_SIZE);
pages = size / PAGE_SIZE;
if (pages <= fp_old->pages)
@@ -520,9 +518,9 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
#ifdef CONFIG_BPF_JIT
/* All BPF JIT sysctl knobs here. */
-int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
+int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
+int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_harden __read_mostly;
-int bpf_jit_kallsyms __read_mostly;
long bpf_jit_limit __read_mostly;
static __always_inline void
@@ -2139,6 +2137,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
const struct bpf_func_proto bpf_spin_lock_proto __weak;
const struct bpf_func_proto bpf_spin_unlock_proto __weak;
+const struct bpf_func_proto bpf_jiffies64_proto __weak;
const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ef49e17ae47c..70f71b154fa5 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -72,17 +72,18 @@ struct bpf_cpu_map {
struct bpf_map map;
/* Below members specific for map type */
struct bpf_cpu_map_entry **cpu_map;
- struct list_head __percpu *flush_list;
};
-static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx);
+static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list);
+
+static int bq_flush_to_queue(struct xdp_bulk_queue *bq);
static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
struct bpf_cpu_map *cmap;
int err = -ENOMEM;
- int ret, cpu;
u64 cost;
+ int ret;
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
@@ -106,7 +107,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
/* make sure page count doesn't overflow */
cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
- cost += sizeof(struct list_head) * num_possible_cpus();
/* Notice returns -EPERM on if map size is larger than memlock limit */
ret = bpf_map_charge_init(&cmap->map.memory, cost);
@@ -115,23 +115,14 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
goto free_cmap;
}
- cmap->flush_list = alloc_percpu(struct list_head);
- if (!cmap->flush_list)
- goto free_charge;
-
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu));
-
/* Alloc array for possible remote "destination" CPUs */
cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
sizeof(struct bpf_cpu_map_entry *),
cmap->map.numa_node);
if (!cmap->cpu_map)
- goto free_percpu;
+ goto free_charge;
return &cmap->map;
-free_percpu:
- free_percpu(cmap->flush_list);
free_charge:
bpf_map_charge_finish(&cmap->map.memory);
free_cmap:
@@ -399,22 +390,14 @@ free_rcu:
static void __cpu_map_entry_free(struct rcu_head *rcu)
{
struct bpf_cpu_map_entry *rcpu;
- int cpu;
/* This cpu_map_entry have been disconnected from map and one
- * RCU graze-period have elapsed. Thus, XDP cannot queue any
+ * RCU grace-period have elapsed. Thus, XDP cannot queue any
* new packets and cannot change/set flush_needed that can
* find this entry.
*/
rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
- /* Flush remaining packets in percpu bulkq */
- for_each_online_cpu(cpu) {
- struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
-
- /* No concurrent bq_enqueue can run at this point */
- bq_flush_to_queue(bq, false);
- }
free_percpu(rcpu->bulkq);
/* Cannot kthread_stop() here, last put free rcpu resources */
put_cpu_map_entry(rcpu);
@@ -436,7 +419,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu)
* percpu bulkq to queue. Due to caller map_delete_elem() disable
* preemption, cannot call kthread_stop() to make sure queue is empty.
* Instead a work_queue is started for stopping kthread,
- * cpu_map_kthread_stop, which waits for an RCU graze period before
+ * cpu_map_kthread_stop, which waits for an RCU grace period before
* stopping kthread, emptying the queue.
*/
static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
@@ -507,7 +490,6 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
static void cpu_map_free(struct bpf_map *map)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
- int cpu;
u32 i;
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
@@ -522,18 +504,6 @@ static void cpu_map_free(struct bpf_map *map)
bpf_clear_redirect_map(map);
synchronize_rcu();
- /* To ensure all pending flush operations have completed wait for flush
- * list be empty on _all_ cpus. Because the above synchronize_rcu()
- * ensures the map is disconnected from the program we can assume no new
- * items will be added to the list.
- */
- for_each_online_cpu(cpu) {
- struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu);
-
- while (!list_empty(flush_list))
- cond_resched();
- }
-
/* For cpu_map the remote CPUs can still be using the entries
* (struct bpf_cpu_map_entry).
*/
@@ -544,10 +514,9 @@ static void cpu_map_free(struct bpf_map *map)
if (!rcpu)
continue;
- /* bq flush and cleanup happens after RCU graze-period */
+ /* bq flush and cleanup happens after RCU grace-period */
__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
}
- free_percpu(cmap->flush_list);
bpf_map_area_free(cmap->cpu_map);
kfree(cmap);
}
@@ -599,7 +568,7 @@ const struct bpf_map_ops cpu_map_ops = {
.map_check_btf = map_check_no_btf,
};
-static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
+static int bq_flush_to_queue(struct xdp_bulk_queue *bq)
{
struct bpf_cpu_map_entry *rcpu = bq->obj;
unsigned int processed = 0, drops = 0;
@@ -620,10 +589,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
err = __ptr_ring_produce(q, xdpf);
if (err) {
drops++;
- if (likely(in_napi_ctx))
- xdp_return_frame_rx_napi(xdpf);
- else
- xdp_return_frame(xdpf);
+ xdp_return_frame_rx_napi(xdpf);
}
processed++;
}
@@ -642,11 +608,11 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
*/
static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
- struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list);
+ struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
- bq_flush_to_queue(bq, true);
+ bq_flush_to_queue(bq);
/* Notice, xdp_buff/page MUST be queued here, long enough for
* driver to code invoking us to finished, due to driver
@@ -681,16 +647,26 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
return 0;
}
-void __cpu_map_flush(struct bpf_map *map)
+void __cpu_map_flush(void)
{
- struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
- struct list_head *flush_list = this_cpu_ptr(cmap->flush_list);
+ struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
- bq_flush_to_queue(bq, true);
+ bq_flush_to_queue(bq);
/* If already running, costs spin_lock_irqsave + smb_mb */
wake_up_process(bq->obj->kthread);
}
}
+
+static int __init cpu_map_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu));
+ return 0;
+}
+
+subsys_initcall(cpu_map_init);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 3d3d61b5985b..58bdca5d978a 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -53,13 +53,11 @@
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
#define DEV_MAP_BULK_SIZE 16
-struct bpf_dtab_netdev;
-
-struct xdp_bulk_queue {
+struct xdp_dev_bulk_queue {
struct xdp_frame *q[DEV_MAP_BULK_SIZE];
struct list_head flush_node;
+ struct net_device *dev;
struct net_device *dev_rx;
- struct bpf_dtab_netdev *obj;
unsigned int count;
};
@@ -67,15 +65,13 @@ struct bpf_dtab_netdev {
struct net_device *dev; /* must be first member, due to tracepoint */
struct hlist_node index_hlist;
struct bpf_dtab *dtab;
- struct xdp_bulk_queue __percpu *bulkq;
struct rcu_head rcu;
- unsigned int idx; /* keep track of map index for tracepoint */
+ unsigned int idx;
};
struct bpf_dtab {
struct bpf_map map;
struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */
- struct list_head __percpu *flush_list;
struct list_head list;
/* these are only used for DEVMAP_HASH type maps */
@@ -85,6 +81,7 @@ struct bpf_dtab {
u32 n_buckets;
};
+static DEFINE_PER_CPU(struct list_head, dev_flush_list);
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
@@ -109,8 +106,8 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
{
- int err, cpu;
- u64 cost;
+ u64 cost = 0;
+ int err;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -125,9 +122,6 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
bpf_map_init_from_attr(&dtab->map, attr);
- /* make sure page count doesn't overflow */
- cost = (u64) sizeof(struct list_head) * num_possible_cpus();
-
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
@@ -143,17 +137,10 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
if (err)
return -EINVAL;
- dtab->flush_list = alloc_percpu(struct list_head);
- if (!dtab->flush_list)
- goto free_charge;
-
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu));
-
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets);
if (!dtab->dev_index_head)
- goto free_percpu;
+ goto free_charge;
spin_lock_init(&dtab->index_lock);
} else {
@@ -161,13 +148,11 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
sizeof(struct bpf_dtab_netdev *),
dtab->map.numa_node);
if (!dtab->netdev_map)
- goto free_percpu;
+ goto free_charge;
}
return 0;
-free_percpu:
- free_percpu(dtab->flush_list);
free_charge:
bpf_map_charge_finish(&dtab->map.memory);
return -ENOMEM;
@@ -201,14 +186,16 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
static void dev_map_free(struct bpf_map *map)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- int i, cpu;
+ int i;
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
* so the programs (can be more than one that used this map) were
- * disconnected from events. Wait for outstanding critical sections in
- * these programs to complete. The rcu critical section only guarantees
- * no further reads against netdev_map. It does __not__ ensure pending
- * flush operations (if any) are complete.
+ * disconnected from events. The following synchronize_rcu() guarantees
+ * both rcu read critical sections complete and waits for
+ * preempt-disable regions (NAPI being the relevant context here) so we
+ * are certain there will be no further reads against the netdev_map and
+ * all flush operations are complete. Flush operations can only be done
+ * from NAPI context for this reason.
*/
spin_lock(&dev_map_lock);
@@ -221,18 +208,6 @@ static void dev_map_free(struct bpf_map *map)
/* Make sure prior __dev_map_entry_free() have completed. */
rcu_barrier();
- /* To ensure all pending flush operations have completed wait for flush
- * list to empty on _all_ cpus.
- * Because the above synchronize_rcu() ensures the map is disconnected
- * from the program we can assume no new items will be added.
- */
- for_each_online_cpu(cpu) {
- struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu);
-
- while (!list_empty(flush_list))
- cond_resched();
- }
-
if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
for (i = 0; i < dtab->n_buckets; i++) {
struct bpf_dtab_netdev *dev;
@@ -243,7 +218,6 @@ static void dev_map_free(struct bpf_map *map)
hlist_for_each_entry_safe(dev, next, head, index_hlist) {
hlist_del_rcu(&dev->index_hlist);
- free_percpu(dev->bulkq);
dev_put(dev->dev);
kfree(dev);
}
@@ -258,7 +232,6 @@ static void dev_map_free(struct bpf_map *map)
if (!dev)
continue;
- free_percpu(dev->bulkq);
dev_put(dev->dev);
kfree(dev);
}
@@ -266,7 +239,6 @@ static void dev_map_free(struct bpf_map *map)
bpf_map_area_free(dtab->netdev_map);
}
- free_percpu(dtab->flush_list);
kfree(dtab);
}
@@ -293,7 +265,8 @@ struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
struct hlist_head *head = dev_map_index_hash(dtab, key);
struct bpf_dtab_netdev *dev;
- hlist_for_each_entry_rcu(dev, head, index_hlist)
+ hlist_for_each_entry_rcu(dev, head, index_hlist,
+ lockdep_is_held(&dtab->index_lock))
if (dev->idx == key)
return dev;
@@ -345,11 +318,9 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
return -ENOENT;
}
-static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
- bool in_napi_ctx)
+static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
{
- struct bpf_dtab_netdev *obj = bq->obj;
- struct net_device *dev = obj->dev;
+ struct net_device *dev = bq->dev;
int sent = 0, drops = 0, err = 0;
int i;
@@ -372,8 +343,7 @@ static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
out:
bq->count = 0;
- trace_xdp_devmap_xmit(&obj->dtab->map, obj->idx,
- sent, drops, bq->dev_rx, dev, err);
+ trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err);
bq->dev_rx = NULL;
__list_del_clearprev(&bq->flush_node);
return 0;
@@ -384,33 +354,29 @@ error:
for (i = 0; i < bq->count; i++) {
struct xdp_frame *xdpf = bq->q[i];
- /* RX path under NAPI protection, can return frames faster */
- if (likely(in_napi_ctx))
- xdp_return_frame_rx_napi(xdpf);
- else
- xdp_return_frame(xdpf);
+ xdp_return_frame_rx_napi(xdpf);
drops++;
}
goto out;
}
-/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
+/* __dev_flush is called from xdp_do_flush() which _must_ be signaled
* from the driver before returning from its napi->poll() routine. The poll()
* routine is called either from busy_poll context or net_rx_action signaled
* from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
* net device can be torn down. On devmap tear down we ensure the flush list
* is empty before completing to ensure all flush operations have completed.
+ * When drivers update the bpf program they may need to ensure any flush ops
+ * are also complete. Using synchronize_rcu or call_rcu will suffice for this
+ * because both wait for napi context to exit.
*/
-void __dev_map_flush(struct bpf_map *map)
+void __dev_flush(void)
{
- struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- struct list_head *flush_list = this_cpu_ptr(dtab->flush_list);
- struct xdp_bulk_queue *bq, *tmp;
+ struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
+ struct xdp_dev_bulk_queue *bq, *tmp;
- rcu_read_lock();
list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
- bq_xmit_all(bq, XDP_XMIT_FLUSH, true);
- rcu_read_unlock();
+ bq_xmit_all(bq, XDP_XMIT_FLUSH);
}
/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
@@ -432,15 +398,14 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
* Thus, safe percpu variable access.
*/
-static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
+static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx)
-
{
- struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list);
- struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
+ struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
+ struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
- bq_xmit_all(bq, 0, true);
+ bq_xmit_all(bq, 0);
/* Ingress dev_rx will be the same for all xdp_frame's in
* bulk_queue, because bq stored per-CPU and must be flushed
@@ -457,10 +422,9 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
return 0;
}
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
- struct net_device *dev_rx)
+static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
{
- struct net_device *dev = dst->dev;
struct xdp_frame *xdpf;
int err;
@@ -475,7 +439,21 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
if (unlikely(!xdpf))
return -EOVERFLOW;
- return bq_enqueue(dst, xdpf, dev_rx);
+ return bq_enqueue(dev, xdpf, dev_rx);
+}
+
+int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
+{
+ return __xdp_enqueue(dev, xdp, dev_rx);
+}
+
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
+{
+ struct net_device *dev = dst->dev;
+
+ return __xdp_enqueue(dev, xdp, dev_rx);
}
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
@@ -509,28 +487,11 @@ static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
return dev ? &dev->ifindex : NULL;
}
-static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
-{
- if (dev->dev->netdev_ops->ndo_xdp_xmit) {
- struct xdp_bulk_queue *bq;
- int cpu;
-
- rcu_read_lock();
- for_each_online_cpu(cpu) {
- bq = per_cpu_ptr(dev->bulkq, cpu);
- bq_xmit_all(bq, XDP_XMIT_FLUSH, false);
- }
- rcu_read_unlock();
- }
-}
-
static void __dev_map_entry_free(struct rcu_head *rcu)
{
struct bpf_dtab_netdev *dev;
dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
- dev_map_flush_old(dev);
- free_percpu(dev->bulkq);
dev_put(dev->dev);
kfree(dev);
}
@@ -545,12 +506,11 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
return -EINVAL;
/* Use call_rcu() here to ensure any rcu critical sections have
- * completed, but this does not guarantee a flush has happened
- * yet. Because driver side rcu_read_lock/unlock only protects the
- * running XDP program. However, for pending flush operations the
- * dev and ctx are stored in another per cpu map. And additionally,
- * the driver tear down ensures all soft irqs are complete before
- * removing the net device in the case of dev_put equals zero.
+ * completed as well as any flush operations because call_rcu
+ * will wait for preempt-disable region to complete, NAPI in this
+ * context. And additionally, the driver tear down ensures all
+ * soft irqs are complete before removing the net device in the
+ * case of dev_put equals zero.
*/
old_dev = xchg(&dtab->netdev_map[k], NULL);
if (old_dev)
@@ -585,30 +545,15 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
u32 ifindex,
unsigned int idx)
{
- gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
struct bpf_dtab_netdev *dev;
- struct xdp_bulk_queue *bq;
- int cpu;
- dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node);
+ dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
+ dtab->map.numa_node);
if (!dev)
return ERR_PTR(-ENOMEM);
- dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
- sizeof(void *), gfp);
- if (!dev->bulkq) {
- kfree(dev);
- return ERR_PTR(-ENOMEM);
- }
-
- for_each_possible_cpu(cpu) {
- bq = per_cpu_ptr(dev->bulkq, cpu);
- bq->obj = dev;
- }
-
dev->dev = dev_get_by_index(net, ifindex);
if (!dev->dev) {
- free_percpu(dev->bulkq);
kfree(dev);
return ERR_PTR(-EINVAL);
}
@@ -768,9 +713,23 @@ static int dev_map_notification(struct notifier_block *notifier,
{
struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
struct bpf_dtab *dtab;
- int i;
+ int i, cpu;
switch (event) {
+ case NETDEV_REGISTER:
+ if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq)
+ break;
+
+ /* will be freed in free_netdev() */
+ netdev->xdp_bulkq =
+ __alloc_percpu_gfp(sizeof(struct xdp_dev_bulk_queue),
+ sizeof(void *), GFP_ATOMIC);
+ if (!netdev->xdp_bulkq)
+ return NOTIFY_BAD;
+
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev;
+ break;
case NETDEV_UNREGISTER:
/* This rcu_read_lock/unlock pair is needed because
* dev_map_list is an RCU list AND to ensure a delete
@@ -810,10 +769,15 @@ static struct notifier_block dev_map_notifier = {
static int __init dev_map_init(void)
{
+ int cpu;
+
/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
offsetof(struct _bpf_dtab_netdev, dev));
register_netdevice_notifier(&dev_map_notifier);
+
+ for_each_possible_cpu(cpu)
+ INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu));
return 0;
}
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
new file mode 100644
index 000000000000..b3e5b214fed8
--- /dev/null
+++ b/kernel/bpf/dispatcher.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2019 Intel Corporation. */
+
+#include <linux/hash.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+
+/* The BPF dispatcher is a multiway branch code generator. The
+ * dispatcher is a mechanism to avoid the performance penalty of an
+ * indirect call, which is expensive when retpolines are enabled. A
+ * dispatch client registers a BPF program into the dispatcher, and if
+ * there is available room in the dispatcher a direct call to the BPF
+ * program will be generated. All calls to the BPF programs called via
+ * the dispatcher will then be a direct call, instead of an
+ * indirect. The dispatcher hijacks a trampoline function it via the
+ * __fentry__ of the trampoline. The trampoline function has the
+ * following signature:
+ *
+ * unsigned int trampoline(const void *ctx, const struct bpf_insn *insnsi,
+ * unsigned int (*bpf_func)(const void *,
+ * const struct bpf_insn *));
+ */
+
+static struct bpf_dispatcher_prog *bpf_dispatcher_find_prog(
+ struct bpf_dispatcher *d, struct bpf_prog *prog)
+{
+ int i;
+
+ for (i = 0; i < BPF_DISPATCHER_MAX; i++) {
+ if (prog == d->progs[i].prog)
+ return &d->progs[i];
+ }
+ return NULL;
+}
+
+static struct bpf_dispatcher_prog *bpf_dispatcher_find_free(
+ struct bpf_dispatcher *d)
+{
+ return bpf_dispatcher_find_prog(d, NULL);
+}
+
+static bool bpf_dispatcher_add_prog(struct bpf_dispatcher *d,
+ struct bpf_prog *prog)
+{
+ struct bpf_dispatcher_prog *entry;
+
+ if (!prog)
+ return false;
+
+ entry = bpf_dispatcher_find_prog(d, prog);
+ if (entry) {
+ refcount_inc(&entry->users);
+ return false;
+ }
+
+ entry = bpf_dispatcher_find_free(d);
+ if (!entry)
+ return false;
+
+ bpf_prog_inc(prog);
+ entry->prog = prog;
+ refcount_set(&entry->users, 1);
+ d->num_progs++;
+ return true;
+}
+
+static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d,
+ struct bpf_prog *prog)
+{
+ struct bpf_dispatcher_prog *entry;
+
+ if (!prog)
+ return false;
+
+ entry = bpf_dispatcher_find_prog(d, prog);
+ if (!entry)
+ return false;
+
+ if (refcount_dec_and_test(&entry->users)) {
+ entry->prog = NULL;
+ bpf_prog_put(prog);
+ d->num_progs--;
+ return true;
+ }
+ return false;
+}
+
+int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
+{
+ return -ENOTSUPP;
+}
+
+static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image)
+{
+ s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0];
+ int i;
+
+ for (i = 0; i < BPF_DISPATCHER_MAX; i++) {
+ if (d->progs[i].prog)
+ *ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func;
+ }
+ return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs);
+}
+
+static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
+{
+ void *old, *new;
+ u32 noff;
+ int err;
+
+ if (!prev_num_progs) {
+ old = NULL;
+ noff = 0;
+ } else {
+ old = d->image + d->image_off;
+ noff = d->image_off ^ (BPF_IMAGE_SIZE / 2);
+ }
+
+ new = d->num_progs ? d->image + noff : NULL;
+ if (new) {
+ if (bpf_dispatcher_prepare(d, new))
+ return;
+ }
+
+ err = bpf_arch_text_poke(d->func, BPF_MOD_JUMP, old, new);
+ if (err || !new)
+ return;
+
+ d->image_off = noff;
+}
+
+void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
+ struct bpf_prog *to)
+{
+ bool changed = false;
+ int prev_num_progs;
+
+ if (from == to)
+ return;
+
+ mutex_lock(&d->mutex);
+ if (!d->image) {
+ d->image = bpf_image_alloc();
+ if (!d->image)
+ goto out;
+ }
+
+ prev_num_progs = d->num_progs;
+ changed |= bpf_dispatcher_remove_prog(d, from);
+ changed |= bpf_dispatcher_add_prog(d, to);
+
+ if (!changed)
+ goto out;
+
+ bpf_dispatcher_update(d, prev_num_progs);
+out:
+ mutex_unlock(&d->mutex);
+}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 22066a62c8c9..2d182c4ee9d9 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -17,6 +17,16 @@
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
+#define BATCH_OPS(_name) \
+ .map_lookup_batch = \
+ _name##_map_lookup_batch, \
+ .map_lookup_and_delete_batch = \
+ _name##_map_lookup_and_delete_batch, \
+ .map_update_batch = \
+ generic_map_update_batch, \
+ .map_delete_batch = \
+ generic_map_delete_batch
+
struct bucket {
struct hlist_nulls_head head;
raw_spinlock_t lock;
@@ -1232,6 +1242,256 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
+static int
+__htab_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ bool do_delete, bool is_lru_map,
+ bool is_percpu)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
+ void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
+ void __user *uvalues = u64_to_user_ptr(attr->batch.values);
+ void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
+ void *ubatch = u64_to_user_ptr(attr->batch.in_batch);
+ u32 batch, max_count, size, bucket_size;
+ u64 elem_map_flags, map_flags;
+ struct hlist_nulls_head *head;
+ struct hlist_nulls_node *n;
+ unsigned long flags;
+ struct htab_elem *l;
+ struct bucket *b;
+ int ret = 0;
+
+ elem_map_flags = attr->batch.elem_flags;
+ if ((elem_map_flags & ~BPF_F_LOCK) ||
+ ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
+ return -EINVAL;
+
+ map_flags = attr->batch.flags;
+ if (map_flags)
+ return -EINVAL;
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
+ batch = 0;
+ if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
+ return -EFAULT;
+
+ if (batch >= htab->n_buckets)
+ return -ENOENT;
+
+ key_size = htab->map.key_size;
+ roundup_key_size = round_up(htab->map.key_size, 8);
+ value_size = htab->map.value_size;
+ size = round_up(value_size, 8);
+ if (is_percpu)
+ value_size = size * num_possible_cpus();
+ total = 0;
+ /* while experimenting with hash tables with sizes ranging from 10 to
+ * 1000, it was observed that a bucket can have upto 5 entries.
+ */
+ bucket_size = 5;
+
+alloc:
+ /* We cannot do copy_from_user or copy_to_user inside
+ * the rcu_read_lock. Allocate enough space here.
+ */
+ keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN);
+ values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN);
+ if (!keys || !values) {
+ ret = -ENOMEM;
+ goto after_loop;
+ }
+
+again:
+ preempt_disable();
+ this_cpu_inc(bpf_prog_active);
+ rcu_read_lock();
+again_nocopy:
+ dst_key = keys;
+ dst_val = values;
+ b = &htab->buckets[batch];
+ head = &b->head;
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ bucket_cnt = 0;
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
+ bucket_cnt++;
+
+ if (bucket_cnt > (max_count - total)) {
+ if (total == 0)
+ ret = -ENOSPC;
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ rcu_read_unlock();
+ this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ goto after_loop;
+ }
+
+ if (bucket_cnt > bucket_size) {
+ bucket_size = bucket_cnt;
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ rcu_read_unlock();
+ this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ kvfree(keys);
+ kvfree(values);
+ goto alloc;
+ }
+
+ hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
+ memcpy(dst_key, l->key, key_size);
+
+ if (is_percpu) {
+ int off = 0, cpu;
+ void __percpu *pptr;
+
+ pptr = htab_elem_get_ptr(l, map->key_size);
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(dst_val + off,
+ per_cpu_ptr(pptr, cpu), size);
+ off += size;
+ }
+ } else {
+ value = l->key + roundup_key_size;
+ if (elem_map_flags & BPF_F_LOCK)
+ copy_map_value_locked(map, dst_val, value,
+ true);
+ else
+ copy_map_value(map, dst_val, value);
+ check_and_init_map_lock(map, dst_val);
+ }
+ if (do_delete) {
+ hlist_nulls_del_rcu(&l->hash_node);
+ if (is_lru_map)
+ bpf_lru_push_free(&htab->lru, &l->lru_node);
+ else
+ free_htab_elem(htab, l);
+ }
+ dst_key += key_size;
+ dst_val += value_size;
+ }
+
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ /* If we are not copying data, we can go to next bucket and avoid
+ * unlocking the rcu.
+ */
+ if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {
+ batch++;
+ goto again_nocopy;
+ }
+
+ rcu_read_unlock();
+ this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
+ key_size * bucket_cnt) ||
+ copy_to_user(uvalues + total * value_size, values,
+ value_size * bucket_cnt))) {
+ ret = -EFAULT;
+ goto after_loop;
+ }
+
+ total += bucket_cnt;
+ batch++;
+ if (batch >= htab->n_buckets) {
+ ret = -ENOENT;
+ goto after_loop;
+ }
+ goto again;
+
+after_loop:
+ if (ret == -EFAULT)
+ goto out;
+
+ /* copy # of entries and next batch */
+ ubatch = u64_to_user_ptr(attr->batch.out_batch);
+ if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
+ put_user(total, &uattr->batch.count))
+ ret = -EFAULT;
+
+out:
+ kvfree(keys);
+ kvfree(values);
+ return ret;
+}
+
+static int
+htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ false, true);
+}
+
+static int
+htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ false, true);
+}
+
+static int
+htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ false, false);
+}
+
+static int
+htab_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ false, false);
+}
+
+static int
+htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ true, true);
+}
+
+static int
+htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ true, true);
+}
+
+static int
+htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ true, false);
+}
+
+static int
+htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ true, false);
+}
+
const struct bpf_map_ops htab_map_ops = {
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
@@ -1242,6 +1502,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_delete_elem = htab_map_delete_elem,
.map_gen_lookup = htab_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
+ BATCH_OPS(htab),
};
const struct bpf_map_ops htab_lru_map_ops = {
@@ -1255,6 +1516,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_delete_elem = htab_lru_map_delete_elem,
.map_gen_lookup = htab_lru_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
+ BATCH_OPS(htab_lru),
};
/* Called from eBPF program */
@@ -1368,6 +1630,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_update_elem = htab_percpu_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
+ BATCH_OPS(htab_percpu),
};
const struct bpf_map_ops htab_lru_percpu_map_ops = {
@@ -1379,6 +1642,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_update_elem = htab_lru_percpu_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
+ BATCH_OPS(htab_lru_percpu),
};
static int fd_htab_map_alloc_check(union bpf_attr *attr)
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index cada974c9f4e..d8b7b110a1c5 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -11,6 +11,7 @@
#include <linux/uidgid.h>
#include <linux/filter.h>
#include <linux/ctype.h>
+#include <linux/jiffies.h>
#include "../../lib/kstrtox.h"
@@ -312,6 +313,17 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
preempt_enable();
}
+BPF_CALL_0(bpf_jiffies64)
+{
+ return get_jiffies_64();
+}
+
+const struct bpf_func_proto bpf_jiffies64_proto = {
+ .func = bpf_jiffies64,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
#ifdef CONFIG_CGROUPS
BPF_CALL_0(bpf_get_current_cgroup_id)
{
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index ecf42bec38c0..bd2fd8eab470 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -196,6 +196,7 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
void *key = map_iter(m)->key;
void *prev_key;
+ (*pos)++;
if (map_iter(m)->done)
return NULL;
@@ -208,8 +209,6 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
map_iter(m)->done = true;
return NULL;
}
-
- ++(*pos);
return key;
}
@@ -380,7 +379,7 @@ static const struct inode_operations bpf_dir_iops = {
.unlink = simple_unlink,
};
-static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
+static int bpf_obj_do_pin(const char __user *pathname, void *raw,
enum bpf_type type)
{
struct dentry *dentry;
@@ -389,7 +388,7 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
umode_t mode;
int ret;
- dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0);
+ dentry = user_path_create(AT_FDCWD, pathname, &path, 0);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
@@ -422,30 +421,22 @@ out:
int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
{
- struct filename *pname;
enum bpf_type type;
void *raw;
int ret;
- pname = getname(pathname);
- if (IS_ERR(pname))
- return PTR_ERR(pname);
-
raw = bpf_fd_probe_obj(ufd, &type);
- if (IS_ERR(raw)) {
- ret = PTR_ERR(raw);
- goto out;
- }
+ if (IS_ERR(raw))
+ return PTR_ERR(raw);
- ret = bpf_obj_do_pin(pname, raw, type);
+ ret = bpf_obj_do_pin(pathname, raw, type);
if (ret != 0)
bpf_any_put(raw, type);
-out:
- putname(pname);
+
return ret;
}
-static void *bpf_obj_do_get(const struct filename *pathname,
+static void *bpf_obj_do_get(const char __user *pathname,
enum bpf_type *type, int flags)
{
struct inode *inode;
@@ -453,7 +444,7 @@ static void *bpf_obj_do_get(const struct filename *pathname,
void *raw;
int ret;
- ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path);
+ ret = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW, &path);
if (ret)
return ERR_PTR(ret);
@@ -480,36 +471,27 @@ out:
int bpf_obj_get_user(const char __user *pathname, int flags)
{
enum bpf_type type = BPF_TYPE_UNSPEC;
- struct filename *pname;
- int ret = -ENOENT;
int f_flags;
void *raw;
+ int ret;
f_flags = bpf_get_file_flag(flags);
if (f_flags < 0)
return f_flags;
- pname = getname(pathname);
- if (IS_ERR(pname))
- return PTR_ERR(pname);
-
- raw = bpf_obj_do_get(pname, &type, f_flags);
- if (IS_ERR(raw)) {
- ret = PTR_ERR(raw);
- goto out;
- }
+ raw = bpf_obj_do_get(pathname, &type, f_flags);
+ if (IS_ERR(raw))
+ return PTR_ERR(raw);
if (type == BPF_TYPE_PROG)
ret = bpf_prog_new_fd(raw);
else if (type == BPF_TYPE_MAP)
ret = bpf_map_new_fd(raw, f_flags);
else
- goto out;
+ return -ENOENT;
if (ret < 0)
bpf_any_put(raw, type);
-out:
- putname(pname);
return ret;
}
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 5e9366b33f0f..b3c48d1533cb 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -22,7 +22,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
*/
if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
- inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+ inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE ||
+ inner_map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
fdput(f);
return ERR_PTR(-ENOTSUPP);
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e3461ec59570..a91ad518c050 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -23,6 +23,7 @@
#include <linux/timekeeping.h>
#include <linux/ctype.h>
#include <linux/nospec.h>
+#include <linux/audit.h>
#include <uapi/linux/btf.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -128,6 +129,152 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
return map;
}
+static u32 bpf_map_value_size(struct bpf_map *map)
+{
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+ return round_up(map->value_size, 8) * num_possible_cpus();
+ else if (IS_FD_MAP(map))
+ return sizeof(u32);
+ else
+ return map->value_size;
+}
+
+static void maybe_wait_bpf_programs(struct bpf_map *map)
+{
+ /* Wait for any running BPF programs to complete so that
+ * userspace, when we return to it, knows that all programs
+ * that could be running use the new map value.
+ */
+ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
+ map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
+ synchronize_rcu();
+}
+
+static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
+ void *value, __u64 flags)
+{
+ int err;
+
+ /* Need to create a kthread, thus must support schedule */
+ if (bpf_map_is_dev_bound(map)) {
+ return bpf_map_offload_update_elem(map, key, value, flags);
+ } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
+ map->map_type == BPF_MAP_TYPE_SOCKHASH ||
+ map->map_type == BPF_MAP_TYPE_SOCKMAP ||
+ map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ return map->ops->map_update_elem(map, key, value, flags);
+ } else if (IS_FD_PROG_ARRAY(map)) {
+ return bpf_fd_array_map_update_elem(map, f.file, key, value,
+ flags);
+ }
+
+ /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
+ * inside bpf map update or delete otherwise deadlocks are possible
+ */
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ err = bpf_percpu_hash_update(map, key, value, flags);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ err = bpf_percpu_array_update(map, key, value, flags);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+ err = bpf_percpu_cgroup_storage_update(map, key, value,
+ flags);
+ } else if (IS_FD_ARRAY(map)) {
+ rcu_read_lock();
+ err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+ flags);
+ rcu_read_unlock();
+ } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+ rcu_read_lock();
+ err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
+ flags);
+ rcu_read_unlock();
+ } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+ /* rcu_read_lock() is not needed */
+ err = bpf_fd_reuseport_array_update_elem(map, key, value,
+ flags);
+ } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+ map->map_type == BPF_MAP_TYPE_STACK) {
+ err = map->ops->map_push_elem(map, value, flags);
+ } else {
+ rcu_read_lock();
+ err = map->ops->map_update_elem(map, key, value, flags);
+ rcu_read_unlock();
+ }
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ maybe_wait_bpf_programs(map);
+
+ return err;
+}
+
+static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
+ __u64 flags)
+{
+ void *ptr;
+ int err;
+
+ if (bpf_map_is_dev_bound(map))
+ return bpf_map_offload_lookup_elem(map, key, value);
+
+ preempt_disable();
+ this_cpu_inc(bpf_prog_active);
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ err = bpf_percpu_hash_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ err = bpf_percpu_array_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+ err = bpf_percpu_cgroup_storage_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
+ err = bpf_stackmap_copy(map, key, value);
+ } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
+ err = bpf_fd_array_map_lookup_elem(map, key, value);
+ } else if (IS_FD_HASH(map)) {
+ err = bpf_fd_htab_map_lookup_elem(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+ err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+ map->map_type == BPF_MAP_TYPE_STACK) {
+ err = map->ops->map_peek_elem(map, value);
+ } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ /* struct_ops map requires directly updating "value" */
+ err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
+ } else {
+ rcu_read_lock();
+ if (map->ops->map_lookup_elem_sys_only)
+ ptr = map->ops->map_lookup_elem_sys_only(map, key);
+ else
+ ptr = map->ops->map_lookup_elem(map, key);
+ if (IS_ERR(ptr)) {
+ err = PTR_ERR(ptr);
+ } else if (!ptr) {
+ err = -ENOENT;
+ } else {
+ err = 0;
+ if (flags & BPF_F_LOCK)
+ /* lock 'ptr' and copy everything but lock */
+ copy_map_value_locked(map, value, ptr, true);
+ else
+ copy_map_value(map, value, ptr);
+ /* mask lock, since value wasn't zero inited */
+ check_and_init_map_lock(map, value);
+ }
+ rcu_read_unlock();
+ }
+
+ this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ maybe_wait_bpf_programs(map);
+
+ return err;
+}
+
static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
{
/* We really just want to fail instead of triggering OOM killer
@@ -627,7 +774,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
return ret;
}
-#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id
+#define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
@@ -641,6 +788,14 @@ static int map_create(union bpf_attr *attr)
if (err)
return -EINVAL;
+ if (attr->btf_vmlinux_value_type_id) {
+ if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
+ attr->btf_key_type_id || attr->btf_value_type_id)
+ return -EINVAL;
+ } else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
+ return -EINVAL;
+ }
+
f_flags = bpf_get_file_flag(attr->map_flags);
if (f_flags < 0)
return f_flags;
@@ -663,32 +818,35 @@ static int map_create(union bpf_attr *attr)
atomic64_set(&map->usercnt, 1);
mutex_init(&map->freeze_mutex);
- if (attr->btf_key_type_id || attr->btf_value_type_id) {
+ map->spin_lock_off = -EINVAL;
+ if (attr->btf_key_type_id || attr->btf_value_type_id ||
+ /* Even the map's value is a kernel's struct,
+ * the bpf_prog.o must have BTF to begin with
+ * to figure out the corresponding kernel's
+ * counter part. Thus, attr->btf_fd has
+ * to be valid also.
+ */
+ attr->btf_vmlinux_value_type_id) {
struct btf *btf;
- if (!attr->btf_value_type_id) {
- err = -EINVAL;
- goto free_map;
- }
-
btf = btf_get_by_fd(attr->btf_fd);
if (IS_ERR(btf)) {
err = PTR_ERR(btf);
goto free_map;
}
+ map->btf = btf;
- err = map_check_btf(map, btf, attr->btf_key_type_id,
- attr->btf_value_type_id);
- if (err) {
- btf_put(btf);
- goto free_map;
+ if (attr->btf_value_type_id) {
+ err = map_check_btf(map, btf, attr->btf_key_type_id,
+ attr->btf_value_type_id);
+ if (err)
+ goto free_map;
}
- map->btf = btf;
map->btf_key_type_id = attr->btf_key_type_id;
map->btf_value_type_id = attr->btf_value_type_id;
- } else {
- map->spin_lock_off = -EINVAL;
+ map->btf_vmlinux_value_type_id =
+ attr->btf_vmlinux_value_type_id;
}
err = security_bpf_map_alloc(map);
@@ -815,7 +973,7 @@ static int map_lookup_elem(union bpf_attr *attr)
void __user *uvalue = u64_to_user_ptr(attr->value);
int ufd = attr->map_fd;
struct bpf_map *map;
- void *key, *value, *ptr;
+ void *key, *value;
u32 value_size;
struct fd f;
int err;
@@ -847,72 +1005,14 @@ static int map_lookup_elem(union bpf_attr *attr)
goto err_put;
}
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
- map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
- value_size = round_up(map->value_size, 8) * num_possible_cpus();
- else if (IS_FD_MAP(map))
- value_size = sizeof(u32);
- else
- value_size = map->value_size;
+ value_size = bpf_map_value_size(map);
err = -ENOMEM;
value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
- if (bpf_map_is_dev_bound(map)) {
- err = bpf_map_offload_lookup_elem(map, key, value);
- goto done;
- }
-
- preempt_disable();
- this_cpu_inc(bpf_prog_active);
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
- err = bpf_percpu_hash_copy(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
- err = bpf_percpu_array_copy(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
- err = bpf_percpu_cgroup_storage_copy(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
- err = bpf_stackmap_copy(map, key, value);
- } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
- err = bpf_fd_array_map_lookup_elem(map, key, value);
- } else if (IS_FD_HASH(map)) {
- err = bpf_fd_htab_map_lookup_elem(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
- err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
- map->map_type == BPF_MAP_TYPE_STACK) {
- err = map->ops->map_peek_elem(map, value);
- } else {
- rcu_read_lock();
- if (map->ops->map_lookup_elem_sys_only)
- ptr = map->ops->map_lookup_elem_sys_only(map, key);
- else
- ptr = map->ops->map_lookup_elem(map, key);
- if (IS_ERR(ptr)) {
- err = PTR_ERR(ptr);
- } else if (!ptr) {
- err = -ENOENT;
- } else {
- err = 0;
- if (attr->flags & BPF_F_LOCK)
- /* lock 'ptr' and copy everything but lock */
- copy_map_value_locked(map, value, ptr, true);
- else
- copy_map_value(map, value, ptr);
- /* mask lock, since value wasn't zero inited */
- check_and_init_map_lock(map, value);
- }
- rcu_read_unlock();
- }
- this_cpu_dec(bpf_prog_active);
- preempt_enable();
-
-done:
+ err = bpf_map_copy_value(map, key, value, attr->flags);
if (err)
goto free_value;
@@ -931,16 +1031,6 @@ err_put:
return err;
}
-static void maybe_wait_bpf_programs(struct bpf_map *map)
-{
- /* Wait for any running BPF programs to complete so that
- * userspace, when we return to it, knows that all programs
- * that could be running use the new map value.
- */
- if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
- map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
- synchronize_rcu();
-}
#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
@@ -996,60 +1086,8 @@ static int map_update_elem(union bpf_attr *attr)
if (copy_from_user(value, uvalue, value_size) != 0)
goto free_value;
- /* Need to create a kthread, thus must support schedule */
- if (bpf_map_is_dev_bound(map)) {
- err = bpf_map_offload_update_elem(map, key, value, attr->flags);
- goto out;
- } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
- map->map_type == BPF_MAP_TYPE_SOCKHASH ||
- map->map_type == BPF_MAP_TYPE_SOCKMAP) {
- err = map->ops->map_update_elem(map, key, value, attr->flags);
- goto out;
- } else if (IS_FD_PROG_ARRAY(map)) {
- err = bpf_fd_array_map_update_elem(map, f.file, key, value,
- attr->flags);
- goto out;
- }
+ err = bpf_map_update_value(map, f, key, value, attr->flags);
- /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
- * inside bpf map update or delete otherwise deadlocks are possible
- */
- preempt_disable();
- __this_cpu_inc(bpf_prog_active);
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
- err = bpf_percpu_hash_update(map, key, value, attr->flags);
- } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
- err = bpf_percpu_array_update(map, key, value, attr->flags);
- } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
- err = bpf_percpu_cgroup_storage_update(map, key, value,
- attr->flags);
- } else if (IS_FD_ARRAY(map)) {
- rcu_read_lock();
- err = bpf_fd_array_map_update_elem(map, f.file, key, value,
- attr->flags);
- rcu_read_unlock();
- } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
- rcu_read_lock();
- err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
- attr->flags);
- rcu_read_unlock();
- } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
- /* rcu_read_lock() is not needed */
- err = bpf_fd_reuseport_array_update_elem(map, key, value,
- attr->flags);
- } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
- map->map_type == BPF_MAP_TYPE_STACK) {
- err = map->ops->map_push_elem(map, value, attr->flags);
- } else {
- rcu_read_lock();
- err = map->ops->map_update_elem(map, key, value, attr->flags);
- rcu_read_unlock();
- }
- __this_cpu_dec(bpf_prog_active);
- preempt_enable();
- maybe_wait_bpf_programs(map);
-out:
free_value:
kfree(value);
free_key:
@@ -1091,7 +1129,9 @@ static int map_delete_elem(union bpf_attr *attr)
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_delete_elem(map, key);
goto out;
- } else if (IS_FD_PROG_ARRAY(map)) {
+ } else if (IS_FD_PROG_ARRAY(map) ||
+ map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ /* These maps require sleepable context */
err = map->ops->map_delete_elem(map, key);
goto out;
}
@@ -1178,6 +1218,220 @@ err_put:
return err;
}
+int generic_map_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ void __user *keys = u64_to_user_ptr(attr->batch.keys);
+ u32 cp, max_count;
+ int err = 0;
+ void *key;
+
+ if (attr->batch.elem_flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
+ if ((attr->batch.elem_flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)) {
+ return -EINVAL;
+ }
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ if (!key)
+ return -ENOMEM;
+
+ for (cp = 0; cp < max_count; cp++) {
+ err = -EFAULT;
+ if (copy_from_user(key, keys + cp * map->key_size,
+ map->key_size))
+ break;
+
+ if (bpf_map_is_dev_bound(map)) {
+ err = bpf_map_offload_delete_elem(map, key);
+ break;
+ }
+
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
+ rcu_read_lock();
+ err = map->ops->map_delete_elem(map, key);
+ rcu_read_unlock();
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ maybe_wait_bpf_programs(map);
+ if (err)
+ break;
+ }
+ if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
+ err = -EFAULT;
+
+ kfree(key);
+ return err;
+}
+
+int generic_map_update_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ void __user *values = u64_to_user_ptr(attr->batch.values);
+ void __user *keys = u64_to_user_ptr(attr->batch.keys);
+ u32 value_size, cp, max_count;
+ int ufd = attr->map_fd;
+ void *key, *value;
+ struct fd f;
+ int err = 0;
+
+ f = fdget(ufd);
+ if (attr->batch.elem_flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
+ if ((attr->batch.elem_flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)) {
+ return -EINVAL;
+ }
+
+ value_size = bpf_map_value_size(map);
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ if (!key)
+ return -ENOMEM;
+
+ value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+ if (!value) {
+ kfree(key);
+ return -ENOMEM;
+ }
+
+ for (cp = 0; cp < max_count; cp++) {
+ err = -EFAULT;
+ if (copy_from_user(key, keys + cp * map->key_size,
+ map->key_size) ||
+ copy_from_user(value, values + cp * value_size, value_size))
+ break;
+
+ err = bpf_map_update_value(map, f, key, value,
+ attr->batch.elem_flags);
+
+ if (err)
+ break;
+ }
+
+ if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
+ err = -EFAULT;
+
+ kfree(value);
+ kfree(key);
+ return err;
+}
+
+#define MAP_LOOKUP_RETRIES 3
+
+int generic_map_lookup_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
+ void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
+ void __user *values = u64_to_user_ptr(attr->batch.values);
+ void __user *keys = u64_to_user_ptr(attr->batch.keys);
+ void *buf, *buf_prevkey, *prev_key, *key, *value;
+ int err, retry = MAP_LOOKUP_RETRIES;
+ u32 value_size, cp, max_count;
+
+ if (attr->batch.elem_flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
+ if ((attr->batch.elem_flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map))
+ return -EINVAL;
+
+ value_size = bpf_map_value_size(map);
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
+ buf_prevkey = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ if (!buf_prevkey)
+ return -ENOMEM;
+
+ buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
+ if (!buf) {
+ kvfree(buf_prevkey);
+ return -ENOMEM;
+ }
+
+ err = -EFAULT;
+ prev_key = NULL;
+ if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size))
+ goto free_buf;
+ key = buf;
+ value = key + map->key_size;
+ if (ubatch)
+ prev_key = buf_prevkey;
+
+ for (cp = 0; cp < max_count;) {
+ rcu_read_lock();
+ err = map->ops->map_get_next_key(map, prev_key, key);
+ rcu_read_unlock();
+ if (err)
+ break;
+ err = bpf_map_copy_value(map, key, value,
+ attr->batch.elem_flags);
+
+ if (err == -ENOENT) {
+ if (retry) {
+ retry--;
+ continue;
+ }
+ err = -EINTR;
+ break;
+ }
+
+ if (err)
+ goto free_buf;
+
+ if (copy_to_user(keys + cp * map->key_size, key,
+ map->key_size)) {
+ err = -EFAULT;
+ goto free_buf;
+ }
+ if (copy_to_user(values + cp * value_size, value, value_size)) {
+ err = -EFAULT;
+ goto free_buf;
+ }
+
+ if (!prev_key)
+ prev_key = buf_prevkey;
+
+ swap(prev_key, key);
+ retry = MAP_LOOKUP_RETRIES;
+ cp++;
+ }
+
+ if (err == -EFAULT)
+ goto free_buf;
+
+ if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) ||
+ (cp && copy_to_user(uobatch, prev_key, map->key_size))))
+ err = -EFAULT;
+
+free_buf:
+ kfree(buf_prevkey);
+ kfree(buf);
+ return err;
+}
+
#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
static int map_lookup_and_delete_elem(union bpf_attr *attr)
@@ -1306,6 +1560,36 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
return 0;
}
+enum bpf_audit {
+ BPF_AUDIT_LOAD,
+ BPF_AUDIT_UNLOAD,
+ BPF_AUDIT_MAX,
+};
+
+static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
+ [BPF_AUDIT_LOAD] = "LOAD",
+ [BPF_AUDIT_UNLOAD] = "UNLOAD",
+};
+
+static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
+{
+ struct audit_context *ctx = NULL;
+ struct audit_buffer *ab;
+
+ if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
+ return;
+ if (audit_enabled == AUDIT_OFF)
+ return;
+ if (op == BPF_AUDIT_LOAD)
+ ctx = audit_context();
+ ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
+ if (unlikely(!ab))
+ return;
+ audit_log_format(ab, "prog-id=%u op=%s",
+ prog->aux->id, bpf_audit_str[op]);
+ audit_log_end(ab);
+}
+
int __bpf_prog_charge(struct user_struct *user, u32 pages)
{
unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
@@ -1421,6 +1705,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
if (atomic64_dec_and_test(&prog->aux->refcnt)) {
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
+ bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
/* bpf_prog_free_id() must be called first */
bpf_prog_free_id(prog, do_idr_lock);
__bpf_prog_put_noref(prog, true);
@@ -1640,17 +1925,24 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
enum bpf_attach_type expected_attach_type,
u32 btf_id, u32 prog_fd)
{
- switch (prog_type) {
- case BPF_PROG_TYPE_TRACING:
+ if (btf_id) {
if (btf_id > BTF_MAX_TYPE)
return -EINVAL;
- break;
- default:
- if (btf_id || prog_fd)
+
+ switch (prog_type) {
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ case BPF_PROG_TYPE_EXT:
+ break;
+ default:
return -EINVAL;
- break;
+ }
}
+ if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING &&
+ prog_type != BPF_PROG_TYPE_EXT)
+ return -EINVAL;
+
switch (prog_type) {
case BPF_PROG_TYPE_CGROUP_SOCK:
switch (expected_attach_type) {
@@ -1691,6 +1983,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
default:
return -EINVAL;
}
+ case BPF_PROG_TYPE_EXT:
+ if (expected_attach_type)
+ return -EINVAL;
+ /* fallthrough */
default:
return 0;
}
@@ -1830,6 +2126,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
*/
bpf_prog_kallsyms_add(prog);
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
+ bpf_audit_prog(prog, BPF_AUDIT_LOAD);
err = bpf_prog_new_fd(prog);
if (err < 0)
@@ -1892,7 +2189,8 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
int tr_fd, err;
if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
- prog->expected_attach_type != BPF_TRACE_FEXIT) {
+ prog->expected_attach_type != BPF_TRACE_FEXIT &&
+ prog->type != BPF_PROG_TYPE_EXT) {
err = -EINVAL;
goto out_put_prog;
}
@@ -1959,12 +2257,14 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT &&
prog->type != BPF_PROG_TYPE_TRACING &&
+ prog->type != BPF_PROG_TYPE_EXT &&
prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) {
err = -EINVAL;
goto out_put_prog;
}
- if (prog->type == BPF_PROG_TYPE_TRACING) {
+ if (prog->type == BPF_PROG_TYPE_TRACING ||
+ prog->type == BPF_PROG_TYPE_EXT) {
if (attr->raw_tracepoint.name) {
/* The attach point for this category of programs
* should be specified via btf_id during program load.
@@ -2040,10 +2340,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
}
}
-#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
+#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd
#define BPF_F_ATTACH_MASK \
- (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
+ (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE)
static int bpf_prog_attach(const union bpf_attr *attr)
{
@@ -2305,17 +2605,12 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
-static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
+struct bpf_prog *bpf_prog_by_id(u32 id)
{
struct bpf_prog *prog;
- u32 id = attr->prog_id;
- int fd;
- if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ if (!id)
+ return ERR_PTR(-ENOENT);
spin_lock_bh(&prog_idr_lock);
prog = idr_find(&prog_idr, id);
@@ -2324,7 +2619,22 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
else
prog = ERR_PTR(-ENOENT);
spin_unlock_bh(&prog_idr_lock);
+ return prog;
+}
+
+static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ u32 id = attr->prog_id;
+ int fd;
+
+ if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ prog = bpf_prog_by_id(id);
if (IS_ERR(prog))
return PTR_ERR(prog);
@@ -2774,6 +3084,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
info.btf_key_type_id = map->btf_key_type_id;
info.btf_value_type_id = map->btf_value_type_id;
}
+ info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_info_fill(&info, map);
@@ -2986,6 +3297,61 @@ out:
return err;
}
+#define BPF_MAP_BATCH_LAST_FIELD batch.flags
+
+#define BPF_DO_BATCH(fn) \
+ do { \
+ if (!fn) { \
+ err = -ENOTSUPP; \
+ goto err_put; \
+ } \
+ err = fn(map, attr, uattr); \
+ } while (0)
+
+static int bpf_map_do_batch(const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ int cmd)
+{
+ struct bpf_map *map;
+ int err, ufd;
+ struct fd f;
+
+ if (CHECK_ATTR(BPF_MAP_BATCH))
+ return -EINVAL;
+
+ ufd = attr->batch.map_fd;
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ if ((cmd == BPF_MAP_LOOKUP_BATCH ||
+ cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) &&
+ !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
+ if (cmd != BPF_MAP_LOOKUP_BATCH &&
+ !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
+ if (cmd == BPF_MAP_LOOKUP_BATCH)
+ BPF_DO_BATCH(map->ops->map_lookup_batch);
+ else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
+ BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
+ else if (cmd == BPF_MAP_UPDATE_BATCH)
+ BPF_DO_BATCH(map->ops->map_update_batch);
+ else
+ BPF_DO_BATCH(map->ops->map_delete_batch);
+
+err_put:
+ fdput(f);
+ return err;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
@@ -3083,6 +3449,19 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
err = map_lookup_and_delete_elem(&attr);
break;
+ case BPF_MAP_LOOKUP_BATCH:
+ err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH);
+ break;
+ case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
+ err = bpf_map_do_batch(&attr, uattr,
+ BPF_MAP_LOOKUP_AND_DELETE_BATCH);
+ break;
+ case BPF_MAP_UPDATE_BATCH:
+ err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH);
+ break;
+ case BPF_MAP_DELETE_BATCH:
+ err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 23b0d5cfd47e..6b264a92064b 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -4,16 +4,98 @@
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/ftrace.h>
+#include <linux/rbtree_latch.h>
+
+/* dummy _ops. The verifier will operate on target program's ops. */
+const struct bpf_verifier_ops bpf_extension_verifier_ops = {
+};
+const struct bpf_prog_ops bpf_extension_prog_ops = {
+};
/* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */
#define TRAMPOLINE_HASH_BITS 10
#define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS)
static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
+static struct latch_tree_root image_tree __cacheline_aligned;
-/* serializes access to trampoline_table */
+/* serializes access to trampoline_table and image_tree */
static DEFINE_MUTEX(trampoline_mutex);
+static void *bpf_jit_alloc_exec_page(void)
+{
+ void *image;
+
+ image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (!image)
+ return NULL;
+
+ set_vm_flush_reset_perms(image);
+ /* Keep image as writeable. The alternative is to keep flipping ro/rw
+ * everytime new program is attached or detached.
+ */
+ set_memory_x((long)image, 1);
+ return image;
+}
+
+static __always_inline bool image_tree_less(struct latch_tree_node *a,
+ struct latch_tree_node *b)
+{
+ struct bpf_image *ia = container_of(a, struct bpf_image, tnode);
+ struct bpf_image *ib = container_of(b, struct bpf_image, tnode);
+
+ return ia < ib;
+}
+
+static __always_inline int image_tree_comp(void *addr, struct latch_tree_node *n)
+{
+ void *image = container_of(n, struct bpf_image, tnode);
+
+ if (addr < image)
+ return -1;
+ if (addr >= image + PAGE_SIZE)
+ return 1;
+
+ return 0;
+}
+
+static const struct latch_tree_ops image_tree_ops = {
+ .less = image_tree_less,
+ .comp = image_tree_comp,
+};
+
+static void *__bpf_image_alloc(bool lock)
+{
+ struct bpf_image *image;
+
+ image = bpf_jit_alloc_exec_page();
+ if (!image)
+ return NULL;
+
+ if (lock)
+ mutex_lock(&trampoline_mutex);
+ latch_tree_insert(&image->tnode, &image_tree, &image_tree_ops);
+ if (lock)
+ mutex_unlock(&trampoline_mutex);
+ return image->data;
+}
+
+void *bpf_image_alloc(void)
+{
+ return __bpf_image_alloc(true);
+}
+
+bool is_bpf_image_address(unsigned long addr)
+{
+ bool ret;
+
+ rcu_read_lock();
+ ret = latch_tree_find((void *) addr, &image_tree, &image_tree_ops) != NULL;
+ rcu_read_unlock();
+
+ return ret;
+}
+
struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
{
struct bpf_trampoline *tr;
@@ -34,7 +116,7 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
goto out;
/* is_root was checked earlier. No need for bpf_jit_charge_modmem() */
- image = bpf_jit_alloc_exec(PAGE_SIZE);
+ image = __bpf_image_alloc(false);
if (!image) {
kfree(tr);
tr = NULL;
@@ -48,12 +130,6 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
mutex_init(&tr->mutex);
for (i = 0; i < BPF_TRAMP_MAX; i++)
INIT_HLIST_HEAD(&tr->progs_hlist[i]);
-
- set_vm_flush_reset_perms(image);
- /* Keep image as writeable. The alternative is to keep flipping ro/rw
- * everytime new program is attached or detached.
- */
- set_memory_x((long)image, 1);
tr->image = image;
out:
mutex_unlock(&trampoline_mutex);
@@ -115,14 +191,14 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
}
/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
- * bytes on x86. Pick a number to fit into PAGE_SIZE / 2
+ * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2
*/
#define BPF_MAX_TRAMP_PROGS 40
static int bpf_trampoline_update(struct bpf_trampoline *tr)
{
- void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2;
- void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2;
+ void *old_image = tr->image + ((tr->selector + 1) & 1) * BPF_IMAGE_SIZE/2;
+ void *new_image = tr->image + (tr->selector & 1) * BPF_IMAGE_SIZE/2;
struct bpf_prog *progs_to_run[BPF_MAX_TRAMP_PROGS];
int fentry_cnt = tr->progs_cnt[BPF_TRAMP_FENTRY];
int fexit_cnt = tr->progs_cnt[BPF_TRAMP_FEXIT];
@@ -150,11 +226,20 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
if (fexit_cnt)
flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
- err = arch_prepare_bpf_trampoline(new_image, &tr->func.model, flags,
+ /* Though the second half of trampoline page is unused a task could be
+ * preempted in the middle of the first half of trampoline and two
+ * updates to trampoline would change the code from underneath the
+ * preempted task. Hence wait for tasks to voluntarily schedule or go
+ * to userspace.
+ */
+ synchronize_rcu_tasks();
+
+ err = arch_prepare_bpf_trampoline(new_image, new_image + BPF_IMAGE_SIZE / 2,
+ &tr->func.model, flags,
fentry, fentry_cnt,
fexit, fexit_cnt,
tr->func.addr);
- if (err)
+ if (err < 0)
goto out;
if (tr->selector)
@@ -175,8 +260,10 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t)
switch (t) {
case BPF_TRACE_FENTRY:
return BPF_TRAMP_FENTRY;
- default:
+ case BPF_TRACE_FEXIT:
return BPF_TRAMP_FEXIT;
+ default:
+ return BPF_TRAMP_REPLACE;
}
}
@@ -185,12 +272,31 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog)
enum bpf_tramp_prog_type kind;
struct bpf_trampoline *tr;
int err = 0;
+ int cnt;
tr = prog->aux->trampoline;
kind = bpf_attach_type_to_tramp(prog->expected_attach_type);
mutex_lock(&tr->mutex);
- if (tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT]
- >= BPF_MAX_TRAMP_PROGS) {
+ if (tr->extension_prog) {
+ /* cannot attach fentry/fexit if extension prog is attached.
+ * cannot overwrite extension prog either.
+ */
+ err = -EBUSY;
+ goto out;
+ }
+ cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT];
+ if (kind == BPF_TRAMP_REPLACE) {
+ /* Cannot attach extension if fentry/fexit are in use. */
+ if (cnt) {
+ err = -EBUSY;
+ goto out;
+ }
+ tr->extension_prog = prog;
+ err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
+ prog->bpf_func);
+ goto out;
+ }
+ if (cnt >= BPF_MAX_TRAMP_PROGS) {
err = -E2BIG;
goto out;
}
@@ -221,15 +327,25 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
tr = prog->aux->trampoline;
kind = bpf_attach_type_to_tramp(prog->expected_attach_type);
mutex_lock(&tr->mutex);
+ if (kind == BPF_TRAMP_REPLACE) {
+ WARN_ON_ONCE(!tr->extension_prog);
+ err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
+ tr->extension_prog->bpf_func, NULL);
+ tr->extension_prog = NULL;
+ goto out;
+ }
hlist_del(&prog->aux->tramp_hlist);
tr->progs_cnt[kind]--;
err = bpf_trampoline_update(prog->aux->trampoline);
+out:
mutex_unlock(&tr->mutex);
return err;
}
void bpf_trampoline_put(struct bpf_trampoline *tr)
{
+ struct bpf_image *image;
+
if (!tr)
return;
mutex_lock(&trampoline_mutex);
@@ -240,7 +356,11 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
goto out;
if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
goto out;
- bpf_jit_free_exec(tr->image);
+ image = container_of(tr->image, struct bpf_image, data);
+ latch_tree_erase(&image->tnode, &image_tree, &image_tree_ops);
+ /* wait for tasks to get out of trampoline before freeing it */
+ synchronize_rcu_tasks();
+ bpf_jit_free_exec(image);
hlist_del(&tr->hlist);
kfree(tr);
out:
@@ -286,7 +406,8 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
}
int __weak
-arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
+arch_prepare_bpf_trampoline(void *image, void *image_end,
+ const struct btf_func_model *m, u32 flags,
struct bpf_prog **fentry_progs, int fentry_cnt,
struct bpf_prog **fexit_progs, int fexit_cnt,
void *orig_call)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7d530ce8719d..1cc945daa9c8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1122,10 +1122,6 @@ static void init_reg_state(struct bpf_verifier_env *env,
regs[BPF_REG_FP].type = PTR_TO_STACK;
mark_reg_known_zero(env, regs, BPF_REG_FP);
regs[BPF_REG_FP].frameno = state->frameno;
-
- /* 1st arg to a function */
- regs[BPF_REG_1].type = PTR_TO_CTX;
- mark_reg_known_zero(env, regs, BPF_REG_1);
}
#define BPF_MAIN_FUNC (-1)
@@ -1916,6 +1912,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_TCP_SOCK:
case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
+ case PTR_TO_BTF_ID:
return true;
default:
return false;
@@ -2738,8 +2735,8 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
}
#endif
-static int check_ctx_reg(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg, int regno)
+int check_ctx_reg(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno)
{
/* Access to ctx or passing it to a helper is only allowed in
* its original, unmodified form.
@@ -2858,11 +2855,6 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
u32 btf_id;
int ret;
- if (atype != BPF_READ) {
- verbose(env, "only read is supported\n");
- return -EACCES;
- }
-
if (off < 0) {
verbose(env,
"R%d is ptr_%s invalid negative access: off=%d\n",
@@ -2879,17 +2871,32 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
return -EACCES;
}
- ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id);
+ if (env->ops->btf_struct_access) {
+ ret = env->ops->btf_struct_access(&env->log, t, off, size,
+ atype, &btf_id);
+ } else {
+ if (atype != BPF_READ) {
+ verbose(env, "only read is supported\n");
+ return -EACCES;
+ }
+
+ ret = btf_struct_access(&env->log, t, off, size, atype,
+ &btf_id);
+ }
+
if (ret < 0)
return ret;
- if (ret == SCALAR_VALUE) {
- mark_reg_unknown(env, regs, value_regno);
- return 0;
+ if (atype == BPF_READ) {
+ if (ret == SCALAR_VALUE) {
+ mark_reg_unknown(env, regs, value_regno);
+ return 0;
+ }
+ mark_reg_known_zero(env, regs, value_regno);
+ regs[value_regno].type = PTR_TO_BTF_ID;
+ regs[value_regno].btf_id = btf_id;
}
- mark_reg_known_zero(env, regs, value_regno);
- regs[value_regno].type = PTR_TO_BTF_ID;
- regs[value_regno].btf_id = btf_id;
+
return 0;
}
@@ -3945,12 +3952,26 @@ static int release_reference(struct bpf_verifier_env *env,
return 0;
}
+static void clear_caller_saved_regs(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs)
+{
+ int i;
+
+ /* after the call registers r0 - r5 were scratched */
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ mark_reg_not_init(env, regs, caller_saved[i]);
+ check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+ }
+}
+
static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx)
{
struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_func_info_aux *func_info_aux;
struct bpf_func_state *caller, *callee;
int i, err, subprog, target_insn;
+ bool is_global = false;
if (state->curframe + 1 >= MAX_CALL_FRAMES) {
verbose(env, "the call stack of %d frames is too deep\n",
@@ -3973,6 +3994,32 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EFAULT;
}
+ func_info_aux = env->prog->aux->func_info_aux;
+ if (func_info_aux)
+ is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
+ err = btf_check_func_arg_match(env, subprog, caller->regs);
+ if (err == -EFAULT)
+ return err;
+ if (is_global) {
+ if (err) {
+ verbose(env, "Caller passes invalid args into func#%d\n",
+ subprog);
+ return err;
+ } else {
+ if (env->log.level & BPF_LOG_LEVEL)
+ verbose(env,
+ "Func#%d is global and valid. Skipping.\n",
+ subprog);
+ clear_caller_saved_regs(env, caller->regs);
+
+ /* All global functions return SCALAR_VALUE */
+ mark_reg_unknown(env, caller->regs, BPF_REG_0);
+
+ /* continue with next insn after call */
+ return 0;
+ }
+ }
+
callee = kzalloc(sizeof(*callee), GFP_KERNEL);
if (!callee)
return -ENOMEM;
@@ -3999,18 +4046,11 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
for (i = BPF_REG_1; i <= BPF_REG_5; i++)
callee->regs[i] = caller->regs[i];
- /* after the call registers r0 - r5 were scratched */
- for (i = 0; i < CALLER_SAVED_REGS; i++) {
- mark_reg_not_init(env, caller->regs, caller_saved[i]);
- check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
- }
+ clear_caller_saved_regs(env, caller->regs);
/* only increment it after check_reg_arg() finished */
state->curframe++;
- if (btf_check_func_arg_match(env, subprog))
- return -EINVAL;
-
/* and go analyze first insn of the callee */
*insn_idx = target_insn;
@@ -6360,8 +6400,30 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
static int check_return_code(struct bpf_verifier_env *env)
{
struct tnum enforce_attach_type_range = tnum_unknown;
+ const struct bpf_prog *prog = env->prog;
struct bpf_reg_state *reg;
struct tnum range = tnum_range(0, 1);
+ int err;
+
+ /* The struct_ops func-ptr's return type could be "void" */
+ if (env->prog->type == BPF_PROG_TYPE_STRUCT_OPS &&
+ !prog->aux->attach_func_proto->type)
+ return 0;
+
+ /* eBPF calling convetion is such that R0 is used
+ * to return the value from eBPF program.
+ * Make sure that it's readable at this time
+ * of bpf_exit, which means that program wrote
+ * something into it earlier
+ */
+ err = check_reg_arg(env, BPF_REG_0, SRC_OP);
+ if (err)
+ return err;
+
+ if (is_pointer_value(env, BPF_REG_0)) {
+ verbose(env, "R0 leaks addr as return value\n");
+ return -EACCES;
+ }
switch (env->prog->type) {
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
@@ -6750,12 +6812,13 @@ static int check_btf_func(struct bpf_verifier_env *env,
/* check type_id */
type = btf_type_by_id(btf, krecord[i].type_id);
- if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) {
+ if (!type || !btf_type_is_func(type)) {
verbose(env, "invalid type id %d in func info",
krecord[i].type_id);
ret = -EINVAL;
goto err_free;
}
+ info_aux[i].linkage = BTF_INFO_VLEN(type->info);
prev_offset = krecord[i].insn_off;
urecord += urec_size;
}
@@ -7735,35 +7798,13 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
static int do_check(struct bpf_verifier_env *env)
{
- struct bpf_verifier_state *state;
+ struct bpf_verifier_state *state = env->cur_state;
struct bpf_insn *insns = env->prog->insnsi;
struct bpf_reg_state *regs;
int insn_cnt = env->prog->len;
bool do_print_state = false;
int prev_insn_idx = -1;
- env->prev_linfo = NULL;
-
- state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
- if (!state)
- return -ENOMEM;
- state->curframe = 0;
- state->speculative = false;
- state->branches = 1;
- state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
- if (!state->frame[0]) {
- kfree(state);
- return -ENOMEM;
- }
- env->cur_state = state;
- init_func_state(env, state->frame[0],
- BPF_MAIN_FUNC /* callsite */,
- 0 /* frameno */,
- 0 /* subprogno, zero == main subprog */);
-
- if (btf_check_func_arg_match(env, 0))
- return -EINVAL;
-
for (;;) {
struct bpf_insn *insn;
u8 class;
@@ -7841,7 +7882,7 @@ static int do_check(struct bpf_verifier_env *env)
}
regs = cur_regs(env);
- env->insn_aux_data[env->insn_idx].seen = true;
+ env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
prev_insn_idx = env->insn_idx;
if (class == BPF_ALU || class == BPF_ALU64) {
@@ -8027,21 +8068,6 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
- /* eBPF calling convetion is such that R0 is used
- * to return the value from eBPF program.
- * Make sure that it's readable at this time
- * of bpf_exit, which means that program wrote
- * something into it earlier
- */
- err = check_reg_arg(env, BPF_REG_0, SRC_OP);
- if (err)
- return err;
-
- if (is_pointer_value(env, BPF_REG_0)) {
- verbose(env, "R0 leaks addr as return value\n");
- return -EACCES;
- }
-
err = check_return_code(env);
if (err)
return err;
@@ -8076,7 +8102,7 @@ process_bpf_exit:
return err;
env->insn_idx++;
- env->insn_aux_data[env->insn_idx].seen = true;
+ env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
} else {
verbose(env, "invalid BPF_LD mode\n");
return -EINVAL;
@@ -8089,7 +8115,6 @@ process_bpf_exit:
env->insn_idx++;
}
- env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
return 0;
}
@@ -8149,6 +8174,11 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return -EINVAL;
}
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ verbose(env, "bpf_struct_ops map cannot be used in prog\n");
+ return -EINVAL;
+ }
+
return 0;
}
@@ -8361,7 +8391,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env,
memcpy(new_data + off + cnt - 1, old_data + off,
sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
for (i = off; i < off + cnt - 1; i++) {
- new_data[i].seen = true;
+ new_data[i].seen = env->pass_cnt;
new_data[i].zext_dst = insn_has_def32(env, insn + i);
}
env->insn_aux_data = new_data;
@@ -8840,12 +8870,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
break;
case PTR_TO_BTF_ID:
- if (type == BPF_WRITE) {
+ if (type == BPF_READ) {
+ insn->code = BPF_LDX | BPF_PROBE_MEM |
+ BPF_SIZE((insn)->code);
+ env->prog->aux->num_exentries++;
+ } else if (env->prog->type != BPF_PROG_TYPE_STRUCT_OPS) {
verbose(env, "Writes through BTF pointers are not allowed\n");
return -EINVAL;
}
- insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code);
- env->prog->aux->num_exentries++;
continue;
default:
continue;
@@ -9425,6 +9457,30 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
goto patch_call_imm;
}
+ if (prog->jit_requested && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_jiffies64) {
+ struct bpf_insn ld_jiffies_addr[2] = {
+ BPF_LD_IMM64(BPF_REG_0,
+ (unsigned long)&jiffies),
+ };
+
+ insn_buf[0] = ld_jiffies_addr[0];
+ insn_buf[1] = ld_jiffies_addr[1];
+ insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0,
+ BPF_REG_0, 0);
+ cnt = 3;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
+ cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
patch_call_imm:
fn = env->ops->get_func_proto(insn->imm, env->prog);
/* all functions that have prototype and verifier allowed
@@ -9471,6 +9527,7 @@ static void free_states(struct bpf_verifier_env *env)
kfree(sl);
sl = sln;
}
+ env->free_list = NULL;
if (!env->explored_states)
return;
@@ -9484,11 +9541,164 @@ static void free_states(struct bpf_verifier_env *env)
kfree(sl);
sl = sln;
}
+ env->explored_states[i] = NULL;
}
+}
- kvfree(env->explored_states);
+/* The verifier is using insn_aux_data[] to store temporary data during
+ * verification and to store information for passes that run after the
+ * verification like dead code sanitization. do_check_common() for subprogram N
+ * may analyze many other subprograms. sanitize_insn_aux_data() clears all
+ * temporary data after do_check_common() finds that subprogram N cannot be
+ * verified independently. pass_cnt counts the number of times
+ * do_check_common() was run and insn->aux->seen tells the pass number
+ * insn_aux_data was touched. These variables are compared to clear temporary
+ * data from failed pass. For testing and experiments do_check_common() can be
+ * run multiple times even when prior attempt to verify is unsuccessful.
+ */
+static void sanitize_insn_aux_data(struct bpf_verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ struct bpf_insn_aux_data *aux;
+ int i, class;
+
+ for (i = 0; i < env->prog->len; i++) {
+ class = BPF_CLASS(insn[i].code);
+ if (class != BPF_LDX && class != BPF_STX)
+ continue;
+ aux = &env->insn_aux_data[i];
+ if (aux->seen != env->pass_cnt)
+ continue;
+ memset(aux, 0, offsetof(typeof(*aux), orig_idx));
+ }
}
+static int do_check_common(struct bpf_verifier_env *env, int subprog)
+{
+ struct bpf_verifier_state *state;
+ struct bpf_reg_state *regs;
+ int ret, i;
+
+ env->prev_linfo = NULL;
+ env->pass_cnt++;
+
+ state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
+ if (!state)
+ return -ENOMEM;
+ state->curframe = 0;
+ state->speculative = false;
+ state->branches = 1;
+ state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
+ if (!state->frame[0]) {
+ kfree(state);
+ return -ENOMEM;
+ }
+ env->cur_state = state;
+ init_func_state(env, state->frame[0],
+ BPF_MAIN_FUNC /* callsite */,
+ 0 /* frameno */,
+ subprog);
+
+ regs = state->frame[state->curframe]->regs;
+ if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
+ ret = btf_prepare_func_args(env, subprog, regs);
+ if (ret)
+ goto out;
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
+ if (regs[i].type == PTR_TO_CTX)
+ mark_reg_known_zero(env, regs, i);
+ else if (regs[i].type == SCALAR_VALUE)
+ mark_reg_unknown(env, regs, i);
+ }
+ } else {
+ /* 1st arg to a function */
+ regs[BPF_REG_1].type = PTR_TO_CTX;
+ mark_reg_known_zero(env, regs, BPF_REG_1);
+ ret = btf_check_func_arg_match(env, subprog, regs);
+ if (ret == -EFAULT)
+ /* unlikely verifier bug. abort.
+ * ret == 0 and ret < 0 are sadly acceptable for
+ * main() function due to backward compatibility.
+ * Like socket filter program may be written as:
+ * int bpf_prog(struct pt_regs *ctx)
+ * and never dereference that ctx in the program.
+ * 'struct pt_regs' is a type mismatch for socket
+ * filter that should be using 'struct __sk_buff'.
+ */
+ goto out;
+ }
+
+ ret = do_check(env);
+out:
+ /* check for NULL is necessary, since cur_state can be freed inside
+ * do_check() under memory pressure.
+ */
+ if (env->cur_state) {
+ free_verifier_state(env->cur_state, true);
+ env->cur_state = NULL;
+ }
+ while (!pop_stack(env, NULL, NULL));
+ free_states(env);
+ if (ret)
+ /* clean aux data in case subprog was rejected */
+ sanitize_insn_aux_data(env);
+ return ret;
+}
+
+/* Verify all global functions in a BPF program one by one based on their BTF.
+ * All global functions must pass verification. Otherwise the whole program is rejected.
+ * Consider:
+ * int bar(int);
+ * int foo(int f)
+ * {
+ * return bar(f);
+ * }
+ * int bar(int b)
+ * {
+ * ...
+ * }
+ * foo() will be verified first for R1=any_scalar_value. During verification it
+ * will be assumed that bar() already verified successfully and call to bar()
+ * from foo() will be checked for type match only. Later bar() will be verified
+ * independently to check that it's safe for R1=any_scalar_value.
+ */
+static int do_check_subprogs(struct bpf_verifier_env *env)
+{
+ struct bpf_prog_aux *aux = env->prog->aux;
+ int i, ret;
+
+ if (!aux->func_info)
+ return 0;
+
+ for (i = 1; i < env->subprog_cnt; i++) {
+ if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL)
+ continue;
+ env->insn_idx = env->subprog_info[i].start;
+ WARN_ON_ONCE(env->insn_idx == 0);
+ ret = do_check_common(env, i);
+ if (ret) {
+ return ret;
+ } else if (env->log.level & BPF_LOG_LEVEL) {
+ verbose(env,
+ "Func#%d is safe for any args that match its prototype\n",
+ i);
+ }
+ }
+ return 0;
+}
+
+static int do_check_main(struct bpf_verifier_env *env)
+{
+ int ret;
+
+ env->insn_idx = 0;
+ ret = do_check_common(env, 0);
+ if (!ret)
+ env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
+ return ret;
+}
+
+
static void print_verification_stats(struct bpf_verifier_env *env)
{
int i;
@@ -9513,9 +9723,62 @@ static void print_verification_stats(struct bpf_verifier_env *env)
env->peak_states, env->longest_mark_read_walk);
}
+static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
+{
+ const struct btf_type *t, *func_proto;
+ const struct bpf_struct_ops *st_ops;
+ const struct btf_member *member;
+ struct bpf_prog *prog = env->prog;
+ u32 btf_id, member_idx;
+ const char *mname;
+
+ btf_id = prog->aux->attach_btf_id;
+ st_ops = bpf_struct_ops_find(btf_id);
+ if (!st_ops) {
+ verbose(env, "attach_btf_id %u is not a supported struct\n",
+ btf_id);
+ return -ENOTSUPP;
+ }
+
+ t = st_ops->type;
+ member_idx = prog->expected_attach_type;
+ if (member_idx >= btf_type_vlen(t)) {
+ verbose(env, "attach to invalid member idx %u of struct %s\n",
+ member_idx, st_ops->name);
+ return -EINVAL;
+ }
+
+ member = &btf_type_member(t)[member_idx];
+ mname = btf_name_by_offset(btf_vmlinux, member->name_off);
+ func_proto = btf_type_resolve_func_ptr(btf_vmlinux, member->type,
+ NULL);
+ if (!func_proto) {
+ verbose(env, "attach to invalid member %s(@idx %u) of struct %s\n",
+ mname, member_idx, st_ops->name);
+ return -EINVAL;
+ }
+
+ if (st_ops->check_member) {
+ int err = st_ops->check_member(t, member);
+
+ if (err) {
+ verbose(env, "attach to unsupported member %s of struct %s\n",
+ mname, st_ops->name);
+ return err;
+ }
+ }
+
+ prog->aux->attach_func_proto = func_proto;
+ prog->aux->attach_func_name = mname;
+ env->ops = st_ops->verifier_ops;
+
+ return 0;
+}
+
static int check_attach_btf_id(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
+ bool prog_extension = prog->type == BPF_PROG_TYPE_EXT;
struct bpf_prog *tgt_prog = prog->aux->linked_prog;
u32 btf_id = prog->aux->attach_btf_id;
const char prefix[] = "btf_trace_";
@@ -9528,7 +9791,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
long addr;
u64 key;
- if (prog->type != BPF_PROG_TYPE_TRACING)
+ if (prog->type == BPF_PROG_TYPE_STRUCT_OPS)
+ return check_struct_ops_btf_id(env);
+
+ if (prog->type != BPF_PROG_TYPE_TRACING && !prog_extension)
return 0;
if (!btf_id) {
@@ -9564,8 +9830,59 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
return -EINVAL;
}
conservative = aux->func_info_aux[subprog].unreliable;
+ if (prog_extension) {
+ if (conservative) {
+ verbose(env,
+ "Cannot replace static functions\n");
+ return -EINVAL;
+ }
+ if (!prog->jit_requested) {
+ verbose(env,
+ "Extension programs should be JITed\n");
+ return -EINVAL;
+ }
+ env->ops = bpf_verifier_ops[tgt_prog->type];
+ }
+ if (!tgt_prog->jited) {
+ verbose(env, "Can attach to only JITed progs\n");
+ return -EINVAL;
+ }
+ if (tgt_prog->type == prog->type) {
+ /* Cannot fentry/fexit another fentry/fexit program.
+ * Cannot attach program extension to another extension.
+ * It's ok to attach fentry/fexit to extension program.
+ */
+ verbose(env, "Cannot recursively attach\n");
+ return -EINVAL;
+ }
+ if (tgt_prog->type == BPF_PROG_TYPE_TRACING &&
+ prog_extension &&
+ (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY ||
+ tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) {
+ /* Program extensions can extend all program types
+ * except fentry/fexit. The reason is the following.
+ * The fentry/fexit programs are used for performance
+ * analysis, stats and can be attached to any program
+ * type except themselves. When extension program is
+ * replacing XDP function it is necessary to allow
+ * performance analysis of all functions. Both original
+ * XDP program and its program extension. Hence
+ * attaching fentry/fexit to BPF_PROG_TYPE_EXT is
+ * allowed. If extending of fentry/fexit was allowed it
+ * would be possible to create long call chain
+ * fentry->extension->fentry->extension beyond
+ * reasonable stack size. Hence extending fentry is not
+ * allowed.
+ */
+ verbose(env, "Cannot extend fentry/fexit\n");
+ return -EINVAL;
+ }
key = ((u64)aux->id) << 32 | btf_id;
} else {
+ if (prog_extension) {
+ verbose(env, "Cannot replace kernel functions\n");
+ return -EINVAL;
+ }
key = btf_id;
}
@@ -9603,6 +9920,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
prog->aux->attach_func_proto = t;
prog->aux->attach_btf_trace = true;
return 0;
+ default:
+ if (!prog_extension)
+ return -EINVAL;
+ /* fallthrough */
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
if (!btf_type_is_func(t)) {
@@ -9610,6 +9931,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
btf_id);
return -EINVAL;
}
+ if (prog_extension &&
+ btf_check_type_match(env, prog, btf, t))
+ return -EINVAL;
t = btf_type_by_id(btf, t->type);
if (!btf_type_is_func_proto(t))
return -EINVAL;
@@ -9633,18 +9957,6 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
if (ret < 0)
goto out;
if (tgt_prog) {
- if (!tgt_prog->jited) {
- /* for now */
- verbose(env, "Can trace only JITed BPF progs\n");
- ret = -EINVAL;
- goto out;
- }
- if (tgt_prog->type == BPF_PROG_TYPE_TRACING) {
- /* prevent cycles */
- verbose(env, "Cannot recursively attach\n");
- ret = -EINVAL;
- goto out;
- }
if (subprog == 0)
addr = (long) tgt_prog->bpf_func;
else
@@ -9666,8 +9978,6 @@ out:
if (ret)
bpf_trampoline_put(tr);
return ret;
- default:
- return -EINVAL;
}
}
@@ -9737,10 +10047,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
goto skip_full_check;
}
- ret = check_attach_btf_id(env);
- if (ret)
- goto skip_full_check;
-
env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
env->strict_alignment = true;
@@ -9777,22 +10083,22 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (ret < 0)
goto skip_full_check;
+ ret = check_attach_btf_id(env);
+ if (ret)
+ goto skip_full_check;
+
ret = check_cfg(env);
if (ret < 0)
goto skip_full_check;
- ret = do_check(env);
- if (env->cur_state) {
- free_verifier_state(env->cur_state, true);
- env->cur_state = NULL;
- }
+ ret = do_check_subprogs(env);
+ ret = ret ?: do_check_main(env);
if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux))
ret = bpf_prog_offload_finalize(env);
skip_full_check:
- while (!pop_stack(env, NULL, NULL));
- free_states(env);
+ kvfree(env->explored_states);
if (ret == 0)
ret = check_max_stack_depth(env);
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 90c4fce1c981..2cc5c8f4c800 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -72,9 +72,9 @@ static void xsk_map_sock_delete(struct xdp_sock *xs,
static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
{
struct bpf_map_memory mem;
- int cpu, err, numa_node;
+ int err, numa_node;
struct xsk_map *m;
- u64 cost, size;
+ u64 size;
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
@@ -86,9 +86,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
numa_node = bpf_map_attr_numa_node(attr);
size = struct_size(m, xsk_map, attr->max_entries);
- cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus());
- err = bpf_map_charge_init(&mem, cost);
+ err = bpf_map_charge_init(&mem, size);
if (err < 0)
return ERR_PTR(err);
@@ -102,16 +101,6 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
bpf_map_charge_move(&m->map.memory, &mem);
spin_lock_init(&m->lock);
- m->flush_list = alloc_percpu(struct list_head);
- if (!m->flush_list) {
- bpf_map_charge_finish(&m->map.memory);
- bpf_map_area_free(m);
- return ERR_PTR(-ENOMEM);
- }
-
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
-
return &m->map;
}
@@ -121,7 +110,6 @@ static void xsk_map_free(struct bpf_map *map)
bpf_clear_redirect_map(map);
synchronize_net();
- free_percpu(m->flush_list);
bpf_map_area_free(m);
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1e12e6928bca..b3744872263e 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6289,12 +6289,13 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
#ifdef CONFIG_CGROUP_BPF
int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type, u32 flags)
+ struct bpf_prog *replace_prog, enum bpf_attach_type type,
+ u32 flags)
{
int ret;
mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
+ ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, type, flags);
mutex_unlock(&cgroup_mutex);
return ret;
}
diff --git a/kernel/extable.c b/kernel/extable.c
index f6920a11e28a..a0024f27d3a1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -131,8 +131,9 @@ int kernel_text_address(unsigned long addr)
* triggers a stack trace, or a WARN() that happens during
* coming back from idle, or cpu on or offlining.
*
- * is_module_text_address() as well as the kprobe slots
- * and is_bpf_text_address() require RCU to be watching.
+ * is_module_text_address() as well as the kprobe slots,
+ * is_bpf_text_address() and is_bpf_image_address require
+ * RCU to be watching.
*/
no_rcu = !rcu_is_watching();
@@ -148,6 +149,8 @@ int kernel_text_address(unsigned long addr)
goto out;
if (is_bpf_text_address(addr))
goto out;
+ if (is_bpf_image_address(addr))
+ goto out;
ret = 0;
out:
if (no_rcu)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e5ef4ae9edb5..19e793aa441a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -703,6 +703,7 @@ struct send_signal_irq_work {
struct irq_work irq_work;
struct task_struct *task;
u32 sig;
+ enum pid_type type;
};
static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
@@ -712,10 +713,10 @@ static void do_bpf_send_signal(struct irq_work *entry)
struct send_signal_irq_work *work;
work = container_of(entry, struct send_signal_irq_work, irq_work);
- group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, PIDTYPE_TGID);
+ group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type);
}
-BPF_CALL_1(bpf_send_signal, u32, sig)
+static int bpf_send_signal_common(u32 sig, enum pid_type type)
{
struct send_signal_irq_work *work = NULL;
@@ -748,11 +749,17 @@ BPF_CALL_1(bpf_send_signal, u32, sig)
*/
work->task = current;
work->sig = sig;
+ work->type = type;
irq_work_queue(&work->irq_work);
return 0;
}
- return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID);
+ return group_send_sig_info(sig, SEND_SIG_PRIV, current, type);
+}
+
+BPF_CALL_1(bpf_send_signal, u32, sig)
+{
+ return bpf_send_signal_common(sig, PIDTYPE_TGID);
}
static const struct bpf_func_proto bpf_send_signal_proto = {
@@ -762,6 +769,18 @@ static const struct bpf_func_proto bpf_send_signal_proto = {
.arg1_type = ARG_ANYTHING,
};
+BPF_CALL_1(bpf_send_signal_thread, u32, sig)
+{
+ return bpf_send_signal_common(sig, PIDTYPE_PID);
+}
+
+static const struct bpf_func_proto bpf_send_signal_thread_proto = {
+ .func = bpf_send_signal_thread,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *
tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -822,6 +841,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
#endif
case BPF_FUNC_send_signal:
return &bpf_send_signal_proto;
+ case BPF_FUNC_send_signal_thread:
+ return &bpf_send_signal_thread_proto;
default:
return NULL;
}