summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/arraymap.c55
-rw-r--r--kernel/bpf/cgroup.c37
-rw-r--r--kernel/bpf/core.c49
-rw-r--r--kernel/bpf/hashtab.c21
-rw-r--r--kernel/bpf/map_in_map.c5
-rw-r--r--kernel/bpf/map_in_map.h1
-rw-r--r--kernel/bpf/syscall.c465
-rw-r--r--kernel/bpf/verifier.c189
-rw-r--r--kernel/events/core.c47
-rw-r--r--kernel/trace/bpf_trace.c66
10 files changed, 786 insertions, 149 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 172dc8ee0e3b..d771a3872500 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -335,6 +335,26 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
}
/* only called from syscall */
+int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
+{
+ void **elem, *ptr;
+ int ret = 0;
+
+ if (!map->ops->map_fd_sys_lookup_elem)
+ return -ENOTSUPP;
+
+ rcu_read_lock();
+ elem = array_map_lookup_elem(map, key);
+ if (elem && (ptr = READ_ONCE(*elem)))
+ *value = map->ops->map_fd_sys_lookup_elem(ptr);
+ else
+ ret = -ENOENT;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/* only called from syscall */
int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
void *key, void *value, u64 map_flags)
{
@@ -400,6 +420,11 @@ static void prog_fd_array_put_ptr(void *ptr)
bpf_prog_put(ptr);
}
+static u32 prog_fd_array_sys_lookup_elem(void *ptr)
+{
+ return ((struct bpf_prog *)ptr)->aux->id;
+}
+
/* decrement refcnt of all bpf_progs that are stored in this map */
void bpf_fd_array_map_clear(struct bpf_map *map)
{
@@ -418,6 +443,7 @@ const struct bpf_map_ops prog_array_map_ops = {
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = prog_fd_array_get_ptr,
.map_fd_put_ptr = prog_fd_array_put_ptr,
+ .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
};
static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
@@ -452,38 +478,24 @@ static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file, int fd)
{
- const struct perf_event_attr *attr;
struct bpf_event_entry *ee;
struct perf_event *event;
struct file *perf_file;
+ u64 value;
perf_file = perf_event_get(fd);
if (IS_ERR(perf_file))
return perf_file;
+ ee = ERR_PTR(-EOPNOTSUPP);
event = perf_file->private_data;
- ee = ERR_PTR(-EINVAL);
-
- attr = perf_event_attrs(event);
- if (IS_ERR(attr) || attr->inherit)
+ if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
goto err_out;
- switch (attr->type) {
- case PERF_TYPE_SOFTWARE:
- if (attr->config != PERF_COUNT_SW_BPF_OUTPUT)
- goto err_out;
- /* fall-through */
- case PERF_TYPE_RAW:
- case PERF_TYPE_HARDWARE:
- ee = bpf_event_entry_gen(perf_file, map_file);
- if (ee)
- return ee;
- ee = ERR_PTR(-ENOMEM);
- /* fall-through */
- default:
- break;
- }
-
+ ee = bpf_event_entry_gen(perf_file, map_file);
+ if (ee)
+ return ee;
+ ee = ERR_PTR(-ENOMEM);
err_out:
fput(perf_file);
return ee;
@@ -599,4 +611,5 @@ const struct bpf_map_ops array_of_maps_map_ops = {
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = bpf_map_fd_get_ptr,
.map_fd_put_ptr = bpf_map_fd_put_ptr,
+ .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
};
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ea6033cba947..546113430049 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -236,3 +236,40 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
return ret;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
+
+/**
+ * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
+ * @sk: socket to get cgroup from
+ * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
+ * sk with connection information (IP addresses, etc.) May not contain
+ * cgroup info if it is a req sock.
+ * @type: The type of program to be exectuted
+ *
+ * socket passed is expected to be of type INET or INET6.
+ *
+ * The program type passed in via @type must be suitable for sock_ops
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
+ struct bpf_sock_ops_kern *sock_ops,
+ enum bpf_attach_type type)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_prog *prog;
+ int ret = 0;
+
+
+ rcu_read_lock();
+
+ prog = rcu_dereference(cgrp->bpf.effective[type]);
+ if (prog)
+ ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
+
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index dedf367f59bb..ad5f55922a13 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -763,10 +763,10 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
*
* Decode and execute eBPF instructions.
*/
-static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
+static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
+ u64 *stack)
{
- u64 stack[MAX_BPF_STACK / sizeof(u64)];
- u64 regs[MAX_BPF_REG], tmp;
+ u64 tmp;
static const void *jumptable[256] = {
[0 ... 255] = &&default_label,
/* Now overwrite non-defaults ... */
@@ -824,7 +824,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
/* Call instruction */
[BPF_JMP | BPF_CALL] = &&JMP_CALL,
- [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
+ [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
/* Jumps */
[BPF_JMP | BPF_JA] = &&JMP_JA,
[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
@@ -874,9 +874,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
#define CONT ({ insn++; goto select_insn; })
#define CONT_JMP ({ insn++; goto select_insn; })
- FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
- ARG1 = (u64) (unsigned long) ctx;
-
select_insn:
goto *jumptable[insn->code];
@@ -1219,7 +1216,39 @@ load_byte:
WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
return 0;
}
-STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */
+STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */
+
+#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
+#define DEFINE_BPF_PROG_RUN(stack_size) \
+static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
+{ \
+ u64 stack[stack_size / sizeof(u64)]; \
+ u64 regs[MAX_BPF_REG]; \
+\
+ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
+ ARG1 = (u64) (unsigned long) ctx; \
+ return ___bpf_prog_run(regs, insn, stack); \
+}
+
+#define EVAL1(FN, X) FN(X)
+#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
+#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
+#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y)
+#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y)
+#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y)
+
+EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
+EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
+EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);
+
+#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),
+
+static unsigned int (*interpreters[])(const void *ctx,
+ const struct bpf_insn *insn) = {
+EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
+EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
+EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
+};
bool bpf_prog_array_compatible(struct bpf_array *array,
const struct bpf_prog *fp)
@@ -1268,7 +1297,9 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
*/
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
- fp->bpf_func = (void *) __bpf_prog_run;
+ u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
+
+ fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
/* eBPF JITs can rewrite the program in case constant
* blinding is active. However, in case of error during
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 004334ea13ba..4fb463172aa8 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1244,6 +1244,26 @@ static void fd_htab_map_free(struct bpf_map *map)
}
/* only called from syscall */
+int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
+{
+ void **ptr;
+ int ret = 0;
+
+ if (!map->ops->map_fd_sys_lookup_elem)
+ return -ENOTSUPP;
+
+ rcu_read_lock();
+ ptr = htab_map_lookup_elem(map, key);
+ if (ptr)
+ *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr));
+ else
+ ret = -ENOENT;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/* only called from syscall */
int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
void *key, void *value, u64 map_flags)
{
@@ -1305,4 +1325,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
.map_delete_elem = htab_map_delete_elem,
.map_fd_get_ptr = bpf_map_fd_get_ptr,
.map_fd_put_ptr = bpf_map_fd_put_ptr,
+ .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
};
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 59bcdf821ae4..1da574612bea 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -95,3 +95,8 @@ void bpf_map_fd_put_ptr(void *ptr)
*/
bpf_map_put(ptr);
}
+
+u32 bpf_map_fd_sys_lookup_elem(void *ptr)
+{
+ return ((struct bpf_map *)ptr)->id;
+}
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index 177fadb689dc..6183db9ec08c 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -19,5 +19,6 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
int ufd);
void bpf_map_fd_put_ptr(void *ptr);
+u32 bpf_map_fd_sys_lookup_elem(void *ptr);
#endif
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 265a0d854e33..18980472f5b0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -22,8 +22,20 @@
#include <linux/filter.h>
#include <linux/version.h>
#include <linux/kernel.h>
+#include <linux/idr.h>
+
+#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
+ (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
+ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
+ (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
+#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
+#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
DEFINE_PER_CPU(int, bpf_prog_active);
+static DEFINE_IDR(prog_idr);
+static DEFINE_SPINLOCK(prog_idr_lock);
+static DEFINE_IDR(map_idr);
+static DEFINE_SPINLOCK(map_idr_lock);
int sysctl_unprivileged_bpf_disabled __read_mostly;
@@ -114,6 +126,37 @@ static void bpf_map_uncharge_memlock(struct bpf_map *map)
free_uid(user);
}
+static int bpf_map_alloc_id(struct bpf_map *map)
+{
+ int id;
+
+ spin_lock_bh(&map_idr_lock);
+ id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
+ if (id > 0)
+ map->id = id;
+ spin_unlock_bh(&map_idr_lock);
+
+ if (WARN_ON_ONCE(!id))
+ return -ENOSPC;
+
+ return id > 0 ? 0 : id;
+}
+
+static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
+{
+ if (do_idr_lock)
+ spin_lock_bh(&map_idr_lock);
+ else
+ __acquire(&map_idr_lock);
+
+ idr_remove(&map_idr, map->id);
+
+ if (do_idr_lock)
+ spin_unlock_bh(&map_idr_lock);
+ else
+ __release(&map_idr_lock);
+}
+
/* called from workqueue */
static void bpf_map_free_deferred(struct work_struct *work)
{
@@ -135,14 +178,21 @@ static void bpf_map_put_uref(struct bpf_map *map)
/* decrement map refcnt and schedule it for freeing via workqueue
* (unrelying map implementation ops->map_free() might sleep)
*/
-void bpf_map_put(struct bpf_map *map)
+static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
{
if (atomic_dec_and_test(&map->refcnt)) {
+ /* bpf_map_free_id() must be called first */
+ bpf_map_free_id(map, do_idr_lock);
INIT_WORK(&map->work, bpf_map_free_deferred);
schedule_work(&map->work);
}
}
+void bpf_map_put(struct bpf_map *map)
+{
+ __bpf_map_put(map, true);
+}
+
void bpf_map_put_with_uref(struct bpf_map *map)
{
bpf_map_put_uref(map);
@@ -166,10 +216,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
const struct bpf_map *map = filp->private_data;
const struct bpf_array *array;
u32 owner_prog_type = 0;
+ u32 owner_jited = 0;
if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
array = container_of(map, struct bpf_array, map);
owner_prog_type = array->owner_prog_type;
+ owner_jited = array->owner_jited;
}
seq_printf(m,
@@ -186,9 +238,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
map->map_flags,
map->pages * 1ULL << PAGE_SHIFT);
- if (owner_prog_type)
+ if (owner_prog_type) {
seq_printf(m, "owner_prog_type:\t%u\n",
owner_prog_type);
+ seq_printf(m, "owner_jited:\t%u\n",
+ owner_jited);
+ }
}
#endif
@@ -236,11 +291,22 @@ static int map_create(union bpf_attr *attr)
if (err)
goto free_map_nouncharge;
- err = bpf_map_new_fd(map);
- if (err < 0)
- /* failed to allocate fd */
+ err = bpf_map_alloc_id(map);
+ if (err)
goto free_map;
+ err = bpf_map_new_fd(map);
+ if (err < 0) {
+ /* failed to allocate fd.
+ * bpf_map_put() is needed because the above
+ * bpf_map_alloc_id() has published the map
+ * to the userspace and the userspace may
+ * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
+ */
+ bpf_map_put(map);
+ return err;
+ }
+
trace_bpf_map_create(map, err);
return err;
@@ -295,6 +361,28 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
return map;
}
+/* map_idr_lock should have been held */
+static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
+ bool uref)
+{
+ int refold;
+
+ refold = __atomic_add_unless(&map->refcnt, 1, 0);
+
+ if (refold >= BPF_MAX_REFCNT) {
+ __bpf_map_put(map, false);
+ return ERR_PTR(-EBUSY);
+ }
+
+ if (!refold)
+ return ERR_PTR(-ENOENT);
+
+ if (uref)
+ atomic_inc(&map->usercnt);
+
+ return map;
+}
+
int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
{
return -ENOTSUPP;
@@ -335,6 +423,8 @@ static int map_lookup_elem(union bpf_attr *attr)
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
value_size = round_up(map->value_size, 8) * num_possible_cpus();
+ else if (IS_FD_MAP(map))
+ value_size = sizeof(u32);
else
value_size = map->value_size;
@@ -350,9 +440,10 @@ static int map_lookup_elem(union bpf_attr *attr)
err = bpf_percpu_array_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
err = bpf_stackmap_copy(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
- map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
- err = -ENOTSUPP;
+ } else if (IS_FD_ARRAY(map)) {
+ err = bpf_fd_array_map_lookup_elem(map, key, value);
+ } else if (IS_FD_HASH(map)) {
+ err = bpf_fd_htab_map_lookup_elem(map, key, value);
} else {
rcu_read_lock();
ptr = map->ops->map_lookup_elem(map, key);
@@ -650,6 +741,42 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
free_uid(user);
}
+static int bpf_prog_alloc_id(struct bpf_prog *prog)
+{
+ int id;
+
+ spin_lock_bh(&prog_idr_lock);
+ id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
+ if (id > 0)
+ prog->aux->id = id;
+ spin_unlock_bh(&prog_idr_lock);
+
+ /* id is in [1, INT_MAX) */
+ if (WARN_ON_ONCE(!id))
+ return -ENOSPC;
+
+ return id > 0 ? 0 : id;
+}
+
+static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
+{
+ /* cBPF to eBPF migrations are currently not in the idr store. */
+ if (!prog->aux->id)
+ return;
+
+ if (do_idr_lock)
+ spin_lock_bh(&prog_idr_lock);
+ else
+ __acquire(&prog_idr_lock);
+
+ idr_remove(&prog_idr, prog->aux->id);
+
+ if (do_idr_lock)
+ spin_unlock_bh(&prog_idr_lock);
+ else
+ __release(&prog_idr_lock);
+}
+
static void __bpf_prog_put_rcu(struct rcu_head *rcu)
{
struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
@@ -659,14 +786,21 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
bpf_prog_free(aux->prog);
}
-void bpf_prog_put(struct bpf_prog *prog)
+static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
if (atomic_dec_and_test(&prog->aux->refcnt)) {
trace_bpf_prog_put_rcu(prog);
+ /* bpf_prog_free_id() must be called first */
+ bpf_prog_free_id(prog, do_idr_lock);
bpf_prog_kallsyms_del(prog);
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
}
}
+
+void bpf_prog_put(struct bpf_prog *prog)
+{
+ __bpf_prog_put(prog, true);
+}
EXPORT_SYMBOL_GPL(bpf_prog_put);
static int bpf_prog_release(struct inode *inode, struct file *filp)
@@ -748,6 +882,24 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
}
EXPORT_SYMBOL_GPL(bpf_prog_inc);
+/* prog_idr_lock should have been held */
+static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
+{
+ int refold;
+
+ refold = __atomic_add_unless(&prog->aux->refcnt, 1, 0);
+
+ if (refold >= BPF_MAX_REFCNT) {
+ __bpf_prog_put(prog, false);
+ return ERR_PTR(-EBUSY);
+ }
+
+ if (!refold)
+ return ERR_PTR(-ENOENT);
+
+ return prog;
+}
+
static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
{
struct fd f = fdget(ufd);
@@ -815,7 +967,9 @@ static int bpf_prog_load(union bpf_attr *attr)
attr->kern_version != LINUX_VERSION_CODE)
return -EINVAL;
- if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
+ if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
+ type != BPF_PROG_TYPE_CGROUP_SKB &&
+ !capable(CAP_SYS_ADMIN))
return -EPERM;
/* plain bpf_prog allocation */
@@ -855,11 +1009,22 @@ static int bpf_prog_load(union bpf_attr *attr)
if (err < 0)
goto free_used_maps;
- err = bpf_prog_new_fd(prog);
- if (err < 0)
- /* failed to allocate fd */
+ err = bpf_prog_alloc_id(prog);
+ if (err)
goto free_used_maps;
+ err = bpf_prog_new_fd(prog);
+ if (err < 0) {
+ /* failed to allocate fd.
+ * bpf_prog_put() is needed because the above
+ * bpf_prog_alloc_id() has published the prog
+ * to the userspace and the userspace may
+ * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID.
+ */
+ bpf_prog_put(prog);
+ return err;
+ }
+
bpf_prog_kallsyms_add(prog);
trace_bpf_prog_load(prog, err);
return err;
@@ -919,6 +1084,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_CGROUP_INET_SOCK_CREATE:
ptype = BPF_PROG_TYPE_CGROUP_SOCK;
break;
+ case BPF_CGROUP_SOCK_OPS:
+ ptype = BPF_PROG_TYPE_SOCK_OPS;
+ break;
default:
return -EINVAL;
}
@@ -959,6 +1127,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_SOCK_OPS:
cgrp = cgroup_get_from_fd(attr->target_fd);
if (IS_ERR(cgrp))
return PTR_ERR(cgrp);
@@ -973,6 +1142,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
return ret;
}
+
#endif /* CONFIG_CGROUP_BPF */
#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
@@ -997,6 +1167,237 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
return ret;
}
+#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
+
+static int bpf_obj_get_next_id(const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ struct idr *idr,
+ spinlock_t *lock)
+{
+ u32 next_id = attr->start_id;
+ int err = 0;
+
+ if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ next_id++;
+ spin_lock_bh(lock);
+ if (!idr_get_next(idr, &next_id))
+ err = -ENOENT;
+ spin_unlock_bh(lock);
+
+ if (!err)
+ err = put_user(next_id, &uattr->next_id);
+
+ return err;
+}
+
+#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
+
+static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ u32 id = attr->prog_id;
+ int fd;
+
+ if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ spin_lock_bh(&prog_idr_lock);
+ prog = idr_find(&prog_idr, id);
+ if (prog)
+ prog = bpf_prog_inc_not_zero(prog);
+ else
+ prog = ERR_PTR(-ENOENT);
+ spin_unlock_bh(&prog_idr_lock);
+
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ fd = bpf_prog_new_fd(prog);
+ if (fd < 0)
+ bpf_prog_put(prog);
+
+ return fd;
+}
+
+#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id
+
+static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
+{
+ struct bpf_map *map;
+ u32 id = attr->map_id;
+ int fd;
+
+ if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ spin_lock_bh(&map_idr_lock);
+ map = idr_find(&map_idr, id);
+ if (map)
+ map = bpf_map_inc_not_zero(map, true);
+ else
+ map = ERR_PTR(-ENOENT);
+ spin_unlock_bh(&map_idr_lock);
+
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ fd = bpf_map_new_fd(map);
+ if (fd < 0)
+ bpf_map_put(map);
+
+ return fd;
+}
+
+static int check_uarg_tail_zero(void __user *uaddr,
+ size_t expected_size,
+ size_t actual_size)
+{
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+ int err;
+
+ if (actual_size <= expected_size)
+ return 0;
+
+ addr = uaddr + expected_size;
+ end = uaddr + actual_size;
+
+ for (; addr < end; addr++) {
+ err = get_user(val, addr);
+ if (err)
+ return err;
+ if (val)
+ return -E2BIG;
+ }
+
+ return 0;
+}
+
+static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct bpf_prog_info info = {};
+ u32 info_len = attr->info.info_len;
+ char __user *uinsns;
+ u32 ulen;
+ int err;
+
+ err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ if (err)
+ return err;
+ info_len = min_t(u32, sizeof(info), info_len);
+
+ if (copy_from_user(&info, uinfo, info_len))
+ return err;
+
+ info.type = prog->type;
+ info.id = prog->aux->id;
+
+ memcpy(info.tag, prog->tag, sizeof(prog->tag));
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ info.jited_prog_len = 0;
+ info.xlated_prog_len = 0;
+ goto done;
+ }
+
+ ulen = info.jited_prog_len;
+ info.jited_prog_len = prog->jited_len;
+ if (info.jited_prog_len && ulen) {
+ uinsns = u64_to_user_ptr(info.jited_prog_insns);
+ ulen = min_t(u32, info.jited_prog_len, ulen);
+ if (copy_to_user(uinsns, prog->bpf_func, ulen))
+ return -EFAULT;
+ }
+
+ ulen = info.xlated_prog_len;
+ info.xlated_prog_len = bpf_prog_size(prog->len);
+ if (info.xlated_prog_len && ulen) {
+ uinsns = u64_to_user_ptr(info.xlated_prog_insns);
+ ulen = min_t(u32, info.xlated_prog_len, ulen);
+ if (copy_to_user(uinsns, prog->insnsi, ulen))
+ return -EFAULT;
+ }
+
+done:
+ if (copy_to_user(uinfo, &info, info_len) ||
+ put_user(info_len, &uattr->info.info_len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int bpf_map_get_info_by_fd(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct bpf_map_info info = {};
+ u32 info_len = attr->info.info_len;
+ int err;
+
+ err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ if (err)
+ return err;
+ info_len = min_t(u32, sizeof(info), info_len);
+
+ info.type = map->map_type;
+ info.id = map->id;
+ info.key_size = map->key_size;
+ info.value_size = map->value_size;
+ info.max_entries = map->max_entries;
+ info.map_flags = map->map_flags;
+
+ if (copy_to_user(uinfo, &info, info_len) ||
+ put_user(info_len, &uattr->info.info_len))
+ return -EFAULT;
+
+ return 0;
+}
+
+#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
+
+static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ int ufd = attr->info.bpf_fd;
+ struct fd f;
+ int err;
+
+ if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
+ return -EINVAL;
+
+ f = fdget(ufd);
+ if (!f.file)
+ return -EBADFD;
+
+ if (f.file->f_op == &bpf_prog_fops)
+ err = bpf_prog_get_info_by_fd(f.file->private_data, attr,
+ uattr);
+ else if (f.file->f_op == &bpf_map_fops)
+ err = bpf_map_get_info_by_fd(f.file->private_data, attr,
+ uattr);
+ else
+ err = -EINVAL;
+
+ fdput(f);
+ return err;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
@@ -1016,23 +1417,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
* user-space does not rely on any kernel feature
* extensions we dont know about yet.
*/
- if (size > sizeof(attr)) {
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
-
- addr = (void __user *)uattr + sizeof(attr);
- end = (void __user *)uattr + size;
-
- for (; addr < end; addr++) {
- err = get_user(val, addr);
- if (err)
- return err;
- if (val)
- return -E2BIG;
- }
- size = sizeof(attr);
- }
+ err = check_uarg_tail_zero(uattr, sizeof(attr), size);
+ if (err)
+ return err;
+ size = min_t(u32, size, sizeof(attr));
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
if (copy_from_user(&attr, uattr, size) != 0)
@@ -1074,6 +1462,23 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_PROG_TEST_RUN:
err = bpf_prog_test_run(&attr, uattr);
break;
+ case BPF_PROG_GET_NEXT_ID:
+ err = bpf_obj_get_next_id(&attr, uattr,
+ &prog_idr, &prog_idr_lock);
+ break;
+ case BPF_MAP_GET_NEXT_ID:
+ err = bpf_obj_get_next_id(&attr, uattr,
+ &map_idr, &map_idr_lock);
+ break;
+ case BPF_PROG_GET_FD_BY_ID:
+ err = bpf_prog_get_fd_by_id(&attr);
+ break;
+ case BPF_MAP_GET_FD_BY_ID:
+ err = bpf_map_get_fd_by_id(&attr);
+ break;
+ case BPF_OBJ_GET_INFO_BY_FD:
+ err = bpf_obj_get_info_by_fd(&attr, uattr);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a8a725697bed..6a86723c5b64 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -546,20 +546,6 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
return 0;
}
-static int bpf_size_to_bytes(int bpf_size)
-{
- if (bpf_size == BPF_W)
- return 4;
- else if (bpf_size == BPF_H)
- return 2;
- else if (bpf_size == BPF_B)
- return 1;
- else if (bpf_size == BPF_DW)
- return 8;
- else
- return -EINVAL;
-}
-
static bool is_spillable_regtype(enum bpf_reg_type type)
{
switch (type) {
@@ -758,15 +744,29 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
}
/* check access to 'struct bpf_context' fields */
-static int check_ctx_access(struct bpf_verifier_env *env, int off, int size,
+static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
enum bpf_access_type t, enum bpf_reg_type *reg_type)
{
+ struct bpf_insn_access_aux info = {
+ .reg_type = *reg_type,
+ };
+
/* for analyzer ctx accesses are already validated and converted */
if (env->analyzer_ops)
return 0;
if (env->prog->aux->ops->is_valid_access &&
- env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) {
+ env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
+ /* A non zero info.ctx_field_size indicates that this field is a
+ * candidate for later verifier transformation to load the whole
+ * field and then apply a mask when accessed with a narrower
+ * access than actual ctx access size. A zero info.ctx_field_size
+ * will only allow for whole field access and rejects any other
+ * type of narrower access.
+ */
+ env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
+ *reg_type = info.reg_type;
+
/* remember the offset of last byte accessed in ctx */
if (env->prog->aux->max_ctx_offset < off + size)
env->prog->aux->max_ctx_offset = off + size;
@@ -868,7 +868,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
* if t==write && value_regno==-1, some unknown value is stored into memory
* if t==read && value_regno==-1, don't care what we read from memory
*/
-static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
+static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off,
int bpf_size, enum bpf_access_type t,
int value_regno)
{
@@ -911,7 +911,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
verbose("R%d leaks addr into ctx\n", value_regno);
return -EACCES;
}
- err = check_ctx_access(env, off, size, t, &reg_type);
+ err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
if (!err && t == BPF_READ && value_regno >= 0) {
mark_reg_unknown_value_and_range(state->regs,
value_regno);
@@ -926,6 +926,10 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
verbose("invalid stack off=%d size=%d\n", off, size);
return -EACCES;
}
+
+ if (env->prog->aux->stack_depth < -off)
+ env->prog->aux->stack_depth = -off;
+
if (t == BPF_WRITE) {
if (!env->allow_ptr_leaks &&
state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
@@ -968,7 +972,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
return err;
}
-static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
+static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
{
struct bpf_reg_state *regs = env->cur_state.regs;
int err;
@@ -995,13 +999,13 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check whether atomic_add can read the memory */
- err = check_mem_access(env, insn->dst_reg, insn->off,
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ, -1);
if (err)
return err;
/* check whether atomic_add can write into the same memory */
- return check_mem_access(env, insn->dst_reg, insn->off,
+ return check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE, -1);
}
@@ -1037,6 +1041,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
return -EACCES;
}
+ if (env->prog->aux->stack_depth < -off)
+ env->prog->aux->stack_depth = -off;
+
if (meta && meta->raw_mode) {
meta->access_size = access_size;
meta->regno = regno;
@@ -1344,8 +1351,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
if (reg->type != PTR_TO_PACKET &&
reg->type != PTR_TO_PACKET_END)
continue;
- reg->type = UNKNOWN_VALUE;
- reg->imm = 0;
+ __mark_reg_unknown_value(state->spilled_regs,
+ i / BPF_REG_SIZE);
}
}
@@ -1414,7 +1421,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
* is inferred from register state.
*/
for (i = 0; i < meta.access_size; i++) {
- err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1);
+ err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1);
if (err)
return err;
}
@@ -1650,6 +1657,65 @@ static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
+static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+ struct bpf_reg_state *src_reg = &regs[insn->src_reg];
+ u8 opcode = BPF_OP(insn->code);
+ s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm);
+
+ /* BPF_X code with src_reg->type UNKNOWN_VALUE here. */
+ if (src_reg->imm > 0 && dst_reg->imm) {
+ switch (opcode) {
+ case BPF_ADD:
+ /* dreg += sreg
+ * where both have zero upper bits. Adding them
+ * can only result making one more bit non-zero
+ * in the larger value.
+ * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
+ * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
+ */
+ dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
+ dst_reg->imm--;
+ break;
+ case BPF_AND:
+ /* dreg &= sreg
+ * AND can not extend zero bits only shrink
+ * Ex. 0x00..00ffffff
+ * & 0x0f..ffffffff
+ * ----------------
+ * 0x00..00ffffff
+ */
+ dst_reg->imm = max(src_reg->imm, 63 - imm_log2);
+ break;
+ case BPF_OR:
+ /* dreg |= sreg
+ * OR can only extend zero bits
+ * Ex. 0x00..00ffffff
+ * | 0x0f..ffffffff
+ * ----------------
+ * 0x0f..00ffffff
+ */
+ dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
+ break;
+ case BPF_SUB:
+ case BPF_MUL:
+ case BPF_RSH:
+ case BPF_LSH:
+ /* These may be flushed out later */
+ default:
+ mark_reg_unknown_value(regs, insn->dst_reg);
+ }
+ } else {
+ mark_reg_unknown_value(regs, insn->dst_reg);
+ }
+
+ dst_reg->type = UNKNOWN_VALUE;
+ return 0;
+}
+
static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
struct bpf_insn *insn)
{
@@ -1659,6 +1725,9 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
u8 opcode = BPF_OP(insn->code);
u64 dst_imm = dst_reg->imm;
+ if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE)
+ return evaluate_reg_imm_alu_unknown(env, insn);
+
/* dst_reg->type == CONST_IMM here. Simulate execution of insns
* containing ALU ops. Don't care about overflow or negative
* values, just add/sub/... them; registers are in u64.
@@ -1950,6 +2019,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
*/
regs[insn->dst_reg].type = CONST_IMM;
regs[insn->dst_reg].imm = insn->imm;
+ regs[insn->dst_reg].id = 0;
regs[insn->dst_reg].max_value = insn->imm;
regs[insn->dst_reg].min_value = insn->imm;
regs[insn->dst_reg].min_align = calc_align(insn->imm);
@@ -2407,6 +2477,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
regs[insn->dst_reg].type = CONST_IMM;
regs[insn->dst_reg].imm = imm;
+ regs[insn->dst_reg].id = 0;
return 0;
}
@@ -2826,6 +2897,8 @@ static bool states_equal(struct bpf_verifier_env *env,
return false;
if (i % BPF_REG_SIZE)
continue;
+ if (old->stack_slot_type[i] != STACK_SPILL)
+ continue;
if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
&cur->spilled_regs[i / BPF_REG_SIZE],
sizeof(old->spilled_regs[0])))
@@ -2987,18 +3060,12 @@ static int do_check(struct bpf_verifier_env *env)
/* check that memory (src_reg + off) is readable,
* the state of dst_reg will be updated by this func
*/
- err = check_mem_access(env, insn->src_reg, insn->off,
+ err = check_mem_access(env, insn_idx, insn->src_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ,
insn->dst_reg);
if (err)
return err;
- if (BPF_SIZE(insn->code) != BPF_W &&
- BPF_SIZE(insn->code) != BPF_DW) {
- insn_idx++;
- continue;
- }
-
prev_src_type = &env->insn_aux_data[insn_idx].ptr_type;
if (*prev_src_type == NOT_INIT) {
@@ -3026,7 +3093,7 @@ static int do_check(struct bpf_verifier_env *env)
enum bpf_reg_type *prev_dst_type, dst_reg_type;
if (BPF_MODE(insn->code) == BPF_XADD) {
- err = check_xadd(env, insn);
+ err = check_xadd(env, insn_idx, insn);
if (err)
return err;
insn_idx++;
@@ -3045,7 +3112,7 @@ static int do_check(struct bpf_verifier_env *env)
dst_reg_type = regs[insn->dst_reg].type;
/* check that memory (dst_reg + off) is writeable */
- err = check_mem_access(env, insn->dst_reg, insn->off,
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE,
insn->src_reg);
if (err)
@@ -3074,7 +3141,7 @@ static int do_check(struct bpf_verifier_env *env)
return err;
/* check that memory (dst_reg + off) is writeable */
- err = check_mem_access(env, insn->dst_reg, insn->off,
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE,
-1);
if (err)
@@ -3172,7 +3239,8 @@ process_bpf_exit:
insn_idx++;
}
- verbose("processed %d insns\n", insn_processed);
+ verbose("processed %d insns, stack depth %d\n",
+ insn_processed, env->prog->aux->stack_depth);
return 0;
}
@@ -3372,11 +3440,13 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
static int convert_ctx_accesses(struct bpf_verifier_env *env)
{
const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+ int i, cnt, size, ctx_field_size, delta = 0;
const int insn_cnt = env->prog->len;
struct bpf_insn insn_buf[16], *insn;
struct bpf_prog *new_prog;
enum bpf_access_type type;
- int i, cnt, delta = 0;
+ bool is_narrower_load;
+ u32 target_size;
if (ops->gen_prologue) {
cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
@@ -3416,12 +3486,52 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
continue;
- cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog);
- if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
+ size = BPF_LDST_BYTES(insn);
+
+ /* If the read access is a narrower load of the field,
+ * convert to a 4/8-byte load, to minimum program type specific
+ * convert_ctx_access changes. If conversion is successful,
+ * we will apply proper mask to the result.
+ */
+ is_narrower_load = size < ctx_field_size;
+ if (is_narrower_load) {
+ u32 off = insn->off;
+ u8 size_code;
+
+ if (type == BPF_WRITE) {
+ verbose("bpf verifier narrow ctx access misconfigured\n");
+ return -EINVAL;
+ }
+
+ size_code = BPF_H;
+ if (ctx_field_size == 4)
+ size_code = BPF_W;
+ else if (ctx_field_size == 8)
+ size_code = BPF_DW;
+
+ insn->off = off & ~(ctx_field_size - 1);
+ insn->code = BPF_LDX | BPF_MEM | size_code;
+ }
+
+ target_size = 0;
+ cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog,
+ &target_size);
+ if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
+ (ctx_field_size && !target_size)) {
verbose("bpf verifier is misconfigured\n");
return -EINVAL;
}
+ if (is_narrower_load && size < target_size) {
+ if (ctx_field_size <= 4)
+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
+ (1 << size * 8) - 1);
+ else
+ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
+ (1 << size * 8) - 1);
+ }
+
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
@@ -3467,6 +3577,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
* the program array.
*/
prog->cb_access = 1;
+ env->prog->aux->stack_depth = MAX_BPF_STACK;
/* mark bpf_tail_call as different opcode to avoid
* conditional branch in the interpeter for every normal
@@ -3474,7 +3585,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
* that doesn't support bpf_tail_call yet
*/
insn->imm = 0;
- insn->code |= BPF_X;
+ insn->code = BPF_JMP | BPF_TAIL_CALL;
continue;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4d2c32f98482..1538df9b2b65 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3632,10 +3632,10 @@ static inline u64 perf_event_count(struct perf_event *event)
* will not be local and we cannot read them atomically
* - must not have a pmu::count method
*/
-u64 perf_event_read_local(struct perf_event *event)
+int perf_event_read_local(struct perf_event *event, u64 *value)
{
unsigned long flags;
- u64 val;
+ int ret = 0;
/*
* Disabling interrupts avoids all counter scheduling (context
@@ -3643,25 +3643,37 @@ u64 perf_event_read_local(struct perf_event *event)
*/
local_irq_save(flags);
- /* If this is a per-task event, it must be for current */
- WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
- event->hw.target != current);
-
- /* If this is a per-CPU event, it must be for this CPU */
- WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
- event->cpu != smp_processor_id());
-
/*
* It must not be an event with inherit set, we cannot read
* all child counters from atomic context.
*/
- WARN_ON_ONCE(event->attr.inherit);
+ if (event->attr.inherit) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
/*
* It must not have a pmu::count method, those are not
* NMI safe.
*/
- WARN_ON_ONCE(event->pmu->count);
+ if (event->pmu->count) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /* If this is a per-task event, it must be for current */
+ if ((event->attach_state & PERF_ATTACH_TASK) &&
+ event->hw.target != current) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* If this is a per-CPU event, it must be for this CPU */
+ if (!(event->attach_state & PERF_ATTACH_TASK) &&
+ event->cpu != smp_processor_id()) {
+ ret = -EINVAL;
+ goto out;
+ }
/*
* If the event is currently on this CPU, its either a per-task event,
@@ -3671,10 +3683,11 @@ u64 perf_event_read_local(struct perf_event *event)
if (event->oncpu == smp_processor_id())
event->pmu->read(event);
- val = local64_read(&event->count);
+ *value = local64_read(&event->count);
+out:
local_irq_restore(flags);
- return val;
+ return ret;
}
static int perf_event_read(struct perf_event *event, bool group)
@@ -8049,12 +8062,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
bool is_kprobe, is_tracepoint;
struct bpf_prog *prog;
- if (event->attr.type == PERF_TYPE_HARDWARE ||
- event->attr.type == PERF_TYPE_SOFTWARE)
- return perf_event_set_bpf_handler(event, prog_fd);
-
if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -EINVAL;
+ return perf_event_set_bpf_handler(event, prog_fd);
if (event->tp_event->prog)
return -EEXIST;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 460a031c77e5..37385193a608 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -122,8 +122,8 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
}
/*
- * limited trace_printk()
- * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
+ * Only limited trace_printk() conversion specifiers allowed:
+ * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %s
*/
BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
u64, arg2, u64, arg3)
@@ -198,7 +198,8 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
i++;
}
- if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
+ if (fmt[i] != 'i' && fmt[i] != 'd' &&
+ fmt[i] != 'u' && fmt[i] != 'x')
return -EINVAL;
fmt_cnt++;
}
@@ -234,7 +235,8 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
struct bpf_event_entry *ee;
- struct perf_event *event;
+ u64 value = 0;
+ int err;
if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
return -EINVAL;
@@ -247,21 +249,14 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
if (!ee)
return -ENOENT;
- event = ee->event;
- if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
- event->attr.type != PERF_TYPE_RAW))
- return -EINVAL;
-
- /* make sure event is local and doesn't have pmu::count */
- if (unlikely(event->oncpu != cpu || event->pmu->count))
- return -EINVAL;
-
+ err = perf_event_read_local(ee->event, &value);
/*
- * we don't know if the function is run successfully by the
- * return value. It can be judged in other places, such as
- * eBPF programs.
+ * this api is ugly since we miss [-22..-2] range of valid
+ * counter values, but that's uapi
*/
- return perf_event_read_local(event);
+ if (err)
+ return err;
+ return value;
}
static const struct bpf_func_proto bpf_perf_event_read_proto = {
@@ -272,14 +267,16 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
.arg2_type = ARG_ANYTHING,
};
+static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
+
static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
u64 flags, struct perf_raw_record *raw)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd);
unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
- struct perf_sample_data sample_data;
struct bpf_event_entry *ee;
struct perf_event *event;
@@ -300,9 +297,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
if (unlikely(event->oncpu != cpu))
return -EOPNOTSUPP;
- perf_sample_data_init(&sample_data, 0, 0);
- sample_data.raw = raw;
- perf_event_output(event, &sample_data, regs);
+ perf_sample_data_init(sd, 0, 0);
+ sd->raw = raw;
+ perf_event_output(event, sd, regs);
return 0;
}
@@ -483,7 +480,7 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
/* bpf+kprobe programs can access fields of 'struct pt_regs' */
static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
- enum bpf_reg_type *reg_type)
+ struct bpf_insn_access_aux *info)
{
if (off < 0 || off >= sizeof(struct pt_regs))
return false;
@@ -566,7 +563,7 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
}
static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
- enum bpf_reg_type *reg_type)
+ struct bpf_insn_access_aux *info)
{
if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
return false;
@@ -585,40 +582,47 @@ const struct bpf_verifier_ops tracepoint_prog_ops = {
};
static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
- enum bpf_reg_type *reg_type)
+ struct bpf_insn_access_aux *info)
{
+ const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data,
+ sample_period);
+
if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
return false;
if (type != BPF_READ)
return false;
if (off % size != 0)
return false;
- if (off == offsetof(struct bpf_perf_event_data, sample_period)) {
- if (size != sizeof(u64))
+
+ switch (off) {
+ case bpf_ctx_range(struct bpf_perf_event_data, sample_period):
+ bpf_ctx_record_field_size(info, size_sp);
+ if (!bpf_ctx_narrow_access_ok(off, size, size_sp))
return false;
- } else {
+ break;
+ default:
if (size != sizeof(long))
return false;
}
+
return true;
}
static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
- struct bpf_prog *prog)
+ struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
switch (si->off) {
case offsetof(struct bpf_perf_event_data, sample_period):
- BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
-
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
data), si->dst_reg, si->src_reg,
offsetof(struct bpf_perf_event_data_kern, data));
*insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
- offsetof(struct perf_sample_data, period));
+ bpf_target_off(struct perf_sample_data, period, 8,
+ target_size));
break;
default:
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,