summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/arraymap.c26
-rw-r--r--kernel/bpf/core.c30
-rw-r--r--kernel/bpf/hashtab.c4
-rw-r--r--kernel/bpf/helpers.c7
-rw-r--r--kernel/bpf/syscall.c83
-rw-r--r--kernel/bpf/verifier.c111
-rw-r--r--kernel/events/core.c15
-rw-r--r--kernel/irq/manage.c9
-rw-r--r--kernel/ptrace.c5
-rw-r--r--kernel/seccomp.c78
-rw-r--r--kernel/sysctl.c13
-rw-r--r--kernel/trace/bpf_trace.c55
12 files changed, 392 insertions, 44 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 29ace107f236..3f4c99e06c6b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -15,6 +15,7 @@
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/filter.h>
+#include <linux/perf_event.h>
/* Called from syscall */
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
@@ -48,7 +49,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
array->map.key_size = attr->key_size;
array->map.value_size = attr->value_size;
array->map.max_entries = attr->max_entries;
-
+ array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
array->elem_size = elem_size;
return &array->map;
@@ -291,14 +292,23 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
attr = perf_event_attrs(event);
if (IS_ERR(attr))
- return (void *)attr;
+ goto err;
- if (attr->type != PERF_TYPE_RAW &&
- attr->type != PERF_TYPE_HARDWARE) {
- perf_event_release_kernel(event);
- return ERR_PTR(-EINVAL);
- }
- return event;
+ if (attr->inherit)
+ goto err;
+
+ if (attr->type == PERF_TYPE_RAW)
+ return event;
+
+ if (attr->type == PERF_TYPE_HARDWARE)
+ return event;
+
+ if (attr->type == PERF_TYPE_SOFTWARE &&
+ attr->config == PERF_COUNT_SW_BPF_OUTPUT)
+ return event;
+err:
+ perf_event_release_kernel(event);
+ return ERR_PTR(-EINVAL);
}
static void perf_event_fd_array_put_ptr(void *ptr)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 67c380cfa9ca..80864712d2c4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -82,6 +82,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
if (fp == NULL)
return NULL;
+ kmemcheck_annotate_bitfield(fp, meta);
+
aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
if (aux == NULL) {
vfree(fp);
@@ -110,6 +112,8 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
if (fp != NULL) {
+ kmemcheck_annotate_bitfield(fp, meta);
+
memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
fp->pages = size / PAGE_SIZE;
@@ -727,6 +731,32 @@ void bpf_prog_free(struct bpf_prog *fp)
}
EXPORT_SYMBOL_GPL(bpf_prog_free);
+/* RNG for unpriviledged user space with separated state from prandom_u32(). */
+static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);
+
+void bpf_user_rnd_init_once(void)
+{
+ prandom_init_once(&bpf_user_rnd_state);
+}
+
+u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ /* Should someone ever have the rather unwise idea to use some
+ * of the registers passed into this function, then note that
+ * this function is called from native eBPF and classic-to-eBPF
+ * transformations. Register assignments from both sides are
+ * different, f.e. classic always sets fn(ctx, A, X) here.
+ */
+ struct rnd_state *state;
+ u32 res;
+
+ state = &get_cpu_var(bpf_user_rnd_state);
+ res = prandom_u32_state(state);
+ put_cpu_var(state);
+
+ return res;
+}
+
/* Weak definitions of helper functions in case we don't have bpf syscall. */
const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
const struct bpf_func_proto bpf_map_update_elem_proto __weak;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 83c209d9b17a..28592d79502b 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -88,6 +88,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
htab->elem_size = sizeof(struct htab_elem) +
round_up(htab->map.key_size, 8) +
htab->map.value_size;
+
+ htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) +
+ htab->elem_size * htab->map.max_entries,
+ PAGE_SIZE) >> PAGE_SHIFT;
return &htab->map;
free_htab:
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1447ec09421e..4504ca66118d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -93,13 +93,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = {
.arg2_type = ARG_PTR_TO_MAP_KEY,
};
-static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
- return prandom_u32();
-}
-
const struct bpf_func_proto bpf_get_prandom_u32_proto = {
- .func = bpf_get_prandom_u32,
+ .func = bpf_user_rnd_u32,
.gpl_only = false,
.ret_type = RET_INTEGER,
};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 35bac8e8b071..687dd6ca574d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -18,6 +18,8 @@
#include <linux/filter.h>
#include <linux/version.h>
+int sysctl_unprivileged_bpf_disabled __read_mostly;
+
static LIST_HEAD(bpf_map_types);
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
@@ -44,11 +46,38 @@ void bpf_register_map_type(struct bpf_map_type_list *tl)
list_add(&tl->list_node, &bpf_map_types);
}
+static int bpf_map_charge_memlock(struct bpf_map *map)
+{
+ struct user_struct *user = get_current_user();
+ unsigned long memlock_limit;
+
+ memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ atomic_long_add(map->pages, &user->locked_vm);
+
+ if (atomic_long_read(&user->locked_vm) > memlock_limit) {
+ atomic_long_sub(map->pages, &user->locked_vm);
+ free_uid(user);
+ return -EPERM;
+ }
+ map->user = user;
+ return 0;
+}
+
+static void bpf_map_uncharge_memlock(struct bpf_map *map)
+{
+ struct user_struct *user = map->user;
+
+ atomic_long_sub(map->pages, &user->locked_vm);
+ free_uid(user);
+}
+
/* called from workqueue */
static void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
+ bpf_map_uncharge_memlock(map);
/* implementation dependent freeing */
map->ops->map_free(map);
}
@@ -108,6 +137,10 @@ static int map_create(union bpf_attr *attr)
atomic_set(&map->refcnt, 1);
+ err = bpf_map_charge_memlock(map);
+ if (err)
+ goto free_map;
+
err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
if (err < 0)
@@ -402,6 +435,10 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
*/
BUG_ON(!prog->aux->ops->get_func_proto);
+ if (insn->imm == BPF_FUNC_get_route_realm)
+ prog->dst_needed = 1;
+ if (insn->imm == BPF_FUNC_get_prandom_u32)
+ bpf_user_rnd_init_once();
if (insn->imm == BPF_FUNC_tail_call) {
/* mark bpf_tail_call as different opcode
* to avoid conditional branch in
@@ -436,11 +473,37 @@ static void free_used_maps(struct bpf_prog_aux *aux)
kfree(aux->used_maps);
}
+static int bpf_prog_charge_memlock(struct bpf_prog *prog)
+{
+ struct user_struct *user = get_current_user();
+ unsigned long memlock_limit;
+
+ memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ atomic_long_add(prog->pages, &user->locked_vm);
+ if (atomic_long_read(&user->locked_vm) > memlock_limit) {
+ atomic_long_sub(prog->pages, &user->locked_vm);
+ free_uid(user);
+ return -EPERM;
+ }
+ prog->aux->user = user;
+ return 0;
+}
+
+static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
+{
+ struct user_struct *user = prog->aux->user;
+
+ atomic_long_sub(prog->pages, &user->locked_vm);
+ free_uid(user);
+}
+
static void __prog_put_rcu(struct rcu_head *rcu)
{
struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
free_used_maps(aux);
+ bpf_prog_uncharge_memlock(aux->prog);
bpf_prog_free(aux->prog);
}
@@ -457,6 +520,7 @@ void bpf_prog_put(struct bpf_prog *prog)
{
if (atomic_dec_and_test(&prog->aux->refcnt)) {
free_used_maps(prog->aux);
+ bpf_prog_uncharge_memlock(prog);
bpf_prog_free(prog);
}
}
@@ -540,11 +604,18 @@ static int bpf_prog_load(union bpf_attr *attr)
attr->kern_version != LINUX_VERSION_CODE)
return -EINVAL;
+ if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
/* plain bpf_prog allocation */
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
if (!prog)
return -ENOMEM;
+ err = bpf_prog_charge_memlock(prog);
+ if (err)
+ goto free_prog_nouncharge;
+
prog->len = attr->insn_cnt;
err = -EFAULT;
@@ -553,10 +624,10 @@ static int bpf_prog_load(union bpf_attr *attr)
goto free_prog;
prog->orig_prog = NULL;
- prog->jited = false;
+ prog->jited = 0;
atomic_set(&prog->aux->refcnt, 1);
- prog->gpl_compatible = is_gpl;
+ prog->gpl_compatible = is_gpl ? 1 : 0;
/* find program type: socket_filter vs tracing_filter */
err = find_prog_type(type, prog);
@@ -586,6 +657,8 @@ static int bpf_prog_load(union bpf_attr *attr)
free_used_maps:
free_used_maps(prog->aux);
free_prog:
+ bpf_prog_uncharge_memlock(prog);
+free_prog_nouncharge:
bpf_prog_free(prog);
return err;
}
@@ -595,11 +668,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
union bpf_attr attr = {};
int err;
- /* the syscall is limited to root temporarily. This restriction will be
- * lifted when security audit is clean. Note that eBPF+tracing must have
- * this restriction, since it may pass kernel data to user space
- */
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
return -EPERM;
if (!access_ok(VERIFY_READ, uattr, 1))
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b074b23000d6..b56cf51f8d42 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -199,6 +199,7 @@ struct verifier_env {
struct verifier_state_list **explored_states; /* search pruning optimization */
struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
u32 used_map_cnt; /* number of used maps */
+ bool allow_ptr_leaks;
};
/* verbose verifier prints what it's seeing
@@ -244,6 +245,7 @@ static const struct {
} func_limit[] = {
{BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+ {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output},
};
static void print_verifier_state(struct verifier_env *env)
@@ -538,6 +540,21 @@ static int bpf_size_to_bytes(int bpf_size)
return -EINVAL;
}
+static bool is_spillable_regtype(enum bpf_reg_type type)
+{
+ switch (type) {
+ case PTR_TO_MAP_VALUE:
+ case PTR_TO_MAP_VALUE_OR_NULL:
+ case PTR_TO_STACK:
+ case PTR_TO_CTX:
+ case FRAME_PTR:
+ case CONST_PTR_TO_MAP:
+ return true;
+ default:
+ return false;
+ }
+}
+
/* check_stack_read/write functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
@@ -550,9 +567,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
*/
if (value_regno >= 0 &&
- (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
- state->regs[value_regno].type == PTR_TO_STACK ||
- state->regs[value_regno].type == PTR_TO_CTX)) {
+ is_spillable_regtype(state->regs[value_regno].type)) {
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
@@ -643,6 +658,20 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
return -EACCES;
}
+static bool is_pointer_value(struct verifier_env *env, int regno)
+{
+ if (env->allow_ptr_leaks)
+ return false;
+
+ switch (env->cur_state.regs[regno].type) {
+ case UNKNOWN_VALUE:
+ case CONST_IMM:
+ return false;
+ default:
+ return true;
+ }
+}
+
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
@@ -669,11 +698,21 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
}
if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
+ if (t == BPF_WRITE && value_regno >= 0 &&
+ is_pointer_value(env, value_regno)) {
+ verbose("R%d leaks addr into map\n", value_regno);
+ return -EACCES;
+ }
err = check_map_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown_value(state->regs, value_regno);
} else if (state->regs[regno].type == PTR_TO_CTX) {
+ if (t == BPF_WRITE && value_regno >= 0 &&
+ is_pointer_value(env, value_regno)) {
+ verbose("R%d leaks addr into ctx\n", value_regno);
+ return -EACCES;
+ }
err = check_ctx_access(env, off, size, t);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown_value(state->regs, value_regno);
@@ -684,10 +723,17 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
verbose("invalid stack off=%d size=%d\n", off, size);
return -EACCES;
}
- if (t == BPF_WRITE)
+ if (t == BPF_WRITE) {
+ if (!env->allow_ptr_leaks &&
+ state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
+ size != BPF_REG_SIZE) {
+ verbose("attempt to corrupt spilled pointer on stack\n");
+ return -EACCES;
+ }
err = check_stack_write(state, off, size, value_regno);
- else
+ } else {
err = check_stack_read(state, off, size, value_regno);
+ }
} else {
verbose("R%d invalid mem access '%s'\n",
regno, reg_type_str[state->regs[regno].type]);
@@ -775,8 +821,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
return -EACCES;
}
- if (arg_type == ARG_ANYTHING)
+ if (arg_type == ARG_ANYTHING) {
+ if (is_pointer_value(env, regno)) {
+ verbose("R%d leaks addr into helper function\n", regno);
+ return -EACCES;
+ }
return 0;
+ }
if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
arg_type == ARG_PTR_TO_MAP_VALUE) {
@@ -860,7 +911,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
* don't allow any other map type to be passed into
* the special func;
*/
- if (bool_map != bool_func)
+ if (bool_func && bool_map != bool_func)
return -EINVAL;
}
@@ -950,8 +1001,9 @@ static int check_call(struct verifier_env *env, int func_id)
}
/* check validity of 32-bit and 64-bit arithmetic operations */
-static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
+static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
{
+ struct reg_state *regs = env->cur_state.regs;
u8 opcode = BPF_OP(insn->code);
int err;
@@ -976,6 +1028,12 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
if (err)
return err;
+ if (is_pointer_value(env, insn->dst_reg)) {
+ verbose("R%d pointer arithmetic prohibited\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+
/* check dest operand */
err = check_reg_arg(regs, insn->dst_reg, DST_OP);
if (err)
@@ -1012,6 +1070,11 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
*/
regs[insn->dst_reg] = regs[insn->src_reg];
} else {
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose("R%d partial copy of pointer\n",
+ insn->src_reg);
+ return -EACCES;
+ }
regs[insn->dst_reg].type = UNKNOWN_VALUE;
regs[insn->dst_reg].map_ptr = NULL;
}
@@ -1061,8 +1124,18 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
/* pattern match 'bpf_add Rx, imm' instruction */
if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
regs[insn->dst_reg].type == FRAME_PTR &&
- BPF_SRC(insn->code) == BPF_K)
+ BPF_SRC(insn->code) == BPF_K) {
stack_relative = true;
+ } else if (is_pointer_value(env, insn->dst_reg)) {
+ verbose("R%d pointer arithmetic prohibited\n",
+ insn->dst_reg);
+ return -EACCES;
+ } else if (BPF_SRC(insn->code) == BPF_X &&
+ is_pointer_value(env, insn->src_reg)) {
+ verbose("R%d pointer arithmetic prohibited\n",
+ insn->src_reg);
+ return -EACCES;
+ }
/* check dest operand */
err = check_reg_arg(regs, insn->dst_reg, DST_OP);
@@ -1101,6 +1174,12 @@ static int check_cond_jmp_op(struct verifier_env *env,
err = check_reg_arg(regs, insn->src_reg, SRC_OP);
if (err)
return err;
+
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose("R%d pointer comparison prohibited\n",
+ insn->src_reg);
+ return -EACCES;
+ }
} else {
if (insn->src_reg != BPF_REG_0) {
verbose("BPF_JMP uses reserved fields\n");
@@ -1155,6 +1234,9 @@ static int check_cond_jmp_op(struct verifier_env *env,
regs[insn->dst_reg].type = CONST_IMM;
regs[insn->dst_reg].imm = 0;
}
+ } else if (is_pointer_value(env, insn->dst_reg)) {
+ verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
+ return -EACCES;
} else if (BPF_SRC(insn->code) == BPF_K &&
(opcode == BPF_JEQ || opcode == BPF_JNE)) {
@@ -1658,7 +1740,7 @@ static int do_check(struct verifier_env *env)
}
if (class == BPF_ALU || class == BPF_ALU64) {
- err = check_alu_op(regs, insn);
+ err = check_alu_op(env, insn);
if (err)
return err;
@@ -1816,6 +1898,11 @@ static int do_check(struct verifier_env *env)
if (err)
return err;
+ if (is_pointer_value(env, BPF_REG_0)) {
+ verbose("R0 leaks addr as return value\n");
+ return -EACCES;
+ }
+
process_bpf_exit:
insn_idx = pop_stack(env, &prev_insn_idx);
if (insn_idx < 0) {
@@ -2024,7 +2111,7 @@ static int convert_ctx_accesses(struct verifier_env *env)
cnt = env->prog->aux->ops->
convert_ctx_access(type, insn->dst_reg, insn->src_reg,
- insn->off, insn_buf);
+ insn->off, insn_buf, env->prog);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
verbose("bpf verifier is misconfigured\n");
return -EINVAL;
@@ -2144,6 +2231,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
if (ret < 0)
goto skip_full_check;
+ env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
+
ret = do_check(env);
skip_full_check:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b11756f9b6dc..64754bfecd70 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5286,9 +5286,15 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_RAW) {
if (data->raw) {
- perf_output_put(handle, data->raw->size);
- __output_copy(handle, data->raw->data,
- data->raw->size);
+ u32 raw_size = data->raw->size;
+ u32 real_size = round_up(raw_size + sizeof(u32),
+ sizeof(u64)) - sizeof(u32);
+ u64 zero = 0;
+
+ perf_output_put(handle, real_size);
+ __output_copy(handle, data->raw->data, raw_size);
+ if (real_size - raw_size)
+ __output_copy(handle, &zero, real_size - raw_size);
} else {
struct {
u32 size;
@@ -5420,8 +5426,7 @@ void perf_prepare_sample(struct perf_event_header *header,
else
size += sizeof(u32);
- WARN_ON_ONCE(size & (sizeof(u64)-1));
- header->size += size;
+ header->size += round_up(size, sizeof(u64));
}
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f9a59f6cabd2..4c213864ec45 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1761,6 +1761,7 @@ void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
kfree(__free_percpu_irq(irq, dev_id));
chip_bus_sync_unlock(desc);
}
+EXPORT_SYMBOL_GPL(free_percpu_irq);
/**
* setup_percpu_irq - setup a per-cpu interrupt
@@ -1790,9 +1791,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
* @devname: An ascii name for the claiming device
* @dev_id: A percpu cookie passed back to the handler function
*
- * This call allocates interrupt resources, but doesn't
- * automatically enable the interrupt. It has to be done on each
- * CPU using enable_percpu_irq().
+ * This call allocates interrupt resources and enables the
+ * interrupt on the local CPU. If the interrupt is supposed to be
+ * enabled on other CPUs, it has to be done on each CPU using
+ * enable_percpu_irq().
*
* Dev_id must be globally unique. It is a per-cpu variable, and
* the handler gets called with the interrupted CPU's instance of
@@ -1831,6 +1833,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
return retval;
}
+EXPORT_SYMBOL_GPL(request_percpu_irq);
/**
* irq_get_irqchip_state - returns the irqchip state of a interrupt.
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 787320de68e0..b760bae64cf1 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1016,6 +1016,11 @@ int ptrace_request(struct task_struct *child, long request,
break;
}
#endif
+
+ case PTRACE_SECCOMP_GET_FILTER:
+ ret = seccomp_get_filter(child, addr, datavp);
+ break;
+
default:
break;
}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 5bd4779282df..580ac2d4024f 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -347,6 +347,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
struct seccomp_filter *sfilter;
int ret;
+ const bool save_orig = config_enabled(CONFIG_CHECKPOINT_RESTORE);
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
return ERR_PTR(-EINVAL);
@@ -370,7 +371,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
return ERR_PTR(-ENOMEM);
ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
- seccomp_check_filter);
+ seccomp_check_filter, save_orig);
if (ret < 0) {
kfree(sfilter);
return ERR_PTR(ret);
@@ -469,7 +470,7 @@ void get_seccomp_filter(struct task_struct *tsk)
static inline void seccomp_filter_free(struct seccomp_filter *filter)
{
if (filter) {
- bpf_prog_free(filter->prog);
+ bpf_prog_destroy(filter->prog);
kfree(filter);
}
}
@@ -867,3 +868,76 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
/* prctl interface doesn't have flags, so they are always zero. */
return do_seccomp(op, 0, uargs);
}
+
+#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
+long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
+ void __user *data)
+{
+ struct seccomp_filter *filter;
+ struct sock_fprog_kern *fprog;
+ long ret;
+ unsigned long count = 0;
+
+ if (!capable(CAP_SYS_ADMIN) ||
+ current->seccomp.mode != SECCOMP_MODE_DISABLED) {
+ return -EACCES;
+ }
+
+ spin_lock_irq(&task->sighand->siglock);
+ if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ filter = task->seccomp.filter;
+ while (filter) {
+ filter = filter->prev;
+ count++;
+ }
+
+ if (filter_off >= count) {
+ ret = -ENOENT;
+ goto out;
+ }
+ count -= filter_off;
+
+ filter = task->seccomp.filter;
+ while (filter && count > 1) {
+ filter = filter->prev;
+ count--;
+ }
+
+ if (WARN_ON(count != 1 || !filter)) {
+ /* The filter tree shouldn't shrink while we're using it. */
+ ret = -ENOENT;
+ goto out;
+ }
+
+ fprog = filter->prog->orig_prog;
+ if (!fprog) {
+ /* This must be a new non-cBPF filter, since we save every
+ * every cBPF filter's orig_prog above when
+ * CONFIG_CHECKPOINT_RESTORE is enabled.
+ */
+ ret = -EMEDIUMTYPE;
+ goto out;
+ }
+
+ ret = fprog->len;
+ if (!data)
+ goto out;
+
+ get_seccomp_filter(task);
+ spin_unlock_irq(&task->sighand->siglock);
+
+ if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
+ ret = -EFAULT;
+
+ put_seccomp_filter(task);
+ return ret;
+
+out:
+ spin_unlock_irq(&task->sighand->siglock);
+ return ret;
+}
+#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e69201d8094e..96c856b04081 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -64,6 +64,7 @@
#include <linux/binfmts.h>
#include <linux/sched/sysctl.h>
#include <linux/kexec.h>
+#include <linux/bpf.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -1139,6 +1140,18 @@ static struct ctl_table kern_table[] = {
.proc_handler = timer_migration_handler,
},
#endif
+#ifdef CONFIG_BPF_SYSCALL
+ {
+ .procname = "unprivileged_bpf_disabled",
+ .data = &sysctl_unprivileged_bpf_disabled,
+ .maxlen = sizeof(sysctl_unprivileged_bpf_disabled),
+ .mode = 0644,
+ /* only handle a transition from default "0" to "1" */
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &one,
+ },
+#endif
{ }
};
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0fe96c7c8803..4228fd3682c3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -199,6 +199,11 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
if (!event)
return -ENOENT;
+ /* make sure event is local and doesn't have pmu::count */
+ if (event->oncpu != smp_processor_id() ||
+ event->pmu->count)
+ return -EINVAL;
+
/*
* we don't know if the function is run successfully by the
* return value. It can be judged in other places, such as
@@ -207,14 +212,58 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
return perf_event_read_local(event);
}
-const struct bpf_func_proto bpf_perf_event_read_proto = {
+static const struct bpf_func_proto bpf_perf_event_read_proto = {
.func = bpf_perf_event_read,
- .gpl_only = false,
+ .gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_ANYTHING,
};
+static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
+{
+ struct pt_regs *regs = (struct pt_regs *) (long) r1;
+ struct bpf_map *map = (struct bpf_map *) (long) r2;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ void *data = (void *) (long) r4;
+ struct perf_sample_data sample_data;
+ struct perf_event *event;
+ struct perf_raw_record raw = {
+ .size = size,
+ .data = data,
+ };
+
+ if (unlikely(index >= array->map.max_entries))
+ return -E2BIG;
+
+ event = (struct perf_event *)array->ptrs[index];
+ if (unlikely(!event))
+ return -ENOENT;
+
+ if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
+ event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
+ return -EINVAL;
+
+ if (unlikely(event->oncpu != smp_processor_id()))
+ return -EOPNOTSUPP;
+
+ perf_sample_data_init(&sample_data, 0, 0);
+ sample_data.raw = &raw;
+ perf_event_output(event, &sample_data, regs);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_perf_event_output_proto = {
+ .func = bpf_perf_event_output,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_STACK,
+ .arg5_type = ARG_CONST_STACK_SIZE,
+};
+
static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -242,6 +291,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return &bpf_get_smp_processor_id_proto;
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_perf_event_output_proto;
default:
return NULL;
}