From 0c93b7d85d40b690f04786ea0f18798b73182e4f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 25 Mar 2016 12:06:51 -0400 Subject: bpf: reject invalid names right in ->lookup() ... and other methods won't see them at all Signed-off-by: Al Viro --- kernel/bpf/inode.c | 37 ++++++++----------------------------- 1 file changed, 8 insertions(+), 29 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index f2ece3c174a5..35d21c189bb0 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -119,18 +119,10 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) return 0; } -static bool bpf_dname_reserved(const struct dentry *dentry) -{ - return strchr(dentry->d_name.name, '.'); -} - static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; - if (bpf_dname_reserved(dentry)) - return -EPERM; - inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -152,9 +144,6 @@ static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry, { struct inode *inode; - if (bpf_dname_reserved(dentry)) - return -EPERM; - inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -187,31 +176,21 @@ static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode, } } -static int bpf_link(struct dentry *old_dentry, struct inode *dir, - struct dentry *new_dentry) +static struct dentry * +bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { - if (bpf_dname_reserved(new_dentry)) - return -EPERM; - - return simple_link(old_dentry, dir, new_dentry); -} - -static int bpf_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - if (bpf_dname_reserved(new_dentry)) - return -EPERM; - - return simple_rename(old_dir, old_dentry, new_dir, new_dentry); + if (strchr(dentry->d_name.name, '.')) + return ERR_PTR(-EPERM); + return simple_lookup(dir, dentry, flags); } static const struct inode_operations bpf_dir_iops = { - .lookup = simple_lookup, + .lookup = bpf_lookup, .mknod = bpf_mkobj, .mkdir = bpf_mkdir, .rmdir = simple_rmdir, - .rename = bpf_rename, - .link = bpf_link, + .rename = simple_rename, + .link = simple_link, .unlink = simple_unlink, }; -- cgit From 9940d67c93b5bb7ddcf862b41b1847cb728186c4 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 6 Apr 2016 18:43:27 -0700 Subject: bpf: support bpf_get_stackid() and bpf_perf_event_output() in tracepoint programs needs two wrapper functions to fetch 'struct pt_regs *' to convert tracepoint bpf context into kprobe bpf context to reuse existing helper functions Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/stackmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 499d9e933f8e..35114725cf30 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -116,7 +116,7 @@ free_smap: return ERR_PTR(err); } -static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) { struct pt_regs *regs = (struct pt_regs *) (long) r1; struct bpf_map *map = (struct bpf_map *) (long) r2; -- cgit From 32bbe0078afe86a8bf4c67c6b3477781b15e94dc Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 6 Apr 2016 18:43:28 -0700 Subject: bpf: sanitize bpf tracepoint access during bpf program loading remember the last byte of ctx access and at the time of attaching the program to tracepoint check that the program doesn't access bytes beyond defined in tracepoint fields This also disallows access to __dynamic_array fields, but can be relaxed in the future. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2e08f8e9b771..58792fed5678 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -652,8 +652,12 @@ static int check_ctx_access(struct verifier_env *env, int off, int size, enum bpf_access_type t) { if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t)) + env->prog->aux->ops->is_valid_access(off, size, t)) { + /* remember the offset of last byte accessed in ctx */ + if (env->prog->aux->max_ctx_offset < off + size) + env->prog->aux->max_ctx_offset = off + size; return 0; + } verbose("invalid bpf_context access off=%d size=%d\n", off, size); return -EACCES; -- cgit From 07016151a446d25397b24588df4ed5cf777a69bb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 5 Apr 2016 22:33:17 +0200 Subject: bpf, verifier: further improve search pruning The verifier needs to go through every path of the program in order to check that it terminates safely, which can be quite a lot of instructions that need to be processed f.e. in cases with more branchy programs. With search pruning from f1bca824dabb ("bpf: add search pruning optimization to verifier") the search space can already be reduced significantly when the verifier detects that a previously walked path with same register and stack contents terminated already (see verifier's states_equal()), so the search can skip walking those states. When working with larger programs of > ~2000 (out of max 4096) insns, we found that the current limit of 32k instructions is easily hit. For example, a case we ran into is that the search space cannot be pruned due to branches at the beginning of the program that make use of certain stack space slots (STACK_MISC), which are never used in the remaining program (STACK_INVALID). Therefore, the verifier needs to walk paths for the slots in STACK_INVALID state, but also all remaining paths with a stack structure, where the slots are in STACK_MISC, which can nearly double the search space needed. After various experiments, we find that a limit of 64k processed insns is a more reasonable choice when dealing with larger programs in practice. This still allows to reject extreme crafted cases that can have a much higher complexity (f.e. > ~300k) within the 4096 insns limit due to search pruning not being able to take effect. Furthermore, we found that a lot of states can be pruned after a call instruction, f.e. we were able to reduce the search state by ~35% in some cases with this heuristic, trade-off is to keep a bit more states in env->explored_states. Usually, call instructions have a number of preceding register assignments and/or stack stores, where search pruning has a better chance to suceed in states_equal() test. The current code marks the branch targets with STATE_LIST_MARK in case of conditional jumps, and the next (t + 1) instruction in case of unconditional jump so that f.e. a backjump will walk it. We also did experiments with using t + insns[t].off + 1 as a marker in the unconditionally jump case instead of t + 1 with the rationale that these two branches of execution that converge after the label might have more potential of pruning. We found that it was a bit better, but not necessarily significantly better than the current state, perhaps also due to clang not generating back jumps often. Hence, we left that as is for now. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 58792fed5678..8233021538d3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -202,6 +202,9 @@ struct verifier_env { bool allow_ptr_leaks; }; +#define BPF_COMPLEXITY_LIMIT_INSNS 65536 +#define BPF_COMPLEXITY_LIMIT_STACK 1024 + /* verbose verifier prints what it's seeing * bpf_check() is called under lock, so no race to access these global vars */ @@ -454,7 +457,7 @@ static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx, elem->next = env->head; env->head = elem; env->stack_size++; - if (env->stack_size > 1024) { + if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { verbose("BPF program is too complex\n"); goto err; } @@ -1543,6 +1546,8 @@ peek_stack: goto peek_stack; else if (ret < 0) goto err_free; + if (t + 1 < insn_cnt) + env->explored_states[t + 1] = STATE_LIST_MARK; } else if (opcode == BPF_JA) { if (BPF_SRC(insns[t].code) != BPF_K) { ret = -EINVAL; @@ -1747,7 +1752,7 @@ static int do_check(struct verifier_env *env) insn = &insns[insn_idx]; class = BPF_CLASS(insn->code); - if (++insn_processed > 32768) { + if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { verbose("BPF program is too large. Proccessed %d insn\n", insn_processed); return -E2BIG; -- cgit From 4923ec0b10d998349c2ac4b38aa4674e539e6f92 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 6 Apr 2016 19:39:21 -0700 Subject: bpf: simplify verifier register state assignments verifier is using the following structure to track the state of registers: struct reg_state { enum bpf_reg_type type; union { int imm; struct bpf_map *map_ptr; }; }; and later on in states_equal() does memcmp(&old->regs[i], &cur->regs[i],..) to find equivalent states. Throughout the code of verifier there are assignements to 'imm' and 'map_ptr' fields and it's not obvious that most of the assignments into 'imm' don't need to clear extra 4 bytes (like mark_reg_unknown_value() does) to make sure that memcmp doesn't go over junk left from 'map_ptr' assignment. Simplify the code by converting 'int' into 'long' Suggested-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8233021538d3..6c5d7cd4cb0e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -142,7 +142,7 @@ struct reg_state { enum bpf_reg_type type; union { /* valid when type == CONST_IMM | PTR_TO_STACK */ - int imm; + long imm; /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | * PTR_TO_MAP_VALUE_OR_NULL @@ -263,7 +263,7 @@ static void print_verifier_state(struct verifier_env *env) continue; verbose(" R%d=%s", i, reg_type_str[t]); if (t == CONST_IMM || t == PTR_TO_STACK) - verbose("%d", env->cur_state.regs[i].imm); + verbose("%ld", env->cur_state.regs[i].imm); else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL) verbose("(ks=%d,vs=%d)", @@ -480,7 +480,6 @@ static void init_reg_state(struct reg_state *regs) for (i = 0; i < MAX_BPF_REG; i++) { regs[i].type = NOT_INIT; regs[i].imm = 0; - regs[i].map_ptr = NULL; } /* frame pointer */ @@ -495,7 +494,6 @@ static void mark_reg_unknown_value(struct reg_state *regs, u32 regno) BUG_ON(regno >= MAX_BPF_REG); regs[regno].type = UNKNOWN_VALUE; regs[regno].imm = 0; - regs[regno].map_ptr = NULL; } enum reg_arg_type { -- cgit From 33ff9823c569f3aceb071071914919177a6bed6a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 13 Apr 2016 00:10:50 +0200 Subject: bpf, verifier: add bpf_call_arg_meta for passing meta data Currently, when the verifier checks calls in check_call() function, we call check_func_arg() for all 5 arguments e.g. to make sure expected types are correct. In some cases, we collect meta data (here: map pointer) to perform additional checks such as checking stack boundary on key/value sizes for subsequent arguments. As we're going to extend the meta data, add a generic struct bpf_call_arg_meta that we can use for passing into check_func_arg(). Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6c5d7cd4cb0e..202f8f738542 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -205,6 +205,10 @@ struct verifier_env { #define BPF_COMPLEXITY_LIMIT_INSNS 65536 #define BPF_COMPLEXITY_LIMIT_STACK 1024 +struct bpf_call_arg_meta { + struct bpf_map *map_ptr; +}; + /* verbose verifier prints what it's seeing * bpf_check() is called under lock, so no race to access these global vars */ @@ -822,7 +826,8 @@ static int check_stack_boundary(struct verifier_env *env, int regno, } static int check_func_arg(struct verifier_env *env, u32 regno, - enum bpf_arg_type arg_type, struct bpf_map **mapp) + enum bpf_arg_type arg_type, + struct bpf_call_arg_meta *meta) { struct reg_state *reg = env->cur_state.regs + regno; enum bpf_reg_type expected_type; @@ -875,14 +880,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno, if (arg_type == ARG_CONST_MAP_PTR) { /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ - *mapp = reg->map_ptr; - + meta->map_ptr = reg->map_ptr; } else if (arg_type == ARG_PTR_TO_MAP_KEY) { /* bpf_map_xxx(..., map_ptr, ..., key) call: * check that [key, key + map->key_size) are within * stack limits and initialized */ - if (!*mapp) { + if (!meta->map_ptr) { /* in function declaration map_ptr must come before * map_key, so that it's verified and known before * we have to check map_key here. Otherwise it means @@ -891,19 +895,19 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->key\n"); return -EACCES; } - err = check_stack_boundary(env, regno, (*mapp)->key_size, + err = check_stack_boundary(env, regno, meta->map_ptr->key_size, false); } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity */ - if (!*mapp) { + if (!meta->map_ptr) { /* kernel subsystem misconfigured verifier */ verbose("invalid map_ptr to access map->value\n"); return -EACCES; } - err = check_stack_boundary(env, regno, (*mapp)->value_size, - false); + err = check_stack_boundary(env, regno, + meta->map_ptr->value_size, false); } else if (arg_type == ARG_CONST_STACK_SIZE || arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); @@ -954,8 +958,8 @@ static int check_call(struct verifier_env *env, int func_id) struct verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; struct reg_state *regs = state->regs; - struct bpf_map *map = NULL; struct reg_state *reg; + struct bpf_call_arg_meta meta; int i, err; /* find function prototype */ @@ -978,20 +982,22 @@ static int check_call(struct verifier_env *env, int func_id) return -EINVAL; } + memset(&meta, 0, sizeof(meta)); + /* check args */ - err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map); + err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta); if (err) return err; - err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map); + err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); if (err) return err; - err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map); + err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); if (err) return err; - err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map); + err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &meta); if (err) return err; - err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map); + err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &meta); if (err) return err; @@ -1013,18 +1019,18 @@ static int check_call(struct verifier_env *env, int func_id) * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() */ - if (map == NULL) { + if (meta.map_ptr == NULL) { verbose("kernel subsystem misconfigured verifier\n"); return -EINVAL; } - regs[BPF_REG_0].map_ptr = map; + regs[BPF_REG_0].map_ptr = meta.map_ptr; } else { verbose("unknown return type %d of func %d\n", fn->ret_type, func_id); return -EINVAL; } - err = check_map_func_compatibility(map, func_id); + err = check_map_func_compatibility(meta.map_ptr, func_id); if (err) return err; -- cgit From 435faee1aae9c1ac231f89e4faf0437bfe29f425 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 13 Apr 2016 00:10:51 +0200 Subject: bpf, verifier: add ARG_PTR_TO_RAW_STACK type When passing buffers from eBPF stack space into a helper function, we have ARG_PTR_TO_STACK argument type for helpers available. The verifier makes sure that such buffers are initialized, within boundaries, etc. However, the downside with this is that we have a couple of helper functions such as bpf_skb_load_bytes() that fill out the passed buffer in the expected success case anyway, so zero initializing them prior to the helper call is unneeded/wasted instructions in the eBPF program that can be avoided. Therefore, add a new helper function argument type called ARG_PTR_TO_RAW_STACK. The idea is to skip the STACK_MISC check in check_stack_boundary() and color the related stack slots as STACK_MISC after we checked all call arguments. Helper functions using ARG_PTR_TO_RAW_STACK must make sure that every path of the helper function will fill the provided buffer area, so that we cannot leak any uninitialized stack memory. This f.e. means that error paths need to memset() the buffers, but the expected fast-path doesn't have to do this anymore. Since there's no such helper needing more than at most one ARG_PTR_TO_RAW_STACK argument, we can keep it simple and don't need to check for multiple areas. Should in future such a use-case really appear, we have check_raw_mode() that will make sure we implement support for it first. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 202f8f738542..9c843a5417da 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -207,6 +207,9 @@ struct verifier_env { struct bpf_call_arg_meta { struct bpf_map *map_ptr; + bool raw_mode; + int regno; + int access_size; }; /* verbose verifier prints what it's seeing @@ -789,7 +792,8 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) * and all elements of stack are initialized */ static int check_stack_boundary(struct verifier_env *env, int regno, - int access_size, bool zero_size_allowed) + int access_size, bool zero_size_allowed, + struct bpf_call_arg_meta *meta) { struct verifier_state *state = &env->cur_state; struct reg_state *regs = state->regs; @@ -815,6 +819,12 @@ static int check_stack_boundary(struct verifier_env *env, int regno, return -EACCES; } + if (meta && meta->raw_mode) { + meta->access_size = access_size; + meta->regno = regno; + return 0; + } + for (i = 0; i < access_size; i++) { if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) { verbose("invalid indirect read from stack off %d+%d size %d\n", @@ -859,7 +869,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, expected_type = CONST_PTR_TO_MAP; } else if (arg_type == ARG_PTR_TO_CTX) { expected_type = PTR_TO_CTX; - } else if (arg_type == ARG_PTR_TO_STACK) { + } else if (arg_type == ARG_PTR_TO_STACK || + arg_type == ARG_PTR_TO_RAW_STACK) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be * passed in as argument, it's a CONST_IMM type. Final test @@ -867,6 +878,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno, */ if (reg->type == CONST_IMM && reg->imm == 0) expected_type = CONST_IMM; + meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK; } else { verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; @@ -896,7 +908,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return -EACCES; } err = check_stack_boundary(env, regno, meta->map_ptr->key_size, - false); + false, NULL); } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity @@ -907,7 +919,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return -EACCES; } err = check_stack_boundary(env, regno, - meta->map_ptr->value_size, false); + meta->map_ptr->value_size, + false, NULL); } else if (arg_type == ARG_CONST_STACK_SIZE || arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); @@ -922,7 +935,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return -EACCES; } err = check_stack_boundary(env, regno - 1, reg->imm, - zero_size_allowed); + zero_size_allowed, meta); } return err; @@ -953,6 +966,24 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) return 0; } +static int check_raw_mode(const struct bpf_func_proto *fn) +{ + int count = 0; + + if (fn->arg1_type == ARG_PTR_TO_RAW_STACK) + count++; + if (fn->arg2_type == ARG_PTR_TO_RAW_STACK) + count++; + if (fn->arg3_type == ARG_PTR_TO_RAW_STACK) + count++; + if (fn->arg4_type == ARG_PTR_TO_RAW_STACK) + count++; + if (fn->arg5_type == ARG_PTR_TO_RAW_STACK) + count++; + + return count > 1 ? -EINVAL : 0; +} + static int check_call(struct verifier_env *env, int func_id) { struct verifier_state *state = &env->cur_state; @@ -984,6 +1015,15 @@ static int check_call(struct verifier_env *env, int func_id) memset(&meta, 0, sizeof(meta)); + /* We only support one arg being in raw mode at the moment, which + * is sufficient for the helper functions we have right now. + */ + err = check_raw_mode(fn); + if (err) { + verbose("kernel subsystem misconfigured func %d\n", func_id); + return err; + } + /* check args */ err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta); if (err) @@ -1001,6 +1041,15 @@ static int check_call(struct verifier_env *env, int func_id) if (err) return err; + /* Mark slots with STACK_MISC in case of raw mode, stack offset + * is inferred from register state. + */ + for (i = 0; i < meta.access_size; i++) { + err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1); + if (err) + return err; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { reg = regs + caller_saved[i]; -- cgit From 074f528eed408b467516e142fa4c45e5b0d2ba16 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 13 Apr 2016 00:10:52 +0200 Subject: bpf: convert relevant helper args to ARG_PTR_TO_RAW_STACK This patch converts all helpers that can use ARG_PTR_TO_RAW_STACK as argument type. For tc programs this is bpf_skb_load_bytes(), bpf_skb_get_tunnel_key(), bpf_skb_get_tunnel_opt(). For tracing, this optimizes bpf_get_current_comm() and bpf_probe_read(). The check in bpf_skb_load_bytes() for MAX_BPF_STACK can also be removed since the verifier already makes sure we stay within bounds on stack buffers. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/helpers.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 50da680c479f..ad7a0573f71b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -163,17 +163,26 @@ static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5) struct task_struct *task = current; char *buf = (char *) (long) r1; - if (!task) - return -EINVAL; + if (unlikely(!task)) + goto err_clear; - strlcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm))); + strncpy(buf, task->comm, size); + + /* Verifier guarantees that size > 0. For task->comm exceeding + * size, guarantee that buf is %NUL-terminated. Unconditionally + * done here to save the size test. + */ + buf[size - 1] = 0; return 0; +err_clear: + memset(buf, 0, size); + return -EINVAL; } const struct bpf_func_proto bpf_get_current_comm_proto = { .func = bpf_get_current_comm, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_STACK, + .arg1_type = ARG_PTR_TO_RAW_STACK, .arg2_type = ARG_CONST_STACK_SIZE, }; -- cgit From bd570ff970a54df653b48ed0cfb373f2ebed083d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 18 Apr 2016 21:01:24 +0200 Subject: bpf: add event output helper for notifications/sampling/logging This patch adds a new helper for cls/act programs that can push events to user space applications. For networking, this can be f.e. for sampling, debugging, logging purposes or pushing of arbitrary wake-up events. The idea is similar to a43eec304259 ("bpf: introduce bpf_perf_event_output() helper") and 39111695b1b8 ("samples: bpf: add bpf_perf_event_output example"). The eBPF program utilizes a perf event array map that user space populates with fds from perf_event_open(), the eBPF program calls into the helper f.e. as skb_event_output(skb, &my_map, BPF_F_CURRENT_CPU, raw, sizeof(raw)) so that the raw data is pushed into the fd f.e. at the map index of the current CPU. User space can poll/mmap/etc on this and has a data channel for receiving events that can be post-processed. The nice thing is that since the eBPF program and user space application making use of it are tightly coupled, they can define their own arbitrary raw data format and what/when they want to push. While f.e. packet headers could be one part of the meta data that is being pushed, this is not a substitute for things like packet sockets as whole packet is not being pushed and push is only done in a single direction. Intention is more of a generically usable, efficient event pipe to applications. Workflow is that tc can pin the map and applications can attach themselves e.g. after cls/act setup to one or multiple map slots, demuxing is done by the eBPF program. Adding this facility is with minimal effort, it reuses the helper introduced in a43eec304259 ("bpf: introduce bpf_perf_event_output() helper") and we get its functionality for free by overloading its BPF_FUNC_ identifier for cls/act programs, ctx is currently unused, but will be made use of in future. Example will be added to iproute2's BPF example files. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index be0abf669ced..e4248fe79513 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -764,14 +764,21 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; + const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; + const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { return NULL; } +const struct bpf_func_proto * __weak bpf_get_event_output_proto(void) +{ + return NULL; +} + /* Always built-in helper functions. */ const struct bpf_func_proto bpf_tail_call_proto = { .func = NULL, -- cgit From 1a0dc1ac1d2928e25739ee82d7e04423b01da563 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 5 May 2016 19:49:09 -0700 Subject: bpf: cleanup verifier code cleanup verifier code and prepare it for addition of "pointer to packet" logic Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 100 ++++++++++++++++++++++++++------------------------ 1 file changed, 53 insertions(+), 47 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 63554b6d4e25..afeb62808902 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -249,28 +249,30 @@ static const char * const reg_type_str[] = { [CONST_IMM] = "imm", }; -static void print_verifier_state(struct verifier_env *env) +static void print_verifier_state(struct verifier_state *state) { + struct reg_state *reg; enum bpf_reg_type t; int i; for (i = 0; i < MAX_BPF_REG; i++) { - t = env->cur_state.regs[i].type; + reg = &state->regs[i]; + t = reg->type; if (t == NOT_INIT) continue; verbose(" R%d=%s", i, reg_type_str[t]); if (t == CONST_IMM || t == PTR_TO_STACK) - verbose("%ld", env->cur_state.regs[i].imm); + verbose("%ld", reg->imm); else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL) verbose("(ks=%d,vs=%d)", - env->cur_state.regs[i].map_ptr->key_size, - env->cur_state.regs[i].map_ptr->value_size); + reg->map_ptr->key_size, + reg->map_ptr->value_size); } for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (env->cur_state.stack_slot_type[i] == STACK_SPILL) + if (state->stack_slot_type[i] == STACK_SPILL) verbose(" fp%d=%s", -MAX_BPF_STACK + i, - reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]); + reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]); } verbose("\n"); } @@ -686,10 +688,11 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, int value_regno) { struct verifier_state *state = &env->cur_state; + struct reg_state *reg = &state->regs[regno]; int size, err = 0; - if (state->regs[regno].type == PTR_TO_STACK) - off += state->regs[regno].imm; + if (reg->type == PTR_TO_STACK) + off += reg->imm; size = bpf_size_to_bytes(bpf_size); if (size < 0) @@ -700,7 +703,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, return -EACCES; } - if (state->regs[regno].type == PTR_TO_MAP_VALUE) { + if (reg->type == PTR_TO_MAP_VALUE) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose("R%d leaks addr into map\n", value_regno); @@ -710,7 +713,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); - } else if (state->regs[regno].type == PTR_TO_CTX) { + } else if (reg->type == PTR_TO_CTX) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose("R%d leaks addr into ctx\n", value_regno); @@ -720,8 +723,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); - } else if (state->regs[regno].type == FRAME_PTR || - state->regs[regno].type == PTR_TO_STACK) { + } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { if (off >= 0 || off < -MAX_BPF_STACK) { verbose("invalid stack off=%d size=%d\n", off, size); return -EACCES; @@ -739,7 +741,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, } } else { verbose("R%d invalid mem access '%s'\n", - regno, reg_type_str[state->regs[regno].type]); + regno, reg_type_str[reg->type]); return -EACCES; } return err; @@ -1104,7 +1106,7 @@ static int check_call(struct verifier_env *env, int func_id) /* check validity of 32-bit and 64-bit arithmetic operations */ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; + struct reg_state *regs = env->cur_state.regs, *dst_reg; u8 opcode = BPF_OP(insn->code); int err; @@ -1193,8 +1195,6 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) } else { /* all other ALU ops: and, sub, xor, add, ... */ - bool stack_relative = false; - if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0 || insn->off != 0) { verbose("BPF_ALU uses reserved fields\n"); @@ -1232,11 +1232,19 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) } } + /* check dest operand */ + err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK); + if (err) + return err; + + dst_reg = ®s[insn->dst_reg]; + /* pattern match 'bpf_add Rx, imm' instruction */ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && - regs[insn->dst_reg].type == FRAME_PTR && - BPF_SRC(insn->code) == BPF_K) { - stack_relative = true; + dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) { + dst_reg->type = PTR_TO_STACK; + dst_reg->imm = insn->imm; + return 0; } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer arithmetic prohibited\n", insn->dst_reg); @@ -1248,15 +1256,8 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) return -EACCES; } - /* check dest operand */ - err = check_reg_arg(regs, insn->dst_reg, DST_OP); - if (err) - return err; - - if (stack_relative) { - regs[insn->dst_reg].type = PTR_TO_STACK; - regs[insn->dst_reg].imm = insn->imm; - } + /* mark dest operand */ + mark_reg_unknown_value(regs, insn->dst_reg); } return 0; @@ -1265,7 +1266,7 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) static int check_cond_jmp_op(struct verifier_env *env, struct bpf_insn *insn, int *insn_idx) { - struct reg_state *regs = env->cur_state.regs; + struct reg_state *regs = env->cur_state.regs, *dst_reg; struct verifier_state *other_branch; u8 opcode = BPF_OP(insn->code); int err; @@ -1303,11 +1304,12 @@ static int check_cond_jmp_op(struct verifier_env *env, if (err) return err; + dst_reg = ®s[insn->dst_reg]; + /* detect if R == 0 where R was initialized to zero earlier */ if (BPF_SRC(insn->code) == BPF_K && (opcode == BPF_JEQ || opcode == BPF_JNE) && - regs[insn->dst_reg].type == CONST_IMM && - regs[insn->dst_reg].imm == insn->imm) { + dst_reg->type == CONST_IMM && dst_reg->imm == insn->imm) { if (opcode == BPF_JEQ) { /* if (imm == imm) goto pc+off; * only follow the goto, ignore fall-through @@ -1329,9 +1331,8 @@ static int check_cond_jmp_op(struct verifier_env *env, /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */ if (BPF_SRC(insn->code) == BPF_K && - insn->imm == 0 && (opcode == BPF_JEQ || - opcode == BPF_JNE) && - regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) { + insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && + dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { if (opcode == BPF_JEQ) { /* next fallthrough insn can access memory via * this register @@ -1366,7 +1367,7 @@ static int check_cond_jmp_op(struct verifier_env *env, } } if (log_level) - print_verifier_state(env); + print_verifier_state(&env->cur_state); return 0; } @@ -1444,14 +1445,14 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) int i, err; if (!may_access_skb(env->prog->type)) { - verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n"); + verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); return -EINVAL; } if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { - verbose("BPF_LD_ABS uses reserved fields\n"); + verbose("BPF_LD_[ABS|IND] uses reserved fields\n"); return -EINVAL; } @@ -1712,17 +1713,21 @@ err_free: */ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) { + struct reg_state *rold, *rcur; int i; for (i = 0; i < MAX_BPF_REG; i++) { - if (memcmp(&old->regs[i], &cur->regs[i], - sizeof(old->regs[0])) != 0) { - if (old->regs[i].type == NOT_INIT || - (old->regs[i].type == UNKNOWN_VALUE && - cur->regs[i].type != NOT_INIT)) - continue; - return false; - } + rold = &old->regs[i]; + rcur = &cur->regs[i]; + + if (memcmp(rold, rcur, sizeof(*rold)) == 0) + continue; + + if (rold->type == NOT_INIT || + (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT)) + continue; + + return false; } for (i = 0; i < MAX_BPF_STACK; i++) { @@ -1844,7 +1849,7 @@ static int do_check(struct verifier_env *env) if (log_level && do_print_state) { verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx); - print_verifier_state(env); + print_verifier_state(&env->cur_state); do_print_state = false; } @@ -2056,6 +2061,7 @@ process_bpf_exit: insn_idx++; } + verbose("processed %d insns\n", insn_processed); return 0; } -- cgit From 969bf05eb3cedd5a8d4b7c346a85c2ede87a6d6d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 5 May 2016 19:49:10 -0700 Subject: bpf: direct packet access Extended BPF carried over two instructions from classic to access packet data: LD_ABS and LD_IND. They're highly optimized in JITs, but due to their design they have to do length check for every access. When BPF is processing 20M packets per second single LD_ABS after JIT is consuming 3% cpu. Hence the need to optimize it further by amortizing the cost of 'off < skb_headlen' over multiple packet accesses. One option is to introduce two new eBPF instructions LD_ABS_DW and LD_IND_DW with similar usage as skb_header_pointer(). The kernel part for interpreter and x64 JIT was implemented in [1], but such new insns behave like old ld_abs and abort the program with 'return 0' if access is beyond linear data. Such hidden control flow is hard to workaround plus changing JITs and rolling out new llvm is incovenient. Therefore allow cls_bpf/act_bpf program access skb->data directly: int bpf_prog(struct __sk_buff *skb) { struct iphdr *ip; if (skb->data + sizeof(struct iphdr) + ETH_HLEN > skb->data_end) /* packet too small */ return 0; ip = skb->data + ETH_HLEN; /* access IP header fields with direct loads */ if (ip->version != 4 || ip->saddr == 0x7f000001) return 1; [...] } This solution avoids introduction of new instructions. llvm stays the same and all JITs stay the same, but verifier has to work extra hard to prove safety of the above program. For XDP the direct store instructions can be allowed as well. The skb->data is NET_IP_ALIGNED, so for common cases the verifier can check the alignment. The complex packet parsers where packet pointer is adjusted incrementally cannot be tracked for alignment, so allow byte access in such cases and misaligned access on architectures that define efficient_unaligned_access [1] https://git.kernel.org/cgit/linux/kernel/git/ast/bpf.git/?h=ld_abs_dw Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/core.c | 5 + kernel/bpf/verifier.c | 441 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 438 insertions(+), 8 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e4248fe79513..d781b077431f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -794,6 +794,11 @@ void __weak bpf_int_jit_compile(struct bpf_prog *prog) { } +bool __weak bpf_helper_changes_skb_data(void *func) +{ + return false; +} + /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index afeb62808902..6338c61fc2a1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -136,13 +137,32 @@ enum bpf_reg_type { FRAME_PTR, /* reg == frame_pointer */ PTR_TO_STACK, /* reg == frame_pointer + imm */ CONST_IMM, /* constant integer value */ + + /* PTR_TO_PACKET represents: + * skb->data + * skb->data + imm + * skb->data + (u16) var + * skb->data + (u16) var + imm + * if (range > 0) then [ptr, ptr + range - off) is safe to access + * if (id > 0) means that some 'var' was added + * if (off > 0) menas that 'imm' was added + */ + PTR_TO_PACKET, + PTR_TO_PACKET_END, /* skb->data + headlen */ }; struct reg_state { enum bpf_reg_type type; union { - /* valid when type == CONST_IMM | PTR_TO_STACK */ - long imm; + /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */ + s64 imm; + + /* valid when type == PTR_TO_PACKET* */ + struct { + u32 id; + u16 off; + u16 range; + }; /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | * PTR_TO_MAP_VALUE_OR_NULL @@ -247,6 +267,8 @@ static const char * const reg_type_str[] = { [FRAME_PTR] = "fp", [PTR_TO_STACK] = "fp", [CONST_IMM] = "imm", + [PTR_TO_PACKET] = "pkt", + [PTR_TO_PACKET_END] = "pkt_end", }; static void print_verifier_state(struct verifier_state *state) @@ -262,7 +284,12 @@ static void print_verifier_state(struct verifier_state *state) continue; verbose(" R%d=%s", i, reg_type_str[t]); if (t == CONST_IMM || t == PTR_TO_STACK) - verbose("%ld", reg->imm); + verbose("%lld", reg->imm); + else if (t == PTR_TO_PACKET) + verbose("(id=%d,off=%d,r=%d)", + reg->id, reg->off, reg->range); + else if (t == UNKNOWN_VALUE && reg->imm) + verbose("%lld", reg->imm); else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL) verbose("(ks=%d,vs=%d)", @@ -548,6 +575,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_MAP_VALUE_OR_NULL: case PTR_TO_STACK: case PTR_TO_CTX: + case PTR_TO_PACKET: + case PTR_TO_PACKET_END: case FRAME_PTR: case CONST_PTR_TO_MAP: return true; @@ -647,6 +676,27 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off, return 0; } +#define MAX_PACKET_OFF 0xffff + +static int check_packet_access(struct verifier_env *env, u32 regno, int off, + int size) +{ + struct reg_state *regs = env->cur_state.regs; + struct reg_state *reg = ®s[regno]; + int linear_size = (int) reg->range - (int) reg->off; + + if (linear_size < 0 || linear_size >= MAX_PACKET_OFF) { + verbose("verifier bug\n"); + return -EFAULT; + } + if (off < 0 || off + size > linear_size) { + verbose("invalid access to packet, off=%d size=%d, allowed=%d\n", + off, size, linear_size); + return -EACCES; + } + return 0; +} + /* check access to 'struct bpf_context' fields */ static int check_ctx_access(struct verifier_env *env, int off, int size, enum bpf_access_type t) @@ -677,6 +727,45 @@ static bool is_pointer_value(struct verifier_env *env, int regno) } } +static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, + int off, int size) +{ + if (reg->type != PTR_TO_PACKET) { + if (off % size != 0) { + verbose("misaligned access off %d size %d\n", off, size); + return -EACCES; + } else { + return 0; + } + } + + switch (env->prog->type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + break; + default: + verbose("verifier is misconfigured\n"); + return -EACCES; + } + + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) + /* misaligned access to packet is ok on x86,arm,arm64 */ + return 0; + + if (reg->id && size != 1) { + verbose("Unknown packet alignment. Only byte-sized access allowed\n"); + return -EACCES; + } + + /* skb->data is NET_IP_ALIGN-ed */ + if ((NET_IP_ALIGN + reg->off + off) % size != 0) { + verbose("misaligned packet access off %d+%d+%d size %d\n", + NET_IP_ALIGN, reg->off, off, size); + return -EACCES; + } + return 0; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -698,10 +787,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, if (size < 0) return size; - if (off % size != 0) { - verbose("misaligned access off %d size %d\n", off, size); - return -EACCES; - } + err = check_ptr_alignment(env, reg, off, size); + if (err) + return err; if (reg->type == PTR_TO_MAP_VALUE) { if (t == BPF_WRITE && value_regno >= 0 && @@ -720,8 +808,16 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, return -EACCES; } err = check_ctx_access(env, off, size, t); - if (!err && t == BPF_READ && value_regno >= 0) + if (!err && t == BPF_READ && value_regno >= 0) { mark_reg_unknown_value(state->regs, value_regno); + if (off == offsetof(struct __sk_buff, data) && + env->allow_ptr_leaks) + /* note that reg.[id|off|range] == 0 */ + state->regs[value_regno].type = PTR_TO_PACKET; + else if (off == offsetof(struct __sk_buff, data_end) && + env->allow_ptr_leaks) + state->regs[value_regno].type = PTR_TO_PACKET_END; + } } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { if (off >= 0 || off < -MAX_BPF_STACK) { @@ -739,11 +835,28 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, } else { err = check_stack_read(state, off, size, value_regno); } + } else if (state->regs[regno].type == PTR_TO_PACKET) { + if (t == BPF_WRITE) { + verbose("cannot write into packet\n"); + return -EACCES; + } + err = check_packet_access(env, regno, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown_value(state->regs, value_regno); } else { verbose("R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); return -EACCES; } + + if (!err && size <= 2 && value_regno >= 0 && env->allow_ptr_leaks && + state->regs[value_regno].type == UNKNOWN_VALUE) { + /* 1 or 2 byte load zero-extends, determine the number of + * zero upper bits. Not doing it fo 4 byte load, since + * such values cannot be added to ptr_to_packet anyway. + */ + state->regs[value_regno].imm = 64 - size * 8; + } return err; } @@ -1001,6 +1114,29 @@ static int check_raw_mode(const struct bpf_func_proto *fn) return count > 1 ? -EINVAL : 0; } +static void clear_all_pkt_pointers(struct verifier_env *env) +{ + struct verifier_state *state = &env->cur_state; + struct reg_state *regs = state->regs, *reg; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) + if (regs[i].type == PTR_TO_PACKET || + regs[i].type == PTR_TO_PACKET_END) + mark_reg_unknown_value(regs, i); + + for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { + if (state->stack_slot_type[i] != STACK_SPILL) + continue; + reg = &state->spilled_regs[i / BPF_REG_SIZE]; + if (reg->type != PTR_TO_PACKET && + reg->type != PTR_TO_PACKET_END) + continue; + reg->type = UNKNOWN_VALUE; + reg->imm = 0; + } +} + static int check_call(struct verifier_env *env, int func_id) { struct verifier_state *state = &env->cur_state; @@ -1008,6 +1144,7 @@ static int check_call(struct verifier_env *env, int func_id) struct reg_state *regs = state->regs; struct reg_state *reg; struct bpf_call_arg_meta meta; + bool changes_data; int i, err; /* find function prototype */ @@ -1030,6 +1167,8 @@ static int check_call(struct verifier_env *env, int func_id) return -EINVAL; } + changes_data = bpf_helper_changes_skb_data(fn->func); + memset(&meta, 0, sizeof(meta)); /* We only support one arg being in raw mode at the moment, which @@ -1100,6 +1239,189 @@ static int check_call(struct verifier_env *env, int func_id) if (err) return err; + if (changes_data) + clear_all_pkt_pointers(env); + return 0; +} + +static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + struct reg_state *dst_reg = ®s[insn->dst_reg]; + struct reg_state *src_reg = ®s[insn->src_reg]; + s32 imm; + + if (BPF_SRC(insn->code) == BPF_K) { + /* pkt_ptr += imm */ + imm = insn->imm; + +add_imm: + if (imm <= 0) { + verbose("addition of negative constant to packet pointer is not allowed\n"); + return -EACCES; + } + if (imm >= MAX_PACKET_OFF || + imm + dst_reg->off >= MAX_PACKET_OFF) { + verbose("constant %d is too large to add to packet pointer\n", + imm); + return -EACCES; + } + /* a constant was added to pkt_ptr. + * Remember it while keeping the same 'id' + */ + dst_reg->off += imm; + } else { + if (src_reg->type == CONST_IMM) { + /* pkt_ptr += reg where reg is known constant */ + imm = src_reg->imm; + goto add_imm; + } + /* disallow pkt_ptr += reg + * if reg is not uknown_value with guaranteed zero upper bits + * otherwise pkt_ptr may overflow and addition will become + * subtraction which is not allowed + */ + if (src_reg->type != UNKNOWN_VALUE) { + verbose("cannot add '%s' to ptr_to_packet\n", + reg_type_str[src_reg->type]); + return -EACCES; + } + if (src_reg->imm < 48) { + verbose("cannot add integer value with %lld upper zero bits to ptr_to_packet\n", + src_reg->imm); + return -EACCES; + } + /* dst_reg stays as pkt_ptr type and since some positive + * integer value was added to the pointer, increment its 'id' + */ + dst_reg->id++; + + /* something was added to pkt_ptr, set range and off to zero */ + dst_reg->off = 0; + dst_reg->range = 0; + } + return 0; +} + +static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + struct reg_state *dst_reg = ®s[insn->dst_reg]; + u8 opcode = BPF_OP(insn->code); + s64 imm_log2; + + /* for type == UNKNOWN_VALUE: + * imm > 0 -> number of zero upper bits + * imm == 0 -> don't track which is the same as all bits can be non-zero + */ + + if (BPF_SRC(insn->code) == BPF_X) { + struct reg_state *src_reg = ®s[insn->src_reg]; + + if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 && + dst_reg->imm && opcode == BPF_ADD) { + /* dreg += sreg + * where both have zero upper bits. Adding them + * can only result making one more bit non-zero + * in the larger value. + * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47) + * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47) + */ + dst_reg->imm = min(dst_reg->imm, src_reg->imm); + dst_reg->imm--; + return 0; + } + if (src_reg->type == CONST_IMM && src_reg->imm > 0 && + dst_reg->imm && opcode == BPF_ADD) { + /* dreg += sreg + * where dreg has zero upper bits and sreg is const. + * Adding them can only result making one more bit + * non-zero in the larger value. + */ + imm_log2 = __ilog2_u64((long long)src_reg->imm); + dst_reg->imm = min(dst_reg->imm, 63 - imm_log2); + dst_reg->imm--; + return 0; + } + /* all other cases non supported yet, just mark dst_reg */ + dst_reg->imm = 0; + return 0; + } + + /* sign extend 32-bit imm into 64-bit to make sure that + * negative values occupy bit 63. Note ilog2() would have + * been incorrect, since sizeof(insn->imm) == 4 + */ + imm_log2 = __ilog2_u64((long long)insn->imm); + + if (dst_reg->imm && opcode == BPF_LSH) { + /* reg <<= imm + * if reg was a result of 2 byte load, then its imm == 48 + * which means that upper 48 bits are zero and shifting this reg + * left by 4 would mean that upper 44 bits are still zero + */ + dst_reg->imm -= insn->imm; + } else if (dst_reg->imm && opcode == BPF_MUL) { + /* reg *= imm + * if multiplying by 14 subtract 4 + * This is conservative calculation of upper zero bits. + * It's not trying to special case insn->imm == 1 or 0 cases + */ + dst_reg->imm -= imm_log2 + 1; + } else if (opcode == BPF_AND) { + /* reg &= imm */ + dst_reg->imm = 63 - imm_log2; + } else if (dst_reg->imm && opcode == BPF_ADD) { + /* reg += imm */ + dst_reg->imm = min(dst_reg->imm, 63 - imm_log2); + dst_reg->imm--; + } else if (opcode == BPF_RSH) { + /* reg >>= imm + * which means that after right shift, upper bits will be zero + * note that verifier already checked that + * 0 <= imm < 64 for shift insn + */ + dst_reg->imm += insn->imm; + if (unlikely(dst_reg->imm > 64)) + /* some dumb code did: + * r2 = *(u32 *)mem; + * r2 >>= 32; + * and all bits are zero now */ + dst_reg->imm = 64; + } else { + /* all other alu ops, means that we don't know what will + * happen to the value, mark it with unknown number of zero bits + */ + dst_reg->imm = 0; + } + + if (dst_reg->imm < 0) { + /* all 64 bits of the register can contain non-zero bits + * and such value cannot be added to ptr_to_packet, since it + * may overflow, mark it as unknown to avoid further eval + */ + dst_reg->imm = 0; + } + return 0; +} + +static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + struct reg_state *dst_reg = ®s[insn->dst_reg]; + struct reg_state *src_reg = ®s[insn->src_reg]; + u8 opcode = BPF_OP(insn->code); + + /* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn. + * Don't care about overflow or negative values, just add them + */ + if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) + dst_reg->imm += insn->imm; + else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) + dst_reg->imm += src_reg->imm; + else + mark_reg_unknown_value(regs, insn->dst_reg); return 0; } @@ -1245,6 +1567,21 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) dst_reg->type = PTR_TO_STACK; dst_reg->imm = insn->imm; return 0; + } else if (opcode == BPF_ADD && + BPF_CLASS(insn->code) == BPF_ALU64 && + dst_reg->type == PTR_TO_PACKET) { + /* ptr_to_packet += K|X */ + return check_packet_ptr_add(env, insn); + } else if (BPF_CLASS(insn->code) == BPF_ALU64 && + dst_reg->type == UNKNOWN_VALUE && + env->allow_ptr_leaks) { + /* unknown += K|X */ + return evaluate_reg_alu(env, insn); + } else if (BPF_CLASS(insn->code) == BPF_ALU64 && + dst_reg->type == CONST_IMM && + env->allow_ptr_leaks) { + /* reg_imm += K|X */ + return evaluate_reg_imm_alu(env, insn); } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer arithmetic prohibited\n", insn->dst_reg); @@ -1263,6 +1600,34 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) return 0; } +static void find_good_pkt_pointers(struct verifier_env *env, + struct reg_state *dst_reg) +{ + struct verifier_state *state = &env->cur_state; + struct reg_state *regs = state->regs, *reg; + int i; + /* r2 = r3; + * r2 += 8 + * if (r2 > pkt_end) goto somewhere + * r2 == dst_reg, pkt_end == src_reg, + * r2=pkt(id=n,off=8,r=0) + * r3=pkt(id=n,off=0,r=0) + * find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) + * so that range of bytes [r3, r3 + 8) is safe to access + */ + for (i = 0; i < MAX_BPF_REG; i++) + if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) + regs[i].range = dst_reg->off; + + for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { + if (state->stack_slot_type[i] != STACK_SPILL) + continue; + reg = &state->spilled_regs[i / BPF_REG_SIZE]; + if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) + reg->range = dst_reg->off; + } +} + static int check_cond_jmp_op(struct verifier_env *env, struct bpf_insn *insn, int *insn_idx) { @@ -1346,6 +1711,10 @@ static int check_cond_jmp_op(struct verifier_env *env, regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = 0; } + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && + dst_reg->type == PTR_TO_PACKET && + regs[insn->src_reg].type == PTR_TO_PACKET_END) { + find_good_pkt_pointers(env, dst_reg); } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; @@ -1685,6 +2054,58 @@ err_free: return ret; } +/* the following conditions reduce the number of explored insns + * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet + */ +static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur) +{ + if (old->id != cur->id) + return false; + + /* old ptr_to_packet is more conservative, since it allows smaller + * range. Ex: + * old(off=0,r=10) is equal to cur(off=0,r=20), because + * old(off=0,r=10) means that with range=10 the verifier proceeded + * further and found no issues with the program. Now we're in the same + * spot with cur(off=0,r=20), so we're safe too, since anything further + * will only be looking at most 10 bytes after this pointer. + */ + if (old->off == cur->off && old->range < cur->range) + return true; + + /* old(off=20,r=10) is equal to cur(off=22,re=22 or 5 or 0) + * since both cannot be used for packet access and safe(old) + * pointer has smaller off that could be used for further + * 'if (ptr > data_end)' check + * Ex: + * old(off=20,r=10) and cur(off=22,r=22) and cur(off=22,r=0) mean + * that we cannot access the packet. + * The safe range is: + * [ptr, ptr + range - off) + * so whenever off >=range, it means no safe bytes from this pointer. + * When comparing old->off <= cur->off, it means that older code + * went with smaller offset and that offset was later + * used to figure out the safe range after 'if (ptr > data_end)' check + * Say, 'old' state was explored like: + * ... R3(off=0, r=0) + * R4 = R3 + 20 + * ... now R4(off=20,r=0) <-- here + * if (R4 > data_end) + * ... R4(off=20,r=20), R3(off=0,r=20) and R3 can be used to access. + * ... the code further went all the way to bpf_exit. + * Now the 'cur' state at the mark 'here' has R4(off=30,r=0). + * old_R4(off=20,r=0) equal to cur_R4(off=30,r=0), since if the verifier + * goes further, such cur_R4 will give larger safe packet range after + * 'if (R4 > data_end)' and all further insn were already good with r=20, + * so they will be good with r=30 and we can prune the search. + */ + if (old->off <= cur->off && + old->off >= old->range && cur->off >= cur->range) + return true; + + return false; +} + /* compare two verifier states * * all states stored in state_list are known to be valid, since @@ -1727,6 +2148,10 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT)) continue; + if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && + compare_ptrs_to_packet(rold, rcur)) + continue; + return false; } -- cgit From 735b433397ea2f97d59240cbe4ea770aa7c88eef Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 5 May 2016 19:49:11 -0700 Subject: bpf: improve verifier state equivalence since UNKNOWN_VALUE type is weaker than CONST_IMM we can un-teach verifier its recognition of constants in conditional branches without affecting safety. Ex: if (reg == 123) { .. here verifier was marking reg->type as CONST_IMM instead keep reg as UNKNOWN_VALUE } Two verifier states with UNKNOWN_VALUE are equivalent, whereas CONST_IMM_X != CONST_IMM_Y, since CONST_IMM is used for stack range verification and other cases. So help search pruning by marking registers as UNKNOWN_VALUE where possible instead of CONST_IMM. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6338c61fc2a1..84bff68cf80e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1704,12 +1704,11 @@ static int check_cond_jmp_op(struct verifier_env *env, */ regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; /* branch targer cannot access it, since reg == 0 */ - other_branch->regs[insn->dst_reg].type = CONST_IMM; - other_branch->regs[insn->dst_reg].imm = 0; + mark_reg_unknown_value(other_branch->regs, + insn->dst_reg); } else { other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; - regs[insn->dst_reg].type = CONST_IMM; - regs[insn->dst_reg].imm = 0; + mark_reg_unknown_value(regs, insn->dst_reg); } } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && dst_reg->type == PTR_TO_PACKET && @@ -1718,22 +1717,6 @@ static int check_cond_jmp_op(struct verifier_env *env, } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; - } else if (BPF_SRC(insn->code) == BPF_K && - (opcode == BPF_JEQ || opcode == BPF_JNE)) { - - if (opcode == BPF_JEQ) { - /* detect if (R == imm) goto - * and in the target state recognize that R = imm - */ - other_branch->regs[insn->dst_reg].type = CONST_IMM; - other_branch->regs[insn->dst_reg].imm = insn->imm; - } else { - /* detect if (R != imm) goto - * and in the fall-through state recognize that R = imm - */ - regs[insn->dst_reg].type = CONST_IMM; - regs[insn->dst_reg].imm = insn->imm; - } } if (log_level) print_verifier_state(&env->cur_state); -- cgit From 4936e3528e3e272c567fe4ff0abb7ce3e1500575 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 May 2016 19:08:26 +0200 Subject: bpf: minor cleanups in ebpf code Besides others, remove redundant comments where the code is self documenting enough, and properly indent various bpf_verifier_ops and bpf_prog_type_list declarations. Moreover, remove two exports that actually have no module user. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d781b077431f..5313d09d4b62 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -129,14 +129,12 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, return fp; } -EXPORT_SYMBOL_GPL(bpf_prog_realloc); void __bpf_prog_free(struct bpf_prog *fp) { kfree(fp->aux); vfree(fp); } -EXPORT_SYMBOL_GPL(__bpf_prog_free); #ifdef CONFIG_BPF_JIT struct bpf_binary_header * -- cgit From c237ee5eb33bf19fe0591c04ff8db19da7323a83 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 May 2016 19:08:30 +0200 Subject: bpf: add bpf_patch_insn_single helper Move the functionality to patch instructions out of the verifier code and into the core as the new bpf_patch_insn_single() helper will be needed later on for blinding as well. No changes in functionality. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 53 +++++++------------------------------- 2 files changed, 80 insertions(+), 44 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5313d09d4b62..49b5538a5301 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -136,6 +136,77 @@ void __bpf_prog_free(struct bpf_prog *fp) vfree(fp); } +static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_JMP && + /* Call and Exit are both special jumps with no + * target inside the BPF instruction image. + */ + BPF_OP(insn->code) != BPF_CALL && + BPF_OP(insn->code) != BPF_EXIT; +} + +static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta) +{ + struct bpf_insn *insn = prog->insnsi; + u32 i, insn_cnt = prog->len; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (!bpf_is_jmp_and_has_target(insn)) + continue; + + /* Adjust offset of jmps if we cross boundaries. */ + if (i < pos && i + insn->off + 1 > pos) + insn->off += delta; + else if (i > pos + delta && i + insn->off + 1 <= pos + delta) + insn->off -= delta; + } +} + +struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, + const struct bpf_insn *patch, u32 len) +{ + u32 insn_adj_cnt, insn_rest, insn_delta = len - 1; + struct bpf_prog *prog_adj; + + /* Since our patchlet doesn't expand the image, we're done. */ + if (insn_delta == 0) { + memcpy(prog->insnsi + off, patch, sizeof(*patch)); + return prog; + } + + insn_adj_cnt = prog->len + insn_delta; + + /* Several new instructions need to be inserted. Make room + * for them. Likely, there's no need for a new allocation as + * last page could have large enough tailroom. + */ + prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt), + GFP_USER); + if (!prog_adj) + return NULL; + + prog_adj->len = insn_adj_cnt; + + /* Patching happens in 3 steps: + * + * 1) Move over tail of insnsi from next instruction onwards, + * so we can patch the single target insn with one or more + * new ones (patching is always from 1 to n insns, n > 0). + * 2) Inject new instructions at the target location. + * 3) Adjust branch offsets if necessary. + */ + insn_rest = insn_adj_cnt - off - len; + + memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1, + sizeof(*patch) * insn_rest); + memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len); + + bpf_adj_branches(prog_adj, off, insn_delta); + + return prog_adj; +} + #ifdef CONFIG_BPF_JIT struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 84bff68cf80e..a08d66215245 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2587,26 +2587,6 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) insn->src_reg = 0; } -static void adjust_branches(struct bpf_prog *prog, int pos, int delta) -{ - struct bpf_insn *insn = prog->insnsi; - int insn_cnt = prog->len; - int i; - - for (i = 0; i < insn_cnt; i++, insn++) { - if (BPF_CLASS(insn->code) != BPF_JMP || - BPF_OP(insn->code) == BPF_CALL || - BPF_OP(insn->code) == BPF_EXIT) - continue; - - /* adjust offset of jmps if necessary */ - if (i < pos && i + insn->off + 1 > pos) - insn->off += delta; - else if (i > pos + delta && i + insn->off + 1 <= pos + delta) - insn->off -= delta; - } -} - /* convert load instructions that access fields of 'struct __sk_buff' * into sequence of instructions that access fields of 'struct sk_buff' */ @@ -2616,14 +2596,15 @@ static int convert_ctx_accesses(struct verifier_env *env) int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16]; struct bpf_prog *new_prog; - u32 cnt; - int i; enum bpf_access_type type; + int i; if (!env->prog->aux->ops->convert_ctx_access) return 0; for (i = 0; i < insn_cnt; i++, insn++) { + u32 insn_delta, cnt; + if (insn->code == (BPF_LDX | BPF_MEM | BPF_W)) type = BPF_READ; else if (insn->code == (BPF_STX | BPF_MEM | BPF_W)) @@ -2645,34 +2626,18 @@ static int convert_ctx_accesses(struct verifier_env *env) return -EINVAL; } - if (cnt == 1) { - memcpy(insn, insn_buf, sizeof(*insn)); - continue; - } - - /* several new insns need to be inserted. Make room for them */ - insn_cnt += cnt - 1; - new_prog = bpf_prog_realloc(env->prog, - bpf_prog_size(insn_cnt), - GFP_USER); + new_prog = bpf_patch_insn_single(env->prog, i, insn_buf, cnt); if (!new_prog) return -ENOMEM; - new_prog->len = insn_cnt; - - memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1, - sizeof(*insn) * (insn_cnt - i - cnt)); - - /* copy substitute insns in place of load instruction */ - memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt); - - /* adjust branches in the whole program */ - adjust_branches(new_prog, i, cnt - 1); + insn_delta = cnt - 1; /* keep walking new program and skip insns we just inserted */ env->prog = new_prog; - insn = new_prog->insnsi + i + cnt - 1; - i += cnt - 1; + insn = new_prog->insnsi + i + insn_delta; + + insn_cnt += insn_delta; + i += insn_delta; } return 0; -- cgit From d1c55ab5e41fcd72cb0a8bef86d3f652ad9ad9f5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 May 2016 19:08:31 +0200 Subject: bpf: prepare bpf_int_jit_compile/bpf_prog_select_runtime apis Since the blinding is strictly only called from inside eBPF JITs, we need to change signatures for bpf_int_jit_compile() and bpf_prog_select_runtime() first in order to prepare that the eBPF program we're dealing with can change underneath. Hence, for call sites, we need to return the latest prog. No functional change in this patch. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 18 ++++++++++++++---- kernel/bpf/syscall.c | 2 +- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 49b5538a5301..70f0821aca47 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -761,15 +761,22 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) /** * bpf_prog_select_runtime - select exec runtime for BPF program * @fp: bpf_prog populated with internal BPF program + * @err: pointer to error variable * * Try to JIT eBPF program, if JIT is not available, use interpreter. * The BPF program will be executed via BPF_PROG_RUN() macro. */ -int bpf_prog_select_runtime(struct bpf_prog *fp) +struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { fp->bpf_func = (void *) __bpf_prog_run; - bpf_int_jit_compile(fp); + /* eBPF JITs can rewrite the program in case constant + * blinding is active. However, in case of error during + * blinding, bpf_int_jit_compile() must always return a + * valid program, which in this case would simply not + * be JITed, but falls back to the interpreter. + */ + fp = bpf_int_jit_compile(fp); bpf_prog_lock_ro(fp); /* The tail call compatibility check can only be done at @@ -777,7 +784,9 @@ int bpf_prog_select_runtime(struct bpf_prog *fp) * with JITed or non JITed program concatenations and not * all eBPF JITs might immediately support all features. */ - return bpf_check_tail_call(fp); + *err = bpf_check_tail_call(fp); + + return fp; } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); @@ -859,8 +868,9 @@ const struct bpf_func_proto bpf_tail_call_proto = { }; /* For classic BPF JITs that don't implement bpf_int_jit_compile(). */ -void __weak bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) { + return prog; } bool __weak bpf_helper_changes_skb_data(void *func) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cf5e9f7ad13a..46ecce4b79ed 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -762,7 +762,7 @@ static int bpf_prog_load(union bpf_attr *attr) fixup_bpf_calls(prog); /* eBPF program is ready to be JITed */ - err = bpf_prog_select_runtime(prog); + prog = bpf_prog_select_runtime(prog, &err); if (err < 0) goto free_used_maps; -- cgit From 4f3446bb809f20ad56cadf712e6006815ae7a8f9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 May 2016 19:08:32 +0200 Subject: bpf: add generic constant blinding for use in jits This work adds a generic facility for use from eBPF JIT compilers that allows for further hardening of JIT generated images through blinding constants. In response to the original work on BPF JIT spraying published by Keegan McAllister [1], most BPF JITs were changed to make images read-only and start at a randomized offset in the page, where the rest was filled with trap instructions. We have this nowadays in x86, arm, arm64 and s390 JIT compilers. Additionally, later work also made eBPF interpreter images read only for kernels supporting DEBUG_SET_MODULE_RONX, that is, x86, arm, arm64 and s390 archs as well currently. This is done by default for mentioned JITs when JITing is enabled. Furthermore, we had a generic and configurable constant blinding facility on our todo for quite some time now to further make spraying harder, and first implementation since around netconf 2016. We found that for systems where untrusted users can load cBPF/eBPF code where JIT is enabled, start offset randomization helps a bit to make jumps into crafted payload harder, but in case where larger programs that cross page boundary are injected, we again have some part of the program opcodes at a page start offset. With improved guessing and more reliable payload injection, chances can increase to jump into such payload. Elena Reshetova recently wrote a test case for it [2, 3]. Moreover, eBPF comes with 64 bit constants, which can leave some more room for payloads. Note that for all this, additional bugs in the kernel are still required to make the jump (and of course to guess right, to not jump into a trap) and naturally the JIT must be enabled, which is disabled by default. For helping mitigation, the general idea is to provide an option bpf_jit_harden that admins can tweak along with bpf_jit_enable, so that for cases where JIT should be enabled for performance reasons, the generated image can be further hardened with blinding constants for unpriviledged users (bpf_jit_harden == 1), with trading off performance for these, but not for privileged ones. We also added the option of blinding for all users (bpf_jit_harden == 2), which is quite helpful for testing f.e. with test_bpf.ko. There are no further e.g. hardening levels of bpf_jit_harden switch intended, rationale is to have it dead simple to use as on/off. Since this functionality would need to be duplicated over and over for JIT compilers to use, which are already complex enough, we provide a generic eBPF byte-code level based blinding implementation, which is then just transparently JITed. JIT compilers need to make only a few changes to integrate this facility and can be migrated one by one. This option is for eBPF JITs and will be used in x86, arm64, s390 without too much effort, and soon ppc64 JITs, thus that native eBPF can be blinded as well as cBPF to eBPF migrations, so that both can be covered with a single implementation. The rule for JITs is that bpf_jit_blind_constants() must be called from bpf_int_jit_compile(), and in case blinding is disabled, we follow normally with JITing the passed program. In case blinding is enabled and we fail during the process of blinding itself, we must return with the interpreter. Similarly, in case the JITing process after the blinding failed, we return normally to the interpreter with the non-blinded code. Meaning, interpreter doesn't change in any way and operates on eBPF code as usual. For doing this pre-JIT blinding step, we need to make use of a helper/auxiliary register, here BPF_REG_AX. This is strictly internal to the JIT and not in any way part of the eBPF architecture. Just like in the same way as JITs internally make use of some helper registers when emitting code, only that here the helper register is one abstraction level higher in eBPF bytecode, but nevertheless in JIT phase. That helper register is needed since f.e. manually written program can issue loads to all registers of eBPF architecture. The core concept with the additional register is: blind out all 32 and 64 bit constants by converting BPF_K based instructions into a small sequence from K_VAL into ((RND ^ K_VAL) ^ RND). Therefore, this is transformed into: BPF_REG_AX := (RND ^ K_VAL), BPF_REG_AX ^= RND, and REG BPF_REG_AX, so actual operation on the target register is translated from BPF_K into BPF_X one that is operating on BPF_REG_AX's content. During rewriting phase when blinding, RND is newly generated via prandom_u32() for each processed instruction. 64 bit loads are split into two 32 bit loads to make translation and patching not too complex. Only basic thing required by JITs is to call the helper bpf_jit_blind_constants()/bpf_jit_prog_release_other() pair, and to map BPF_REG_AX into an unused register. Small bpf_jit_disasm extract from [2] when applied to x86 JIT: echo 0 > /proc/sys/net/core/bpf_jit_harden ffffffffa034f5e9 + : [...] 39: mov $0xa8909090,%eax 3e: mov $0xa8909090,%eax 43: mov $0xa8ff3148,%eax 48: mov $0xa89081b4,%eax 4d: mov $0xa8900bb0,%eax 52: mov $0xa810e0c1,%eax 57: mov $0xa8908eb4,%eax 5c: mov $0xa89020b0,%eax [...] echo 1 > /proc/sys/net/core/bpf_jit_harden ffffffffa034f1e5 + : [...] 39: mov $0xe1192563,%r10d 3f: xor $0x4989b5f3,%r10d 46: mov %r10d,%eax 49: mov $0xb8296d93,%r10d 4f: xor $0x10b9fd03,%r10d 56: mov %r10d,%eax 59: mov $0x8c381146,%r10d 5f: xor $0x24c7200e,%r10d 66: mov %r10d,%eax 69: mov $0xeb2a830e,%r10d 6f: xor $0x43ba02ba,%r10d 76: mov %r10d,%eax 79: mov $0xd9730af,%r10d 7f: xor $0xa5073b1f,%r10d 86: mov %r10d,%eax 89: mov $0x9a45662b,%r10d 8f: xor $0x325586ea,%r10d 96: mov %r10d,%eax [...] As can be seen, original constants that carry payload are hidden when enabled, actual operations are transformed from constant-based to register-based ones, making jumps into constants ineffective. Above extract/example uses single BPF load instruction over and over, but of course all instructions with constants are blinded. Performance wise, JIT with blinding performs a bit slower than just JIT and faster than interpreter case. This is expected, since we still get all the performance benefits from JITing and in normal use-cases not every single instruction needs to be blinded. Summing up all 296 test cases averaged over multiple runs from test_bpf.ko suite, interpreter was 55% slower than JIT only and JIT with blinding was 8% slower than JIT only. Since there are also some extremes in the test suite, I expect for ordinary workloads that the performance for the JIT with blinding case is even closer to JIT only case, f.e. nmap test case from suite has averaged timings in ns 29 (JIT), 35 (+ blinding), and 151 (interpreter). BPF test suite, seccomp test suite, eBPF sample code and various bigger networking eBPF programs have been tested with this and were running fine. For testing purposes, I also adapted interpreter and redirected blinded eBPF image to interpreter and also here all tests pass. [1] http://mainisusuallyafunction.blogspot.com/2012/11/attacking-hardened-linux-systems-with.html [2] https://github.com/01org/jit-spray-poc-for-ksp/ [3] http://www.openwall.com/lists/kernel-hardening/2016/05/03/5 Signed-off-by: Daniel Borkmann Reviewed-by: Elena Reshetova Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 203 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 203 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 70f0821aca47..f1e8a0def99b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -243,6 +243,209 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) { module_memfree(hdr); } + +int bpf_jit_harden __read_mostly; + +static int bpf_jit_blind_insn(const struct bpf_insn *from, + const struct bpf_insn *aux, + struct bpf_insn *to_buff) +{ + struct bpf_insn *to = to_buff; + u32 imm_rnd = prandom_u32(); + s16 off; + + BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); + BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG); + + if (from->imm == 0 && + (from->code == (BPF_ALU | BPF_MOV | BPF_K) || + from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) { + *to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg); + goto out; + } + + switch (from->code) { + case BPF_ALU | BPF_ADD | BPF_K: + case BPF_ALU | BPF_SUB | BPF_K: + case BPF_ALU | BPF_AND | BPF_K: + case BPF_ALU | BPF_OR | BPF_K: + case BPF_ALU | BPF_XOR | BPF_K: + case BPF_ALU | BPF_MUL | BPF_K: + case BPF_ALU | BPF_MOV | BPF_K: + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_MOD | BPF_K: + *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX); + break; + + case BPF_ALU64 | BPF_ADD | BPF_K: + case BPF_ALU64 | BPF_SUB | BPF_K: + case BPF_ALU64 | BPF_AND | BPF_K: + case BPF_ALU64 | BPF_OR | BPF_K: + case BPF_ALU64 | BPF_XOR | BPF_K: + case BPF_ALU64 | BPF_MUL | BPF_K: + case BPF_ALU64 | BPF_MOV | BPF_K: + case BPF_ALU64 | BPF_DIV | BPF_K: + case BPF_ALU64 | BPF_MOD | BPF_K: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU64_REG(from->code, from->dst_reg, BPF_REG_AX); + break; + + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JNE | BPF_K: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSET | BPF_K: + /* Accommodate for extra offset in case of a backjump. */ + off = from->off; + if (off < 0) + off -= 2; + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off); + break; + + case BPF_LD | BPF_ABS | BPF_W: + case BPF_LD | BPF_ABS | BPF_H: + case BPF_LD | BPF_ABS | BPF_B: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0); + break; + + case BPF_LD | BPF_IND | BPF_W: + case BPF_LD | BPF_IND | BPF_H: + case BPF_LD | BPF_IND | BPF_B: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg); + *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0); + break; + + case BPF_LD | BPF_IMM | BPF_DW: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32); + *to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX); + break; + case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */ + *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm); + *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX); + break; + + case BPF_ST | BPF_MEM | BPF_DW: + case BPF_ST | BPF_MEM | BPF_W: + case BPF_ST | BPF_MEM | BPF_H: + case BPF_ST | BPF_MEM | BPF_B: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off); + break; + } +out: + return to - to_buff; +} + +static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, + gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | + gfp_extra_flags; + struct bpf_prog *fp; + + fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); + if (fp != NULL) { + kmemcheck_annotate_bitfield(fp, meta); + + /* aux->prog still points to the fp_other one, so + * when promoting the clone to the real program, + * this still needs to be adapted. + */ + memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE); + } + + return fp; +} + +static void bpf_prog_clone_free(struct bpf_prog *fp) +{ + /* aux was stolen by the other clone, so we cannot free + * it from this path! It will be freed eventually by the + * other program on release. + * + * At this point, we don't need a deferred release since + * clone is guaranteed to not be locked. + */ + fp->aux = NULL; + __bpf_prog_free(fp); +} + +void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other) +{ + /* We have to repoint aux->prog to self, as we don't + * know whether fp here is the clone or the original. + */ + fp->aux->prog = fp; + bpf_prog_clone_free(fp_other); +} + +struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) +{ + struct bpf_insn insn_buff[16], aux[2]; + struct bpf_prog *clone, *tmp; + int insn_delta, insn_cnt; + struct bpf_insn *insn; + int i, rewritten; + + if (!bpf_jit_blinding_enabled()) + return prog; + + clone = bpf_prog_clone_create(prog, GFP_USER); + if (!clone) + return ERR_PTR(-ENOMEM); + + insn_cnt = clone->len; + insn = clone->insnsi; + + for (i = 0; i < insn_cnt; i++, insn++) { + /* We temporarily need to hold the original ld64 insn + * so that we can still access the first part in the + * second blinding run. + */ + if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) && + insn[1].code == 0) + memcpy(aux, insn, sizeof(aux)); + + rewritten = bpf_jit_blind_insn(insn, aux, insn_buff); + if (!rewritten) + continue; + + tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten); + if (!tmp) { + /* Patching may have repointed aux->prog during + * realloc from the original one, so we need to + * fix it up here on error. + */ + bpf_jit_prog_release_other(prog, clone); + return ERR_PTR(-ENOMEM); + } + + clone = tmp; + insn_delta = rewritten - 1; + + /* Walk new program and skip insns we just inserted. */ + insn = clone->insnsi + i + insn_delta; + insn_cnt += insn_delta; + i += insn_delta; + } + + return clone; +} #endif /* CONFIG_BPF_JIT */ /* Base function for offset calculation. Needs to go into .text section, -- cgit From b7552e1bccbe3da9c8e7386c6188e8ea4667c8e7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 18 May 2016 14:14:28 +0200 Subject: bpf: rather use get_random_int for randomizations Start address randomization and blinding in BPF currently use prandom_u32(). prandom_u32() values are not exposed to unpriviledged user space to my knowledge, but given other kernel facilities such as ASLR, stack canaries, etc make use of stronger get_random_int(), we better make use of it here as well given blinding requests successively new random values. get_random_int() has minimal entropy pool depletion, is not cryptographically secure, but doesn't need to be for our use cases here. Suggested-by: Hannes Frederic Sowa Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f1e8a0def99b..b94a36550591 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -231,7 +231,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, hdr->pages = size / PAGE_SIZE; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); - start = (prandom_u32() % hole) & ~(alignment - 1); + start = (get_random_int() % hole) & ~(alignment - 1); /* Leave a random number of instructions before BPF code. */ *image_ptr = &hdr->image[start]; @@ -251,7 +251,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, struct bpf_insn *to_buff) { struct bpf_insn *to = to_buff; - u32 imm_rnd = prandom_u32(); + u32 imm_rnd = get_random_int(); s16 off; BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); -- cgit From e27f4a942a0ee4b84567a3c6cfa84f273e55cbb7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 May 2016 17:22:48 -0500 Subject: bpf: Use mount_nodev not mount_ns to mount the bpf filesystem While reviewing the filesystems that set FS_USERNS_MOUNT I spotted the bpf filesystem. Looking at the code I saw a broken usage of mount_ns with current->nsproxy->mnt_ns. As the code does not acquire a reference to the mount namespace it can not possibly be correct to store the mount namespace on the superblock as it does. Replace mount_ns with mount_nodev so that each mount of the bpf filesystem returns a distinct instance, and the code is not buggy. In discussion with Hannes Frederic Sowa it was reported that the use of mount_ns was an attempt to have one bpf instance per mount namespace, in an attempt to keep resources that pin resources from hiding. That intent simply does not work, the vfs is not built to allow that kind of behavior. Which means that the bpf filesystem really is buggy both semantically and in it's implemenation as it does not nor can it implement the original intent. This change is userspace visible, but my experience with similar filesystems leads me to believe nothing will break with a model of each mount of the bpf filesystem is distinct from all others. Fixes: b2197755b263 ("bpf: add support for persistent maps/progs") Cc: Hannes Frederic Sowa Acked-by: Daniel Borkmann Signed-off-by: "Eric W. Biederman" Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- kernel/bpf/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 71b75d9c81da..04be7021f848 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -357,7 +357,7 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent) static struct dentry *bpf_mount(struct file_system_type *type, int flags, const char *dev_name, void *data) { - return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super); + return mount_nodev(type, flags, data, bpf_fill_super); } static struct file_system_type bpf_fs_type = { -- cgit From d91b28ed42de99217efb2e8cb0357263d6fb737c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 19 May 2016 18:17:13 -0700 Subject: bpf: support decreasing order in direct packet access when packet headers are accessed in 'decreasing' order (like TCP port may be fetched before the program reads IP src) the llvm may generate the following code: [...] // R7=pkt(id=0,off=22,r=70) r2 = *(u32 *)(r7 +0) // good access [...] r7 += 40 // R7=pkt(id=0,off=62,r=70) r8 = *(u32 *)(r7 +0) // good access [...] r1 = *(u32 *)(r7 -20) // this one will fail though it's within a safe range // it's doing *(u32*)(skb->data + 42) Fix verifier to recognize such code pattern Alos turned out that 'off > range' condition is not a verifier bug. It's a buggy program that may do something like: if (ptr + 50 > data_end) return 0; ptr += 60; *(u32*)ptr; in such case emit "invalid access to packet, off=0 size=4, R1(id=0,off=60,r=50)" error message, so all information is available for the program author to fix the program. Fixes: 969bf05eb3ce ("bpf: direct packet access") Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a08d66215245..d54e34874579 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -683,15 +683,11 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off, { struct reg_state *regs = env->cur_state.regs; struct reg_state *reg = ®s[regno]; - int linear_size = (int) reg->range - (int) reg->off; - if (linear_size < 0 || linear_size >= MAX_PACKET_OFF) { - verbose("verifier bug\n"); - return -EFAULT; - } - if (off < 0 || off + size > linear_size) { - verbose("invalid access to packet, off=%d size=%d, allowed=%d\n", - off, size, linear_size); + off += reg->off; + if (off < 0 || off + size > reg->range) { + verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", + off, size, regno, reg->id, reg->off, reg->range); return -EACCES; } return 0; -- cgit From 1b9b69ecb3a5236d4d3da0f0fa11af916371841e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 19 May 2016 18:17:14 -0700 Subject: bpf: teach verifier to recognize imm += ptr pattern Humans don't write C code like: u8 *ptr = skb->data; int imm = 4; imm += ptr; but from llvm backend point of view 'imm' and 'ptr' are registers and imm += ptr may be preferred vs ptr += imm depending which register value will be used further in the code, while verifier can only recognize ptr += imm. That caused small unrelated changes in the C code of the bpf program to trigger rejection by the verifier. Therefore teach the verifier to recognize both ptr += imm and imm += ptr. For example: when R6=pkt(id=0,off=0,r=62) R7=imm22 after r7 += r6 instruction will be R6=pkt(id=0,off=0,r=62) R7=pkt(id=0,off=22,r=62) Fixes: 969bf05eb3ce ("bpf: direct packet access") Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d54e34874579..668e07903c8f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1245,6 +1245,7 @@ static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn) struct reg_state *regs = env->cur_state.regs; struct reg_state *dst_reg = ®s[insn->dst_reg]; struct reg_state *src_reg = ®s[insn->src_reg]; + struct reg_state tmp_reg; s32 imm; if (BPF_SRC(insn->code) == BPF_K) { @@ -1267,6 +1268,19 @@ add_imm: */ dst_reg->off += imm; } else { + if (src_reg->type == PTR_TO_PACKET) { + /* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */ + tmp_reg = *dst_reg; /* save r7 state */ + *dst_reg = *src_reg; /* copy pkt_ptr state r6 into r7 */ + src_reg = &tmp_reg; /* pretend it's src_reg state */ + /* if the checks below reject it, the copy won't matter, + * since we're rejecting the whole program. If all ok, + * then imm22 state will be added to r7 + * and r7 will be pkt(id=0,off=22,r=62) while + * r6 will stay as pkt(id=0,off=0,r=62) + */ + } + if (src_reg->type == CONST_IMM) { /* pkt_ptr += reg where reg is known constant */ imm = src_reg->imm; @@ -1565,7 +1579,9 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) return 0; } else if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && - dst_reg->type == PTR_TO_PACKET) { + (dst_reg->type == PTR_TO_PACKET || + (BPF_SRC(insn->code) == BPF_X && + regs[insn->src_reg].type == PTR_TO_PACKET))) { /* ptr_to_packet += K|X */ return check_packet_ptr_add(env, insn); } else if (BPF_CLASS(insn->code) == BPF_ALU64 && -- cgit From 612bacad78ba6d0a91166fc4487af114bac172a8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 22 May 2016 23:16:18 +0200 Subject: bpf, inode: disallow userns mounts Follow-up to commit e27f4a942a0e ("bpf: Use mount_nodev not mount_ns to mount the bpf filesystem"), which removes the FS_USERNS_MOUNT flag. The original idea was to have a per mountns instance instead of a single global fs instance, but that didn't work out and we had to switch to mount_nodev() model. The intent of that middle ground was that we avoid users who don't play nice to create endless instances of bpf fs which are difficult to control and discover from an admin point of view, but at the same time it would have allowed us to be more flexible with regard to namespaces. Therefore, since we now did the switch to mount_nodev() as a fix where individual instances are created, we also need to remove userns mount flag along with it to avoid running into mentioned situation. I don't expect any breakage at this early point in time with removing the flag and we can revisit this later should the requirement for this come up with future users. This and commit e27f4a942a0e have been split to facilitate tracking should any of them run into the unlikely case of causing a regression. Fixes: b2197755b263 ("bpf: add support for persistent maps/progs") Signed-off-by: Daniel Borkmann Acked-by: Hannes Frederic Sowa Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 04be7021f848..318858edb1cd 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -365,7 +365,6 @@ static struct file_system_type bpf_fs_type = { .name = "bpf", .mount = bpf_mount, .kill_sb = kill_litter_super, - .fs_flags = FS_USERNS_MOUNT, }; MODULE_ALIAS_FS("bpf"); -- cgit