From 0c93b7d85d40b690f04786ea0f18798b73182e4f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Mar 2016 12:06:51 -0400
Subject: bpf: reject invalid names right in ->lookup()

... and other methods won't see them at all

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/bpf/inode.c | 37 ++++++++-----------------------------
 1 file changed, 8 insertions(+), 29 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index f2ece3c174a5..35d21c189bb0 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -119,18 +119,10 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
 	return 0;
 }
 
-static bool bpf_dname_reserved(const struct dentry *dentry)
-{
-	return strchr(dentry->d_name.name, '.');
-}
-
 static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
 
-	if (bpf_dname_reserved(dentry))
-		return -EPERM;
-
 	inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -152,9 +144,6 @@ static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry,
 {
 	struct inode *inode;
 
-	if (bpf_dname_reserved(dentry))
-		return -EPERM;
-
 	inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -187,31 +176,21 @@ static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode,
 	}
 }
 
-static int bpf_link(struct dentry *old_dentry, struct inode *dir,
-		    struct dentry *new_dentry)
+static struct dentry *
+bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
 {
-	if (bpf_dname_reserved(new_dentry))
-		return -EPERM;
-
-	return simple_link(old_dentry, dir, new_dentry);
-}
-
-static int bpf_rename(struct inode *old_dir, struct dentry *old_dentry,
-		      struct inode *new_dir, struct dentry *new_dentry)
-{
-	if (bpf_dname_reserved(new_dentry))
-		return -EPERM;
-
-	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+	if (strchr(dentry->d_name.name, '.'))
+		return ERR_PTR(-EPERM);
+	return simple_lookup(dir, dentry, flags);
 }
 
 static const struct inode_operations bpf_dir_iops = {
-	.lookup		= simple_lookup,
+	.lookup		= bpf_lookup,
 	.mknod		= bpf_mkobj,
 	.mkdir		= bpf_mkdir,
 	.rmdir		= simple_rmdir,
-	.rename		= bpf_rename,
-	.link		= bpf_link,
+	.rename		= simple_rename,
+	.link		= simple_link,
 	.unlink		= simple_unlink,
 };
 
-- 
cgit 


From 9940d67c93b5bb7ddcf862b41b1847cb728186c4 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:27 -0700
Subject: bpf: support bpf_get_stackid() and bpf_perf_event_output() in
 tracepoint programs

needs two wrapper functions to fetch 'struct pt_regs *' to convert
tracepoint bpf context into kprobe bpf context to reuse existing
helper functions

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/stackmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 499d9e933f8e..35114725cf30 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -116,7 +116,7 @@ free_smap:
 	return ERR_PTR(err);
 }
 
-static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 {
 	struct pt_regs *regs = (struct pt_regs *) (long) r1;
 	struct bpf_map *map = (struct bpf_map *) (long) r2;
-- 
cgit 


From 32bbe0078afe86a8bf4c67c6b3477781b15e94dc Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:28 -0700
Subject: bpf: sanitize bpf tracepoint access

during bpf program loading remember the last byte of ctx access
and at the time of attaching the program to tracepoint check that
the program doesn't access bytes beyond defined in tracepoint fields

This also disallows access to __dynamic_array fields, but can be
relaxed in the future.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2e08f8e9b771..58792fed5678 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -652,8 +652,12 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
 			    enum bpf_access_type t)
 {
 	if (env->prog->aux->ops->is_valid_access &&
-	    env->prog->aux->ops->is_valid_access(off, size, t))
+	    env->prog->aux->ops->is_valid_access(off, size, t)) {
+		/* remember the offset of last byte accessed in ctx */
+		if (env->prog->aux->max_ctx_offset < off + size)
+			env->prog->aux->max_ctx_offset = off + size;
 		return 0;
+	}
 
 	verbose("invalid bpf_context access off=%d size=%d\n", off, size);
 	return -EACCES;
-- 
cgit 


From 07016151a446d25397b24588df4ed5cf777a69bb Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 5 Apr 2016 22:33:17 +0200
Subject: bpf, verifier: further improve search pruning

The verifier needs to go through every path of the program in
order to check that it terminates safely, which can be quite a
lot of instructions that need to be processed f.e. in cases with
more branchy programs. With search pruning from f1bca824dabb ("bpf:
add search pruning optimization to verifier") the search space can
already be reduced significantly when the verifier detects that
a previously walked path with same register and stack contents
terminated already (see verifier's states_equal()), so the search
can skip walking those states.

When working with larger programs of > ~2000 (out of max 4096)
insns, we found that the current limit of 32k instructions is easily
hit. For example, a case we ran into is that the search space cannot
be pruned due to branches at the beginning of the program that make
use of certain stack space slots (STACK_MISC), which are never used
in the remaining program (STACK_INVALID). Therefore, the verifier
needs to walk paths for the slots in STACK_INVALID state, but also
all remaining paths with a stack structure, where the slots are in
STACK_MISC, which can nearly double the search space needed. After
various experiments, we find that a limit of 64k processed insns is
a more reasonable choice when dealing with larger programs in practice.
This still allows to reject extreme crafted cases that can have a
much higher complexity (f.e. > ~300k) within the 4096 insns limit
due to search pruning not being able to take effect.

Furthermore, we found that a lot of states can be pruned after a
call instruction, f.e. we were able to reduce the search state by
~35% in some cases with this heuristic, trade-off is to keep a bit
more states in env->explored_states. Usually, call instructions
have a number of preceding register assignments and/or stack stores,
where search pruning has a better chance to suceed in states_equal()
test. The current code marks the branch targets with STATE_LIST_MARK
in case of conditional jumps, and the next (t + 1) instruction in
case of unconditional jump so that f.e. a backjump will walk it. We
also did experiments with using t + insns[t].off + 1 as a marker in
the unconditionally jump case instead of t + 1 with the rationale
that these two branches of execution that converge after the label
might have more potential of pruning. We found that it was a bit
better, but not necessarily significantly better than the current
state, perhaps also due to clang not generating back jumps often.
Hence, we left that as is for now.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 58792fed5678..8233021538d3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -202,6 +202,9 @@ struct verifier_env {
 	bool allow_ptr_leaks;
 };
 
+#define BPF_COMPLEXITY_LIMIT_INSNS	65536
+#define BPF_COMPLEXITY_LIMIT_STACK	1024
+
 /* verbose verifier prints what it's seeing
  * bpf_check() is called under lock, so no race to access these global vars
  */
@@ -454,7 +457,7 @@ static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx,
 	elem->next = env->head;
 	env->head = elem;
 	env->stack_size++;
-	if (env->stack_size > 1024) {
+	if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
 		verbose("BPF program is too complex\n");
 		goto err;
 	}
@@ -1543,6 +1546,8 @@ peek_stack:
 				goto peek_stack;
 			else if (ret < 0)
 				goto err_free;
+			if (t + 1 < insn_cnt)
+				env->explored_states[t + 1] = STATE_LIST_MARK;
 		} else if (opcode == BPF_JA) {
 			if (BPF_SRC(insns[t].code) != BPF_K) {
 				ret = -EINVAL;
@@ -1747,7 +1752,7 @@ static int do_check(struct verifier_env *env)
 		insn = &insns[insn_idx];
 		class = BPF_CLASS(insn->code);
 
-		if (++insn_processed > 32768) {
+		if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
 			verbose("BPF program is too large. Proccessed %d insn\n",
 				insn_processed);
 			return -E2BIG;
-- 
cgit 


From 4923ec0b10d998349c2ac4b38aa4674e539e6f92 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 19:39:21 -0700
Subject: bpf: simplify verifier register state assignments

verifier is using the following structure to track the state of registers:
struct reg_state {
    enum bpf_reg_type type;
    union {
        int imm;
        struct bpf_map *map_ptr;
    };
};
and later on in states_equal() does memcmp(&old->regs[i], &cur->regs[i],..)
to find equivalent states.
Throughout the code of verifier there are assignements to 'imm' and 'map_ptr'
fields and it's not obvious that most of the assignments into 'imm' don't
need to clear extra 4 bytes (like mark_reg_unknown_value() does) to make sure
that memcmp doesn't go over junk left from 'map_ptr' assignment.

Simplify the code by converting 'int' into 'long'

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8233021538d3..6c5d7cd4cb0e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -142,7 +142,7 @@ struct reg_state {
 	enum bpf_reg_type type;
 	union {
 		/* valid when type == CONST_IMM | PTR_TO_STACK */
-		int imm;
+		long imm;
 
 		/* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
 		 *   PTR_TO_MAP_VALUE_OR_NULL
@@ -263,7 +263,7 @@ static void print_verifier_state(struct verifier_env *env)
 			continue;
 		verbose(" R%d=%s", i, reg_type_str[t]);
 		if (t == CONST_IMM || t == PTR_TO_STACK)
-			verbose("%d", env->cur_state.regs[i].imm);
+			verbose("%ld", env->cur_state.regs[i].imm);
 		else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
 			 t == PTR_TO_MAP_VALUE_OR_NULL)
 			verbose("(ks=%d,vs=%d)",
@@ -480,7 +480,6 @@ static void init_reg_state(struct reg_state *regs)
 	for (i = 0; i < MAX_BPF_REG; i++) {
 		regs[i].type = NOT_INIT;
 		regs[i].imm = 0;
-		regs[i].map_ptr = NULL;
 	}
 
 	/* frame pointer */
@@ -495,7 +494,6 @@ static void mark_reg_unknown_value(struct reg_state *regs, u32 regno)
 	BUG_ON(regno >= MAX_BPF_REG);
 	regs[regno].type = UNKNOWN_VALUE;
 	regs[regno].imm = 0;
-	regs[regno].map_ptr = NULL;
 }
 
 enum reg_arg_type {
-- 
cgit 


From 33ff9823c569f3aceb071071914919177a6bed6a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 13 Apr 2016 00:10:50 +0200
Subject: bpf, verifier: add bpf_call_arg_meta for passing meta data

Currently, when the verifier checks calls in check_call() function, we
call check_func_arg() for all 5 arguments e.g. to make sure expected types
are correct. In some cases, we collect meta data (here: map pointer) to
perform additional checks such as checking stack boundary on key/value
sizes for subsequent arguments. As we're going to extend the meta data,
add a generic struct bpf_call_arg_meta that we can use for passing into
check_func_arg().

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 40 +++++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 17 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6c5d7cd4cb0e..202f8f738542 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -205,6 +205,10 @@ struct verifier_env {
 #define BPF_COMPLEXITY_LIMIT_INSNS	65536
 #define BPF_COMPLEXITY_LIMIT_STACK	1024
 
+struct bpf_call_arg_meta {
+	struct bpf_map *map_ptr;
+};
+
 /* verbose verifier prints what it's seeing
  * bpf_check() is called under lock, so no race to access these global vars
  */
@@ -822,7 +826,8 @@ static int check_stack_boundary(struct verifier_env *env, int regno,
 }
 
 static int check_func_arg(struct verifier_env *env, u32 regno,
-			  enum bpf_arg_type arg_type, struct bpf_map **mapp)
+			  enum bpf_arg_type arg_type,
+			  struct bpf_call_arg_meta *meta)
 {
 	struct reg_state *reg = env->cur_state.regs + regno;
 	enum bpf_reg_type expected_type;
@@ -875,14 +880,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 
 	if (arg_type == ARG_CONST_MAP_PTR) {
 		/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
-		*mapp = reg->map_ptr;
-
+		meta->map_ptr = reg->map_ptr;
 	} else if (arg_type == ARG_PTR_TO_MAP_KEY) {
 		/* bpf_map_xxx(..., map_ptr, ..., key) call:
 		 * check that [key, key + map->key_size) are within
 		 * stack limits and initialized
 		 */
-		if (!*mapp) {
+		if (!meta->map_ptr) {
 			/* in function declaration map_ptr must come before
 			 * map_key, so that it's verified and known before
 			 * we have to check map_key here. Otherwise it means
@@ -891,19 +895,19 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 			verbose("invalid map_ptr to access map->key\n");
 			return -EACCES;
 		}
-		err = check_stack_boundary(env, regno, (*mapp)->key_size,
+		err = check_stack_boundary(env, regno, meta->map_ptr->key_size,
 					   false);
 	} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
 		/* bpf_map_xxx(..., map_ptr, ..., value) call:
 		 * check [value, value + map->value_size) validity
 		 */
-		if (!*mapp) {
+		if (!meta->map_ptr) {
 			/* kernel subsystem misconfigured verifier */
 			verbose("invalid map_ptr to access map->value\n");
 			return -EACCES;
 		}
-		err = check_stack_boundary(env, regno, (*mapp)->value_size,
-					   false);
+		err = check_stack_boundary(env, regno,
+					   meta->map_ptr->value_size, false);
 	} else if (arg_type == ARG_CONST_STACK_SIZE ||
 		   arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
 		bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO);
@@ -954,8 +958,8 @@ static int check_call(struct verifier_env *env, int func_id)
 	struct verifier_state *state = &env->cur_state;
 	const struct bpf_func_proto *fn = NULL;
 	struct reg_state *regs = state->regs;
-	struct bpf_map *map = NULL;
 	struct reg_state *reg;
+	struct bpf_call_arg_meta meta;
 	int i, err;
 
 	/* find function prototype */
@@ -978,20 +982,22 @@ static int check_call(struct verifier_env *env, int func_id)
 		return -EINVAL;
 	}
 
+	memset(&meta, 0, sizeof(meta));
+
 	/* check args */
-	err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map);
+	err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);
 	if (err)
 		return err;
-	err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map);
+	err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
 	if (err)
 		return err;
-	err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map);
+	err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
 	if (err)
 		return err;
-	err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map);
+	err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &meta);
 	if (err)
 		return err;
-	err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map);
+	err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &meta);
 	if (err)
 		return err;
 
@@ -1013,18 +1019,18 @@ static int check_call(struct verifier_env *env, int func_id)
 		 * can check 'value_size' boundary of memory access
 		 * to map element returned from bpf_map_lookup_elem()
 		 */
-		if (map == NULL) {
+		if (meta.map_ptr == NULL) {
 			verbose("kernel subsystem misconfigured verifier\n");
 			return -EINVAL;
 		}
-		regs[BPF_REG_0].map_ptr = map;
+		regs[BPF_REG_0].map_ptr = meta.map_ptr;
 	} else {
 		verbose("unknown return type %d of func %d\n",
 			fn->ret_type, func_id);
 		return -EINVAL;
 	}
 
-	err = check_map_func_compatibility(map, func_id);
+	err = check_map_func_compatibility(meta.map_ptr, func_id);
 	if (err)
 		return err;
 
-- 
cgit 


From 435faee1aae9c1ac231f89e4faf0437bfe29f425 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 13 Apr 2016 00:10:51 +0200
Subject: bpf, verifier: add ARG_PTR_TO_RAW_STACK type

When passing buffers from eBPF stack space into a helper function, we have
ARG_PTR_TO_STACK argument type for helpers available. The verifier makes sure
that such buffers are initialized, within boundaries, etc.

However, the downside with this is that we have a couple of helper functions
such as bpf_skb_load_bytes() that fill out the passed buffer in the expected
success case anyway, so zero initializing them prior to the helper call is
unneeded/wasted instructions in the eBPF program that can be avoided.

Therefore, add a new helper function argument type called ARG_PTR_TO_RAW_STACK.
The idea is to skip the STACK_MISC check in check_stack_boundary() and color
the related stack slots as STACK_MISC after we checked all call arguments.

Helper functions using ARG_PTR_TO_RAW_STACK must make sure that every path of
the helper function will fill the provided buffer area, so that we cannot leak
any uninitialized stack memory. This f.e. means that error paths need to
memset() the buffers, but the expected fast-path doesn't have to do this
anymore.

Since there's no such helper needing more than at most one ARG_PTR_TO_RAW_STACK
argument, we can keep it simple and don't need to check for multiple areas.
Should in future such a use-case really appear, we have check_raw_mode() that
will make sure we implement support for it first.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 54 insertions(+), 5 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 202f8f738542..9c843a5417da 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -207,6 +207,9 @@ struct verifier_env {
 
 struct bpf_call_arg_meta {
 	struct bpf_map *map_ptr;
+	bool raw_mode;
+	int regno;
+	int access_size;
 };
 
 /* verbose verifier prints what it's seeing
@@ -789,7 +792,8 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
  * and all elements of stack are initialized
  */
 static int check_stack_boundary(struct verifier_env *env, int regno,
-				int access_size, bool zero_size_allowed)
+				int access_size, bool zero_size_allowed,
+				struct bpf_call_arg_meta *meta)
 {
 	struct verifier_state *state = &env->cur_state;
 	struct reg_state *regs = state->regs;
@@ -815,6 +819,12 @@ static int check_stack_boundary(struct verifier_env *env, int regno,
 		return -EACCES;
 	}
 
+	if (meta && meta->raw_mode) {
+		meta->access_size = access_size;
+		meta->regno = regno;
+		return 0;
+	}
+
 	for (i = 0; i < access_size; i++) {
 		if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
 			verbose("invalid indirect read from stack off %d+%d size %d\n",
@@ -859,7 +869,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 		expected_type = CONST_PTR_TO_MAP;
 	} else if (arg_type == ARG_PTR_TO_CTX) {
 		expected_type = PTR_TO_CTX;
-	} else if (arg_type == ARG_PTR_TO_STACK) {
+	} else if (arg_type == ARG_PTR_TO_STACK ||
+		   arg_type == ARG_PTR_TO_RAW_STACK) {
 		expected_type = PTR_TO_STACK;
 		/* One exception here. In case function allows for NULL to be
 		 * passed in as argument, it's a CONST_IMM type. Final test
@@ -867,6 +878,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 		 */
 		if (reg->type == CONST_IMM && reg->imm == 0)
 			expected_type = CONST_IMM;
+		meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK;
 	} else {
 		verbose("unsupported arg_type %d\n", arg_type);
 		return -EFAULT;
@@ -896,7 +908,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 		err = check_stack_boundary(env, regno, meta->map_ptr->key_size,
-					   false);
+					   false, NULL);
 	} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
 		/* bpf_map_xxx(..., map_ptr, ..., value) call:
 		 * check [value, value + map->value_size) validity
@@ -907,7 +919,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 		err = check_stack_boundary(env, regno,
-					   meta->map_ptr->value_size, false);
+					   meta->map_ptr->value_size,
+					   false, NULL);
 	} else if (arg_type == ARG_CONST_STACK_SIZE ||
 		   arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
 		bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO);
@@ -922,7 +935,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 		err = check_stack_boundary(env, regno - 1, reg->imm,
-					   zero_size_allowed);
+					   zero_size_allowed, meta);
 	}
 
 	return err;
@@ -953,6 +966,24 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 	return 0;
 }
 
+static int check_raw_mode(const struct bpf_func_proto *fn)
+{
+	int count = 0;
+
+	if (fn->arg1_type == ARG_PTR_TO_RAW_STACK)
+		count++;
+	if (fn->arg2_type == ARG_PTR_TO_RAW_STACK)
+		count++;
+	if (fn->arg3_type == ARG_PTR_TO_RAW_STACK)
+		count++;
+	if (fn->arg4_type == ARG_PTR_TO_RAW_STACK)
+		count++;
+	if (fn->arg5_type == ARG_PTR_TO_RAW_STACK)
+		count++;
+
+	return count > 1 ? -EINVAL : 0;
+}
+
 static int check_call(struct verifier_env *env, int func_id)
 {
 	struct verifier_state *state = &env->cur_state;
@@ -984,6 +1015,15 @@ static int check_call(struct verifier_env *env, int func_id)
 
 	memset(&meta, 0, sizeof(meta));
 
+	/* We only support one arg being in raw mode at the moment, which
+	 * is sufficient for the helper functions we have right now.
+	 */
+	err = check_raw_mode(fn);
+	if (err) {
+		verbose("kernel subsystem misconfigured func %d\n", func_id);
+		return err;
+	}
+
 	/* check args */
 	err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);
 	if (err)
@@ -1001,6 +1041,15 @@ static int check_call(struct verifier_env *env, int func_id)
 	if (err)
 		return err;
 
+	/* Mark slots with STACK_MISC in case of raw mode, stack offset
+	 * is inferred from register state.
+	 */
+	for (i = 0; i < meta.access_size; i++) {
+		err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1);
+		if (err)
+			return err;
+	}
+
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		reg = regs + caller_saved[i];
-- 
cgit 


From 074f528eed408b467516e142fa4c45e5b0d2ba16 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 13 Apr 2016 00:10:52 +0200
Subject: bpf: convert relevant helper args to ARG_PTR_TO_RAW_STACK

This patch converts all helpers that can use ARG_PTR_TO_RAW_STACK as argument
type. For tc programs this is bpf_skb_load_bytes(), bpf_skb_get_tunnel_key(),
bpf_skb_get_tunnel_opt(). For tracing, this optimizes bpf_get_current_comm()
and bpf_probe_read(). The check in bpf_skb_load_bytes() for MAX_BPF_STACK can
also be removed since the verifier already makes sure we stay within bounds
on stack buffers.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/helpers.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 50da680c479f..ad7a0573f71b 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -163,17 +163,26 @@ static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5)
 	struct task_struct *task = current;
 	char *buf = (char *) (long) r1;
 
-	if (!task)
-		return -EINVAL;
+	if (unlikely(!task))
+		goto err_clear;
 
-	strlcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm)));
+	strncpy(buf, task->comm, size);
+
+	/* Verifier guarantees that size > 0. For task->comm exceeding
+	 * size, guarantee that buf is %NUL-terminated. Unconditionally
+	 * done here to save the size test.
+	 */
+	buf[size - 1] = 0;
 	return 0;
+err_clear:
+	memset(buf, 0, size);
+	return -EINVAL;
 }
 
 const struct bpf_func_proto bpf_get_current_comm_proto = {
 	.func		= bpf_get_current_comm,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_STACK,
+	.arg1_type	= ARG_PTR_TO_RAW_STACK,
 	.arg2_type	= ARG_CONST_STACK_SIZE,
 };
-- 
cgit 


From bd570ff970a54df653b48ed0cfb373f2ebed083d Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 18 Apr 2016 21:01:24 +0200
Subject: bpf: add event output helper for notifications/sampling/logging

This patch adds a new helper for cls/act programs that can push events
to user space applications. For networking, this can be f.e. for sampling,
debugging, logging purposes or pushing of arbitrary wake-up events. The
idea is similar to a43eec304259 ("bpf: introduce bpf_perf_event_output()
helper") and 39111695b1b8 ("samples: bpf: add bpf_perf_event_output example").

The eBPF program utilizes a perf event array map that user space populates
with fds from perf_event_open(), the eBPF program calls into the helper
f.e. as skb_event_output(skb, &my_map, BPF_F_CURRENT_CPU, raw, sizeof(raw))
so that the raw data is pushed into the fd f.e. at the map index of the
current CPU.

User space can poll/mmap/etc on this and has a data channel for receiving
events that can be post-processed. The nice thing is that since the eBPF
program and user space application making use of it are tightly coupled,
they can define their own arbitrary raw data format and what/when they
want to push.

While f.e. packet headers could be one part of the meta data that is being
pushed, this is not a substitute for things like packet sockets as whole
packet is not being pushed and push is only done in a single direction.
Intention is more of a generically usable, efficient event pipe to applications.
Workflow is that tc can pin the map and applications can attach themselves
e.g. after cls/act setup to one or multiple map slots, demuxing is done by
the eBPF program.

Adding this facility is with minimal effort, it reuses the helper
introduced in a43eec304259 ("bpf: introduce bpf_perf_event_output() helper")
and we get its functionality for free by overloading its BPF_FUNC_ identifier
for cls/act programs, ctx is currently unused, but will be made use of in
future. Example will be added to iproute2's BPF example files.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index be0abf669ced..e4248fe79513 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -764,14 +764,21 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
 const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
 const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
 const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
+
 const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
+
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 {
 	return NULL;
 }
 
+const struct bpf_func_proto * __weak bpf_get_event_output_proto(void)
+{
+	return NULL;
+}
+
 /* Always built-in helper functions. */
 const struct bpf_func_proto bpf_tail_call_proto = {
 	.func		= NULL,
-- 
cgit 


From 1a0dc1ac1d2928e25739ee82d7e04423b01da563 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 5 May 2016 19:49:09 -0700
Subject: bpf: cleanup verifier code

cleanup verifier code and prepare it for addition of "pointer to packet" logic

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 100 ++++++++++++++++++++++++++------------------------
 1 file changed, 53 insertions(+), 47 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 63554b6d4e25..afeb62808902 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -249,28 +249,30 @@ static const char * const reg_type_str[] = {
 	[CONST_IMM]		= "imm",
 };
 
-static void print_verifier_state(struct verifier_env *env)
+static void print_verifier_state(struct verifier_state *state)
 {
+	struct reg_state *reg;
 	enum bpf_reg_type t;
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++) {
-		t = env->cur_state.regs[i].type;
+		reg = &state->regs[i];
+		t = reg->type;
 		if (t == NOT_INIT)
 			continue;
 		verbose(" R%d=%s", i, reg_type_str[t]);
 		if (t == CONST_IMM || t == PTR_TO_STACK)
-			verbose("%ld", env->cur_state.regs[i].imm);
+			verbose("%ld", reg->imm);
 		else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
 			 t == PTR_TO_MAP_VALUE_OR_NULL)
 			verbose("(ks=%d,vs=%d)",
-				env->cur_state.regs[i].map_ptr->key_size,
-				env->cur_state.regs[i].map_ptr->value_size);
+				reg->map_ptr->key_size,
+				reg->map_ptr->value_size);
 	}
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-		if (env->cur_state.stack_slot_type[i] == STACK_SPILL)
+		if (state->stack_slot_type[i] == STACK_SPILL)
 			verbose(" fp%d=%s", -MAX_BPF_STACK + i,
-				reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]);
+				reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]);
 	}
 	verbose("\n");
 }
@@ -686,10 +688,11 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 			    int value_regno)
 {
 	struct verifier_state *state = &env->cur_state;
+	struct reg_state *reg = &state->regs[regno];
 	int size, err = 0;
 
-	if (state->regs[regno].type == PTR_TO_STACK)
-		off += state->regs[regno].imm;
+	if (reg->type == PTR_TO_STACK)
+		off += reg->imm;
 
 	size = bpf_size_to_bytes(bpf_size);
 	if (size < 0)
@@ -700,7 +703,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 		return -EACCES;
 	}
 
-	if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
+	if (reg->type == PTR_TO_MAP_VALUE) {
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
 			verbose("R%d leaks addr into map\n", value_regno);
@@ -710,7 +713,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown_value(state->regs, value_regno);
 
-	} else if (state->regs[regno].type == PTR_TO_CTX) {
+	} else if (reg->type == PTR_TO_CTX) {
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
 			verbose("R%d leaks addr into ctx\n", value_regno);
@@ -720,8 +723,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown_value(state->regs, value_regno);
 
-	} else if (state->regs[regno].type == FRAME_PTR ||
-		   state->regs[regno].type == PTR_TO_STACK) {
+	} else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
 		if (off >= 0 || off < -MAX_BPF_STACK) {
 			verbose("invalid stack off=%d size=%d\n", off, size);
 			return -EACCES;
@@ -739,7 +741,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 		}
 	} else {
 		verbose("R%d invalid mem access '%s'\n",
-			regno, reg_type_str[state->regs[regno].type]);
+			regno, reg_type_str[reg->type]);
 		return -EACCES;
 	}
 	return err;
@@ -1104,7 +1106,7 @@ static int check_call(struct verifier_env *env, int func_id)
 /* check validity of 32-bit and 64-bit arithmetic operations */
 static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 {
-	struct reg_state *regs = env->cur_state.regs;
+	struct reg_state *regs = env->cur_state.regs, *dst_reg;
 	u8 opcode = BPF_OP(insn->code);
 	int err;
 
@@ -1193,8 +1195,6 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 
 	} else {	/* all other ALU ops: and, sub, xor, add, ... */
 
-		bool stack_relative = false;
-
 		if (BPF_SRC(insn->code) == BPF_X) {
 			if (insn->imm != 0 || insn->off != 0) {
 				verbose("BPF_ALU uses reserved fields\n");
@@ -1232,11 +1232,19 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 			}
 		}
 
+		/* check dest operand */
+		err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+		if (err)
+			return err;
+
+		dst_reg = &regs[insn->dst_reg];
+
 		/* pattern match 'bpf_add Rx, imm' instruction */
 		if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
-		    regs[insn->dst_reg].type == FRAME_PTR &&
-		    BPF_SRC(insn->code) == BPF_K) {
-			stack_relative = true;
+		    dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) {
+			dst_reg->type = PTR_TO_STACK;
+			dst_reg->imm = insn->imm;
+			return 0;
 		} else if (is_pointer_value(env, insn->dst_reg)) {
 			verbose("R%d pointer arithmetic prohibited\n",
 				insn->dst_reg);
@@ -1248,15 +1256,8 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 			return -EACCES;
 		}
 
-		/* check dest operand */
-		err = check_reg_arg(regs, insn->dst_reg, DST_OP);
-		if (err)
-			return err;
-
-		if (stack_relative) {
-			regs[insn->dst_reg].type = PTR_TO_STACK;
-			regs[insn->dst_reg].imm = insn->imm;
-		}
+		/* mark dest operand */
+		mark_reg_unknown_value(regs, insn->dst_reg);
 	}
 
 	return 0;
@@ -1265,7 +1266,7 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 static int check_cond_jmp_op(struct verifier_env *env,
 			     struct bpf_insn *insn, int *insn_idx)
 {
-	struct reg_state *regs = env->cur_state.regs;
+	struct reg_state *regs = env->cur_state.regs, *dst_reg;
 	struct verifier_state *other_branch;
 	u8 opcode = BPF_OP(insn->code);
 	int err;
@@ -1303,11 +1304,12 @@ static int check_cond_jmp_op(struct verifier_env *env,
 	if (err)
 		return err;
 
+	dst_reg = &regs[insn->dst_reg];
+
 	/* detect if R == 0 where R was initialized to zero earlier */
 	if (BPF_SRC(insn->code) == BPF_K &&
 	    (opcode == BPF_JEQ || opcode == BPF_JNE) &&
-	    regs[insn->dst_reg].type == CONST_IMM &&
-	    regs[insn->dst_reg].imm == insn->imm) {
+	    dst_reg->type == CONST_IMM && dst_reg->imm == insn->imm) {
 		if (opcode == BPF_JEQ) {
 			/* if (imm == imm) goto pc+off;
 			 * only follow the goto, ignore fall-through
@@ -1329,9 +1331,8 @@ static int check_cond_jmp_op(struct verifier_env *env,
 
 	/* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */
 	if (BPF_SRC(insn->code) == BPF_K &&
-	    insn->imm == 0 && (opcode == BPF_JEQ ||
-			       opcode == BPF_JNE) &&
-	    regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) {
+	    insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
+	    dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
 		if (opcode == BPF_JEQ) {
 			/* next fallthrough insn can access memory via
 			 * this register
@@ -1366,7 +1367,7 @@ static int check_cond_jmp_op(struct verifier_env *env,
 		}
 	}
 	if (log_level)
-		print_verifier_state(env);
+		print_verifier_state(&env->cur_state);
 	return 0;
 }
 
@@ -1444,14 +1445,14 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
 	int i, err;
 
 	if (!may_access_skb(env->prog->type)) {
-		verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n");
+		verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
 		return -EINVAL;
 	}
 
 	if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
 	    BPF_SIZE(insn->code) == BPF_DW ||
 	    (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
-		verbose("BPF_LD_ABS uses reserved fields\n");
+		verbose("BPF_LD_[ABS|IND] uses reserved fields\n");
 		return -EINVAL;
 	}
 
@@ -1712,17 +1713,21 @@ err_free:
  */
 static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
 {
+	struct reg_state *rold, *rcur;
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++) {
-		if (memcmp(&old->regs[i], &cur->regs[i],
-			   sizeof(old->regs[0])) != 0) {
-			if (old->regs[i].type == NOT_INIT ||
-			    (old->regs[i].type == UNKNOWN_VALUE &&
-			     cur->regs[i].type != NOT_INIT))
-				continue;
-			return false;
-		}
+		rold = &old->regs[i];
+		rcur = &cur->regs[i];
+
+		if (memcmp(rold, rcur, sizeof(*rold)) == 0)
+			continue;
+
+		if (rold->type == NOT_INIT ||
+		    (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT))
+			continue;
+
+		return false;
 	}
 
 	for (i = 0; i < MAX_BPF_STACK; i++) {
@@ -1844,7 +1849,7 @@ static int do_check(struct verifier_env *env)
 
 		if (log_level && do_print_state) {
 			verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
-			print_verifier_state(env);
+			print_verifier_state(&env->cur_state);
 			do_print_state = false;
 		}
 
@@ -2056,6 +2061,7 @@ process_bpf_exit:
 		insn_idx++;
 	}
 
+	verbose("processed %d insns\n", insn_processed);
 	return 0;
 }
 
-- 
cgit 


From 969bf05eb3cedd5a8d4b7c346a85c2ede87a6d6d Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 5 May 2016 19:49:10 -0700
Subject: bpf: direct packet access

Extended BPF carried over two instructions from classic to access
packet data: LD_ABS and LD_IND. They're highly optimized in JITs,
but due to their design they have to do length check for every access.
When BPF is processing 20M packets per second single LD_ABS after JIT
is consuming 3% cpu. Hence the need to optimize it further by amortizing
the cost of 'off < skb_headlen' over multiple packet accesses.
One option is to introduce two new eBPF instructions LD_ABS_DW and LD_IND_DW
with similar usage as skb_header_pointer().
The kernel part for interpreter and x64 JIT was implemented in [1], but such
new insns behave like old ld_abs and abort the program with 'return 0' if
access is beyond linear data. Such hidden control flow is hard to workaround
plus changing JITs and rolling out new llvm is incovenient.

Therefore allow cls_bpf/act_bpf program access skb->data directly:
int bpf_prog(struct __sk_buff *skb)
{
  struct iphdr *ip;

  if (skb->data + sizeof(struct iphdr) + ETH_HLEN > skb->data_end)
      /* packet too small */
      return 0;

  ip = skb->data + ETH_HLEN;

  /* access IP header fields with direct loads */
  if (ip->version != 4 || ip->saddr == 0x7f000001)
      return 1;
  [...]
}

This solution avoids introduction of new instructions. llvm stays
the same and all JITs stay the same, but verifier has to work extra hard
to prove safety of the above program.

For XDP the direct store instructions can be allowed as well.

The skb->data is NET_IP_ALIGNED, so for common cases the verifier can check
the alignment. The complex packet parsers where packet pointer is adjusted
incrementally cannot be tracked for alignment, so allow byte access in such cases
and misaligned access on architectures that define efficient_unaligned_access

[1] https://git.kernel.org/cgit/linux/kernel/git/ast/bpf.git/?h=ld_abs_dw

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c     |   5 +
 kernel/bpf/verifier.c | 441 +++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 438 insertions(+), 8 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index e4248fe79513..d781b077431f 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -794,6 +794,11 @@ void __weak bpf_int_jit_compile(struct bpf_prog *prog)
 {
 }
 
+bool __weak bpf_helper_changes_skb_data(void *func)
+{
+	return false;
+}
+
 /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
  * skb_copy_bits(), so provide a weak definition of it for NET-less config.
  */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index afeb62808902..6338c61fc2a1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1,4 +1,5 @@
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -136,13 +137,32 @@ enum bpf_reg_type {
 	FRAME_PTR,		 /* reg == frame_pointer */
 	PTR_TO_STACK,		 /* reg == frame_pointer + imm */
 	CONST_IMM,		 /* constant integer value */
+
+	/* PTR_TO_PACKET represents:
+	 * skb->data
+	 * skb->data + imm
+	 * skb->data + (u16) var
+	 * skb->data + (u16) var + imm
+	 * if (range > 0) then [ptr, ptr + range - off) is safe to access
+	 * if (id > 0) means that some 'var' was added
+	 * if (off > 0) menas that 'imm' was added
+	 */
+	PTR_TO_PACKET,
+	PTR_TO_PACKET_END,	 /* skb->data + headlen */
 };
 
 struct reg_state {
 	enum bpf_reg_type type;
 	union {
-		/* valid when type == CONST_IMM | PTR_TO_STACK */
-		long imm;
+		/* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */
+		s64 imm;
+
+		/* valid when type == PTR_TO_PACKET* */
+		struct {
+			u32 id;
+			u16 off;
+			u16 range;
+		};
 
 		/* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
 		 *   PTR_TO_MAP_VALUE_OR_NULL
@@ -247,6 +267,8 @@ static const char * const reg_type_str[] = {
 	[FRAME_PTR]		= "fp",
 	[PTR_TO_STACK]		= "fp",
 	[CONST_IMM]		= "imm",
+	[PTR_TO_PACKET]		= "pkt",
+	[PTR_TO_PACKET_END]	= "pkt_end",
 };
 
 static void print_verifier_state(struct verifier_state *state)
@@ -262,7 +284,12 @@ static void print_verifier_state(struct verifier_state *state)
 			continue;
 		verbose(" R%d=%s", i, reg_type_str[t]);
 		if (t == CONST_IMM || t == PTR_TO_STACK)
-			verbose("%ld", reg->imm);
+			verbose("%lld", reg->imm);
+		else if (t == PTR_TO_PACKET)
+			verbose("(id=%d,off=%d,r=%d)",
+				reg->id, reg->off, reg->range);
+		else if (t == UNKNOWN_VALUE && reg->imm)
+			verbose("%lld", reg->imm);
 		else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
 			 t == PTR_TO_MAP_VALUE_OR_NULL)
 			verbose("(ks=%d,vs=%d)",
@@ -548,6 +575,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_MAP_VALUE_OR_NULL:
 	case PTR_TO_STACK:
 	case PTR_TO_CTX:
+	case PTR_TO_PACKET:
+	case PTR_TO_PACKET_END:
 	case FRAME_PTR:
 	case CONST_PTR_TO_MAP:
 		return true;
@@ -647,6 +676,27 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
 	return 0;
 }
 
+#define MAX_PACKET_OFF 0xffff
+
+static int check_packet_access(struct verifier_env *env, u32 regno, int off,
+			       int size)
+{
+	struct reg_state *regs = env->cur_state.regs;
+	struct reg_state *reg = &regs[regno];
+	int linear_size = (int) reg->range - (int) reg->off;
+
+	if (linear_size < 0 || linear_size >= MAX_PACKET_OFF) {
+		verbose("verifier bug\n");
+		return -EFAULT;
+	}
+	if (off < 0 || off + size > linear_size) {
+		verbose("invalid access to packet, off=%d size=%d, allowed=%d\n",
+			off, size, linear_size);
+		return -EACCES;
+	}
+	return 0;
+}
+
 /* check access to 'struct bpf_context' fields */
 static int check_ctx_access(struct verifier_env *env, int off, int size,
 			    enum bpf_access_type t)
@@ -677,6 +727,45 @@ static bool is_pointer_value(struct verifier_env *env, int regno)
 	}
 }
 
+static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
+			       int off, int size)
+{
+	if (reg->type != PTR_TO_PACKET) {
+		if (off % size != 0) {
+			verbose("misaligned access off %d size %d\n", off, size);
+			return -EACCES;
+		} else {
+			return 0;
+		}
+	}
+
+	switch (env->prog->type) {
+	case BPF_PROG_TYPE_SCHED_CLS:
+	case BPF_PROG_TYPE_SCHED_ACT:
+		break;
+	default:
+		verbose("verifier is misconfigured\n");
+		return -EACCES;
+	}
+
+	if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
+		/* misaligned access to packet is ok on x86,arm,arm64 */
+		return 0;
+
+	if (reg->id && size != 1) {
+		verbose("Unknown packet alignment. Only byte-sized access allowed\n");
+		return -EACCES;
+	}
+
+	/* skb->data is NET_IP_ALIGN-ed */
+	if ((NET_IP_ALIGN + reg->off + off) % size != 0) {
+		verbose("misaligned packet access off %d+%d+%d size %d\n",
+			NET_IP_ALIGN, reg->off, off, size);
+		return -EACCES;
+	}
+	return 0;
+}
+
 /* check whether memory at (regno + off) is accessible for t = (read | write)
  * if t==write, value_regno is a register which value is stored into memory
  * if t==read, value_regno is a register which will receive the value from memory
@@ -698,10 +787,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 	if (size < 0)
 		return size;
 
-	if (off % size != 0) {
-		verbose("misaligned access off %d size %d\n", off, size);
-		return -EACCES;
-	}
+	err = check_ptr_alignment(env, reg, off, size);
+	if (err)
+		return err;
 
 	if (reg->type == PTR_TO_MAP_VALUE) {
 		if (t == BPF_WRITE && value_regno >= 0 &&
@@ -720,8 +808,16 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 			return -EACCES;
 		}
 		err = check_ctx_access(env, off, size, t);
-		if (!err && t == BPF_READ && value_regno >= 0)
+		if (!err && t == BPF_READ && value_regno >= 0) {
 			mark_reg_unknown_value(state->regs, value_regno);
+			if (off == offsetof(struct __sk_buff, data) &&
+			    env->allow_ptr_leaks)
+				/* note that reg.[id|off|range] == 0 */
+				state->regs[value_regno].type = PTR_TO_PACKET;
+			else if (off == offsetof(struct __sk_buff, data_end) &&
+				 env->allow_ptr_leaks)
+				state->regs[value_regno].type = PTR_TO_PACKET_END;
+		}
 
 	} else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
 		if (off >= 0 || off < -MAX_BPF_STACK) {
@@ -739,11 +835,28 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 		} else {
 			err = check_stack_read(state, off, size, value_regno);
 		}
+	} else if (state->regs[regno].type == PTR_TO_PACKET) {
+		if (t == BPF_WRITE) {
+			verbose("cannot write into packet\n");
+			return -EACCES;
+		}
+		err = check_packet_access(env, regno, off, size);
+		if (!err && t == BPF_READ && value_regno >= 0)
+			mark_reg_unknown_value(state->regs, value_regno);
 	} else {
 		verbose("R%d invalid mem access '%s'\n",
 			regno, reg_type_str[reg->type]);
 		return -EACCES;
 	}
+
+	if (!err && size <= 2 && value_regno >= 0 && env->allow_ptr_leaks &&
+	    state->regs[value_regno].type == UNKNOWN_VALUE) {
+		/* 1 or 2 byte load zero-extends, determine the number of
+		 * zero upper bits. Not doing it fo 4 byte load, since
+		 * such values cannot be added to ptr_to_packet anyway.
+		 */
+		state->regs[value_regno].imm = 64 - size * 8;
+	}
 	return err;
 }
 
@@ -1001,6 +1114,29 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
 	return count > 1 ? -EINVAL : 0;
 }
 
+static void clear_all_pkt_pointers(struct verifier_env *env)
+{
+	struct verifier_state *state = &env->cur_state;
+	struct reg_state *regs = state->regs, *reg;
+	int i;
+
+	for (i = 0; i < MAX_BPF_REG; i++)
+		if (regs[i].type == PTR_TO_PACKET ||
+		    regs[i].type == PTR_TO_PACKET_END)
+			mark_reg_unknown_value(regs, i);
+
+	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
+		if (state->stack_slot_type[i] != STACK_SPILL)
+			continue;
+		reg = &state->spilled_regs[i / BPF_REG_SIZE];
+		if (reg->type != PTR_TO_PACKET &&
+		    reg->type != PTR_TO_PACKET_END)
+			continue;
+		reg->type = UNKNOWN_VALUE;
+		reg->imm = 0;
+	}
+}
+
 static int check_call(struct verifier_env *env, int func_id)
 {
 	struct verifier_state *state = &env->cur_state;
@@ -1008,6 +1144,7 @@ static int check_call(struct verifier_env *env, int func_id)
 	struct reg_state *regs = state->regs;
 	struct reg_state *reg;
 	struct bpf_call_arg_meta meta;
+	bool changes_data;
 	int i, err;
 
 	/* find function prototype */
@@ -1030,6 +1167,8 @@ static int check_call(struct verifier_env *env, int func_id)
 		return -EINVAL;
 	}
 
+	changes_data = bpf_helper_changes_skb_data(fn->func);
+
 	memset(&meta, 0, sizeof(meta));
 
 	/* We only support one arg being in raw mode at the moment, which
@@ -1100,6 +1239,189 @@ static int check_call(struct verifier_env *env, int func_id)
 	if (err)
 		return err;
 
+	if (changes_data)
+		clear_all_pkt_pointers(env);
+	return 0;
+}
+
+static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn)
+{
+	struct reg_state *regs = env->cur_state.regs;
+	struct reg_state *dst_reg = &regs[insn->dst_reg];
+	struct reg_state *src_reg = &regs[insn->src_reg];
+	s32 imm;
+
+	if (BPF_SRC(insn->code) == BPF_K) {
+		/* pkt_ptr += imm */
+		imm = insn->imm;
+
+add_imm:
+		if (imm <= 0) {
+			verbose("addition of negative constant to packet pointer is not allowed\n");
+			return -EACCES;
+		}
+		if (imm >= MAX_PACKET_OFF ||
+		    imm + dst_reg->off >= MAX_PACKET_OFF) {
+			verbose("constant %d is too large to add to packet pointer\n",
+				imm);
+			return -EACCES;
+		}
+		/* a constant was added to pkt_ptr.
+		 * Remember it while keeping the same 'id'
+		 */
+		dst_reg->off += imm;
+	} else {
+		if (src_reg->type == CONST_IMM) {
+			/* pkt_ptr += reg where reg is known constant */
+			imm = src_reg->imm;
+			goto add_imm;
+		}
+		/* disallow pkt_ptr += reg
+		 * if reg is not uknown_value with guaranteed zero upper bits
+		 * otherwise pkt_ptr may overflow and addition will become
+		 * subtraction which is not allowed
+		 */
+		if (src_reg->type != UNKNOWN_VALUE) {
+			verbose("cannot add '%s' to ptr_to_packet\n",
+				reg_type_str[src_reg->type]);
+			return -EACCES;
+		}
+		if (src_reg->imm < 48) {
+			verbose("cannot add integer value with %lld upper zero bits to ptr_to_packet\n",
+				src_reg->imm);
+			return -EACCES;
+		}
+		/* dst_reg stays as pkt_ptr type and since some positive
+		 * integer value was added to the pointer, increment its 'id'
+		 */
+		dst_reg->id++;
+
+		/* something was added to pkt_ptr, set range and off to zero */
+		dst_reg->off = 0;
+		dst_reg->range = 0;
+	}
+	return 0;
+}
+
+static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn)
+{
+	struct reg_state *regs = env->cur_state.regs;
+	struct reg_state *dst_reg = &regs[insn->dst_reg];
+	u8 opcode = BPF_OP(insn->code);
+	s64 imm_log2;
+
+	/* for type == UNKNOWN_VALUE:
+	 * imm > 0 -> number of zero upper bits
+	 * imm == 0 -> don't track which is the same as all bits can be non-zero
+	 */
+
+	if (BPF_SRC(insn->code) == BPF_X) {
+		struct reg_state *src_reg = &regs[insn->src_reg];
+
+		if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 &&
+		    dst_reg->imm && opcode == BPF_ADD) {
+			/* dreg += sreg
+			 * where both have zero upper bits. Adding them
+			 * can only result making one more bit non-zero
+			 * in the larger value.
+			 * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
+			 *     0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
+			 */
+			dst_reg->imm = min(dst_reg->imm, src_reg->imm);
+			dst_reg->imm--;
+			return 0;
+		}
+		if (src_reg->type == CONST_IMM && src_reg->imm > 0 &&
+		    dst_reg->imm && opcode == BPF_ADD) {
+			/* dreg += sreg
+			 * where dreg has zero upper bits and sreg is const.
+			 * Adding them can only result making one more bit
+			 * non-zero in the larger value.
+			 */
+			imm_log2 = __ilog2_u64((long long)src_reg->imm);
+			dst_reg->imm = min(dst_reg->imm, 63 - imm_log2);
+			dst_reg->imm--;
+			return 0;
+		}
+		/* all other cases non supported yet, just mark dst_reg */
+		dst_reg->imm = 0;
+		return 0;
+	}
+
+	/* sign extend 32-bit imm into 64-bit to make sure that
+	 * negative values occupy bit 63. Note ilog2() would have
+	 * been incorrect, since sizeof(insn->imm) == 4
+	 */
+	imm_log2 = __ilog2_u64((long long)insn->imm);
+
+	if (dst_reg->imm && opcode == BPF_LSH) {
+		/* reg <<= imm
+		 * if reg was a result of 2 byte load, then its imm == 48
+		 * which means that upper 48 bits are zero and shifting this reg
+		 * left by 4 would mean that upper 44 bits are still zero
+		 */
+		dst_reg->imm -= insn->imm;
+	} else if (dst_reg->imm && opcode == BPF_MUL) {
+		/* reg *= imm
+		 * if multiplying by 14 subtract 4
+		 * This is conservative calculation of upper zero bits.
+		 * It's not trying to special case insn->imm == 1 or 0 cases
+		 */
+		dst_reg->imm -= imm_log2 + 1;
+	} else if (opcode == BPF_AND) {
+		/* reg &= imm */
+		dst_reg->imm = 63 - imm_log2;
+	} else if (dst_reg->imm && opcode == BPF_ADD) {
+		/* reg += imm */
+		dst_reg->imm = min(dst_reg->imm, 63 - imm_log2);
+		dst_reg->imm--;
+	} else if (opcode == BPF_RSH) {
+		/* reg >>= imm
+		 * which means that after right shift, upper bits will be zero
+		 * note that verifier already checked that
+		 * 0 <= imm < 64 for shift insn
+		 */
+		dst_reg->imm += insn->imm;
+		if (unlikely(dst_reg->imm > 64))
+			/* some dumb code did:
+			 * r2 = *(u32 *)mem;
+			 * r2 >>= 32;
+			 * and all bits are zero now */
+			dst_reg->imm = 64;
+	} else {
+		/* all other alu ops, means that we don't know what will
+		 * happen to the value, mark it with unknown number of zero bits
+		 */
+		dst_reg->imm = 0;
+	}
+
+	if (dst_reg->imm < 0) {
+		/* all 64 bits of the register can contain non-zero bits
+		 * and such value cannot be added to ptr_to_packet, since it
+		 * may overflow, mark it as unknown to avoid further eval
+		 */
+		dst_reg->imm = 0;
+	}
+	return 0;
+}
+
+static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn)
+{
+	struct reg_state *regs = env->cur_state.regs;
+	struct reg_state *dst_reg = &regs[insn->dst_reg];
+	struct reg_state *src_reg = &regs[insn->src_reg];
+	u8 opcode = BPF_OP(insn->code);
+
+	/* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn.
+	 * Don't care about overflow or negative values, just add them
+	 */
+	if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K)
+		dst_reg->imm += insn->imm;
+	else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X &&
+		 src_reg->type == CONST_IMM)
+		dst_reg->imm += src_reg->imm;
+	else
+		mark_reg_unknown_value(regs, insn->dst_reg);
 	return 0;
 }
 
@@ -1245,6 +1567,21 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 			dst_reg->type = PTR_TO_STACK;
 			dst_reg->imm = insn->imm;
 			return 0;
+		} else if (opcode == BPF_ADD &&
+			   BPF_CLASS(insn->code) == BPF_ALU64 &&
+			   dst_reg->type == PTR_TO_PACKET) {
+			/* ptr_to_packet += K|X */
+			return check_packet_ptr_add(env, insn);
+		} else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
+			   dst_reg->type == UNKNOWN_VALUE &&
+			   env->allow_ptr_leaks) {
+			/* unknown += K|X */
+			return evaluate_reg_alu(env, insn);
+		} else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
+			   dst_reg->type == CONST_IMM &&
+			   env->allow_ptr_leaks) {
+			/* reg_imm += K|X */
+			return evaluate_reg_imm_alu(env, insn);
 		} else if (is_pointer_value(env, insn->dst_reg)) {
 			verbose("R%d pointer arithmetic prohibited\n",
 				insn->dst_reg);
@@ -1263,6 +1600,34 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 	return 0;
 }
 
+static void find_good_pkt_pointers(struct verifier_env *env,
+				   struct reg_state *dst_reg)
+{
+	struct verifier_state *state = &env->cur_state;
+	struct reg_state *regs = state->regs, *reg;
+	int i;
+	/* r2 = r3;
+	 * r2 += 8
+	 * if (r2 > pkt_end) goto somewhere
+	 * r2 == dst_reg, pkt_end == src_reg,
+	 * r2=pkt(id=n,off=8,r=0)
+	 * r3=pkt(id=n,off=0,r=0)
+	 * find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
+	 * so that range of bytes [r3, r3 + 8) is safe to access
+	 */
+	for (i = 0; i < MAX_BPF_REG; i++)
+		if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
+			regs[i].range = dst_reg->off;
+
+	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
+		if (state->stack_slot_type[i] != STACK_SPILL)
+			continue;
+		reg = &state->spilled_regs[i / BPF_REG_SIZE];
+		if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
+			reg->range = dst_reg->off;
+	}
+}
+
 static int check_cond_jmp_op(struct verifier_env *env,
 			     struct bpf_insn *insn, int *insn_idx)
 {
@@ -1346,6 +1711,10 @@ static int check_cond_jmp_op(struct verifier_env *env,
 			regs[insn->dst_reg].type = CONST_IMM;
 			regs[insn->dst_reg].imm = 0;
 		}
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
+		   dst_reg->type == PTR_TO_PACKET &&
+		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
+		find_good_pkt_pointers(env, dst_reg);
 	} else if (is_pointer_value(env, insn->dst_reg)) {
 		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
 		return -EACCES;
@@ -1685,6 +2054,58 @@ err_free:
 	return ret;
 }
 
+/* the following conditions reduce the number of explored insns
+ * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet
+ */
+static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur)
+{
+	if (old->id != cur->id)
+		return false;
+
+	/* old ptr_to_packet is more conservative, since it allows smaller
+	 * range. Ex:
+	 * old(off=0,r=10) is equal to cur(off=0,r=20), because
+	 * old(off=0,r=10) means that with range=10 the verifier proceeded
+	 * further and found no issues with the program. Now we're in the same
+	 * spot with cur(off=0,r=20), so we're safe too, since anything further
+	 * will only be looking at most 10 bytes after this pointer.
+	 */
+	if (old->off == cur->off && old->range < cur->range)
+		return true;
+
+	/* old(off=20,r=10) is equal to cur(off=22,re=22 or 5 or 0)
+	 * since both cannot be used for packet access and safe(old)
+	 * pointer has smaller off that could be used for further
+	 * 'if (ptr > data_end)' check
+	 * Ex:
+	 * old(off=20,r=10) and cur(off=22,r=22) and cur(off=22,r=0) mean
+	 * that we cannot access the packet.
+	 * The safe range is:
+	 * [ptr, ptr + range - off)
+	 * so whenever off >=range, it means no safe bytes from this pointer.
+	 * When comparing old->off <= cur->off, it means that older code
+	 * went with smaller offset and that offset was later
+	 * used to figure out the safe range after 'if (ptr > data_end)' check
+	 * Say, 'old' state was explored like:
+	 * ... R3(off=0, r=0)
+	 * R4 = R3 + 20
+	 * ... now R4(off=20,r=0)  <-- here
+	 * if (R4 > data_end)
+	 * ... R4(off=20,r=20), R3(off=0,r=20) and R3 can be used to access.
+	 * ... the code further went all the way to bpf_exit.
+	 * Now the 'cur' state at the mark 'here' has R4(off=30,r=0).
+	 * old_R4(off=20,r=0) equal to cur_R4(off=30,r=0), since if the verifier
+	 * goes further, such cur_R4 will give larger safe packet range after
+	 * 'if (R4 > data_end)' and all further insn were already good with r=20,
+	 * so they will be good with r=30 and we can prune the search.
+	 */
+	if (old->off <= cur->off &&
+	    old->off >= old->range && cur->off >= cur->range)
+		return true;
+
+	return false;
+}
+
 /* compare two verifier states
  *
  * all states stored in state_list are known to be valid, since
@@ -1727,6 +2148,10 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
 		    (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT))
 			continue;
 
+		if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET &&
+		    compare_ptrs_to_packet(rold, rcur))
+			continue;
+
 		return false;
 	}
 
-- 
cgit 


From 735b433397ea2f97d59240cbe4ea770aa7c88eef Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 5 May 2016 19:49:11 -0700
Subject: bpf: improve verifier state equivalence

since UNKNOWN_VALUE type is weaker than CONST_IMM we can un-teach
verifier its recognition of constants in conditional branches
without affecting safety.
Ex:
if (reg == 123) {
  .. here verifier was marking reg->type as CONST_IMM
     instead keep reg as UNKNOWN_VALUE
}

Two verifier states with UNKNOWN_VALUE are equivalent, whereas
CONST_IMM_X != CONST_IMM_Y, since CONST_IMM is used for stack range
verification and other cases.
So help search pruning by marking registers as UNKNOWN_VALUE
where possible instead of CONST_IMM.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 23 +++--------------------
 1 file changed, 3 insertions(+), 20 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6338c61fc2a1..84bff68cf80e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1704,12 +1704,11 @@ static int check_cond_jmp_op(struct verifier_env *env,
 			 */
 			regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
 			/* branch targer cannot access it, since reg == 0 */
-			other_branch->regs[insn->dst_reg].type = CONST_IMM;
-			other_branch->regs[insn->dst_reg].imm = 0;
+			mark_reg_unknown_value(other_branch->regs,
+					       insn->dst_reg);
 		} else {
 			other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
-			regs[insn->dst_reg].type = CONST_IMM;
-			regs[insn->dst_reg].imm = 0;
+			mark_reg_unknown_value(regs, insn->dst_reg);
 		}
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
 		   dst_reg->type == PTR_TO_PACKET &&
@@ -1718,22 +1717,6 @@ static int check_cond_jmp_op(struct verifier_env *env,
 	} else if (is_pointer_value(env, insn->dst_reg)) {
 		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
 		return -EACCES;
-	} else if (BPF_SRC(insn->code) == BPF_K &&
-		   (opcode == BPF_JEQ || opcode == BPF_JNE)) {
-
-		if (opcode == BPF_JEQ) {
-			/* detect if (R == imm) goto
-			 * and in the target state recognize that R = imm
-			 */
-			other_branch->regs[insn->dst_reg].type = CONST_IMM;
-			other_branch->regs[insn->dst_reg].imm = insn->imm;
-		} else {
-			/* detect if (R != imm) goto
-			 * and in the fall-through state recognize that R = imm
-			 */
-			regs[insn->dst_reg].type = CONST_IMM;
-			regs[insn->dst_reg].imm = insn->imm;
-		}
 	}
 	if (log_level)
 		print_verifier_state(&env->cur_state);
-- 
cgit 


From 4936e3528e3e272c567fe4ff0abb7ce3e1500575 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 May 2016 19:08:26 +0200
Subject: bpf: minor cleanups in ebpf code

Besides others, remove redundant comments where the code is self
documenting enough, and properly indent various bpf_verifier_ops
and bpf_prog_type_list declarations. Moreover, remove two exports
that actually have no module user.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d781b077431f..5313d09d4b62 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -129,14 +129,12 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 
 	return fp;
 }
-EXPORT_SYMBOL_GPL(bpf_prog_realloc);
 
 void __bpf_prog_free(struct bpf_prog *fp)
 {
 	kfree(fp->aux);
 	vfree(fp);
 }
-EXPORT_SYMBOL_GPL(__bpf_prog_free);
 
 #ifdef CONFIG_BPF_JIT
 struct bpf_binary_header *
-- 
cgit 


From c237ee5eb33bf19fe0591c04ff8db19da7323a83 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 May 2016 19:08:30 +0200
Subject: bpf: add bpf_patch_insn_single helper

Move the functionality to patch instructions out of the verifier
code and into the core as the new bpf_patch_insn_single() helper
will be needed later on for blinding as well. No changes in
functionality.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c     | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c | 53 +++++++-------------------------------
 2 files changed, 80 insertions(+), 44 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5313d09d4b62..49b5538a5301 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -136,6 +136,77 @@ void __bpf_prog_free(struct bpf_prog *fp)
 	vfree(fp);
 }
 
+static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)
+{
+	return BPF_CLASS(insn->code) == BPF_JMP  &&
+	       /* Call and Exit are both special jumps with no
+		* target inside the BPF instruction image.
+		*/
+	       BPF_OP(insn->code) != BPF_CALL &&
+	       BPF_OP(insn->code) != BPF_EXIT;
+}
+
+static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta)
+{
+	struct bpf_insn *insn = prog->insnsi;
+	u32 i, insn_cnt = prog->len;
+
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		if (!bpf_is_jmp_and_has_target(insn))
+			continue;
+
+		/* Adjust offset of jmps if we cross boundaries. */
+		if (i < pos && i + insn->off + 1 > pos)
+			insn->off += delta;
+		else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
+			insn->off -= delta;
+	}
+}
+
+struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
+				       const struct bpf_insn *patch, u32 len)
+{
+	u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
+	struct bpf_prog *prog_adj;
+
+	/* Since our patchlet doesn't expand the image, we're done. */
+	if (insn_delta == 0) {
+		memcpy(prog->insnsi + off, patch, sizeof(*patch));
+		return prog;
+	}
+
+	insn_adj_cnt = prog->len + insn_delta;
+
+	/* Several new instructions need to be inserted. Make room
+	 * for them. Likely, there's no need for a new allocation as
+	 * last page could have large enough tailroom.
+	 */
+	prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
+				    GFP_USER);
+	if (!prog_adj)
+		return NULL;
+
+	prog_adj->len = insn_adj_cnt;
+
+	/* Patching happens in 3 steps:
+	 *
+	 * 1) Move over tail of insnsi from next instruction onwards,
+	 *    so we can patch the single target insn with one or more
+	 *    new ones (patching is always from 1 to n insns, n > 0).
+	 * 2) Inject new instructions at the target location.
+	 * 3) Adjust branch offsets if necessary.
+	 */
+	insn_rest = insn_adj_cnt - off - len;
+
+	memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1,
+		sizeof(*patch) * insn_rest);
+	memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);
+
+	bpf_adj_branches(prog_adj, off, insn_delta);
+
+	return prog_adj;
+}
+
 #ifdef CONFIG_BPF_JIT
 struct bpf_binary_header *
 bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 84bff68cf80e..a08d66215245 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2587,26 +2587,6 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
 			insn->src_reg = 0;
 }
 
-static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
-{
-	struct bpf_insn *insn = prog->insnsi;
-	int insn_cnt = prog->len;
-	int i;
-
-	for (i = 0; i < insn_cnt; i++, insn++) {
-		if (BPF_CLASS(insn->code) != BPF_JMP ||
-		    BPF_OP(insn->code) == BPF_CALL ||
-		    BPF_OP(insn->code) == BPF_EXIT)
-			continue;
-
-		/* adjust offset of jmps if necessary */
-		if (i < pos && i + insn->off + 1 > pos)
-			insn->off += delta;
-		else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
-			insn->off -= delta;
-	}
-}
-
 /* convert load instructions that access fields of 'struct __sk_buff'
  * into sequence of instructions that access fields of 'struct sk_buff'
  */
@@ -2616,14 +2596,15 @@ static int convert_ctx_accesses(struct verifier_env *env)
 	int insn_cnt = env->prog->len;
 	struct bpf_insn insn_buf[16];
 	struct bpf_prog *new_prog;
-	u32 cnt;
-	int i;
 	enum bpf_access_type type;
+	int i;
 
 	if (!env->prog->aux->ops->convert_ctx_access)
 		return 0;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
+		u32 insn_delta, cnt;
+
 		if (insn->code == (BPF_LDX | BPF_MEM | BPF_W))
 			type = BPF_READ;
 		else if (insn->code == (BPF_STX | BPF_MEM | BPF_W))
@@ -2645,34 +2626,18 @@ static int convert_ctx_accesses(struct verifier_env *env)
 			return -EINVAL;
 		}
 
-		if (cnt == 1) {
-			memcpy(insn, insn_buf, sizeof(*insn));
-			continue;
-		}
-
-		/* several new insns need to be inserted. Make room for them */
-		insn_cnt += cnt - 1;
-		new_prog = bpf_prog_realloc(env->prog,
-					    bpf_prog_size(insn_cnt),
-					    GFP_USER);
+		new_prog = bpf_patch_insn_single(env->prog, i, insn_buf, cnt);
 		if (!new_prog)
 			return -ENOMEM;
 
-		new_prog->len = insn_cnt;
-
-		memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1,
-			sizeof(*insn) * (insn_cnt - i - cnt));
-
-		/* copy substitute insns in place of load instruction */
-		memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt);
-
-		/* adjust branches in the whole program */
-		adjust_branches(new_prog, i, cnt - 1);
+		insn_delta = cnt - 1;
 
 		/* keep walking new program and skip insns we just inserted */
 		env->prog = new_prog;
-		insn = new_prog->insnsi + i + cnt - 1;
-		i += cnt - 1;
+		insn      = new_prog->insnsi + i + insn_delta;
+
+		insn_cnt += insn_delta;
+		i        += insn_delta;
 	}
 
 	return 0;
-- 
cgit 


From d1c55ab5e41fcd72cb0a8bef86d3f652ad9ad9f5 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 May 2016 19:08:31 +0200
Subject: bpf: prepare bpf_int_jit_compile/bpf_prog_select_runtime apis

Since the blinding is strictly only called from inside eBPF JITs,
we need to change signatures for bpf_int_jit_compile() and
bpf_prog_select_runtime() first in order to prepare that the
eBPF program we're dealing with can change underneath. Hence,
for call sites, we need to return the latest prog. No functional
change in this patch.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c    | 18 ++++++++++++++----
 kernel/bpf/syscall.c |  2 +-
 2 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 49b5538a5301..70f0821aca47 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -761,15 +761,22 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
 /**
  *	bpf_prog_select_runtime - select exec runtime for BPF program
  *	@fp: bpf_prog populated with internal BPF program
+ *	@err: pointer to error variable
  *
  * Try to JIT eBPF program, if JIT is not available, use interpreter.
  * The BPF program will be executed via BPF_PROG_RUN() macro.
  */
-int bpf_prog_select_runtime(struct bpf_prog *fp)
+struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 {
 	fp->bpf_func = (void *) __bpf_prog_run;
 
-	bpf_int_jit_compile(fp);
+	/* eBPF JITs can rewrite the program in case constant
+	 * blinding is active. However, in case of error during
+	 * blinding, bpf_int_jit_compile() must always return a
+	 * valid program, which in this case would simply not
+	 * be JITed, but falls back to the interpreter.
+	 */
+	fp = bpf_int_jit_compile(fp);
 	bpf_prog_lock_ro(fp);
 
 	/* The tail call compatibility check can only be done at
@@ -777,7 +784,9 @@ int bpf_prog_select_runtime(struct bpf_prog *fp)
 	 * with JITed or non JITed program concatenations and not
 	 * all eBPF JITs might immediately support all features.
 	 */
-	return bpf_check_tail_call(fp);
+	*err = bpf_check_tail_call(fp);
+
+	return fp;
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
 
@@ -859,8 +868,9 @@ const struct bpf_func_proto bpf_tail_call_proto = {
 };
 
 /* For classic BPF JITs that don't implement bpf_int_jit_compile(). */
-void __weak bpf_int_jit_compile(struct bpf_prog *prog)
+struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
 {
+	return prog;
 }
 
 bool __weak bpf_helper_changes_skb_data(void *func)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cf5e9f7ad13a..46ecce4b79ed 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -762,7 +762,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 	fixup_bpf_calls(prog);
 
 	/* eBPF program is ready to be JITed */
-	err = bpf_prog_select_runtime(prog);
+	prog = bpf_prog_select_runtime(prog, &err);
 	if (err < 0)
 		goto free_used_maps;
 
-- 
cgit 


From 4f3446bb809f20ad56cadf712e6006815ae7a8f9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 May 2016 19:08:32 +0200
Subject: bpf: add generic constant blinding for use in jits

This work adds a generic facility for use from eBPF JIT compilers
that allows for further hardening of JIT generated images through
blinding constants. In response to the original work on BPF JIT
spraying published by Keegan McAllister [1], most BPF JITs were
changed to make images read-only and start at a randomized offset
in the page, where the rest was filled with trap instructions. We
have this nowadays in x86, arm, arm64 and s390 JIT compilers.
Additionally, later work also made eBPF interpreter images read
only for kernels supporting DEBUG_SET_MODULE_RONX, that is, x86,
arm, arm64 and s390 archs as well currently. This is done by
default for mentioned JITs when JITing is enabled. Furthermore,
we had a generic and configurable constant blinding facility on our
todo for quite some time now to further make spraying harder, and
first implementation since around netconf 2016.

We found that for systems where untrusted users can load cBPF/eBPF
code where JIT is enabled, start offset randomization helps a bit
to make jumps into crafted payload harder, but in case where larger
programs that cross page boundary are injected, we again have some
part of the program opcodes at a page start offset. With improved
guessing and more reliable payload injection, chances can increase
to jump into such payload. Elena Reshetova recently wrote a test
case for it [2, 3]. Moreover, eBPF comes with 64 bit constants, which
can leave some more room for payloads. Note that for all this,
additional bugs in the kernel are still required to make the jump
(and of course to guess right, to not jump into a trap) and naturally
the JIT must be enabled, which is disabled by default.

For helping mitigation, the general idea is to provide an option
bpf_jit_harden that admins can tweak along with bpf_jit_enable, so
that for cases where JIT should be enabled for performance reasons,
the generated image can be further hardened with blinding constants
for unpriviledged users (bpf_jit_harden == 1), with trading off
performance for these, but not for privileged ones. We also added
the option of blinding for all users (bpf_jit_harden == 2), which
is quite helpful for testing f.e. with test_bpf.ko. There are no
further e.g. hardening levels of bpf_jit_harden switch intended,
rationale is to have it dead simple to use as on/off. Since this
functionality would need to be duplicated over and over for JIT
compilers to use, which are already complex enough, we provide a
generic eBPF byte-code level based blinding implementation, which is
then just transparently JITed. JIT compilers need to make only a few
changes to integrate this facility and can be migrated one by one.

This option is for eBPF JITs and will be used in x86, arm64, s390
without too much effort, and soon ppc64 JITs, thus that native eBPF
can be blinded as well as cBPF to eBPF migrations, so that both can
be covered with a single implementation. The rule for JITs is that
bpf_jit_blind_constants() must be called from bpf_int_jit_compile(),
and in case blinding is disabled, we follow normally with JITing the
passed program. In case blinding is enabled and we fail during the
process of blinding itself, we must return with the interpreter.
Similarly, in case the JITing process after the blinding failed, we
return normally to the interpreter with the non-blinded code. Meaning,
interpreter doesn't change in any way and operates on eBPF code as
usual. For doing this pre-JIT blinding step, we need to make use of
a helper/auxiliary register, here BPF_REG_AX. This is strictly internal
to the JIT and not in any way part of the eBPF architecture. Just like
in the same way as JITs internally make use of some helper registers
when emitting code, only that here the helper register is one
abstraction level higher in eBPF bytecode, but nevertheless in JIT
phase. That helper register is needed since f.e. manually written
program can issue loads to all registers of eBPF architecture.

The core concept with the additional register is: blind out all 32
and 64 bit constants by converting BPF_K based instructions into a
small sequence from K_VAL into ((RND ^ K_VAL) ^ RND). Therefore, this
is transformed into: BPF_REG_AX := (RND ^ K_VAL), BPF_REG_AX ^= RND,
and REG <OP> BPF_REG_AX, so actual operation on the target register
is translated from BPF_K into BPF_X one that is operating on
BPF_REG_AX's content. During rewriting phase when blinding, RND is
newly generated via prandom_u32() for each processed instruction.
64 bit loads are split into two 32 bit loads to make translation and
patching not too complex. Only basic thing required by JITs is to
call the helper bpf_jit_blind_constants()/bpf_jit_prog_release_other()
pair, and to map BPF_REG_AX into an unused register.

Small bpf_jit_disasm extract from [2] when applied to x86 JIT:

echo 0 > /proc/sys/net/core/bpf_jit_harden

  ffffffffa034f5e9 + <x>:
  [...]
  39:   mov    $0xa8909090,%eax
  3e:   mov    $0xa8909090,%eax
  43:   mov    $0xa8ff3148,%eax
  48:   mov    $0xa89081b4,%eax
  4d:   mov    $0xa8900bb0,%eax
  52:   mov    $0xa810e0c1,%eax
  57:   mov    $0xa8908eb4,%eax
  5c:   mov    $0xa89020b0,%eax
  [...]

echo 1 > /proc/sys/net/core/bpf_jit_harden

  ffffffffa034f1e5 + <x>:
  [...]
  39:   mov    $0xe1192563,%r10d
  3f:   xor    $0x4989b5f3,%r10d
  46:   mov    %r10d,%eax
  49:   mov    $0xb8296d93,%r10d
  4f:   xor    $0x10b9fd03,%r10d
  56:   mov    %r10d,%eax
  59:   mov    $0x8c381146,%r10d
  5f:   xor    $0x24c7200e,%r10d
  66:   mov    %r10d,%eax
  69:   mov    $0xeb2a830e,%r10d
  6f:   xor    $0x43ba02ba,%r10d
  76:   mov    %r10d,%eax
  79:   mov    $0xd9730af,%r10d
  7f:   xor    $0xa5073b1f,%r10d
  86:   mov    %r10d,%eax
  89:   mov    $0x9a45662b,%r10d
  8f:   xor    $0x325586ea,%r10d
  96:   mov    %r10d,%eax
  [...]

As can be seen, original constants that carry payload are hidden
when enabled, actual operations are transformed from constant-based
to register-based ones, making jumps into constants ineffective.
Above extract/example uses single BPF load instruction over and
over, but of course all instructions with constants are blinded.

Performance wise, JIT with blinding performs a bit slower than just
JIT and faster than interpreter case. This is expected, since we
still get all the performance benefits from JITing and in normal
use-cases not every single instruction needs to be blinded. Summing
up all 296 test cases averaged over multiple runs from test_bpf.ko
suite, interpreter was 55% slower than JIT only and JIT with blinding
was 8% slower than JIT only. Since there are also some extremes in
the test suite, I expect for ordinary workloads that the performance
for the JIT with blinding case is even closer to JIT only case,
f.e. nmap test case from suite has averaged timings in ns 29 (JIT),
35 (+ blinding), and 151 (interpreter).

BPF test suite, seccomp test suite, eBPF sample code and various
bigger networking eBPF programs have been tested with this and were
running fine. For testing purposes, I also adapted interpreter and
redirected blinded eBPF image to interpreter and also here all tests
pass.

  [1] http://mainisusuallyafunction.blogspot.com/2012/11/attacking-hardened-linux-systems-with.html
  [2] https://github.com/01org/jit-spray-poc-for-ksp/
  [3] http://www.openwall.com/lists/kernel-hardening/2016/05/03/5

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Elena Reshetova <elena.reshetova@intel.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 203 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 203 insertions(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 70f0821aca47..f1e8a0def99b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -243,6 +243,209 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 {
 	module_memfree(hdr);
 }
+
+int bpf_jit_harden __read_mostly;
+
+static int bpf_jit_blind_insn(const struct bpf_insn *from,
+			      const struct bpf_insn *aux,
+			      struct bpf_insn *to_buff)
+{
+	struct bpf_insn *to = to_buff;
+	u32 imm_rnd = prandom_u32();
+	s16 off;
+
+	BUILD_BUG_ON(BPF_REG_AX  + 1 != MAX_BPF_JIT_REG);
+	BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);
+
+	if (from->imm == 0 &&
+	    (from->code == (BPF_ALU   | BPF_MOV | BPF_K) ||
+	     from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) {
+		*to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg);
+		goto out;
+	}
+
+	switch (from->code) {
+	case BPF_ALU | BPF_ADD | BPF_K:
+	case BPF_ALU | BPF_SUB | BPF_K:
+	case BPF_ALU | BPF_AND | BPF_K:
+	case BPF_ALU | BPF_OR  | BPF_K:
+	case BPF_ALU | BPF_XOR | BPF_K:
+	case BPF_ALU | BPF_MUL | BPF_K:
+	case BPF_ALU | BPF_MOV | BPF_K:
+	case BPF_ALU | BPF_DIV | BPF_K:
+	case BPF_ALU | BPF_MOD | BPF_K:
+		*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX);
+		break;
+
+	case BPF_ALU64 | BPF_ADD | BPF_K:
+	case BPF_ALU64 | BPF_SUB | BPF_K:
+	case BPF_ALU64 | BPF_AND | BPF_K:
+	case BPF_ALU64 | BPF_OR  | BPF_K:
+	case BPF_ALU64 | BPF_XOR | BPF_K:
+	case BPF_ALU64 | BPF_MUL | BPF_K:
+	case BPF_ALU64 | BPF_MOV | BPF_K:
+	case BPF_ALU64 | BPF_DIV | BPF_K:
+	case BPF_ALU64 | BPF_MOD | BPF_K:
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_ALU64_REG(from->code, from->dst_reg, BPF_REG_AX);
+		break;
+
+	case BPF_JMP | BPF_JEQ  | BPF_K:
+	case BPF_JMP | BPF_JNE  | BPF_K:
+	case BPF_JMP | BPF_JGT  | BPF_K:
+	case BPF_JMP | BPF_JGE  | BPF_K:
+	case BPF_JMP | BPF_JSGT | BPF_K:
+	case BPF_JMP | BPF_JSGE | BPF_K:
+	case BPF_JMP | BPF_JSET | BPF_K:
+		/* Accommodate for extra offset in case of a backjump. */
+		off = from->off;
+		if (off < 0)
+			off -= 2;
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
+		break;
+
+	case BPF_LD | BPF_ABS | BPF_W:
+	case BPF_LD | BPF_ABS | BPF_H:
+	case BPF_LD | BPF_ABS | BPF_B:
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
+		break;
+
+	case BPF_LD | BPF_IND | BPF_W:
+	case BPF_LD | BPF_IND | BPF_H:
+	case BPF_LD | BPF_IND | BPF_B:
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg);
+		*to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
+		break;
+
+	case BPF_LD | BPF_IMM | BPF_DW:
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
+		*to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX);
+		break;
+	case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */
+		*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm);
+		*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_ALU64_REG(BPF_OR,  aux[0].dst_reg, BPF_REG_AX);
+		break;
+
+	case BPF_ST | BPF_MEM | BPF_DW:
+	case BPF_ST | BPF_MEM | BPF_W:
+	case BPF_ST | BPF_MEM | BPF_H:
+	case BPF_ST | BPF_MEM | BPF_B:
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off);
+		break;
+	}
+out:
+	return to - to_buff;
+}
+
+static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
+					      gfp_t gfp_extra_flags)
+{
+	gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+			  gfp_extra_flags;
+	struct bpf_prog *fp;
+
+	fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL);
+	if (fp != NULL) {
+		kmemcheck_annotate_bitfield(fp, meta);
+
+		/* aux->prog still points to the fp_other one, so
+		 * when promoting the clone to the real program,
+		 * this still needs to be adapted.
+		 */
+		memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE);
+	}
+
+	return fp;
+}
+
+static void bpf_prog_clone_free(struct bpf_prog *fp)
+{
+	/* aux was stolen by the other clone, so we cannot free
+	 * it from this path! It will be freed eventually by the
+	 * other program on release.
+	 *
+	 * At this point, we don't need a deferred release since
+	 * clone is guaranteed to not be locked.
+	 */
+	fp->aux = NULL;
+	__bpf_prog_free(fp);
+}
+
+void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
+{
+	/* We have to repoint aux->prog to self, as we don't
+	 * know whether fp here is the clone or the original.
+	 */
+	fp->aux->prog = fp;
+	bpf_prog_clone_free(fp_other);
+}
+
+struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
+{
+	struct bpf_insn insn_buff[16], aux[2];
+	struct bpf_prog *clone, *tmp;
+	int insn_delta, insn_cnt;
+	struct bpf_insn *insn;
+	int i, rewritten;
+
+	if (!bpf_jit_blinding_enabled())
+		return prog;
+
+	clone = bpf_prog_clone_create(prog, GFP_USER);
+	if (!clone)
+		return ERR_PTR(-ENOMEM);
+
+	insn_cnt = clone->len;
+	insn = clone->insnsi;
+
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		/* We temporarily need to hold the original ld64 insn
+		 * so that we can still access the first part in the
+		 * second blinding run.
+		 */
+		if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) &&
+		    insn[1].code == 0)
+			memcpy(aux, insn, sizeof(aux));
+
+		rewritten = bpf_jit_blind_insn(insn, aux, insn_buff);
+		if (!rewritten)
+			continue;
+
+		tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);
+		if (!tmp) {
+			/* Patching may have repointed aux->prog during
+			 * realloc from the original one, so we need to
+			 * fix it up here on error.
+			 */
+			bpf_jit_prog_release_other(prog, clone);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		clone = tmp;
+		insn_delta = rewritten - 1;
+
+		/* Walk new program and skip insns we just inserted. */
+		insn = clone->insnsi + i + insn_delta;
+		insn_cnt += insn_delta;
+		i        += insn_delta;
+	}
+
+	return clone;
+}
 #endif /* CONFIG_BPF_JIT */
 
 /* Base function for offset calculation. Needs to go into .text section,
-- 
cgit 


From b7552e1bccbe3da9c8e7386c6188e8ea4667c8e7 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 18 May 2016 14:14:28 +0200
Subject: bpf: rather use get_random_int for randomizations

Start address randomization and blinding in BPF currently use
prandom_u32(). prandom_u32() values are not exposed to unpriviledged
user space to my knowledge, but given other kernel facilities such as
ASLR, stack canaries, etc make use of stronger get_random_int(), we
better make use of it here as well given blinding requests successively
new random values. get_random_int() has minimal entropy pool depletion,
is not cryptographically secure, but doesn't need to be for our use
cases here.

Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f1e8a0def99b..b94a36550591 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -231,7 +231,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 	hdr->pages = size / PAGE_SIZE;
 	hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
 		     PAGE_SIZE - sizeof(*hdr));
-	start = (prandom_u32() % hole) & ~(alignment - 1);
+	start = (get_random_int() % hole) & ~(alignment - 1);
 
 	/* Leave a random number of instructions before BPF code. */
 	*image_ptr = &hdr->image[start];
@@ -251,7 +251,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
 			      struct bpf_insn *to_buff)
 {
 	struct bpf_insn *to = to_buff;
-	u32 imm_rnd = prandom_u32();
+	u32 imm_rnd = get_random_int();
 	s16 off;
 
 	BUILD_BUG_ON(BPF_REG_AX  + 1 != MAX_BPF_JIT_REG);
-- 
cgit 


From e27f4a942a0ee4b84567a3c6cfa84f273e55cbb7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 20 May 2016 17:22:48 -0500
Subject: bpf: Use mount_nodev not mount_ns to mount the bpf filesystem

While reviewing the filesystems that set FS_USERNS_MOUNT I spotted the
bpf filesystem.  Looking at the code I saw a broken usage of mount_ns
with current->nsproxy->mnt_ns. As the code does not acquire a
reference to the mount namespace it can not possibly be correct to
store the mount namespace on the superblock as it does.

Replace mount_ns with mount_nodev so that each mount of the bpf
filesystem returns a distinct instance, and the code is not buggy.

In discussion with Hannes Frederic Sowa it was reported that the use
of mount_ns was an attempt to have one bpf instance per mount
namespace, in an attempt to keep resources that pin resources from
hiding.  That intent simply does not work, the vfs is not built to
allow that kind of behavior.  Which means that the bpf filesystem
really is buggy both semantically and in it's implemenation as it does
not nor can it implement the original intent.

This change is userspace visible, but my experience with similar
filesystems leads me to believe nothing will break with a model of each
mount of the bpf filesystem is distinct from all others.

Fixes: b2197755b263 ("bpf: add support for persistent maps/progs")
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 71b75d9c81da..04be7021f848 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -357,7 +357,7 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent)
 static struct dentry *bpf_mount(struct file_system_type *type, int flags,
 				const char *dev_name, void *data)
 {
-	return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super);
+	return mount_nodev(type, flags, data, bpf_fill_super);
 }
 
 static struct file_system_type bpf_fs_type = {
-- 
cgit 


From d91b28ed42de99217efb2e8cb0357263d6fb737c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 19 May 2016 18:17:13 -0700
Subject: bpf: support decreasing order in direct packet access

when packet headers are accessed in 'decreasing' order (like TCP port
may be fetched before the program reads IP src) the llvm may generate
the following code:
[...]                // R7=pkt(id=0,off=22,r=70)
r2 = *(u32 *)(r7 +0) // good access
[...]
r7 += 40             // R7=pkt(id=0,off=62,r=70)
r8 = *(u32 *)(r7 +0) // good access
[...]
r1 = *(u32 *)(r7 -20) // this one will fail though it's within a safe range
                      // it's doing *(u32*)(skb->data + 42)
Fix verifier to recognize such code pattern

Alos turned out that 'off > range' condition is not a verifier bug.
It's a buggy program that may do something like:
if (ptr + 50 > data_end)
  return 0;
ptr += 60;
*(u32*)ptr;
in such case emit
"invalid access to packet, off=0 size=4, R1(id=0,off=60,r=50)" error message,
so all information is available for the program author to fix the program.

Fixes: 969bf05eb3ce ("bpf: direct packet access")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a08d66215245..d54e34874579 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -683,15 +683,11 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off,
 {
 	struct reg_state *regs = env->cur_state.regs;
 	struct reg_state *reg = &regs[regno];
-	int linear_size = (int) reg->range - (int) reg->off;
 
-	if (linear_size < 0 || linear_size >= MAX_PACKET_OFF) {
-		verbose("verifier bug\n");
-		return -EFAULT;
-	}
-	if (off < 0 || off + size > linear_size) {
-		verbose("invalid access to packet, off=%d size=%d, allowed=%d\n",
-			off, size, linear_size);
+	off += reg->off;
+	if (off < 0 || off + size > reg->range) {
+		verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
+			off, size, regno, reg->id, reg->off, reg->range);
 		return -EACCES;
 	}
 	return 0;
-- 
cgit 


From 1b9b69ecb3a5236d4d3da0f0fa11af916371841e Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 19 May 2016 18:17:14 -0700
Subject: bpf: teach verifier to recognize imm += ptr pattern

Humans don't write C code like:
  u8 *ptr = skb->data;
  int imm = 4;
  imm += ptr;
but from llvm backend point of view 'imm' and 'ptr' are registers and
imm += ptr may be preferred vs ptr += imm depending which register value
will be used further in the code, while verifier can only recognize ptr += imm.
That caused small unrelated changes in the C code of the bpf program to
trigger rejection by the verifier. Therefore teach the verifier to recognize
both ptr += imm and imm += ptr.
For example:
when R6=pkt(id=0,off=0,r=62) R7=imm22
after r7 += r6 instruction
will be R6=pkt(id=0,off=0,r=62) R7=pkt(id=0,off=22,r=62)

Fixes: 969bf05eb3ce ("bpf: direct packet access")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d54e34874579..668e07903c8f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1245,6 +1245,7 @@ static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn)
 	struct reg_state *regs = env->cur_state.regs;
 	struct reg_state *dst_reg = &regs[insn->dst_reg];
 	struct reg_state *src_reg = &regs[insn->src_reg];
+	struct reg_state tmp_reg;
 	s32 imm;
 
 	if (BPF_SRC(insn->code) == BPF_K) {
@@ -1267,6 +1268,19 @@ add_imm:
 		 */
 		dst_reg->off += imm;
 	} else {
+		if (src_reg->type == PTR_TO_PACKET) {
+			/* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */
+			tmp_reg = *dst_reg;  /* save r7 state */
+			*dst_reg = *src_reg; /* copy pkt_ptr state r6 into r7 */
+			src_reg = &tmp_reg;  /* pretend it's src_reg state */
+			/* if the checks below reject it, the copy won't matter,
+			 * since we're rejecting the whole program. If all ok,
+			 * then imm22 state will be added to r7
+			 * and r7 will be pkt(id=0,off=22,r=62) while
+			 * r6 will stay as pkt(id=0,off=0,r=62)
+			 */
+		}
+
 		if (src_reg->type == CONST_IMM) {
 			/* pkt_ptr += reg where reg is known constant */
 			imm = src_reg->imm;
@@ -1565,7 +1579,9 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 			return 0;
 		} else if (opcode == BPF_ADD &&
 			   BPF_CLASS(insn->code) == BPF_ALU64 &&
-			   dst_reg->type == PTR_TO_PACKET) {
+			   (dst_reg->type == PTR_TO_PACKET ||
+			    (BPF_SRC(insn->code) == BPF_X &&
+			     regs[insn->src_reg].type == PTR_TO_PACKET))) {
 			/* ptr_to_packet += K|X */
 			return check_packet_ptr_add(env, insn);
 		} else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
-- 
cgit 


From 612bacad78ba6d0a91166fc4487af114bac172a8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 22 May 2016 23:16:18 +0200
Subject: bpf, inode: disallow userns mounts

Follow-up to commit e27f4a942a0e ("bpf: Use mount_nodev not mount_ns
to mount the bpf filesystem"), which removes the FS_USERNS_MOUNT flag.

The original idea was to have a per mountns instance instead of a
single global fs instance, but that didn't work out and we had to
switch to mount_nodev() model. The intent of that middle ground was
that we avoid users who don't play nice to create endless instances
of bpf fs which are difficult to control and discover from an admin
point of view, but at the same time it would have allowed us to be
more flexible with regard to namespaces.

Therefore, since we now did the switch to mount_nodev() as a fix
where individual instances are created, we also need to remove userns
mount flag along with it to avoid running into mentioned situation.
I don't expect any breakage at this early point in time with removing
the flag and we can revisit this later should the requirement for
this come up with future users. This and commit e27f4a942a0e have
been split to facilitate tracking should any of them run into the
unlikely case of causing a regression.

Fixes: b2197755b263 ("bpf: add support for persistent maps/progs")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/inode.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 04be7021f848..318858edb1cd 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -365,7 +365,6 @@ static struct file_system_type bpf_fs_type = {
 	.name		= "bpf",
 	.mount		= bpf_mount,
 	.kill_sb	= kill_litter_super,
-	.fs_flags	= FS_USERNS_MOUNT,
 };
 
 MODULE_ALIAS_FS("bpf");
-- 
cgit