From 06ee7115b0d1742de745ad143fb5e06d77d27fba Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:40 -0700 Subject: bpf: add verifier stats and log_level bit 2 In order to understand the verifier bottlenecks add various stats and extend log_level: log_level 1 and 2 are kept as-is: bit 0 - level=1 - print every insn and verifier state at branch points bit 1 - level=2 - print every insn and verifier state at every insn bit 2 - level=4 - print verifier error and stats at the end of verification When verifier rejects the program the libbpf is trying to load the program twice. Once with log_level=0 (no messages, only error code is reported to user space) and second time with log_level=1 to tell the user why the verifier rejected it. With introduction of bit 2 - level=4 the libbpf can choose to always use that level and load programs once, since the verification speed is not affected and in case of error the verbose message will be available. Note that the verifier stats are not part of uapi just like all other verbose messages. They're expected to change in the future. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7d8228d1c898..f7e15eeb60bb 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -248,6 +248,12 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) return log->len_used >= log->len_total - 1; } +#define BPF_LOG_LEVEL1 1 +#define BPF_LOG_LEVEL2 2 +#define BPF_LOG_STATS 4 +#define BPF_LOG_LEVEL (BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2) +#define BPF_LOG_MASK (BPF_LOG_LEVEL | BPF_LOG_STATS) + static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) { return log->level && log->ubuf && !bpf_verifier_log_full(log); @@ -284,6 +290,21 @@ struct bpf_verifier_env { struct bpf_verifier_log log; struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1]; u32 subprog_cnt; + /* number of instructions analyzed by the verifier */ + u32 insn_processed; + /* total verification time */ + u64 verification_time; + /* maximum number of verifier states kept in 'branching' instructions */ + u32 max_states_per_insn; + /* total number of allocated verifier states */ + u32 total_states; + /* some states are freed during program analysis. + * this is peak number of states. this number dominates kernel + * memory consumption during verification + */ + u32 peak_states; + /* longest register parentage chain walked for liveness marking */ + u32 longest_mark_read_walk; }; __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, -- cgit From 9f4686c41bdff051f557accb531af79dd1773687 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:41 -0700 Subject: bpf: improve verification speed by droping states Branch instructions, branch targets and calls in a bpf program are the places where the verifier remembers states that led to successful verification of the program. These states are used to prune brute force program analysis. For unprivileged programs there is a limit of 64 states per such 'branching' instructions (maximum length is tracked by max_states_per_insn counter introduced in the previous patch). Simply reducing this threshold to 32 or lower increases insn_processed metric to the point that small valid programs get rejected. For root programs there is no limit and cilium programs can have max_states_per_insn to be 100 or higher. Walking 100+ states multiplied by number of 'branching' insns during verification consumes significant amount of cpu time. Turned out simple LRU-like mechanism can be used to remove states that unlikely will be helpful in future search pruning. This patch introduces hit_cnt and miss_cnt counters: hit_cnt - this many times this state successfully pruned the search miss_cnt - this many times this state was not equivalent to other states (and that other states were added to state list) The heuristic introduced in this patch is: if (sl->miss_cnt > sl->hit_cnt * 3 + 3) /* drop this state from future considerations */ Higher numbers increase max_states_per_insn (allow more states to be considered for pruning) and slow verification speed, but do not meaningfully reduce insn_processed metric. Lower numbers drop too many states and insn_processed increases too much. Many different formulas were considered. This one is simple and works well enough in practice. (the analysis was done on selftests/progs/* and on cilium programs) The end result is this heuristic improves verification speed by 10 times. Large synthetic programs that used to take a second more now take 1/10 of a second. In cases where max_states_per_insn used to be 100 or more, now it's ~10. There is a slight increase in insn_processed for cilium progs: before after bpf_lb-DLB_L3.o 1831 1838 bpf_lb-DLB_L4.o 3029 3218 bpf_lb-DUNKNOWN.o 1064 1064 bpf_lxc-DDROP_ALL.o 26309 26935 bpf_lxc-DUNKNOWN.o 33517 34439 bpf_netdev.o 9713 9721 bpf_overlay.o 6184 6184 bpf_lcx_jit.o 37335 39389 And 2-3 times improvement in the verification speed. Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index f7e15eeb60bb..fc8254d6b569 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -207,6 +207,7 @@ struct bpf_verifier_state { struct bpf_verifier_state_list { struct bpf_verifier_state state; struct bpf_verifier_state_list *next; + int miss_cnt, hit_cnt; }; /* Possible states for alu_state member. */ @@ -280,6 +281,7 @@ struct bpf_verifier_env { bool strict_alignment; /* perform strict pointer alignment checks */ struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ + struct bpf_verifier_state_list *free_list; struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ -- cgit From c04c0d2b968ac45d6ef020316808ef6c82325a82 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:45 -0700 Subject: bpf: increase complexity limit and maximum program size Large verifier speed improvements allow to increase verifier complexity limit. Now regardless of the program composition and its size it takes little time for the verifier to hit insn_processed limit. On typical x86 machine non-debug kernel processes 1M instructions in 1/10 of a second. (before these speed improvements specially crafted programs could be hitting multi-second verification times) Full kasan kernel with debug takes ~1 second for the same 1M insns. Hence bump the BPF_COMPLEXITY_LIMIT_INSNS limit to 1M. Also increase the number of instructions per program from 4k to internal BPF_COMPLEXITY_LIMIT_INSNS limit. 4k limit was confusing to users, since small programs with hundreds of insns could be hitting BPF_COMPLEXITY_LIMIT_INSNS limit. Sometimes adding more insns and bpf_trace_printk debug statements would make the verifier accept the program while removing code would make the verifier reject it. Some user space application started to add #define MAX_FOO to their programs and do: MAX_FOO=100; again: compile with MAX_FOO; try to load; if (fails_to_load) { reduce MAX_FOO; goto again; } to be able to fit maximum amount of processing into single program. Other users artificially split their single program into a set of programs and use all 32 iterations of tail_calls to increase compute limits. And the most advanced folks used unlimited tc-bpf filter list to execute many bpf programs. Essentially the users managed to workaround 4k insn limit. This patch removes the limit for root programs from uapi. BPF_COMPLEXITY_LIMIT_INSNS is the kernel internal limit and success to load the program no longer depends on program size, but on 'smartness' of the verifier only. The verifier will continue to get smarter with every kernel release. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f62897198844..a445194b5fb6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -421,6 +421,7 @@ struct bpf_array { }; }; +#define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ #define MAX_TAIL_CALL_CNT 32 struct bpf_event_entry { -- cgit From d8eca5bbb2be9bc7546f9e733786fa2f1a594c67 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:03 +0200 Subject: bpf: implement lookup-free direct value access for maps This generic extension to BPF maps allows for directly loading an address residing inside a BPF map value as a single BPF ldimm64 instruction! The idea is similar to what BPF_PSEUDO_MAP_FD does today, which is a special src_reg flag for ldimm64 instruction that indicates that inside the first part of the double insns's imm field is a file descriptor which the verifier then replaces as a full 64bit address of the map into both imm parts. For the newly added BPF_PSEUDO_MAP_VALUE src_reg flag, the idea is the following: the first part of the double insns's imm field is again a file descriptor corresponding to the map, and the second part of the imm field is an offset into the value. The verifier will then replace both imm parts with an address that points into the BPF map value at the given value offset for maps that support this operation. Currently supported is array map with single entry. It is possible to support more than just single map element by reusing both 16bit off fields of the insns as a map index, so full array map lookup could be expressed that way. It hasn't been implemented here due to lack of concrete use case, but could easily be done so in future in a compatible way, since both off fields right now have to be 0 and would correctly denote a map index 0. The BPF_PSEUDO_MAP_VALUE is a distinct flag as otherwise with BPF_PSEUDO_MAP_FD we could not differ offset 0 between load of map pointer versus load of map's value at offset 0, and changing BPF_PSEUDO_MAP_FD's encoding into off by one to differ between regular map pointer and map value pointer would add unnecessary complexity and increases barrier for debugability thus less suitable. Using the second part of the imm field as an offset into the value does /not/ come with limitations since maximum possible value size is in u32 universe anyway. This optimization allows for efficiently retrieving an address to a map value memory area without having to issue a helper call which needs to prepare registers according to calling convention, etc, without needing the extra NULL test, and without having to add the offset in an additional instruction to the value base pointer. The verifier then treats the destination register as PTR_TO_MAP_VALUE with constant reg->off from the user passed offset from the second imm field, and guarantees that this is within bounds of the map value. Any subsequent operations are normally treated as typical map value handling without anything extra needed from verification side. The two map operations for direct value access have been added to array map for now. In future other types could be supported as well depending on the use case. The main use case for this commit is to allow for BPF loader support for global variables that reside in .data/.rodata/.bss sections such that we can directly load the address of them with minimal additional infrastructure required. Loader support has been added in subsequent commits for libbpf library. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++++ include/linux/bpf_verifier.h | 4 ++++ include/uapi/linux/bpf.h | 13 ++++++++++++- 3 files changed, 22 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a445194b5fb6..bd93a592dd29 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -57,6 +57,12 @@ struct bpf_map_ops { const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); + + /* Direct value access helpers. */ + int (*map_direct_value_addr)(const struct bpf_map *map, + u64 *imm, u32 off); + int (*map_direct_value_meta)(const struct bpf_map *map, + u64 imm, u32 *off); }; struct bpf_map { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index fc8254d6b569..b3ab61fe1932 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -224,6 +224,10 @@ struct bpf_insn_aux_data { unsigned long map_state; /* pointer/poison value for maps */ s32 call_imm; /* saved imm field of call insn */ u32 alu_limit; /* limit for add/sub register with pointer */ + struct { + u32 map_index; /* index into used_maps[] */ + u32 map_off; /* offset from value base address */ + }; }; int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ int sanitize_stack_off; /* stack slot to be cleared */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 837024512baf..26cfb5b2c964 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -255,8 +255,19 @@ enum bpf_attach_type { */ #define BPF_F_ANY_ALIGNMENT (1U << 1) -/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */ +/* When BPF ldimm64's insn[0].src_reg != 0 then this can have + * two extensions: + * + * insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE + * insn[0].imm: map fd map fd + * insn[1].imm: 0 offset into value + * insn[0].off: 0 0 + * insn[1].off: 0 0 + * ldimm64 rewrite: address of map address of map[0]+offset + * verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE + */ #define BPF_PSEUDO_MAP_FD 1 +#define BPF_PSEUDO_MAP_VALUE 2 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function -- cgit From 591fe9888d7809d9ee5c828020b6c6ae27c37229 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:05 +0200 Subject: bpf: add program side {rd, wr}only support for maps This work adds two new map creation flags BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG in order to allow for read-only or write-only BPF maps from a BPF program side. Today we have BPF_F_RDONLY and BPF_F_WRONLY, but this only applies to system call side, meaning the BPF program has full read/write access to the map as usual while bpf(2) calls with map fd can either only read or write into the map depending on the flags. BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG allows for the exact opposite such that verifier is going to reject program loads if write into a read-only map or a read into a write-only map is detected. For read-only map case also some helpers are forbidden for programs that would alter the map state such as map deletion, update, etc. As opposed to the two BPF_F_RDONLY / BPF_F_WRONLY flags, BPF_F_RDONLY_PROG as well as BPF_F_WRONLY_PROG really do correspond to the map lifetime. We've enabled this generic map extension to various non-special maps holding normal user data: array, hash, lru, lpm, local storage, queue and stack. Further generic map types could be followed up in future depending on use-case. Main use case here is to forbid writes into .rodata map values from verifier side. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 29 +++++++++++++++++++++++++++++ include/uapi/linux/bpf.h | 6 +++++- 2 files changed, 34 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bd93a592dd29..be20804631b5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -430,6 +430,35 @@ struct bpf_array { #define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ #define MAX_TAIL_CALL_CNT 32 +#define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ + BPF_F_RDONLY_PROG | \ + BPF_F_WRONLY | \ + BPF_F_WRONLY_PROG) + +#define BPF_MAP_CAN_READ BIT(0) +#define BPF_MAP_CAN_WRITE BIT(1) + +static inline u32 bpf_map_flags_to_cap(struct bpf_map *map) +{ + u32 access_flags = map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); + + /* Combination of BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG is + * not possible. + */ + if (access_flags & BPF_F_RDONLY_PROG) + return BPF_MAP_CAN_READ; + else if (access_flags & BPF_F_WRONLY_PROG) + return BPF_MAP_CAN_WRITE; + else + return BPF_MAP_CAN_READ | BPF_MAP_CAN_WRITE; +} + +static inline bool bpf_map_flags_access_ok(u32 access_flags) +{ + return (access_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) != + (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); +} + struct bpf_event_entry { struct perf_event *event; struct file *perf_file; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 26cfb5b2c964..d275446d807c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -294,7 +294,7 @@ enum bpf_attach_type { #define BPF_OBJ_NAME_LEN 16U -/* Flags for accessing BPF object */ +/* Flags for accessing BPF object from syscall side. */ #define BPF_F_RDONLY (1U << 3) #define BPF_F_WRONLY (1U << 4) @@ -304,6 +304,10 @@ enum bpf_attach_type { /* Zero-initialize hash function seed. This should only be used for testing. */ #define BPF_F_ZERO_SEED (1U << 6) +/* Flags for accessing BPF object from program side. */ +#define BPF_F_RDONLY_PROG (1U << 7) +#define BPF_F_WRONLY_PROG (1U << 8) + /* flags for BPF_PROG_QUERY */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) -- cgit From 87df15de441bd4add7876ef584da8cabdd9a042a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:06 +0200 Subject: bpf: add syscall side map freeze support This patch adds a new BPF_MAP_FREEZE command which allows to "freeze" the map globally as read-only / immutable from syscall side. Map permission handling has been refactored into map_get_sys_perms() and drops FMODE_CAN_WRITE in case of locked map. Main use case is to allow for setting up .rodata sections from the BPF ELF which are loaded into the kernel, meaning BPF loader first allocates map, sets up map value by copying .rodata section into it and once complete, it calls BPF_MAP_FREEZE on the map fd to prevent further modifications. Right now BPF_MAP_FREEZE only takes map fd as argument while remaining bpf_attr members are required to be zero. I didn't add write-only locking here as counterpart since I don't have a concrete use-case for it on my side, and I think it makes probably more sense to wait once there is actually one. In that case bpf_attr can be extended as usual with a flag field and/or others where flag 0 means that we lock the map read-only hence this doesn't prevent to add further extensions to BPF_MAP_FREEZE upon need. A map creation flag like BPF_F_WRONCE was not considered for couple of reasons: i) in case of a generic implementation, a map can consist of more than just one element, thus there could be multiple map updates needed to set the map into a state where it can then be made immutable, ii) WRONCE indicates exact one-time write before it is then set immutable. A generic implementation would set a bit atomically on map update entry (if unset), indicating that every subsequent update from then onwards will need to bail out there. However, map updates can fail, so upon failure that flag would need to be unset again and the update attempt would need to be repeated for it to be eventually made immutable. While this can be made race-free, this approach feels less clean and in combination with reason i), it's not generic enough. A dedicated BPF_MAP_FREEZE command directly sets the flag and caller has the guarantee that map is immutable from syscall side upon successful return for any future syscall invocations that would alter the map state, which is also more intuitive from an API point of view. A command name such as BPF_MAP_LOCK has been avoided as it's too close with BPF map spin locks (which already has BPF_F_LOCK flag). BPF_MAP_FREEZE is so far only enabled for privileged users. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 ++- include/uapi/linux/bpf.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index be20804631b5..65f7094c40b4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -87,7 +87,8 @@ struct bpf_map { struct btf *btf; u32 pages; bool unpriv_array; - /* 51 bytes hole */ + bool frozen; /* write-once */ + /* 48 bytes hole */ /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d275446d807c..af1cbd951f26 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -105,6 +105,7 @@ enum bpf_cmd { BPF_BTF_GET_FD_BY_ID, BPF_TASK_FD_QUERY, BPF_MAP_LOOKUP_AND_DELETE_ELEM, + BPF_MAP_FREEZE, }; enum bpf_map_type { -- cgit From f063c889c9458354a92b235a51cbb60d30321070 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:08 +0200 Subject: bpf: add specification for BTF Var and DataSec kinds This adds the BTF specification and UAPI bits for supporting BTF Var and DataSec kinds. This is following LLVM upstream commit ac4082b77e07 ("[BPF] Add BTF Var and DataSec Support") which has been merged recently. Var itself is for describing a global variable and DataSec to describe ELF sections e.g. data/bss/rodata sections that hold one or multiple global variables. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/uapi/linux/btf.h | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 7b7475ef2f17..9310652ca4f9 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -39,11 +39,11 @@ struct btf_type { * struct, union and fwd */ __u32 info; - /* "size" is used by INT, ENUM, STRUCT and UNION. + /* "size" is used by INT, ENUM, STRUCT, UNION and DATASEC. * "size" tells the size of the type it is describing. * * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, - * FUNC and FUNC_PROTO. + * FUNC, FUNC_PROTO and VAR. * "type" is a type_id referring to another type. */ union { @@ -70,8 +70,10 @@ struct btf_type { #define BTF_KIND_RESTRICT 11 /* Restrict */ #define BTF_KIND_FUNC 12 /* Function */ #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ -#define BTF_KIND_MAX 13 -#define NR_BTF_KINDS 14 +#define BTF_KIND_VAR 14 /* Variable */ +#define BTF_KIND_DATASEC 15 /* Section */ +#define BTF_KIND_MAX BTF_KIND_DATASEC +#define NR_BTF_KINDS (BTF_KIND_MAX + 1) /* For some specific BTF_KIND, "struct btf_type" is immediately * followed by extra data. @@ -138,4 +140,26 @@ struct btf_param { __u32 type; }; +enum { + BTF_VAR_STATIC = 0, + BTF_VAR_GLOBAL_ALLOCATED, +}; + +/* BTF_KIND_VAR is followed by a single "struct btf_var" to describe + * additional information related to the variable such as its linkage. + */ +struct btf_var { + __u32 linkage; +}; + +/* BTF_KIND_DATASEC is followed by multiple "struct btf_var_secinfo" + * to describe all BTF_KIND_VAR types it contains along with it's + * in-section offset as well as size. + */ +struct btf_var_secinfo { + __u32 type; + __u32 offset; + __u32 size; +}; + #endif /* _UAPI__LINUX_BTF_H__ */ -- cgit From 2824ecb7010f6a20e9a4140512b798469ab066cc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:10 +0200 Subject: bpf: allow for key-less BTF in array map Given we'll be reusing BPF array maps for global data/bss/rodata sections, we need a way to associate BTF DataSec type as its map value type. In usual cases we have this ugly BPF_ANNOTATE_KV_PAIR() macro hack e.g. via 38d5d3b3d5db ("bpf: Introduce BPF_ANNOTATE_KV_PAIR") to get initial map to type association going. While more use cases for it are discouraged, this also won't work for global data since the use of array map is a BPF loader detail and therefore unknown at compilation time. For array maps with just a single entry we make an exception in terms of BTF in that key type is declared optional if value type is of DataSec type. The latter LLVM is guaranteed to emit and it also aligns with how we regard global data maps as just a plain buffer area reusing existing map facilities for allowing things like introspection with existing tools. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index 455d31b55828..64cdf2a23d42 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -51,6 +51,7 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, const struct btf_member *m, u32 expected_offset, u32 expected_size); int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); +bool btf_type_is_void(const struct btf_type *t); #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); -- cgit From b0b9395d865e3060d97658fbc9ba3f77fecc8da1 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 9 Apr 2019 11:49:09 -0700 Subject: bpf: support input __sk_buff context in BPF_PROG_TEST_RUN Add new set of arguments to bpf_attr for BPF_PROG_TEST_RUN: * ctx_in/ctx_size_in - input context * ctx_out/ctx_size_out - output context The intended use case is to pass some meta data to the test runs that operate on skb (this has being brought up on recent LPC). For programs that use bpf_prog_test_run_skb, support __sk_buff input and output. Initially, from input __sk_buff, copy _only_ cb and priority into skb, all other non-zero fields are prohibited (with EINVAL). If the user has set ctx_out/ctx_size_out, copy the potentially modified __sk_buff back to the userspace. We require all fields of input __sk_buff except the ones we explicitly support to be set to zero. The expectation is that in the future we might add support for more fields and we want to fail explicitly if the user runs the program on the kernel where we don't yet support them. The API is intentionally vague (i.e. we don't explicitly add __sk_buff to bpf_attr, but ctx_in) to potentially let other test_run types use this interface in the future (this can be xdp_md for xdp types for example). v4: * don't copy more than allowed in bpf_ctx_init [Martin] v3: * handle case where ctx_in is NULL, but ctx_out is not [Martin] * convert size==0 checks to ptr==NULL checks and add some extra ptr checks [Martin] v2: * Addressed comments from Martin Lau Signed-off-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index af1cbd951f26..31a27dd337dc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -412,6 +412,13 @@ union bpf_attr { __aligned_u64 data_out; __u32 repeat; __u32 duration; + __u32 ctx_size_in; /* input: len of ctx_in */ + __u32 ctx_size_out; /* input/output: len of ctx_out + * returns ENOSPC if ctx_out + * is too small. + */ + __aligned_u64 ctx_in; + __aligned_u64 ctx_out; } test; struct { /* anonymous struct used by BPF_*_GET_*_ID */ -- cgit From c695865c5c9803f14eef2c99d8a49d9ad60a3383 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 11 Apr 2019 09:12:02 -0700 Subject: bpf: fix missing bpf_check_uarg_tail_zero in BPF_PROG_TEST_RUN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit b0b9395d865e ("bpf: support input __sk_buff context in BPF_PROG_TEST_RUN") started using bpf_check_uarg_tail_zero in BPF_PROG_TEST_RUN. However, bpf_check_uarg_tail_zero is not defined for !CONFIG_BPF_SYSCALL: net/bpf/test_run.c: In function ‘bpf_ctx_init’: net/bpf/test_run.c:142:9: error: implicit declaration of function ‘bpf_check_uarg_tail_zero’ [-Werror=implicit-function-declaration] err = bpf_check_uarg_tail_zero(data_in, max_size, size); ^~~~~~~~~~~~~~~~~~~~~~~~ Let's not build net/bpf/test_run.c when CONFIG_BPF_SYSCALL is not set. Reported-by: kbuild test robot Fixes: b0b9395d865e ("bpf: support input __sk_buff context in BPF_PROG_TEST_RUN") Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 65f7094c40b4..e4d4c1771ab0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -483,14 +483,6 @@ typedef u32 (*bpf_convert_ctx_access_t)(enum bpf_access_type type, u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy); -int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, - union bpf_attr __user *uattr); -int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, - union bpf_attr __user *uattr); -int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, - const union bpf_attr *kattr, - union bpf_attr __user *uattr); - /* an array of programs to be executed under rcu_lock. * * Typical usage: @@ -681,6 +673,13 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); int array_map_alloc_check(union bpf_attr *attr); +int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr); +int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr); +int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -792,6 +791,27 @@ static inline struct bpf_prog *bpf_prog_get_type_path(const char *name, { return ERR_PTR(-EOPNOTSUPP); } + +static inline int bpf_prog_test_run_xdp(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + return -ENOTSUPP; +} + +static inline int bpf_prog_test_run_skb(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + return -ENOTSUPP; +} + +static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + return -ENOTSUPP; +} #endif /* CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, -- cgit From 58dfc900faff6db7eb9bf01555622e0b6c74c262 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Tue, 9 Apr 2019 15:06:41 +0100 Subject: bpf: add layer 2 encap support to bpf_skb_adjust_room commit 868d523535c2 ("bpf: add bpf_skb_adjust_room encap flags") introduced support to bpf_skb_adjust_room for GSO-friendly GRE and UDP encapsulation. For GSO to work for skbs, the inner headers (mac and network) need to be marked. For L3 encapsulation using bpf_skb_adjust_room, the mac and network headers are identical. Here we provide a way of specifying the inner mac header length for cases where L2 encap is desired. Such an approach can support encapsulated ethernet headers, MPLS headers etc. For example to convert from a packet of form [eth][ip][tcp] to [eth][ip][udp][inner mac][ip][tcp], something like the following could be done: headroom = sizeof(iph) + sizeof(struct udphdr) + inner_maclen; ret = bpf_skb_adjust_room(skb, headroom, BPF_ADJ_ROOM_MAC, BPF_F_ADJ_ROOM_ENCAP_L4_UDP | BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | BPF_F_ADJ_ROOM_ENCAP_L2(inner_maclen)); Signed-off-by: Alan Maguire Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 31a27dd337dc..2e96d0b4bf65 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1523,6 +1523,10 @@ union bpf_attr { * * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **: * Use with ENCAP_L3 flags to further specify the tunnel type. * + * * **BPF_F_ADJ_ROOM_ENCAP_L2(len) **: + * Use with ENCAP_L3/L4 flags to further specify the tunnel + * type; **len** is the length of the inner MAC header. + * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -2664,10 +2668,16 @@ enum bpf_func_id { /* BPF_FUNC_skb_adjust_room flags. */ #define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) +#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff +#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 + #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) #define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) #define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) +#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ + BPF_ADJ_ROOM_ENCAP_L2_MASK) \ + << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { -- cgit