From e9091bb77f6e130e42f2edd2e6bb4e5a2b2f9d77 Mon Sep 17 00:00:00 2001 From: Chen Zhou Date: Tue, 8 Sep 2020 21:22:01 +0800 Subject: bpf: Remove duplicate headers Remove duplicate headers which are included twice. Signed-off-by: Chen Zhou Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200908132201.184005-1-chenzhou10@huawei.com --- net/core/bpf_sk_storage.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core') diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index a0d1a3265b71..4a86ea34f29e 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -12,7 +12,6 @@ #include #include #include -#include DEFINE_BPF_STORAGE_CACHE(sk_cache); -- cgit From 654785a1afe1b1428106c322c7ea650e3f8d29e9 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 9 Sep 2020 17:27:10 +0100 Subject: net: sockmap: Remove unnecessary sk_fullsock checks The lookup paths for sockmap and sockhash currently include a check that returns NULL if the socket we just found is not a full socket. However, this check is not necessary. On insertion we ensure that we have a full socket (caveat around sock_ops), so request sockets are not a problem. Time-wait sockets are allocated separate from the original socket and then fed into the hashdance. They don't affect the sockets already stored in the sockmap. Suggested-by: Jakub Sitnicki Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200909162712.221874-2-lmb@cloudflare.com --- net/core/sock_map.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 078386d7d9a2..82494810d0ee 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -382,7 +382,7 @@ static void *sock_map_lookup(struct bpf_map *map, void *key) struct sock *sk; sk = __sock_map_lookup_elem(map, *(u32 *)key); - if (!sk || !sk_fullsock(sk)) + if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; @@ -1110,7 +1110,7 @@ static void *sock_hash_lookup(struct bpf_map *map, void *key) struct sock *sk; sk = __sock_hash_lookup_elem(map, key); - if (!sk || !sk_fullsock(sk)) + if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; -- cgit From 0365351524d7560d8ed7a42801a15252c6c56f41 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 9 Sep 2020 17:27:11 +0100 Subject: net: Allow iterating sockmap and sockhash Add bpf_iter support for sockmap / sockhash, based on the bpf_sk_storage and hashtable implementation. sockmap and sockhash share the same iteration context: a pointer to an arbitrary key and a pointer to a socket. Both pointers may be NULL, and so BPF has to perform a NULL check before accessing them. Technically it's not possible for sockhash iteration to yield a NULL socket, but we ignore this to be able to use a single iteration point. Iteration will visit all keys that remain unmodified during the lifetime of the iterator. It may or may not visit newly added ones. Switch from using rcu_dereference_raw to plain rcu_dereference, so we gain another guard rail if CONFIG_PROVE_RCU is enabled. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200909162712.221874-3-lmb@cloudflare.com --- net/core/sock_map.c | 280 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 278 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 82494810d0ee..e1f05e3fa1d0 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -2,6 +2,7 @@ /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #include +#include #include #include #include @@ -703,6 +704,109 @@ const struct bpf_func_proto bpf_msg_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +struct sock_map_seq_info { + struct bpf_map *map; + struct sock *sk; + u32 index; +}; + +struct bpf_iter__sockmap { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_map *, map); + __bpf_md_ptr(void *, key); + __bpf_md_ptr(struct sock *, sk); +}; + +DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta, + struct bpf_map *map, void *key, + struct sock *sk) + +static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info) +{ + if (unlikely(info->index >= info->map->max_entries)) + return NULL; + + info->sk = __sock_map_lookup_elem(info->map, info->index); + + /* can't return sk directly, since that might be NULL */ + return info; +} + +static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct sock_map_seq_info *info = seq->private; + + if (*pos == 0) + ++*pos; + + /* pairs with sock_map_seq_stop */ + rcu_read_lock(); + return sock_map_seq_lookup_elem(info); +} + +static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock_map_seq_info *info = seq->private; + + ++*pos; + ++info->index; + + return sock_map_seq_lookup_elem(info); +} + +static int sock_map_seq_show(struct seq_file *seq, void *v) +{ + struct sock_map_seq_info *info = seq->private; + struct bpf_iter__sockmap ctx = {}; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, !v); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.map = info->map; + if (v) { + ctx.key = &info->index; + ctx.sk = info->sk; + } + + return bpf_iter_run_prog(prog, &ctx); +} + +static void sock_map_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)sock_map_seq_show(seq, NULL); + + /* pairs with sock_map_seq_start */ + rcu_read_unlock(); +} + +static const struct seq_operations sock_map_seq_ops = { + .start = sock_map_seq_start, + .next = sock_map_seq_next, + .stop = sock_map_seq_stop, + .show = sock_map_seq_show, +}; + +static int sock_map_init_seq_private(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct sock_map_seq_info *info = priv_data; + + info->map = aux->map; + return 0; +} + +static const struct bpf_iter_seq_info sock_map_iter_seq_info = { + .seq_ops = &sock_map_seq_ops, + .init_seq_private = sock_map_init_seq_private, + .seq_priv_size = sizeof(struct sock_map_seq_info), +}; + static int sock_map_btf_id; const struct bpf_map_ops sock_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -717,6 +821,7 @@ const struct bpf_map_ops sock_map_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_stab", .map_btf_id = &sock_map_btf_id, + .iter_seq_info = &sock_map_iter_seq_info, }; struct bpf_shtab_elem { @@ -953,7 +1058,7 @@ static int sock_hash_get_next_key(struct bpf_map *map, void *key, if (!elem) goto find_first_elem; - elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&elem->node)), + elem_next = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&elem->node)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); @@ -965,7 +1070,7 @@ static int sock_hash_get_next_key(struct bpf_map *map, void *key, find_first_elem: for (; i < htab->buckets_num; i++) { head = &sock_hash_select_bucket(htab, i)->head; - elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + elem_next = hlist_entry_safe(rcu_dereference(hlist_first_rcu(head)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); @@ -1199,6 +1304,117 @@ const struct bpf_func_proto bpf_msg_redirect_hash_proto = { .arg4_type = ARG_ANYTHING, }; +struct sock_hash_seq_info { + struct bpf_map *map; + struct bpf_shtab *htab; + u32 bucket_id; +}; + +static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info, + struct bpf_shtab_elem *prev_elem) +{ + const struct bpf_shtab *htab = info->htab; + struct bpf_shtab_bucket *bucket; + struct bpf_shtab_elem *elem; + struct hlist_node *node; + + /* try to find next elem in the same bucket */ + if (prev_elem) { + node = rcu_dereference(hlist_next_rcu(&prev_elem->node)); + elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); + if (elem) + return elem; + + /* no more elements, continue in the next bucket */ + info->bucket_id++; + } + + for (; info->bucket_id < htab->buckets_num; info->bucket_id++) { + bucket = &htab->buckets[info->bucket_id]; + node = rcu_dereference(hlist_first_rcu(&bucket->head)); + elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); + if (elem) + return elem; + } + + return NULL; +} + +static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct sock_hash_seq_info *info = seq->private; + + if (*pos == 0) + ++*pos; + + /* pairs with sock_hash_seq_stop */ + rcu_read_lock(); + return sock_hash_seq_find_next(info, NULL); +} + +static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock_hash_seq_info *info = seq->private; + + ++*pos; + return sock_hash_seq_find_next(info, v); +} + +static int sock_hash_seq_show(struct seq_file *seq, void *v) +{ + struct sock_hash_seq_info *info = seq->private; + struct bpf_iter__sockmap ctx = {}; + struct bpf_shtab_elem *elem = v; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, !elem); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.map = info->map; + if (elem) { + ctx.key = elem->key; + ctx.sk = elem->sk; + } + + return bpf_iter_run_prog(prog, &ctx); +} + +static void sock_hash_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)sock_hash_seq_show(seq, NULL); + + /* pairs with sock_hash_seq_start */ + rcu_read_unlock(); +} + +static const struct seq_operations sock_hash_seq_ops = { + .start = sock_hash_seq_start, + .next = sock_hash_seq_next, + .stop = sock_hash_seq_stop, + .show = sock_hash_seq_show, +}; + +static int sock_hash_init_seq_private(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct sock_hash_seq_info *info = priv_data; + + info->map = aux->map; + info->htab = container_of(aux->map, struct bpf_shtab, map); + return 0; +} + +static const struct bpf_iter_seq_info sock_hash_iter_seq_info = { + .seq_ops = &sock_hash_seq_ops, + .init_seq_private = sock_hash_init_seq_private, + .seq_priv_size = sizeof(struct sock_hash_seq_info), +}; + static int sock_hash_map_btf_id; const struct bpf_map_ops sock_hash_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -1213,6 +1429,7 @@ const struct bpf_map_ops sock_hash_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_shtab", .map_btf_id = &sock_hash_map_btf_id, + .iter_seq_info = &sock_hash_iter_seq_info, }; static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) @@ -1323,3 +1540,62 @@ void sock_map_close(struct sock *sk, long timeout) release_sock(sk); saved_close(sk, timeout); } + +static int sock_map_iter_attach_target(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + struct bpf_map *map; + int err = -EINVAL; + + if (!linfo->map.map_fd) + return -EBADF; + + map = bpf_map_get_with_uref(linfo->map.map_fd); + if (IS_ERR(map)) + return PTR_ERR(map); + + if (map->map_type != BPF_MAP_TYPE_SOCKMAP && + map->map_type != BPF_MAP_TYPE_SOCKHASH) + goto put_map; + + if (prog->aux->max_rdonly_access > map->key_size) { + err = -EACCES; + goto put_map; + } + + aux->map = map; + return 0; + +put_map: + bpf_map_put_with_uref(map); + return err; +} + +static void sock_map_iter_detach_target(struct bpf_iter_aux_info *aux) +{ + bpf_map_put_with_uref(aux->map); +} + +static struct bpf_iter_reg sock_map_iter_reg = { + .target = "sockmap", + .attach_target = sock_map_iter_attach_target, + .detach_target = sock_map_iter_detach_target, + .show_fdinfo = bpf_iter_map_show_fdinfo, + .fill_link_info = bpf_iter_map_fill_link_info, + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__sockmap, key), + PTR_TO_RDONLY_BUF_OR_NULL }, + { offsetof(struct bpf_iter__sockmap, sk), + PTR_TO_BTF_ID_OR_NULL }, + }, +}; + +static int __init bpf_sockmap_iter_init(void) +{ + sock_map_iter_reg.ctx_arg_info[1].btf_id = + btf_sock_ids[BTF_SOCK_TYPE_SOCK]; + return bpf_iter_reg_target(&sock_map_iter_reg); +} +late_initcall(bpf_sockmap_iter_init); -- cgit From e7b10a4dd1b13c8fb8ee1686da7d01437c5da1d2 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 10 Sep 2020 15:35:33 -0400 Subject: tcp: Simplify EBPF TCP_CONGESTION to always init CC Now that the previous patch ensures we don't initialize the congestion control twice, when EBPF sets the congestion control algorithm at connection establishment we can simplify the code by simply initializing the congestion control module at that time. Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Acked-by: Yuchung Cheng Acked-by: Kevin Yang Cc: Lawrence Brakmo --- net/core/filter.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 47eef9a0be6a..067f6759a68f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4313,8 +4313,6 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -#define SOCKOPT_CC_REINIT (1 << 0) - static int _bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen, u32 flags) { @@ -4449,13 +4447,12 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { char name[TCP_CA_NAME_MAX]; - bool reinit = flags & SOCKOPT_CC_REINIT; strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; ret = tcp_set_congestion_control(sk, name, false, - reinit, true); + true, true); } else { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -4652,8 +4649,6 @@ BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { u32 flags = 0; - if (bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN) - flags |= SOCKOPT_CC_REINIT; return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen, flags); } -- cgit From 29a949325c6c90f1431db9af64592275c83d9b2a Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 10 Sep 2020 15:35:34 -0400 Subject: tcp: simplify tcp_set_congestion_control(): Always reinitialize Now that the previous patches ensure that all call sites for tcp_set_congestion_control() want to initialize congestion control, we can simplify tcp_set_congestion_control() by removing the reinit argument and the code to support it. Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Acked-by: Yuchung Cheng Acked-by: Kevin Yang Cc: Lawrence Brakmo --- net/core/filter.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 067f6759a68f..e89d6d7da03c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4451,8 +4451,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; - ret = tcp_set_congestion_control(sk, name, false, - true, true); + ret = tcp_set_congestion_control(sk, name, false, true); } else { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); -- cgit From 5cdc744caab7cb0d0a3ed479246cbab3dbc59c0e Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 10 Sep 2020 15:35:35 -0400 Subject: tcp: simplify _bpf_setsockopt(): Remove flags argument Now that the previous patches have removed the code that uses the flags argument to _bpf_setsockopt(), we can remove that argument. Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Acked-by: Yuchung Cheng Acked-by: Kevin Yang Cc: Lawrence Brakmo --- net/core/filter.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index e89d6d7da03c..d266c6941967 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4314,7 +4314,7 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { }; static int _bpf_setsockopt(struct sock *sk, int level, int optname, - char *optval, int optlen, u32 flags) + char *optval, int optlen) { char devname[IFNAMSIZ]; int val, valbool; @@ -4611,9 +4611,7 @@ err_clear: BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { - u32 flags = 0; - return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen, - flags); + return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { @@ -4647,9 +4645,7 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { - u32 flags = 0; - return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen, - flags); + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { -- cgit From 984fe94f94756dacb3c8cc52904a23adf9e04da1 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Tue, 15 Sep 2020 16:45:39 -0700 Subject: bpf: Mutex protect used_maps array and count To support modifying the used_maps array, we use a mutex to protect the use of the counter and the array. The mutex is initialized right after the prog aux is allocated, and destroyed right before prog aux is freed. This way we guarantee it's initialized for both cBPF and eBPF. Signed-off-by: YiFei Zhu Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Cc: YiFei Zhu Link: https://lore.kernel.org/bpf/20200915234543.3220146-2-sdf@google.com --- net/core/dev.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index d42c9ea0c3c0..75d7f91337d9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5441,15 +5441,20 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) if (new) { u32 i; + mutex_lock(&new->aux->used_maps_mutex); + /* generic XDP does not work with DEVMAPs that can * have a bpf_prog installed on an entry */ for (i = 0; i < new->aux->used_map_cnt; i++) { - if (dev_map_can_have_prog(new->aux->used_maps[i])) - return -EINVAL; - if (cpu_map_prog_allowed(new->aux->used_maps[i])) + if (dev_map_can_have_prog(new->aux->used_maps[i]) || + cpu_map_prog_allowed(new->aux->used_maps[i])) { + mutex_unlock(&new->aux->used_maps_mutex); return -EINVAL; + } } + + mutex_unlock(&new->aux->used_maps_mutex); } switch (xdp->command) { -- cgit From 9436ef6e862b9ca22e5b12f87b106e07d5af4cae Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:20 +0100 Subject: bpf: Allow specifying a BTF ID per argument in function protos Function prototypes using ARG_PTR_TO_BTF_ID currently use two ways to signal which BTF IDs are acceptable. First, bpf_func_proto.btf_id is an array of IDs, one for each argument. This array is only accessed up to the highest numbered argument that uses ARG_PTR_TO_BTF_ID and may therefore be less than five arguments long. It usually points at a BTF_ID_LIST. Second, check_btf_id is a function pointer that is called by the verifier if present. It gets the actual BTF ID of the register, and the argument number we're currently checking. It turns out that the only user check_arg_btf_id ignores the argument, and is simply used to check whether the BTF ID has a struct sock_common at it's start. Replace both of these mechanisms with an explicit BTF ID for each argument in a function proto. Thanks to btf_struct_ids_match this is very flexible: check_arg_btf_id can be replaced by requiring struct sock_common. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-5-lmb@cloudflare.com --- net/core/bpf_sk_storage.c | 8 ++------ net/core/filter.c | 31 +++++++++---------------------- 2 files changed, 11 insertions(+), 28 deletions(-) (limited to 'net/core') diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 4a86ea34f29e..d653a583dbc9 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -378,19 +378,15 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { .arg2_type = ARG_PTR_TO_SOCKET, }; -BTF_ID_LIST(sk_storage_btf_ids) -BTF_ID_UNUSED -BTF_ID(struct, sock) - const struct bpf_func_proto sk_storage_get_btf_proto = { .func = bpf_sk_storage_get, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, - .btf_id = sk_storage_btf_ids, }; const struct bpf_func_proto sk_storage_delete_btf_proto = { @@ -399,7 +395,7 @@ const struct bpf_func_proto sk_storage_delete_btf_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, - .btf_id = sk_storage_btf_ids, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], }; struct bpf_sk_storage_diag { diff --git a/net/core/filter.c b/net/core/filter.c index d266c6941967..6014e5f40c58 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3803,19 +3803,18 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; -BTF_ID_LIST(bpf_skb_output_btf_ids) -BTF_ID(struct, sk_buff) +BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff) const struct bpf_func_proto bpf_skb_output_proto = { .func = bpf_skb_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_skb_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, .arg5_type = ARG_CONST_SIZE_OR_ZERO, - .btf_id = bpf_skb_output_btf_ids, }; static unsigned short bpf_tunnel_key_af(u64 flags) @@ -4199,19 +4198,18 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; -BTF_ID_LIST(bpf_xdp_output_btf_ids) -BTF_ID(struct, xdp_buff) +BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff) const struct bpf_func_proto bpf_xdp_output_proto = { .func = bpf_xdp_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_xdp_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, .arg5_type = ARG_CONST_SIZE_OR_ZERO, - .btf_id = bpf_xdp_output_btf_ids, }; BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) @@ -9897,17 +9895,6 @@ BTF_SOCK_TYPE_xxx u32 btf_sock_ids[MAX_BTF_SOCK_TYPE]; #endif -static bool check_arg_btf_id(u32 btf_id, u32 arg) -{ - int i; - - /* only one argument, no need to check arg */ - for (i = 0; i < MAX_BTF_SOCK_TYPE; i++) - if (btf_sock_ids[i] == btf_id) - return true; - return false; -} - BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) { /* tcp6_sock type is not generated in dwarf and hence btf, @@ -9926,7 +9913,7 @@ const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID, - .check_btf_id = check_arg_btf_id, + .arg1_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6], }; @@ -9943,7 +9930,7 @@ const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID, - .check_btf_id = check_arg_btf_id, + .arg1_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP], }; @@ -9967,7 +9954,7 @@ const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID, - .check_btf_id = check_arg_btf_id, + .arg1_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW], }; @@ -9991,7 +9978,7 @@ const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID, - .check_btf_id = check_arg_btf_id, + .arg1_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ], }; @@ -10013,6 +10000,6 @@ const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID, - .check_btf_id = check_arg_btf_id, + .arg1_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], }; -- cgit From c69d2ddb2072eb7ffca1a31ee5ddc16dcd414ed9 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 16 Sep 2020 15:46:45 -0700 Subject: bpf: Using rcu_read_lock for bpf_sk_storage_map iterator If a bucket contains a lot of sockets, during bpf_iter traversing a bucket, concurrent userspace bpf_map_update_elem() and bpf program bpf_sk_storage_{get,delete}() may experience some undesirable delays as they will compete with bpf_iter for bucket lock. Note that the number of buckets for bpf_sk_storage_map is roughly the same as the number of cpus. So if there are lots of sockets in the system, each bucket could contain lots of sockets. Different actual use cases may experience different delays. Here, using selftest bpf_iter subtest bpf_sk_storage_map, I hacked the kernel with ktime_get_mono_fast_ns() to collect the time when a bucket was locked during bpf_iter prog traversing that bucket. This way, the maximum incurred delay was measured w.r.t. the number of elements in a bucket. # elems in each bucket delay(ns) 64 17000 256 72512 2048 875246 The potential delays will be further increased if we have even more elemnts in a bucket. Using rcu_read_lock() is a reasonable compromise here. It may lose some precision, e.g., access stale sockets, but it will not hurt performance of bpf program or user space application which also tries to get/delete or update map elements. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Cc: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200916224645.720172-1-yhs@fb.com --- net/core/bpf_sk_storage.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'net/core') diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d653a583dbc9..838efc682cff 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -674,6 +674,7 @@ struct bpf_iter_seq_sk_storage_map_info { static struct bpf_local_storage_elem * bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, struct bpf_local_storage_elem *prev_selem) + __acquires(RCU) __releases(RCU) { struct bpf_local_storage *sk_storage; struct bpf_local_storage_elem *selem; @@ -692,16 +693,16 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, selem = prev_selem; count = 0; while (selem) { - selem = hlist_entry_safe(selem->map_node.next, + selem = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&selem->map_node)), struct bpf_local_storage_elem, map_node); if (!selem) { /* not found, unlock and go to the next bucket */ b = &smap->buckets[bucket_id++]; - raw_spin_unlock_bh(&b->lock); + rcu_read_unlock(); skip_elems = 0; break; } - sk_storage = rcu_dereference_raw(selem->local_storage); + sk_storage = rcu_dereference(selem->local_storage); if (sk_storage) { info->skip_elems = skip_elems + count; return selem; @@ -711,10 +712,10 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, for (i = bucket_id; i < (1U << smap->bucket_log); i++) { b = &smap->buckets[i]; - raw_spin_lock_bh(&b->lock); + rcu_read_lock(); count = 0; - hlist_for_each_entry(selem, &b->list, map_node) { - sk_storage = rcu_dereference_raw(selem->local_storage); + hlist_for_each_entry_rcu(selem, &b->list, map_node) { + sk_storage = rcu_dereference(selem->local_storage); if (sk_storage && count >= skip_elems) { info->bucket_id = i; info->skip_elems = count; @@ -722,7 +723,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, } count++; } - raw_spin_unlock_bh(&b->lock); + rcu_read_unlock(); skip_elems = 0; } @@ -781,7 +782,7 @@ static int __bpf_sk_storage_map_seq_show(struct seq_file *seq, ctx.meta = &meta; ctx.map = info->map; if (selem) { - sk_storage = rcu_dereference_raw(selem->local_storage); + sk_storage = rcu_dereference(selem->local_storage); ctx.sk = sk_storage->owner; ctx.value = SDATA(selem)->data; } @@ -797,18 +798,12 @@ static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v) } static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) { - struct bpf_iter_seq_sk_storage_map_info *info = seq->private; - struct bpf_local_storage_map *smap; - struct bpf_local_storage_map_bucket *b; - - if (!v) { + if (!v) (void)__bpf_sk_storage_map_seq_show(seq, v); - } else { - smap = (struct bpf_local_storage_map *)info->map; - b = &smap->buckets[info->bucket_id]; - raw_spin_unlock_bh(&b->lock); - } + else + rcu_read_unlock(); } static int bpf_iter_init_sk_storage_map(void *priv_data, -- cgit