From 9403cf2302588022d06f1878b072d3f6933021f0 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 19 Mar 2019 16:05:44 +0100 Subject: tcp: free request sock directly upon TFO or syncookies error Since the request socket is created locally, it'd make more sense to use reqsk_free() instead of reqsk_put() in TFO and syncookies' error path. However, tcp_get_cookie_sock() may set ->rsk_refcnt before freeing the socket; tcp_conn_request() may also have non-null ->rsk_refcnt because of tcp_try_fastopen(). In both cases 'req' hasn't been exposed to the outside world and is safe to free immediately, but that'd trigger the WARN_ON_ONCE in reqsk_free(). Define __reqsk_free() for these situations where we know nobody's referencing the socket, even though ->rsk_refcnt might be non-null. Now we can consolidate the error path of tcp_get_cookie_sock() and tcp_conn_request(). Signed-off-by: Guillaume Nault Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 17 ++++++++--------- net/ipv4/tcp_input.c | 5 ++--- 2 files changed, 10 insertions(+), 12 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index e531344611a0..008545f63667 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -216,16 +216,15 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, refcount_set(&req->rsk_refcnt, 1); tcp_sk(child)->tsoffset = tsoff; sock_rps_save_rxhash(child, skb); - if (!inet_csk_reqsk_queue_add(sk, req, child)) { - bh_unlock_sock(child); - sock_put(child); - child = NULL; - reqsk_put(req); - } - } else { - reqsk_free(req); + if (inet_csk_reqsk_queue_add(sk, req, child)) + return child; + + bh_unlock_sock(child); + sock_put(child); } - return child; + __reqsk_free(req); + + return NULL; } EXPORT_SYMBOL(tcp_get_cookie_sock); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5def3c48870e..5dfbc333e79a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6502,8 +6502,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, reqsk_fastopen_remove(fastopen_sk, req, false); bh_unlock_sock(fastopen_sk); sock_put(fastopen_sk); - reqsk_put(req); - goto drop; + goto drop_and_free; } sk->sk_data_ready(sk); bh_unlock_sock(fastopen_sk); @@ -6527,7 +6526,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, drop_and_release: dst_release(dst); drop_and_free: - reqsk_free(req); + __reqsk_free(req); drop: tcp_listendrop(sk); return 0; -- cgit From 9ab948a91b2c2abc8e82845c0e61f4b1683e3a4f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 20 Mar 2019 09:18:59 -0700 Subject: ipv4: Allow amount of dirty memory from fib resizing to be controllable fib_trie implementation calls synchronize_rcu when a certain amount of pages are dirty from freed entries. The number of pages was determined experimentally in 2009 (commit c3059477fce2d). At the current setting, synchronize_rcu is called often -- 51 times in a second in one test with an average of an 8 msec delay adding a fib entry. The total impact is a lot of slow down modifying the fib. This is seen in the output of 'time' - the difference between real time and sys+user. For example, using 720,022 single path routes and 'ip -batch'[1]: $ time ./ip -batch ipv4/routes-1-hops real 0m14.214s user 0m2.513s sys 0m6.783s So roughly 35% of the actual time to install the routes is from the ip command getting scheduled out, most notably due to synchronize_rcu (this is observed using 'perf sched timehist'). This patch makes the amount of dirty memory configurable between 64k where the synchronize_rcu is called often (small, low end systems that are memory sensitive) to 64M where synchronize_rcu is called rarely during a large FIB change (for high end systems with lots of memory). The default is 512kB which corresponds to the current setting of 128 pages with a 4kB page size. As an example, at 16MB the worst interval shows 4 calls to synchronize_rcu in a second blocking for up to 30 msec in a single instance, and a total of almost 100 msec across the 4 calls in the second. The trade off is allowing FIB entries to consume more memory in a given time window but but with much better fib insertion rates (~30% increase in prefixes/sec). With this patch and net.ipv4.fib_sync_mem set to 16MB, the same batch file runs in: $ time ./ip -batch ipv4/routes-1-hops real 0m9.692s user 0m2.491s sys 0m6.769s So the dead time is reduced to about 1/2 second or <5% of the real time. [1] 'ip' modified to not request ACK messages which improves route insertion times by about 20% Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 14 ++++++++------ net/ipv4/sysctl_net_ipv4.c | 9 +++++++++ 2 files changed, 17 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index a573e37e0615..1704f432de1f 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -183,14 +183,16 @@ struct trie { }; static struct key_vector *resize(struct trie *t, struct key_vector *tn); -static size_t tnode_free_size; +static unsigned int tnode_free_size; /* - * synchronize_rcu after call_rcu for that many pages; it should be especially - * useful before resizing the root node with PREEMPT_NONE configs; the value was - * obtained experimentally, aiming to avoid visible slowdown. + * synchronize_rcu after call_rcu for outstanding dirty memory; it should be + * especially useful before resizing the root node with PREEMPT_NONE configs; + * the value was obtained experimentally, aiming to avoid visible slowdown. */ -static const int sync_pages = 128; +unsigned int sysctl_fib_sync_mem = 512 * 1024; +unsigned int sysctl_fib_sync_mem_min = 64 * 1024; +unsigned int sysctl_fib_sync_mem_max = 64 * 1024 * 1024; static struct kmem_cache *fn_alias_kmem __ro_after_init; static struct kmem_cache *trie_leaf_kmem __ro_after_init; @@ -504,7 +506,7 @@ static void tnode_free(struct key_vector *tn) tn = container_of(head, struct tnode, rcu)->kv; } - if (tnode_free_size >= PAGE_SIZE * sync_pages) { + if (tnode_free_size >= sysctl_fib_sync_mem) { tnode_free_size = 0; synchronize_rcu(); } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ba0fc4b18465..2316c08e9591 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -549,6 +549,15 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, + { + .procname = "fib_sync_mem", + .data = &sysctl_fib_sync_mem, + .maxlen = sizeof(sysctl_fib_sync_mem), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = &sysctl_fib_sync_mem_min, + .extra2 = &sysctl_fib_sync_mem_max, + }, { } }; -- cgit From 02afc7ad45bd6cfc9fd51fdbc132455371b63469 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Wed, 20 Mar 2019 20:02:56 +0100 Subject: net: dst: remove gc leftovers Get rid of some obsolete gc-related documentation and macros that were missed in commit 5b7c9a8ff828 ("net: remove dst gc related code"). CC: Wei Wang Signed-off-by: Julian Wiedmann Acked-by: Wei Wang Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a5da63e5faa2..14c7fdacaa72 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1176,7 +1176,7 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) * * When a PMTU/redirect information update invalidates a route, * this is indicated by setting obsolete to DST_OBSOLETE_KILL or - * DST_OBSOLETE_DEAD by dst_free(). + * DST_OBSOLETE_DEAD. */ if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) return NULL; -- cgit From 3b0f31f2b8c9fb348e4530b88f6b64f9621f83d6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 21 Mar 2019 22:51:02 +0100 Subject: genetlink: make policy common to family Since maxattr is common, the policy can't really differ sanely, so make it common as well. The only user that did in fact manage to make a non-common policy is taskstats, which has to be really careful about it (since it's still using a common maxattr!). This is no longer supported, but we can fake it using pre_doit. This reduces the size of e.g. nl80211.o (which has lots of commands): text data bss dec hex filename 398745 14323 2240 415308 6564c net/wireless/nl80211.o (before) 397913 14331 2240 414484 65314 net/wireless/nl80211.o (after) -------------------------------- -832 +8 0 -824 Which is obviously just 8 bytes for each command, and an added 8 bytes for the new policy pointer. I'm not sure why the ops list is counted as .text though. Most of the code transformations were done using the following spatch: @ops@ identifier OPS; expression POLICY; @@ struct genl_ops OPS[] = { ..., { - .policy = POLICY, }, ... }; @@ identifier ops.OPS; expression ops.POLICY; identifier fam; expression M; @@ struct genl_family fam = { .ops = OPS, .maxattr = M, + .policy = POLICY, ... }; This also gets rid of devlink_nl_cmd_region_read_dumpit() accessing the cb->data as ops, which we want to change in a later genl patch. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/ipv4/fou.c | 4 +--- net/ipv4/tcp_metrics.c | 3 +-- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 79e98e21cdd7..a23fbb52d265 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -808,20 +808,17 @@ static const struct genl_ops fou_nl_ops[] = { { .cmd = FOU_CMD_ADD, .doit = fou_nl_cmd_add_port, - .policy = fou_nl_policy, .flags = GENL_ADMIN_PERM, }, { .cmd = FOU_CMD_DEL, .doit = fou_nl_cmd_rm_port, - .policy = fou_nl_policy, .flags = GENL_ADMIN_PERM, }, { .cmd = FOU_CMD_GET, .doit = fou_nl_cmd_get_port, .dumpit = fou_nl_dump, - .policy = fou_nl_policy, }, }; @@ -830,6 +827,7 @@ static struct genl_family fou_nl_family __ro_after_init = { .name = FOU_GENL_NAME, .version = FOU_GENL_VERSION, .maxattr = FOU_ATTR_MAX, + .policy = fou_nl_policy, .netnsok = true, .module = THIS_MODULE, .ops = fou_nl_ops, diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index b467a7cabf40..4ccec4c705f7 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -953,12 +953,10 @@ static const struct genl_ops tcp_metrics_nl_ops[] = { .cmd = TCP_METRICS_CMD_GET, .doit = tcp_metrics_nl_cmd_get, .dumpit = tcp_metrics_nl_dump, - .policy = tcp_metrics_nl_policy, }, { .cmd = TCP_METRICS_CMD_DEL, .doit = tcp_metrics_nl_cmd_del, - .policy = tcp_metrics_nl_policy, .flags = GENL_ADMIN_PERM, }, }; @@ -968,6 +966,7 @@ static struct genl_family tcp_metrics_nl_family __ro_after_init = { .name = TCP_METRICS_GENL_NAME, .version = TCP_METRICS_GENL_VERSION, .maxattr = TCP_METRICS_ATTR_MAX, + .policy = tcp_metrics_nl_policy, .netnsok = true, .module = THIS_MODULE, .ops = tcp_metrics_nl_ops, -- cgit From e6d1407013a91722ffc89e980d715eb9ce7b57f6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Mar 2019 06:26:29 -0700 Subject: tcp: remove conditional branches from tcp_mstamp_refresh() tcp_clock_ns() (aka ktime_get_ns()) is using monotonic clock, so the checks we had in tcp_mstamp_refresh() are no longer relevant. This patch removes cpu stall (when the cache line is not hot) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4522579aaca2..e265d1aeeb66 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -52,12 +52,8 @@ void tcp_mstamp_refresh(struct tcp_sock *tp) { u64 val = tcp_clock_ns(); - if (val > tp->tcp_clock_cache) - tp->tcp_clock_cache = val; - - val = div_u64(val, NSEC_PER_USEC); - if (val > tp->tcp_mstamp) - tp->tcp_mstamp = val; + tp->tcp_clock_cache = val; + tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC); } static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, -- cgit From 472c2e07eef045145bc1493cc94a01c87140780a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Mar 2019 08:56:39 -0700 Subject: tcp: add one skb cache for tx On hosts with a lot of cores, RPC workloads suffer from heavy contention on slab spinlocks. 20.69% [kernel] [k] queued_spin_lock_slowpath 5.64% [kernel] [k] _raw_spin_lock 3.83% [kernel] [k] syscall_return_via_sysret 3.48% [kernel] [k] __entry_text_start 1.76% [kernel] [k] __netif_receive_skb_core 1.64% [kernel] [k] __fget For each sendmsg(), we allocate one skb, and free it at the time ACK packet comes. In many cases, ACK packets are handled by another cpus, and this unfortunately incurs heavy costs for slab layer. This patch uses an extra pointer in socket structure, so that we try to reuse the same skb and avoid these expensive costs. We cache at most one skb per socket so this should be safe as far as memory pressure is concerned. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 50 +++++++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 27 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6baa6dc1b13b..f0b5a5999145 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -865,6 +865,21 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, { struct sk_buff *skb; + skb = sk->sk_tx_skb_cache; + if (skb && !size) { + const struct sk_buff_fclones *fclones; + + fclones = container_of(skb, struct sk_buff_fclones, skb1); + if (refcount_read(&fclones->fclone_ref) == 1) { + sk->sk_wmem_queued -= skb->truesize; + sk_mem_uncharge(sk, skb->truesize); + skb->truesize -= skb->data_len; + sk->sk_tx_skb_cache = NULL; + pskb_trim(skb, 0); + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); + return skb; + } + } /* The TCP header must be at least 32-bit aligned. */ size = ALIGN(size, 4); @@ -1098,30 +1113,6 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, } EXPORT_SYMBOL(tcp_sendpage); -/* Do not bother using a page frag for very small frames. - * But use this heuristic only for the first skb in write queue. - * - * Having no payload in skb->head allows better SACK shifting - * in tcp_shift_skb_data(), reducing sack/rack overhead, because - * write queue has less skbs. - * Each skb can hold up to MAX_SKB_FRAGS * 32Kbytes, or ~0.5 MB. - * This also speeds up tso_fragment(), since it wont fallback - * to tcp_fragment(). - */ -static int linear_payload_sz(bool first_skb) -{ - if (first_skb) - return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER); - return 0; -} - -static int select_size(bool first_skb, bool zc) -{ - if (zc) - return 0; - return linear_payload_sz(first_skb); -} - void tcp_free_fastopen_req(struct tcp_sock *tp) { if (tp->fastopen_req) { @@ -1272,7 +1263,6 @@ restart: if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; - int linear; new_segment: if (!sk_stream_memory_free(sk)) @@ -1283,8 +1273,7 @@ new_segment: goto restart; } first_skb = tcp_rtx_and_write_queues_empty(sk); - linear = select_size(first_skb, zc); - skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation, + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, first_skb); if (!skb) goto wait_for_memory; @@ -2552,6 +2541,13 @@ void tcp_write_queue_purge(struct sock *sk) sk_wmem_free_skb(sk, skb); } tcp_rtx_queue_purge(sk); + skb = sk->sk_tx_skb_cache; + if (skb) { + sk->sk_wmem_queued -= skb->truesize; + sk_mem_uncharge(sk, skb->truesize); + __kfree_skb(skb); + sk->sk_tx_skb_cache = NULL; + } INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); sk_mem_reclaim(sk); tcp_clear_all_retrans_hints(tcp_sk(sk)); -- cgit From 8b27dae5a2e89a61c46c6dbc76c040c0e6d0ed4c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Mar 2019 08:56:40 -0700 Subject: tcp: add one skb cache for rx Often times, recvmsg() system calls and BH handling for a particular TCP socket are done on different cpus. This means the incoming skb had to be allocated on a cpu, but freed on another. This incurs a high spinlock contention in slab layer for small rpc, but also a high number of cache line ping pongs for larger packets. A full size GRO packet might use 45 page fragments, meaning that up to 45 put_page() can be involved. More over performing the __kfree_skb() in the recvmsg() context adds a latency for user applications, and increase probability of trapping them in backlog processing, since the BH handler might found the socket owned by the user. This patch, combined with the prior one increases the rpc performance by about 10 % on servers with large number of cores. (tcp_rr workload with 10,000 flows and 112 threads reach 9 Mpps instead of 8 Mpps) This also increases single bulk flow performance on 40Gbit+ links, since in this case there are often two cpus working in tandem : - CPU handling the NIC rx interrupts, feeding the receive queue, and (after this patch) freeing the skbs that were consumed. - CPU in recvmsg() system call, essentially 100 % busy copying out data to user space. Having at most one skb in a per-socket cache has very little risk of memory exhaustion, and since it is protected by socket lock, its management is essentially free. Note that if rps/rfs is used, we do not enable this feature, because there is high chance that the same cpu is handling both the recvmsg() system call and the TCP rx path, but that another cpu did the skb allocations in the device driver right before the RPS/RFS logic. To properly handle this case, it seems we would need to record on which cpu skb was allocated, and use a different channel to give skbs back to this cpu. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 4 ++++ net/ipv4/tcp.c | 4 ++++ net/ipv4/tcp_ipv4.c | 11 +++++++++-- 3 files changed, 17 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index eab3ebde981e..7f3a984ad618 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -136,6 +136,10 @@ void inet_sock_destruct(struct sock *sk) struct inet_sock *inet = inet_sk(sk); __skb_queue_purge(&sk->sk_receive_queue); + if (sk->sk_rx_skb_cache) { + __kfree_skb(sk->sk_rx_skb_cache); + sk->sk_rx_skb_cache = NULL; + } __skb_queue_purge(&sk->sk_error_queue); sk_mem_reclaim(sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f0b5a5999145..29b94edf05f9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2583,6 +2583,10 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); + if (sk->sk_rx_skb_cache) { + __kfree_skb(sk->sk_rx_skb_cache); + sk->sk_rx_skb_cache = NULL; + } tp->copied_seq = tp->rcv_nxt; tp->urg_data = 0; tcp_write_queue_purge(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 277d71239d75..3979939804b7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1774,6 +1774,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, int tcp_v4_rcv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); + struct sk_buff *skb_to_free; int sdif = inet_sdif(skb); const struct iphdr *iph; const struct tcphdr *th; @@ -1905,11 +1906,17 @@ process: tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { + skb_to_free = sk->sk_rx_skb_cache; + sk->sk_rx_skb_cache = NULL; ret = tcp_v4_do_rcv(sk, skb); - } else if (tcp_add_backlog(sk, skb)) { - goto discard_and_relse; + } else { + if (tcp_add_backlog(sk, skb)) + goto discard_and_relse; + skb_to_free = NULL; } bh_unlock_sock(sk); + if (skb_to_free) + __kfree_skb(skb_to_free); put_and_return: if (refcounted) -- cgit From 65fd2c2afac31a4b46a80150347a1748fa9101cb Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Thu, 21 Mar 2019 16:41:37 +0200 Subject: xfrm: gso partial offload support This patch introduces support for gso partial ESP offload. Signed-off-by: Boris Pismenny Signed-off-by: Raed Salem Signed-off-by: Steffen Klassert --- net/ipv4/esp4_offload.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 8756e0e790d2..c6c84f2bc41c 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -138,9 +138,11 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, skb->encap_hdr_csum = 1; - if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) + if ((!(skb->dev->gso_partial_features & NETIF_F_HW_ESP) && + !(features & NETIF_F_HW_ESP)) || x->xso.dev != skb->dev) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); - else if (!(features & NETIF_F_HW_ESP_TX_CSUM)) + else if (!(features & NETIF_F_HW_ESP_TX_CSUM) && + !(skb->dev->gso_partial_features & NETIF_F_HW_ESP_TX_CSUM)) esp_features = features & ~NETIF_F_CSUM_MASK; xo->flags |= XFRM_GSO_SEGMENT; @@ -181,7 +183,9 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ if (!xo) return -EINVAL; - if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) { + if ((!(features & NETIF_F_HW_ESP) && + !(skb->dev->gso_partial_features & NETIF_F_HW_ESP)) || + x->xso.dev != skb->dev) { xo->flags |= CRYPTO_FALLBACK; hw_offload = false; } -- cgit From f981c57ffd2d7cf2dd4b6d6f8fcb3965df42f54c Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Sat, 23 Mar 2019 14:43:02 +0000 Subject: vti4: eliminated some duplicate code. The ipip tunnel introduced in commit dd9ee3444014 ("vti4: Fix a ipip packet processing bug in 'IPCOMP' virtual tunnel") largely duplicated the existing vti_input and vti_recv functions. Refactored to deduplicate the common code. Signed-off-by: Jeremy Sowden Signed-off-by: Steffen Klassert --- net/ipv4/ip_vti.c | 60 ++++++++++++++++++++----------------------------------- 1 file changed, 22 insertions(+), 38 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 68a21bf75dd0..a8474799fb79 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -50,7 +50,7 @@ static unsigned int vti_net_id __read_mostly; static int vti_tunnel_init(struct net_device *dev); static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, - int encap_type) + int encap_type, bool update_skb_dev) { struct ip_tunnel *tunnel; const struct iphdr *iph = ip_hdr(skb); @@ -65,6 +65,9 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; + if (update_skb_dev) + skb->dev = tunnel->dev; + return xfrm_input(skb, nexthdr, spi, encap_type); } @@ -74,47 +77,28 @@ drop: return 0; } -static int vti_input_ipip(struct sk_buff *skb, int nexthdr, __be32 spi, - int encap_type) +static int vti_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type) { - struct ip_tunnel *tunnel; - const struct iphdr *iph = ip_hdr(skb); - struct net *net = dev_net(skb->dev); - struct ip_tunnel_net *itn = net_generic(net, vti_net_id); - - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, - iph->saddr, iph->daddr, 0); - if (tunnel) { - if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) - goto drop; - - XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; - - skb->dev = tunnel->dev; - - return xfrm_input(skb, nexthdr, spi, encap_type); - } - - return -EINVAL; -drop: - kfree_skb(skb); - return 0; + return vti_input(skb, nexthdr, spi, encap_type, false); } -static int vti_rcv(struct sk_buff *skb) +static int vti_rcv(struct sk_buff *skb, __be32 spi, bool update_skb_dev) { XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); - return vti_input(skb, ip_hdr(skb)->protocol, 0, 0); + return vti_input(skb, ip_hdr(skb)->protocol, spi, 0, update_skb_dev); } -static int vti_rcv_ipip(struct sk_buff *skb) +static int vti_rcv_proto(struct sk_buff *skb) { - XFRM_SPI_SKB_CB(skb)->family = AF_INET; - XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + return vti_rcv(skb, 0, false); +} - return vti_input_ipip(skb, ip_hdr(skb)->protocol, ip_hdr(skb)->saddr, 0); +static int vti_rcv_tunnel(struct sk_buff *skb) +{ + return vti_rcv(skb, ip_hdr(skb)->saddr, true); } static int vti_rcv_cb(struct sk_buff *skb, int err) @@ -447,31 +431,31 @@ static void __net_init vti_fb_tunnel_init(struct net_device *dev) } static struct xfrm4_protocol vti_esp4_protocol __read_mostly = { - .handler = vti_rcv, - .input_handler = vti_input, + .handler = vti_rcv_proto, + .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm4_protocol vti_ah4_protocol __read_mostly = { - .handler = vti_rcv, - .input_handler = vti_input, + .handler = vti_rcv_proto, + .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { - .handler = vti_rcv, - .input_handler = vti_input, + .handler = vti_rcv_proto, + .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm_tunnel ipip_handler __read_mostly = { - .handler = vti_rcv_ipip, + .handler = vti_rcv_tunnel, .err_handler = vti4_err, .priority = 0, }; -- cgit From 1713cb37bf671e5d98919536941a8b56337874fd Mon Sep 17 00:00:00 2001 From: Kristian Evensen Date: Wed, 27 Mar 2019 11:16:03 +0100 Subject: fou: Support binding FoU socket MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An FoU socket is currently bound to the wildcard-address. While this works fine, there are several use-cases where the use of the wildcard-address is not desirable. For example, I use FoU on some multi-homed servers and would like to use FoU on only one of the interfaces. This commit adds support for binding FoU sockets to a given source address/interface, as well as connecting the socket to a given destination address/port. udp_tunnel already provides the required infrastructure, so most of the code added is for exposing and setting the different attributes (local address, peer address, etc.). The lookups performed when we add, delete or get an FoU-socket has also been updated to compare all the attributes a user can set. Since the comparison now involves several elements, I have added a separate comparison-function instead of open-coding. In order to test the code and ensure that the new comparison code works correctly, I started by creating a wildcard socket bound to port 1234 on my machine. I then tried to create a non-wildcarded socket bound to the same port, as well as fetching and deleting the socket (including source address, peer address or interface index in the netlink request). Both the create, fetch and delete request failed. Deleting/fetching the socket was only successful when my netlink request attributes matched those used to create the socket. I then repeated the tests, but with a socket bound to a local ip address, a socket bound to a local address + interface, and a bound socket that was also «connected» to a peer. Add only worked when no socket with the matching source address/interface (or wildcard) existed, while fetch/delete was only successful when all attributes matched. In addition to testing that the new code work, I also checked that the current behavior is kept. If none of the new attributes are provided, then an FoU-socket is configured as before (i.e., wildcarded). If any of the new attributes are provided, the FoU-socket is configured as expected. v1->v2: * Fixed building with IPv6 disabled (kbuild). * Fixed a return type warning and make the ugly comparison function more readable (kbuild). * Describe more in detail what has been tested (thanks David Miller). * Make peer port required if peer address is specified. Signed-off-by: Kristian Evensen Signed-off-by: David S. Miller --- net/ipv4/fou.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 122 insertions(+), 16 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index a23fbb52d265..100e63f57ea6 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -499,15 +499,45 @@ out_unlock: return err; } -static int fou_add_to_port_list(struct net *net, struct fou *fou) +static bool fou_cfg_cmp(struct fou *fou, struct fou_cfg *cfg) +{ + struct sock *sk = fou->sock->sk; + struct udp_port_cfg *udp_cfg = &cfg->udp_config; + + if (fou->family != udp_cfg->family || + fou->port != udp_cfg->local_udp_port || + sk->sk_dport != udp_cfg->peer_udp_port || + sk->sk_bound_dev_if != udp_cfg->bind_ifindex) + return false; + + if (fou->family == AF_INET) { + if (sk->sk_rcv_saddr != udp_cfg->local_ip.s_addr || + sk->sk_daddr != udp_cfg->peer_ip.s_addr) + return false; + else + return true; +#if IS_ENABLED(CONFIG_IPV6) + } else { + if (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, &udp_cfg->local_ip6) || + ipv6_addr_cmp(&sk->sk_v6_daddr, &udp_cfg->peer_ip6)) + return false; + else + return true; +#endif + } + + return false; +} + +static int fou_add_to_port_list(struct net *net, struct fou *fou, + struct fou_cfg *cfg) { struct fou_net *fn = net_generic(net, fou_net_id); struct fou *fout; mutex_lock(&fn->fou_lock); list_for_each_entry(fout, &fn->fou_list, list) { - if (fou->port == fout->port && - fou->family == fout->family) { + if (fou_cfg_cmp(fout, cfg)) { mutex_unlock(&fn->fou_lock); return -EALREADY; } @@ -585,7 +615,7 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, sk->sk_allocation = GFP_ATOMIC; - err = fou_add_to_port_list(net, fou); + err = fou_add_to_port_list(net, fou, cfg); if (err) goto error; @@ -605,14 +635,12 @@ error: static int fou_destroy(struct net *net, struct fou_cfg *cfg) { struct fou_net *fn = net_generic(net, fou_net_id); - __be16 port = cfg->udp_config.local_udp_port; - u8 family = cfg->udp_config.family; int err = -EINVAL; struct fou *fou; mutex_lock(&fn->fou_lock); list_for_each_entry(fou, &fn->fou_list, list) { - if (fou->port == port && fou->family == family) { + if (fou_cfg_cmp(fou, cfg)) { fou_release(fou); err = 0; break; @@ -626,16 +654,27 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg) static struct genl_family fou_nl_family; static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { - [FOU_ATTR_PORT] = { .type = NLA_U16, }, - [FOU_ATTR_AF] = { .type = NLA_U8, }, - [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, - [FOU_ATTR_TYPE] = { .type = NLA_U8, }, - [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, }, + [FOU_ATTR_PORT] = { .type = NLA_U16, }, + [FOU_ATTR_AF] = { .type = NLA_U8, }, + [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, + [FOU_ATTR_TYPE] = { .type = NLA_U8, }, + [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, }, + [FOU_ATTR_LOCAL_V4] = { .type = NLA_U32, }, + [FOU_ATTR_PEER_V4] = { .type = NLA_U32, }, + [FOU_ATTR_LOCAL_V6] = { .type = sizeof(struct in6_addr), }, + [FOU_ATTR_PEER_V6] = { .type = sizeof(struct in6_addr), }, + [FOU_ATTR_PEER_PORT] = { .type = NLA_U16, }, + [FOU_ATTR_IFINDEX] = { .type = NLA_S32, }, }; static int parse_nl_config(struct genl_info *info, struct fou_cfg *cfg) { + bool has_local = false, has_peer = false; + struct nlattr *attr; + int ifindex; + __be16 port; + memset(cfg, 0, sizeof(*cfg)); cfg->udp_config.family = AF_INET; @@ -657,8 +696,7 @@ static int parse_nl_config(struct genl_info *info, } if (info->attrs[FOU_ATTR_PORT]) { - __be16 port = nla_get_be16(info->attrs[FOU_ATTR_PORT]); - + port = nla_get_be16(info->attrs[FOU_ATTR_PORT]); cfg->udp_config.local_udp_port = port; } @@ -671,6 +709,52 @@ static int parse_nl_config(struct genl_info *info, if (info->attrs[FOU_ATTR_REMCSUM_NOPARTIAL]) cfg->flags |= FOU_F_REMCSUM_NOPARTIAL; + if (cfg->udp_config.family == AF_INET) { + if (info->attrs[FOU_ATTR_LOCAL_V4]) { + attr = info->attrs[FOU_ATTR_LOCAL_V4]; + cfg->udp_config.local_ip.s_addr = nla_get_in_addr(attr); + has_local = true; + } + + if (info->attrs[FOU_ATTR_PEER_V4]) { + attr = info->attrs[FOU_ATTR_PEER_V4]; + cfg->udp_config.peer_ip.s_addr = nla_get_in_addr(attr); + has_peer = true; + } +#if IS_ENABLED(CONFIG_IPV6) + } else { + if (info->attrs[FOU_ATTR_LOCAL_V6]) { + attr = info->attrs[FOU_ATTR_LOCAL_V6]; + cfg->udp_config.local_ip6 = nla_get_in6_addr(attr); + has_local = true; + } + + if (info->attrs[FOU_ATTR_PEER_V6]) { + attr = info->attrs[FOU_ATTR_PEER_V6]; + cfg->udp_config.peer_ip6 = nla_get_in6_addr(attr); + has_peer = true; + } +#endif + } + + if (has_peer) { + if (info->attrs[FOU_ATTR_PEER_PORT]) { + port = nla_get_be16(info->attrs[FOU_ATTR_PEER_PORT]); + cfg->udp_config.peer_udp_port = port; + } else { + return -EINVAL; + } + } + + if (info->attrs[FOU_ATTR_IFINDEX]) { + if (!has_local) + return -EINVAL; + + ifindex = nla_get_s32(info->attrs[FOU_ATTR_IFINDEX]); + + cfg->udp_config.bind_ifindex = ifindex; + } + return 0; } @@ -702,15 +786,37 @@ static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info) static int fou_fill_info(struct fou *fou, struct sk_buff *msg) { + struct sock *sk = fou->sock->sk; + if (nla_put_u8(msg, FOU_ATTR_AF, fou->sock->sk->sk_family) || nla_put_be16(msg, FOU_ATTR_PORT, fou->port) || + nla_put_be16(msg, FOU_ATTR_PEER_PORT, sk->sk_dport) || nla_put_u8(msg, FOU_ATTR_IPPROTO, fou->protocol) || - nla_put_u8(msg, FOU_ATTR_TYPE, fou->type)) + nla_put_u8(msg, FOU_ATTR_TYPE, fou->type) || + nla_put_s32(msg, FOU_ATTR_IFINDEX, sk->sk_bound_dev_if)) return -1; if (fou->flags & FOU_F_REMCSUM_NOPARTIAL) if (nla_put_flag(msg, FOU_ATTR_REMCSUM_NOPARTIAL)) return -1; + + if (fou->sock->sk->sk_family == AF_INET) { + if (nla_put_in_addr(msg, FOU_ATTR_LOCAL_V4, sk->sk_rcv_saddr)) + return -1; + + if (nla_put_in_addr(msg, FOU_ATTR_PEER_V4, sk->sk_daddr)) + return -1; +#if IS_ENABLED(CONFIG_IPV6) + } else { + if (nla_put_in6_addr(msg, FOU_ATTR_LOCAL_V6, + &sk->sk_v6_rcv_saddr)) + return -1; + + if (nla_put_in6_addr(msg, FOU_ATTR_PEER_V6, &sk->sk_v6_daddr)) + return -1; +#endif + } + return 0; } @@ -763,7 +869,7 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info) ret = -ESRCH; mutex_lock(&fn->fou_lock); list_for_each_entry(fout, &fn->fou_list, list) { - if (port == fout->port && family == fout->family) { + if (fou_cfg_cmp(fout, &cfg)) { ret = fou_dump_info(fout, info->snd_portid, info->snd_seq, 0, msg, info->genlhdr->cmd); -- cgit From 4f661542a40217713f2cee0bb6678fbb30d9d367 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Mar 2019 08:34:55 -0700 Subject: tcp: fix zerocopy and notsent_lowat issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit My recent patch had at least three problems : 1) TX zerocopy wants notification when skb is acknowledged, thus we need to call skb_zcopy_clear() if the skb is cached into sk->sk_tx_skb_cache 2) Some applications might expect precise EPOLLOUT notifications, so we need to update sk->sk_wmem_queued and call sk_mem_uncharge() from sk_wmem_free_skb() in all cases. The SOCK_QUEUE_SHRUNK flag must also be set. 3) Reuse of saved skb should have used skb_cloned() instead of simply checking if the fast clone has been freed. Fixes: 472c2e07eef0 ("tcp: add one skb cache for tx") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Soheil Hassas Yeganeh Acked-by: Soheil Hassas Yeganeh Tested-by: Holger Hoffstätte Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 29b94edf05f9..82bd707c0347 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -865,14 +865,9 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, { struct sk_buff *skb; - skb = sk->sk_tx_skb_cache; - if (skb && !size) { - const struct sk_buff_fclones *fclones; - - fclones = container_of(skb, struct sk_buff_fclones, skb1); - if (refcount_read(&fclones->fclone_ref) == 1) { - sk->sk_wmem_queued -= skb->truesize; - sk_mem_uncharge(sk, skb->truesize); + if (likely(!size)) { + skb = sk->sk_tx_skb_cache; + if (skb && !skb_cloned(skb)) { skb->truesize -= skb->data_len; sk->sk_tx_skb_cache = NULL; pskb_trim(skb, 0); @@ -2543,8 +2538,6 @@ void tcp_write_queue_purge(struct sock *sk) tcp_rtx_queue_purge(sk); skb = sk->sk_tx_skb_cache; if (skb) { - sk->sk_wmem_queued -= skb->truesize; - sk_mem_uncharge(sk, skb->truesize); __kfree_skb(skb); sk->sk_tx_skb_cache = NULL; } -- cgit From df453700e8d81b1bdafdf684365ee2b9431fb702 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Mar 2019 12:40:33 -0700 Subject: inet: switch IP ID generator to siphash According to Amit Klein and Benny Pinkas, IP ID generation is too weak and might be used by attackers. Even with recent net_hash_mix() fix (netns: provide pure entropy for net_hash_mix()) having 64bit key and Jenkins hash is risky. It is time to switch to siphash and its 128bit keys. Signed-off-by: Eric Dumazet Reported-by: Amit Klein Reported-by: Benny Pinkas Signed-off-by: David S. Miller --- net/ipv4/route.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 14c7fdacaa72..f2688fce39e1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -500,15 +500,17 @@ EXPORT_SYMBOL(ip_idents_reserve); void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) { - static u32 ip_idents_hashrnd __read_mostly; u32 hash, id; - net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd)); + /* Note the following code is not safe, but this is okay. */ + if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key))) + get_random_bytes(&net->ipv4.ip_id_key, + sizeof(net->ipv4.ip_id_key)); - hash = jhash_3words((__force u32)iph->daddr, + hash = siphash_3u32((__force u32)iph->daddr, (__force u32)iph->saddr, - iph->protocol ^ net_hash_mix(net), - ip_idents_hashrnd); + iph->protocol, + &net->ipv4.ip_id_key); id = ip_idents_reserve(hash, segs); iph->id = htons(id); } -- cgit From 8373c6c84e6748e1dd8b82c43af37572ab040233 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:46 -0700 Subject: ipv4: Define fib_get_nhs when CONFIG_IP_ROUTE_MULTIPATH is disabled Define fib_get_nhs to return EINVAL when CONFIG_IP_ROUTE_MULTIPATH is not enabled and remove the ifdef check for CONFIG_IP_ROUTE_MULTIPATH in fib_create_info. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 8e185b5a2bf6..b5dbbdfd1e49 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -601,6 +601,15 @@ static void fib_rebalance(struct fib_info *fi) } #else /* CONFIG_IP_ROUTE_MULTIPATH */ +static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, + int remaining, struct fib_config *cfg, + struct netlink_ext_ack *extack) +{ + NL_SET_ERR_MSG(extack, "Multipath support not enabled in kernel"); + + return -EINVAL; +} + #define fib_rebalance(fi) do { } while (0) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -1102,7 +1111,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg, } endfor_nexthops(fi) if (cfg->fc_mp) { -#ifdef CONFIG_IP_ROUTE_MULTIPATH err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack); if (err != 0) goto failure; @@ -1122,11 +1130,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg, "Nexthop class id does not match RTA_FLOW"); goto err_inval; } -#endif -#else - NL_SET_ERR_MSG(extack, - "Multipath support not enabled in kernel"); - goto err_inval; #endif } else { struct fib_nh *nh = fi->fib_nh; -- cgit From 331c7a402358de6206232f6aab7aa48ec6c1088a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:47 -0700 Subject: ipv4: Move IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN to helper in_dev lookup followed by IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN check is called in several places, some with the rcu lock and others with the rtnl held. Move the check to a helper similar to what IPv6 has. Since the helper can be invoked from either context use rcu_dereference_rtnl to dereference ip_ptr. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 31 +++++++------------------------ net/ipv4/fib_trie.c | 4 +--- 2 files changed, 8 insertions(+), 27 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b5dbbdfd1e49..78631eb255f7 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -558,7 +558,6 @@ static void fib_rebalance(struct fib_info *fi) { int total; int w; - struct in_device *in_dev; if (fi->fib_nhs < 2) return; @@ -568,10 +567,7 @@ static void fib_rebalance(struct fib_info *fi) if (nh->nh_flags & RTNH_F_DEAD) continue; - in_dev = __in_dev_get_rtnl(nh->nh_dev); - - if (in_dev && - IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + if (ip_ignore_linkdown(nh->nh_dev) && nh->nh_flags & RTNH_F_LINKDOWN) continue; @@ -582,12 +578,9 @@ static void fib_rebalance(struct fib_info *fi) change_nexthops(fi) { int upper_bound; - in_dev = __in_dev_get_rtnl(nexthop_nh->nh_dev); - if (nexthop_nh->nh_flags & RTNH_F_DEAD) { upper_bound = -1; - } else if (in_dev && - IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + } else if (ip_ignore_linkdown(nexthop_nh->nh_dev) && nexthop_nh->nh_flags & RTNH_F_LINKDOWN) { upper_bound = -1; } else { @@ -1325,12 +1318,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif)) goto nla_put_failure; if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) { - struct in_device *in_dev; - rcu_read_lock(); - in_dev = __in_dev_get_rcu(fi->fib_nh->nh_dev); - if (in_dev && - IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) + if (ip_ignore_linkdown(fi->fib_nh->nh_dev)) rtm->rtm_flags |= RTNH_F_DEAD; rcu_read_unlock(); } @@ -1361,12 +1350,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, rtnh->rtnh_flags = nh->nh_flags & 0xFF; if (nh->nh_flags & RTNH_F_LINKDOWN) { - struct in_device *in_dev; - rcu_read_lock(); - in_dev = __in_dev_get_rcu(nh->nh_dev); - if (in_dev && - IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) + if (ip_ignore_linkdown(nh->nh_dev)) rtnh->rtnh_flags |= RTNH_F_DEAD; rcu_read_unlock(); } @@ -1433,7 +1418,7 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local) static int call_fib_nh_notifiers(struct fib_nh *fib_nh, enum fib_event_type event_type) { - struct in_device *in_dev = __in_dev_get_rtnl(fib_nh->nh_dev); + bool ignore_link_down = ip_ignore_linkdown(fib_nh->nh_dev); struct fib_nh_notifier_info info = { .fib_nh = fib_nh, }; @@ -1442,14 +1427,12 @@ static int call_fib_nh_notifiers(struct fib_nh *fib_nh, case FIB_EVENT_NH_ADD: if (fib_nh->nh_flags & RTNH_F_DEAD) break; - if (IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && - fib_nh->nh_flags & RTNH_F_LINKDOWN) + if (ignore_link_down && fib_nh->nh_flags & RTNH_F_LINKDOWN) break; return call_fib4_notifiers(dev_net(fib_nh->nh_dev), event_type, &info.info); case FIB_EVENT_NH_DEL: - if ((in_dev && IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && - fib_nh->nh_flags & RTNH_F_LINKDOWN) || + if ((ignore_link_down && fib_nh->nh_flags & RTNH_F_LINKDOWN) || (fib_nh->nh_flags & RTNH_F_DEAD)) return call_fib4_notifiers(dev_net(fib_nh->nh_dev), event_type, &info.info); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 1704f432de1f..656d3d19f112 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1471,12 +1471,10 @@ found: continue; for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { const struct fib_nh *nh = &fi->fib_nh[nhsel]; - struct in_device *in_dev = __in_dev_get_rcu(nh->nh_dev); if (nh->nh_flags & RTNH_F_DEAD) continue; - if (in_dev && - IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + if (ip_ignore_linkdown(nh->nh_dev) && nh->nh_flags & RTNH_F_LINKDOWN && !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) continue; -- cgit From e4516ef65490ef29d48a98ad4d7c90dccf39068f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:48 -0700 Subject: ipv4: Create init helper for fib_nh Consolidate the fib_nh initialization which is duplicated between fib_create_info for single path and fib_get_nhs for multipath. Export the helper to allow for use with nexthop objects in the future. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 180 ++++++++++++++++++++++++----------------------- 1 file changed, 91 insertions(+), 89 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 78631eb255f7..cd15746e2b3f 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -457,6 +457,54 @@ static int fib_detect_death(struct fib_info *fi, int order, return 1; } +int fib_nh_init(struct net *net, struct fib_nh *nh, + struct fib_config *cfg, int nh_weight, + struct netlink_ext_ack *extack) +{ + int err = -ENOMEM; + + nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *); + if (!nh->nh_pcpu_rth_output) + goto err_out; + + if (cfg->fc_encap) { + struct lwtunnel_state *lwtstate; + + err = -EINVAL; + if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) { + NL_SET_ERR_MSG(extack, "LWT encap type not specified"); + goto lwt_failure; + } + err = lwtunnel_build_state(cfg->fc_encap_type, + cfg->fc_encap, AF_INET, cfg, + &lwtstate, extack); + if (err) + goto lwt_failure; + + nh->nh_lwtstate = lwtstate_get(lwtstate); + } + + nh->nh_oif = cfg->fc_oif; + nh->nh_gw = cfg->fc_gw; + nh->nh_flags = cfg->fc_flags; + +#ifdef CONFIG_IP_ROUTE_CLASSID + nh->nh_tclassid = cfg->fc_flow; + if (nh->nh_tclassid) + net->ipv4.fib_num_tclassid_users++; +#endif +#ifdef CONFIG_IP_ROUTE_MULTIPATH + nh->nh_weight = nh_weight; +#endif + return 0; + +lwt_failure: + rt_fibinfo_free_cpus(nh->nh_pcpu_rth_output); + nh->nh_pcpu_rth_output = NULL; +err_out: + return err; +} + #ifdef CONFIG_IP_ROUTE_MULTIPATH static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining, @@ -483,11 +531,15 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, int remaining, struct fib_config *cfg, struct netlink_ext_ack *extack) { + struct net *net = fi->fib_net; + struct fib_config fib_cfg; int ret; change_nexthops(fi) { int attrlen; + memset(&fib_cfg, 0, sizeof(fib_cfg)); + if (!rtnh_ok(rtnh, remaining)) { NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - extra data after nexthop"); @@ -500,56 +552,54 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, return -EINVAL; } - nexthop_nh->nh_flags = - (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; - nexthop_nh->nh_oif = rtnh->rtnh_ifindex; - nexthop_nh->nh_weight = rtnh->rtnh_hops + 1; + fib_cfg.fc_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; + fib_cfg.fc_oif = rtnh->rtnh_ifindex; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); - nexthop_nh->nh_gw = nla ? nla_get_in_addr(nla) : 0; -#ifdef CONFIG_IP_ROUTE_CLASSID + if (nla) + fib_cfg.fc_gw = nla_get_in_addr(nla); + nla = nla_find(attrs, attrlen, RTA_FLOW); - nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; - if (nexthop_nh->nh_tclassid) - fi->fib_net->ipv4.fib_num_tclassid_users++; -#endif - nla = nla_find(attrs, attrlen, RTA_ENCAP); - if (nla) { - struct lwtunnel_state *lwtstate; - struct nlattr *nla_entype; - - nla_entype = nla_find(attrs, attrlen, - RTA_ENCAP_TYPE); - if (!nla_entype) { - NL_SET_BAD_ATTR(extack, nla); - NL_SET_ERR_MSG(extack, - "Encap type is missing"); - goto err_inval; - } + if (nla) + fib_cfg.fc_flow = nla_get_u32(nla); - ret = lwtunnel_build_state(nla_get_u16( - nla_entype), - nla, AF_INET, cfg, - &lwtstate, extack); - if (ret) - goto errout; - nexthop_nh->nh_lwtstate = - lwtstate_get(lwtstate); - } + fib_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); + nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); + if (nla) + fib_cfg.fc_encap_type = nla_get_u16(nla); } + ret = fib_nh_init(net, nexthop_nh, &fib_cfg, + rtnh->rtnh_hops + 1, extack); + if (ret) + goto errout; + rtnh = rtnh_next(rtnh, &remaining); } endfor_nexthops(fi); - return 0; - -err_inval: ret = -EINVAL; - + if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) { + NL_SET_ERR_MSG(extack, + "Nexthop device index does not match RTA_OIF"); + goto errout; + } + if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) { + NL_SET_ERR_MSG(extack, + "Nexthop gateway does not match RTA_GATEWAY"); + goto errout; + } +#ifdef CONFIG_IP_ROUTE_CLASSID + if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) { + NL_SET_ERR_MSG(extack, + "Nexthop class id does not match RTA_FLOW"); + goto errout; + } +#endif + ret = 0; errout: return ret; } @@ -1098,63 +1148,15 @@ struct fib_info *fib_create_info(struct fib_config *cfg, fi->fib_nhs = nhs; change_nexthops(fi) { nexthop_nh->nh_parent = fi; - nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *); - if (!nexthop_nh->nh_pcpu_rth_output) - goto failure; } endfor_nexthops(fi) - if (cfg->fc_mp) { + if (cfg->fc_mp) err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack); - if (err != 0) - goto failure; - if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) { - NL_SET_ERR_MSG(extack, - "Nexthop device index does not match RTA_OIF"); - goto err_inval; - } - if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) { - NL_SET_ERR_MSG(extack, - "Nexthop gateway does not match RTA_GATEWAY"); - goto err_inval; - } -#ifdef CONFIG_IP_ROUTE_CLASSID - if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) { - NL_SET_ERR_MSG(extack, - "Nexthop class id does not match RTA_FLOW"); - goto err_inval; - } -#endif - } else { - struct fib_nh *nh = fi->fib_nh; - - if (cfg->fc_encap) { - struct lwtunnel_state *lwtstate; - - if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) { - NL_SET_ERR_MSG(extack, - "LWT encap type not specified"); - goto err_inval; - } - err = lwtunnel_build_state(cfg->fc_encap_type, - cfg->fc_encap, AF_INET, cfg, - &lwtstate, extack); - if (err) - goto failure; + else + err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack); - nh->nh_lwtstate = lwtstate_get(lwtstate); - } - nh->nh_oif = cfg->fc_oif; - nh->nh_gw = cfg->fc_gw; - nh->nh_flags = cfg->fc_flags; -#ifdef CONFIG_IP_ROUTE_CLASSID - nh->nh_tclassid = cfg->fc_flow; - if (nh->nh_tclassid) - fi->fib_net->ipv4.fib_num_tclassid_users++; -#endif -#ifdef CONFIG_IP_ROUTE_MULTIPATH - nh->nh_weight = 1; -#endif - } + if (err != 0) + goto failure; if (fib_props[cfg->fc_type].error) { if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) { -- cgit From faa041a40b9fa64913789fcc0161c7c73161f57e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:49 -0700 Subject: ipv4: Create cleanup helper for fib_nh Move the fib_nh cleanup code from free_fib_info_rcu into a new helper, fib_nh_release. Move classid accounting into fib_nh_release which is called per fib_nh to make accounting symmetrical with fib_nh_init. Export the helper to allow for use with nexthop objects in the future. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index cd15746e2b3f..184940a06cb5 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -204,18 +204,28 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) free_percpu(rtp); } +void fib_nh_release(struct net *net, struct fib_nh *fib_nh) +{ +#ifdef CONFIG_IP_ROUTE_CLASSID + if (fib_nh->nh_tclassid) + net->ipv4.fib_num_tclassid_users--; +#endif + if (fib_nh->nh_dev) + dev_put(fib_nh->nh_dev); + + lwtstate_put(fib_nh->nh_lwtstate); + free_nh_exceptions(fib_nh); + rt_fibinfo_free_cpus(fib_nh->nh_pcpu_rth_output); + rt_fibinfo_free(&fib_nh->nh_rth_input); +} + /* Release a nexthop info record */ static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); change_nexthops(fi) { - if (nexthop_nh->nh_dev) - dev_put(nexthop_nh->nh_dev); - lwtstate_put(nexthop_nh->nh_lwtstate); - free_nh_exceptions(nexthop_nh); - rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); - rt_fibinfo_free(&nexthop_nh->nh_rth_input); + fib_nh_release(fi->fib_net, nexthop_nh); } endfor_nexthops(fi); ip_fib_metrics_put(fi->fib_metrics); @@ -230,12 +240,7 @@ void free_fib_info(struct fib_info *fi) return; } fib_info_cnt--; -#ifdef CONFIG_IP_ROUTE_CLASSID - change_nexthops(fi) { - if (nexthop_nh->nh_tclassid) - fi->fib_net->ipv4.fib_num_tclassid_users--; - } endfor_nexthops(fi); -#endif + call_rcu(&fi->rcu, free_fib_info_rcu); } EXPORT_SYMBOL_GPL(free_fib_info); -- cgit From b75ed8b1aa9c3a99702159c3be8b0c1d54972ae5 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:55 -0700 Subject: ipv4: Rename fib_nh entries Rename fib_nh entries that will be moved to a fib_nh_common struct. Specifically, the device, oif, gateway, flags, scope, lwtstate, nh_weight and nh_upper_bound are common with all nexthop definitions. In the process shorten fib_nh_lwtstate to fib_nh_lws to avoid really long lines. Rename only; no functional change intended. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 10 +-- net/ipv4/fib_semantics.c | 229 ++++++++++++++++++++++++----------------------- net/ipv4/fib_trie.c | 12 +-- net/ipv4/route.c | 18 ++-- 4 files changed, 135 insertions(+), 134 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index ed14ec245584..ffbe24397dbe 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -324,16 +324,16 @@ bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev) for (ret = 0; ret < fi->fib_nhs; ret++) { struct fib_nh *nh = &fi->fib_nh[ret]; - if (nh->nh_dev == dev) { + if (nh->fib_nh_dev == dev) { dev_match = true; break; - } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) { + } else if (l3mdev_master_ifindex_rcu(nh->fib_nh_dev) == dev->ifindex) { dev_match = true; break; } } #else - if (fi->fib_nh[0].nh_dev == dev) + if (fi->fib_nh[0].fib_nh_dev == dev) dev_match = true; #endif @@ -390,7 +390,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, dev_match = fib_info_nh_uses_dev(res.fi, dev); if (dev_match) { - ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; + ret = FIB_RES_NH(res).fib_nh_scope >= RT_SCOPE_HOST; return ret; } if (no_addr) @@ -402,7 +402,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, ret = 0; if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) { if (res.type == RTN_UNICAST) - ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; + ret = FIB_RES_NH(res).fib_nh_scope >= RT_SCOPE_HOST; } return ret; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 184940a06cb5..c1e16b52338b 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -210,10 +210,10 @@ void fib_nh_release(struct net *net, struct fib_nh *fib_nh) if (fib_nh->nh_tclassid) net->ipv4.fib_num_tclassid_users--; #endif - if (fib_nh->nh_dev) - dev_put(fib_nh->nh_dev); + if (fib_nh->fib_nh_dev) + dev_put(fib_nh->fib_nh_dev); - lwtstate_put(fib_nh->nh_lwtstate); + lwtstate_put(fib_nh->fib_nh_lws); free_nh_exceptions(fib_nh); rt_fibinfo_free_cpus(fib_nh->nh_pcpu_rth_output); rt_fibinfo_free(&fib_nh->nh_rth_input); @@ -253,7 +253,7 @@ void fib_release_info(struct fib_info *fi) if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); change_nexthops(fi) { - if (!nexthop_nh->nh_dev) + if (!nexthop_nh->fib_nh_dev) continue; hlist_del(&nexthop_nh->nh_hash); } endfor_nexthops(fi) @@ -268,17 +268,17 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) const struct fib_nh *onh = ofi->fib_nh; for_nexthops(fi) { - if (nh->nh_oif != onh->nh_oif || - nh->nh_gw != onh->nh_gw || - nh->nh_scope != onh->nh_scope || + if (nh->fib_nh_oif != onh->fib_nh_oif || + nh->fib_nh_gw4 != onh->fib_nh_gw4 || + nh->fib_nh_scope != onh->fib_nh_scope || #ifdef CONFIG_IP_ROUTE_MULTIPATH - nh->nh_weight != onh->nh_weight || + nh->fib_nh_weight != onh->fib_nh_weight || #endif #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif - lwtunnel_cmp_encap(nh->nh_lwtstate, onh->nh_lwtstate) || - ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK)) + lwtunnel_cmp_encap(nh->fib_nh_lws, onh->fib_nh_lws) || + ((nh->fib_nh_flags ^ onh->fib_nh_flags) & ~RTNH_COMPARE_MASK)) return -1; onh++; } endfor_nexthops(fi); @@ -303,7 +303,7 @@ static inline unsigned int fib_info_hashfn(const struct fib_info *fi) val ^= (__force u32)fi->fib_prefsrc; val ^= fi->fib_priority; for_nexthops(fi) { - val ^= fib_devindex_hashfn(nh->nh_oif); + val ^= fib_devindex_hashfn(nh->fib_nh_oif); } endfor_nexthops(fi) return (val ^ (val >> 7) ^ (val >> 12)) & mask; @@ -352,9 +352,9 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev) hash = fib_devindex_hashfn(dev->ifindex); head = &fib_info_devhash[hash]; hlist_for_each_entry(nh, head, nh_hash) { - if (nh->nh_dev == dev && - nh->nh_gw == gw && - !(nh->nh_flags & RTNH_F_DEAD)) { + if (nh->fib_nh_dev == dev && + nh->fib_nh_gw4 == gw && + !(nh->fib_nh_flags & RTNH_F_DEAD)) { spin_unlock(&fib_info_lock); return 0; } @@ -389,10 +389,10 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) /* grab encap info */ for_nexthops(fi) { - if (nh->nh_lwtstate) { + if (nh->fib_nh_lws) { /* RTA_ENCAP_TYPE */ nh_encapsize += lwtunnel_get_encap_size( - nh->nh_lwtstate); + nh->fib_nh_lws); /* RTA_ENCAP */ nh_encapsize += nla_total_size(2); } @@ -443,7 +443,7 @@ static int fib_detect_death(struct fib_info *fi, int order, struct neighbour *n; int state = NUD_NONE; - n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev); + n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].fib_nh_gw4, fi->fib_dev); if (n) { state = n->nud_state; neigh_release(n); @@ -486,12 +486,12 @@ int fib_nh_init(struct net *net, struct fib_nh *nh, if (err) goto lwt_failure; - nh->nh_lwtstate = lwtstate_get(lwtstate); + nh->fib_nh_lws = lwtstate_get(lwtstate); } - nh->nh_oif = cfg->fc_oif; - nh->nh_gw = cfg->fc_gw; - nh->nh_flags = cfg->fc_flags; + nh->fib_nh_oif = cfg->fc_oif; + nh->fib_nh_gw4 = cfg->fc_gw; + nh->fib_nh_flags = cfg->fc_flags; #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid = cfg->fc_flow; @@ -499,7 +499,7 @@ int fib_nh_init(struct net *net, struct fib_nh *nh, net->ipv4.fib_num_tclassid_users++; #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH - nh->nh_weight = nh_weight; + nh->fib_nh_weight = nh_weight; #endif return 0; @@ -587,12 +587,12 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, } endfor_nexthops(fi); ret = -EINVAL; - if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) { + if (cfg->fc_oif && fi->fib_nh->fib_nh_oif != cfg->fc_oif) { NL_SET_ERR_MSG(extack, "Nexthop device index does not match RTA_OIF"); goto errout; } - if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) { + if (cfg->fc_gw && fi->fib_nh->fib_nh_gw4 != cfg->fc_gw) { NL_SET_ERR_MSG(extack, "Nexthop gateway does not match RTA_GATEWAY"); goto errout; @@ -619,32 +619,32 @@ static void fib_rebalance(struct fib_info *fi) total = 0; for_nexthops(fi) { - if (nh->nh_flags & RTNH_F_DEAD) + if (nh->fib_nh_flags & RTNH_F_DEAD) continue; - if (ip_ignore_linkdown(nh->nh_dev) && - nh->nh_flags & RTNH_F_LINKDOWN) + if (ip_ignore_linkdown(nh->fib_nh_dev) && + nh->fib_nh_flags & RTNH_F_LINKDOWN) continue; - total += nh->nh_weight; + total += nh->fib_nh_weight; } endfor_nexthops(fi); w = 0; change_nexthops(fi) { int upper_bound; - if (nexthop_nh->nh_flags & RTNH_F_DEAD) { + if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) { upper_bound = -1; - } else if (ip_ignore_linkdown(nexthop_nh->nh_dev) && - nexthop_nh->nh_flags & RTNH_F_LINKDOWN) { + } else if (ip_ignore_linkdown(nexthop_nh->fib_nh_dev) && + nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) { upper_bound = -1; } else { - w += nexthop_nh->nh_weight; + w += nexthop_nh->fib_nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; } - atomic_set(&nexthop_nh->nh_upper_bound, upper_bound); + atomic_set(&nexthop_nh->fib_nh_upper_bound, upper_bound); } endfor_nexthops(fi); } #else /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -677,7 +677,7 @@ static int fib_encap_match(u16 encap_type, ret = lwtunnel_build_state(encap_type, encap, AF_INET, cfg, &lwtstate, extack); if (!ret) { - result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate); + result = lwtunnel_cmp_encap(lwtstate, nh->fib_nh_lws); lwtstate_free(lwtstate); } @@ -706,8 +706,8 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, cfg->fc_flow != fi->fib_nh->nh_tclassid) return 1; #endif - if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && - (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw)) + if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->fib_nh_oif) && + (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->fib_nh_gw4)) return 0; return 1; } @@ -725,7 +725,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, if (!rtnh_ok(rtnh, remaining)) return -EINVAL; - if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif) + if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->fib_nh_oif) return 1; attrlen = rtnh_attrlen(rtnh); @@ -733,7 +733,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, struct nlattr *nla, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); - if (nla && nla_get_in_addr(nla) != nh->nh_gw) + if (nla && nla_get_in_addr(nla) != nh->fib_nh_gw4) return 1; #ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); @@ -840,10 +840,10 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, struct net_device *dev; net = cfg->fc_nlinfo.nl_net; - if (nh->nh_gw) { + if (nh->fib_nh_gw4) { struct fib_result res; - if (nh->nh_flags & RTNH_F_ONLINK) { + if (nh->fib_nh_flags & RTNH_F_ONLINK) { unsigned int addr_type; if (cfg->fc_scope >= RT_SCOPE_LINK) { @@ -851,7 +851,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, "Nexthop has invalid scope"); return -EINVAL; } - dev = __dev_get_by_index(net, nh->nh_oif); + dev = __dev_get_by_index(net, nh->fib_nh_oif); if (!dev) { NL_SET_ERR_MSG(extack, "Nexthop device required for onlink"); return -ENODEV; @@ -861,26 +861,27 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, "Nexthop device is not up"); return -ENETDOWN; } - addr_type = inet_addr_type_dev_table(net, dev, nh->nh_gw); + addr_type = inet_addr_type_dev_table(net, dev, + nh->fib_nh_gw4); if (addr_type != RTN_UNICAST) { NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); return -EINVAL; } if (!netif_carrier_ok(dev)) - nh->nh_flags |= RTNH_F_LINKDOWN; - nh->nh_dev = dev; + nh->fib_nh_flags |= RTNH_F_LINKDOWN; + nh->fib_nh_dev = dev; dev_hold(dev); - nh->nh_scope = RT_SCOPE_LINK; + nh->fib_nh_scope = RT_SCOPE_LINK; return 0; } rcu_read_lock(); { struct fib_table *tbl = NULL; struct flowi4 fl4 = { - .daddr = nh->nh_gw, + .daddr = nh->fib_nh_gw4, .flowi4_scope = cfg->fc_scope + 1, - .flowi4_oif = nh->nh_oif, + .flowi4_oif = nh->fib_nh_oif, .flowi4_iif = LOOPBACK_IFINDEX, }; @@ -917,9 +918,9 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); goto out; } - nh->nh_scope = res.scope; - nh->nh_oif = FIB_RES_OIF(res); - nh->nh_dev = dev = FIB_RES_DEV(res); + nh->fib_nh_scope = res.scope; + nh->fib_nh_oif = FIB_RES_OIF(res); + nh->fib_nh_dev = dev = FIB_RES_DEV(res); if (!dev) { NL_SET_ERR_MSG(extack, "No egress device for nexthop gateway"); @@ -927,19 +928,19 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, } dev_hold(dev); if (!netif_carrier_ok(dev)) - nh->nh_flags |= RTNH_F_LINKDOWN; + nh->fib_nh_flags |= RTNH_F_LINKDOWN; err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; } else { struct in_device *in_dev; - if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) { + if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) { NL_SET_ERR_MSG(extack, "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set"); return -EINVAL; } rcu_read_lock(); err = -ENODEV; - in_dev = inetdev_by_index(net, nh->nh_oif); + in_dev = inetdev_by_index(net, nh->fib_nh_oif); if (!in_dev) goto out; err = -ENETDOWN; @@ -947,11 +948,11 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, NL_SET_ERR_MSG(extack, "Device for nexthop is not up"); goto out; } - nh->nh_dev = in_dev->dev; - dev_hold(nh->nh_dev); - nh->nh_scope = RT_SCOPE_HOST; - if (!netif_carrier_ok(nh->nh_dev)) - nh->nh_flags |= RTNH_F_LINKDOWN; + nh->fib_nh_dev = in_dev->dev; + dev_hold(nh->fib_nh_dev); + nh->fib_nh_scope = RT_SCOPE_HOST; + if (!netif_carrier_ok(nh->fib_nh_dev)) + nh->fib_nh_flags |= RTNH_F_LINKDOWN; err = 0; } out: @@ -1043,8 +1044,8 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) { - nh->nh_saddr = inet_select_addr(nh->nh_dev, - nh->nh_gw, + nh->nh_saddr = inet_select_addr(nh->fib_nh_dev, + nh->fib_nh_gw4, nh->nh_parent->fib_scope); nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); @@ -1198,15 +1199,15 @@ struct fib_info *fib_create_info(struct fib_config *cfg, "Route with host scope can not have multiple nexthops"); goto err_inval; } - if (nh->nh_gw) { + if (nh->fib_nh_gw4) { NL_SET_ERR_MSG(extack, "Route with host scope can not have a gateway"); goto err_inval; } - nh->nh_scope = RT_SCOPE_NOWHERE; - nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif); + nh->fib_nh_scope = RT_SCOPE_NOWHERE; + nh->fib_nh_dev = dev_get_by_index(net, fi->fib_nh->fib_nh_oif); err = -ENODEV; - if (!nh->nh_dev) + if (!nh->fib_nh_dev) goto failure; } else { int linkdown = 0; @@ -1215,7 +1216,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, err = fib_check_nh(cfg, nexthop_nh, extack); if (err != 0) goto failure; - if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN) + if (nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) linkdown++; } endfor_nexthops(fi) if (linkdown == fi->fib_nhs) @@ -1257,9 +1258,9 @@ link_it: struct hlist_head *head; unsigned int hash; - if (!nexthop_nh->nh_dev) + if (!nexthop_nh->fib_nh_dev) continue; - hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex); + hash = fib_devindex_hashfn(nexthop_nh->fib_nh_dev->ifindex); head = &fib_info_devhash[hash]; hlist_add_head(&nexthop_nh->nh_hash, head); } endfor_nexthops(fi) @@ -1318,27 +1319,27 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) goto nla_put_failure; if (fi->fib_nhs == 1) { - if (fi->fib_nh->nh_gw && - nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw)) + if (fi->fib_nh->fib_nh_gw4 && + nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->fib_nh_gw4)) goto nla_put_failure; - if (fi->fib_nh->nh_oif && - nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif)) + if (fi->fib_nh->fib_nh_oif && + nla_put_u32(skb, RTA_OIF, fi->fib_nh->fib_nh_oif)) goto nla_put_failure; - if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) { + if (fi->fib_nh->fib_nh_flags & RTNH_F_LINKDOWN) { rcu_read_lock(); - if (ip_ignore_linkdown(fi->fib_nh->nh_dev)) + if (ip_ignore_linkdown(fi->fib_nh->fib_nh_dev)) rtm->rtm_flags |= RTNH_F_DEAD; rcu_read_unlock(); } - if (fi->fib_nh->nh_flags & RTNH_F_OFFLOAD) + if (fi->fib_nh->fib_nh_flags & RTNH_F_OFFLOAD) rtm->rtm_flags |= RTNH_F_OFFLOAD; #ifdef CONFIG_IP_ROUTE_CLASSID if (fi->fib_nh[0].nh_tclassid && nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) goto nla_put_failure; #endif - if (fi->fib_nh->nh_lwtstate && - lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate) < 0) + if (fi->fib_nh->fib_nh_lws && + lwtunnel_fill_encap(skb, fi->fib_nh->fib_nh_lws) < 0) goto nla_put_failure; } #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -1355,26 +1356,26 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, if (!rtnh) goto nla_put_failure; - rtnh->rtnh_flags = nh->nh_flags & 0xFF; - if (nh->nh_flags & RTNH_F_LINKDOWN) { + rtnh->rtnh_flags = nh->fib_nh_flags & 0xFF; + if (nh->fib_nh_flags & RTNH_F_LINKDOWN) { rcu_read_lock(); - if (ip_ignore_linkdown(nh->nh_dev)) + if (ip_ignore_linkdown(nh->fib_nh_dev)) rtnh->rtnh_flags |= RTNH_F_DEAD; rcu_read_unlock(); } - rtnh->rtnh_hops = nh->nh_weight - 1; - rtnh->rtnh_ifindex = nh->nh_oif; + rtnh->rtnh_hops = nh->fib_nh_weight - 1; + rtnh->rtnh_ifindex = nh->fib_nh_oif; - if (nh->nh_gw && - nla_put_in_addr(skb, RTA_GATEWAY, nh->nh_gw)) + if (nh->fib_nh_gw4 && + nla_put_in_addr(skb, RTA_GATEWAY, nh->fib_nh_gw4)) goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID if (nh->nh_tclassid && nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) goto nla_put_failure; #endif - if (nh->nh_lwtstate && - lwtunnel_fill_encap(skb, nh->nh_lwtstate) < 0) + if (nh->fib_nh_lws && + lwtunnel_fill_encap(skb, nh->fib_nh_lws) < 0) goto nla_put_failure; /* length of rtnetlink header + attributes */ @@ -1422,26 +1423,26 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local) return ret; } -static int call_fib_nh_notifiers(struct fib_nh *fib_nh, +static int call_fib_nh_notifiers(struct fib_nh *nh, enum fib_event_type event_type) { - bool ignore_link_down = ip_ignore_linkdown(fib_nh->nh_dev); + bool ignore_link_down = ip_ignore_linkdown(nh->fib_nh_dev); struct fib_nh_notifier_info info = { - .fib_nh = fib_nh, + .fib_nh = nh, }; switch (event_type) { case FIB_EVENT_NH_ADD: - if (fib_nh->nh_flags & RTNH_F_DEAD) + if (nh->fib_nh_flags & RTNH_F_DEAD) break; - if (ignore_link_down && fib_nh->nh_flags & RTNH_F_LINKDOWN) + if (ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) break; - return call_fib4_notifiers(dev_net(fib_nh->nh_dev), event_type, + return call_fib4_notifiers(dev_net(nh->fib_nh_dev), event_type, &info.info); case FIB_EVENT_NH_DEL: - if ((ignore_link_down && fib_nh->nh_flags & RTNH_F_LINKDOWN) || - (fib_nh->nh_flags & RTNH_F_DEAD)) - return call_fib4_notifiers(dev_net(fib_nh->nh_dev), + if ((ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) || + (nh->fib_nh_flags & RTNH_F_DEAD)) + return call_fib4_notifiers(dev_net(nh->fib_nh_dev), event_type, &info.info); default: break; @@ -1495,7 +1496,7 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) struct fib_nh *nh; hlist_for_each_entry(nh, head, nh_hash) { - if (nh->nh_dev == dev) + if (nh->fib_nh_dev == dev) nh_update_mtu(nh, dev->mtu, orig_mtu); } } @@ -1523,22 +1524,22 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) int dead; BUG_ON(!fi->fib_nhs); - if (nh->nh_dev != dev || fi == prev_fi) + if (nh->fib_nh_dev != dev || fi == prev_fi) continue; prev_fi = fi; dead = 0; change_nexthops(fi) { - if (nexthop_nh->nh_flags & RTNH_F_DEAD) + if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) dead++; - else if (nexthop_nh->nh_dev == dev && - nexthop_nh->nh_scope != scope) { + else if (nexthop_nh->fib_nh_dev == dev && + nexthop_nh->fib_nh_scope != scope) { switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: - nexthop_nh->nh_flags |= RTNH_F_DEAD; + nexthop_nh->fib_nh_flags |= RTNH_F_DEAD; /* fall through */ case NETDEV_CHANGE: - nexthop_nh->nh_flags |= RTNH_F_LINKDOWN; + nexthop_nh->fib_nh_flags |= RTNH_F_LINKDOWN; break; } call_fib_nh_notifiers(nexthop_nh, @@ -1547,7 +1548,7 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (event == NETDEV_UNREGISTER && - nexthop_nh->nh_dev == dev) { + nexthop_nh->fib_nh_dev == dev) { dead = fi->fib_nhs; break; } @@ -1607,8 +1608,8 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res) if (next_fi->fib_scope != res->scope || fa->fa_type != RTN_UNICAST) continue; - if (!next_fi->fib_nh[0].nh_gw || - next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) + if (!next_fi->fib_nh[0].fib_nh_gw4 || + next_fi->fib_nh[0].fib_nh_scope != RT_SCOPE_LINK) continue; fib_alias_accessed(fa); @@ -1679,24 +1680,24 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags) int alive; BUG_ON(!fi->fib_nhs); - if (nh->nh_dev != dev || fi == prev_fi) + if (nh->fib_nh_dev != dev || fi == prev_fi) continue; prev_fi = fi; alive = 0; change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & nh_flags)) { + if (!(nexthop_nh->fib_nh_flags & nh_flags)) { alive++; continue; } - if (!nexthop_nh->nh_dev || - !(nexthop_nh->nh_dev->flags & IFF_UP)) + if (!nexthop_nh->fib_nh_dev || + !(nexthop_nh->fib_nh_dev->flags & IFF_UP)) continue; - if (nexthop_nh->nh_dev != dev || + if (nexthop_nh->fib_nh_dev != dev || !__in_dev_get_rtnl(dev)) continue; alive++; - nexthop_nh->nh_flags &= ~nh_flags; + nexthop_nh->fib_nh_flags &= ~nh_flags; call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD); } endfor_nexthops(fi) @@ -1716,13 +1717,13 @@ static bool fib_good_nh(const struct fib_nh *nh) { int state = NUD_REACHABLE; - if (nh->nh_scope == RT_SCOPE_LINK) { + if (nh->fib_nh_scope == RT_SCOPE_LINK) { struct neighbour *n; rcu_read_lock_bh(); - n = __ipv4_neigh_lookup_noref(nh->nh_dev, - (__force u32)nh->nh_gw); + n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, + (__force u32)nh->fib_nh_gw4); if (n) state = n->nud_state; @@ -1748,7 +1749,7 @@ void fib_select_multipath(struct fib_result *res, int hash) } } - if (hash > atomic_read(&nh->nh_upper_bound)) + if (hash > atomic_read(&nh->fib_nh_upper_bound)) continue; res->nh_sel = nhsel; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 656d3d19f112..1e3b492690f9 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1472,15 +1472,15 @@ found: for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { const struct fib_nh *nh = &fi->fib_nh[nhsel]; - if (nh->nh_flags & RTNH_F_DEAD) + if (nh->fib_nh_flags & RTNH_F_DEAD) continue; - if (ip_ignore_linkdown(nh->nh_dev) && - nh->nh_flags & RTNH_F_LINKDOWN && + if (ip_ignore_linkdown(nh->fib_nh_dev) && + nh->fib_nh_flags & RTNH_F_LINKDOWN && !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) continue; if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) { if (flp->flowi4_oif && - flp->flowi4_oif != nh->nh_oif) + flp->flowi4_oif != nh->fib_nh_oif) continue; } @@ -2651,7 +2651,7 @@ static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT) flags = RTF_REJECT; - if (fi && fi->fib_nh->nh_gw) + if (fi && fi->fib_nh->fib_nh_gw4) flags |= RTF_GATEWAY; if (mask == htonl(0xFFFFFFFF)) flags |= RTF_HOST; @@ -2702,7 +2702,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) "%d\t%08X\t%d\t%u\t%u", fi->fib_dev ? fi->fib_dev->name : "*", prefix, - fi->fib_nh->nh_gw, flags, 0, 0, + fi->fib_nh->fib_nh_gw4, flags, 0, 0, fi->fib_priority, mask, (fi->fib_advmss ? diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f2688fce39e1..7977514d90f5 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -644,7 +644,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, unsigned int i; int depth; - genid = fnhe_genid(dev_net(nh->nh_dev)); + genid = fnhe_genid(dev_net(nh->fib_nh_dev)); hval = fnhe_hashfun(daddr); spin_lock_bh(&fnhe_lock); @@ -1356,7 +1356,7 @@ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) { struct fib_info *fi = res->fi; struct fib_nh *nh = &fi->fib_nh[res->nh_sel]; - struct net_device *dev = nh->nh_dev; + struct net_device *dev = nh->fib_nh_dev; u32 mtu = 0; if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu || @@ -1374,7 +1374,7 @@ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) if (likely(!mtu)) mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU); - return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu); + return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); } static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, @@ -1531,8 +1531,8 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, if (fi) { struct fib_nh *nh = &FIB_RES_NH(*res); - if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) { - rt->rt_gateway = nh->nh_gw; + if (nh->fib_nh_gw4 && nh->fib_nh_scope == RT_SCOPE_LINK) { + rt->rt_gateway = nh->fib_nh_gw4; rt->rt_uses_gateway = 1; } ip_dst_init_metrics(&rt->dst, fi->fib_metrics); @@ -1540,7 +1540,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif - rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); + rt->dst.lwtstate = lwtstate_get(nh->fib_nh_lws); if (unlikely(fnhe)) cached = rt_bind_exception(rt, fnhe, daddr, do_cache); else if (do_cache) @@ -2075,7 +2075,7 @@ local_input: if (do_cache) { struct fib_nh *nh = &FIB_RES_NH(*res); - rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); + rth->dst.lwtstate = lwtstate_get(nh->fib_nh_lws); if (lwtunnel_input_redirect(rth->dst.lwtstate)) { WARN_ON(rth->dst.input == lwtunnel_input); rth->dst.lwtstate->orig_input = rth->dst.input; @@ -2264,8 +2264,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res, } else { if (unlikely(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH && - !(nh->nh_gw && - nh->nh_scope == RT_SCOPE_LINK))) { + !(nh->fib_nh_gw4 && + nh->fib_nh_scope == RT_SCOPE_LINK))) { do_cache = false; goto add; } -- cgit From f1741730dd18828fe3ea5fa91c22f41cf001c625 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:57 -0700 Subject: net: Add fib_nh_common and update fib_nh and fib6_nh Add fib_nh_common struct with common nexthop attributes. Convert fib_nh and fib6_nh to use it. Use macros to move existing fib_nh_* references to the new nh_common.nhc_*. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c1e16b52338b..e9992407863e 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -468,6 +468,8 @@ int fib_nh_init(struct net *net, struct fib_nh *nh, { int err = -ENOMEM; + nh->fib_nh_family = AF_INET; + nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *); if (!nh->nh_pcpu_rth_output) goto err_out; @@ -490,7 +492,10 @@ int fib_nh_init(struct net *net, struct fib_nh *nh, } nh->fib_nh_oif = cfg->fc_oif; - nh->fib_nh_gw4 = cfg->fc_gw; + if (cfg->fc_gw) { + nh->fib_nh_gw4 = cfg->fc_gw; + nh->fib_nh_has_gw = 1; + } nh->fib_nh_flags = cfg->fc_flags; #ifdef CONFIG_IP_ROUTE_CLASSID -- cgit From 979e276ebebd537782797c439c9cb42b6d3aba27 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:58 -0700 Subject: net: Use common nexthop init and release helpers With fib_nh_common in place, move common initialization and release code into helpers used by both ipv4 and ipv6. For the moment, the init is just the lwt encap and the release is both the netdev reference and the the lwt state reference. More will be added later. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 60 +++++++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 21 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e9992407863e..df777af7e278 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -204,16 +204,22 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) free_percpu(rtp); } +void fib_nh_common_release(struct fib_nh_common *nhc) +{ + if (nhc->nhc_dev) + dev_put(nhc->nhc_dev); + + lwtstate_put(nhc->nhc_lwtstate); +} +EXPORT_SYMBOL_GPL(fib_nh_common_release); + void fib_nh_release(struct net *net, struct fib_nh *fib_nh) { #ifdef CONFIG_IP_ROUTE_CLASSID if (fib_nh->nh_tclassid) net->ipv4.fib_num_tclassid_users--; #endif - if (fib_nh->fib_nh_dev) - dev_put(fib_nh->fib_nh_dev); - - lwtstate_put(fib_nh->fib_nh_lws); + fib_nh_common_release(&fib_nh->nh_common); free_nh_exceptions(fib_nh); rt_fibinfo_free_cpus(fib_nh->nh_pcpu_rth_output); rt_fibinfo_free(&fib_nh->nh_rth_input); @@ -462,6 +468,30 @@ static int fib_detect_death(struct fib_info *fi, int order, return 1; } +int fib_nh_common_init(struct fib_nh_common *nhc, struct nlattr *encap, + u16 encap_type, void *cfg, gfp_t gfp_flags, + struct netlink_ext_ack *extack) +{ + if (encap) { + struct lwtunnel_state *lwtstate; + int err; + + if (encap_type == LWTUNNEL_ENCAP_NONE) { + NL_SET_ERR_MSG(extack, "LWT encap type not specified"); + return -EINVAL; + } + err = lwtunnel_build_state(encap_type, encap, nhc->nhc_family, + cfg, &lwtstate, extack); + if (err) + return err; + + nhc->nhc_lwtstate = lwtstate_get(lwtstate); + } + + return 0; +} +EXPORT_SYMBOL_GPL(fib_nh_common_init); + int fib_nh_init(struct net *net, struct fib_nh *nh, struct fib_config *cfg, int nh_weight, struct netlink_ext_ack *extack) @@ -474,22 +504,10 @@ int fib_nh_init(struct net *net, struct fib_nh *nh, if (!nh->nh_pcpu_rth_output) goto err_out; - if (cfg->fc_encap) { - struct lwtunnel_state *lwtstate; - - err = -EINVAL; - if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) { - NL_SET_ERR_MSG(extack, "LWT encap type not specified"); - goto lwt_failure; - } - err = lwtunnel_build_state(cfg->fc_encap_type, - cfg->fc_encap, AF_INET, cfg, - &lwtstate, extack); - if (err) - goto lwt_failure; - - nh->fib_nh_lws = lwtstate_get(lwtstate); - } + err = fib_nh_common_init(&nh->nh_common, cfg->fc_encap, + cfg->fc_encap_type, cfg, GFP_KERNEL, extack); + if (err) + goto init_failure; nh->fib_nh_oif = cfg->fc_oif; if (cfg->fc_gw) { @@ -508,7 +526,7 @@ int fib_nh_init(struct net *net, struct fib_nh *nh, #endif return 0; -lwt_failure: +init_failure: rt_fibinfo_free_cpus(nh->nh_pcpu_rth_output); nh->nh_pcpu_rth_output = NULL; err_out: -- cgit From eb70a1ae2339769156f8ecddd7f6cd59ac994888 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2019 12:46:17 -0700 Subject: tcp: cleanup sk_tx_skb_cache before reuse TCP stack relies on the fact that a freshly allocated skb has skb->cb[] and skb_shinfo(skb)->tx_flags cleared. When recycling tx skb, we must ensure these fields are cleared. Fixes: 472c2e07eef0 ("tcp: add one skb cache for tx") Signed-off-by: Eric Dumazet Cc: Soheil Hassas Yeganeh Cc: Willem de Bruijn Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 82bd707c0347..603e770d59b3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -872,6 +872,8 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, sk->sk_tx_skb_cache = NULL; pskb_trim(skb, 0); INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); + skb_shinfo(skb)->tx_flags = 0; + memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb)); return skb; } } -- cgit From 5869b8fadad0be948e310c456f111c4103f5fbfb Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 31 Mar 2019 17:03:02 +0800 Subject: net: use rcu_dereference_protected to fetch sk_dst_cache in sk_destruct As Eric noticed, in .sk_destruct, sk->sk_dst_cache update is prevented, and no barrier is needed for this. So change to use rcu_dereference_protected() instead of rcu_dereference_check() to fetch sk_dst_cache in there. v1->v2: - no change, repost after net-next is open. Reported-by: Eric Dumazet Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 7f3a984ad618..08a8430f5647 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -160,7 +160,7 @@ void inet_sock_destruct(struct sock *sk) WARN_ON(sk->sk_forward_alloc); kfree(rcu_dereference_protected(inet->inet_opt, 1)); - dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); + dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); dst_release(sk->sk_rx_dst); sk_refcnt_debug_dec(sk); } -- cgit From 0af7e7c128eb33f2dc16ed088ced00675785d628 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 2 Apr 2019 14:11:54 -0700 Subject: ipv4: Update fib_table_lookup tracepoint to take common nexthop Update fib_table_lookup tracepoint to take a fib_nh_common struct and dump the v6 gateway address if the nexthop uses it. Over the years saddr has not proven useful and the output of the tracepoint produces very long lines. Since saddr is not part of fib_nh_common, drop it. If it needs to be added later, fib_nh which contains saddr can be obtained from a fib_nh_common via container_of. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 1e3b492690f9..13b3327206f9 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1498,7 +1498,7 @@ found: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif - trace_fib_table_lookup(tb->tb_id, flp, nh, err); + trace_fib_table_lookup(tb->tb_id, flp, &nh->nh_common, err); return err; } -- cgit From eba618abacade71669eb67c3360eecfee810cc88 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 2 Apr 2019 14:11:55 -0700 Subject: ipv4: Add fib_nh_common to fib_result Most of the ipv4 code only needs data from fib_nh_common. Add fib_nh_common selection to fib_result and update users to use it. Right now, fib_nh_common in fib_result will point to a fib_nh struct that is embedded within a fib_info: fib_info --> fib_nh fib_nh ... fib_nh ^ fib_result->nhc ----+ Later, nhc can point to a fib_nh within a nexthop struct: fib_info --> nexthop --> fib_nh ^ fib_result->nhc ---------------+ or for a nexthop group: fib_info --> nexthop --> nexthop --> fib_nh nexthop --> fib_nh ... nexthop --> fib_nh ^ fib_result->nhc ---------------------------+ In all cases nhsel within fib_result will point to which leg in the multipath route is used. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 6 ++--- net/ipv4/fib_lookup.h | 1 + net/ipv4/fib_semantics.c | 25 ++++++++++++++++---- net/ipv4/fib_trie.c | 13 ++++++----- net/ipv4/route.c | 60 ++++++++++++++++++++++++++++++++---------------- 5 files changed, 72 insertions(+), 33 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index ffbe24397dbe..15f779bd26b3 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -307,7 +307,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) .flowi4_mark = vmark ? skb->mark : 0, }; if (!fib_lookup(net, &fl4, &res, 0)) - return FIB_RES_PREFSRC(net, res); + return fib_result_prefsrc(net, &res); } else { scope = RT_SCOPE_LINK; } @@ -390,7 +390,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, dev_match = fib_info_nh_uses_dev(res.fi, dev); if (dev_match) { - ret = FIB_RES_NH(res).fib_nh_scope >= RT_SCOPE_HOST; + ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; return ret; } if (no_addr) @@ -402,7 +402,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, ret = 0; if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) { if (res.type == RTN_UNICAST) - ret = FIB_RES_NH(res).fib_nh_scope >= RT_SCOPE_HOST; + ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; } return ret; diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index e6ff282bb7f4..7945f0534db7 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -45,6 +45,7 @@ static inline void fib_result_assign(struct fib_result *res, { /* we used to play games with refcounts, but we now use RCU */ res->fi = fi; + res->nhc = fib_info_nhc(fi, 0); } struct fib_prop { diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index df777af7e278..42666a409da0 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1075,6 +1075,21 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) return nh->nh_saddr; } +__be32 fib_result_prefsrc(struct net *net, struct fib_result *res) +{ + struct fib_nh_common *nhc = res->nhc; + struct fib_nh *nh; + + if (res->fi->fib_prefsrc) + return res->fi->fib_prefsrc; + + nh = container_of(nhc, struct fib_nh, nh_common); + if (nh->nh_saddr_genid == atomic_read(&net->ipv4.dev_addr_genid)) + return nh->nh_saddr; + + return fib_info_update_nh_saddr(net, nh); +} + static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) { if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || @@ -1762,20 +1777,22 @@ void fib_select_multipath(struct fib_result *res, int hash) struct net *net = fi->fib_net; bool first = false; - for_nexthops(fi) { + change_nexthops(fi) { if (net->ipv4.sysctl_fib_multipath_use_neigh) { - if (!fib_good_nh(nh)) + if (!fib_good_nh(nexthop_nh)) continue; if (!first) { res->nh_sel = nhsel; + res->nhc = &nexthop_nh->nh_common; first = true; } } - if (hash > atomic_read(&nh->fib_nh_upper_bound)) + if (hash > atomic_read(&nexthop_nh->fib_nh_upper_bound)) continue; res->nh_sel = nhsel; + res->nhc = &nexthop_nh->nh_common; return; } endfor_nexthops(fi); } @@ -1802,5 +1819,5 @@ void fib_select_path(struct net *net, struct fib_result *res, check_saddr: if (!fl4->saddr) - fl4->saddr = FIB_RES_PREFSRC(net, *res); + fl4->saddr = fib_result_prefsrc(net, res); } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 13b3327206f9..334f723bdf80 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1470,17 +1470,17 @@ found: if (fi->fib_flags & RTNH_F_DEAD) continue; for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { - const struct fib_nh *nh = &fi->fib_nh[nhsel]; + struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel); - if (nh->fib_nh_flags & RTNH_F_DEAD) + if (nhc->nhc_flags & RTNH_F_DEAD) continue; - if (ip_ignore_linkdown(nh->fib_nh_dev) && - nh->fib_nh_flags & RTNH_F_LINKDOWN && + if (ip_ignore_linkdown(nhc->nhc_dev) && + nhc->nhc_flags & RTNH_F_LINKDOWN && !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) continue; if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) { if (flp->flowi4_oif && - flp->flowi4_oif != nh->fib_nh_oif) + flp->flowi4_oif != nhc->nhc_oif) continue; } @@ -1490,6 +1490,7 @@ found: res->prefix = htonl(n->key); res->prefixlen = KEYLENGTH - fa->fa_slen; res->nh_sel = nhsel; + res->nhc = nhc; res->type = fa->fa_type; res->scope = fi->fib_scope; res->fi = fi; @@ -1498,7 +1499,7 @@ found: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif - trace_fib_table_lookup(tb->tb_id, flp, &nh->nh_common, err); + trace_fib_table_lookup(tb->tb_id, flp, nhc, err); return err; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 7977514d90f5..f3f2adf630d4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -778,8 +778,10 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow neigh_event_send(n, NULL); } else { if (fib_lookup(net, fl4, &res, 0) == 0) { - struct fib_nh *nh = &FIB_RES_NH(res); + struct fib_nh_common *nhc = FIB_RES_NHC(res); + struct fib_nh *nh; + nh = container_of(nhc, struct fib_nh, nh_common); update_or_create_fnhe(nh, fl4->daddr, new_gw, 0, false, jiffies + ip_rt_gc_timeout); @@ -1027,8 +1029,10 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) rcu_read_lock(); if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) { - struct fib_nh *nh = &FIB_RES_NH(res); + struct fib_nh_common *nhc = FIB_RES_NHC(res); + struct fib_nh *nh; + nh = container_of(nhc, struct fib_nh, nh_common); update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock, jiffies + ip_rt_mtu_expires); } @@ -1235,7 +1239,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) rcu_read_lock(); if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) - src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); + src = fib_result_prefsrc(dev_net(rt->dst.dev), &res); else src = inet_select_addr(rt->dst.dev, rt_nexthop(rt, iph->daddr), @@ -1354,9 +1358,9 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) { + struct fib_nh_common *nhc = res->nhc; + struct net_device *dev = nhc->nhc_dev; struct fib_info *fi = res->fi; - struct fib_nh *nh = &fi->fib_nh[res->nh_sel]; - struct net_device *dev = nh->fib_nh_dev; u32 mtu = 0; if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu || @@ -1364,6 +1368,7 @@ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) mtu = fi->fib_mtu; if (likely(!mtu)) { + struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common); struct fib_nh_exception *fnhe; fnhe = find_exception(nh, daddr); @@ -1374,7 +1379,7 @@ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) if (likely(!mtu)) mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU); - return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); + return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu); } static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, @@ -1529,7 +1534,8 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, bool cached = false; if (fi) { - struct fib_nh *nh = &FIB_RES_NH(*res); + struct fib_nh_common *nhc = FIB_RES_NHC(*res); + struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common); if (nh->fib_nh_gw4 && nh->fib_nh_scope == RT_SCOPE_LINK) { rt->rt_gateway = nh->fib_nh_gw4; @@ -1699,15 +1705,18 @@ static int __mkroute_input(struct sk_buff *skb, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos) { + struct fib_nh_common *nhc = FIB_RES_NHC(*res); + struct net_device *dev = nhc->nhc_dev; struct fib_nh_exception *fnhe; struct rtable *rth; + struct fib_nh *nh; int err; struct in_device *out_dev; bool do_cache; u32 itag = 0; /* get a working reference to the output device */ - out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); + out_dev = __in_dev_get_rcu(dev); if (!out_dev) { net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); return -EINVAL; @@ -1724,10 +1733,13 @@ static int __mkroute_input(struct sk_buff *skb, do_cache = res->fi && !itag; if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && - skb->protocol == htons(ETH_P_IP) && - (IN_DEV_SHARED_MEDIA(out_dev) || - inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) - IPCB(skb)->flags |= IPSKB_DOREDIRECT; + skb->protocol == htons(ETH_P_IP)) { + __be32 gw = nhc->nhc_family == AF_INET ? nhc->nhc_gw.ipv4 : 0; + + if (IN_DEV_SHARED_MEDIA(out_dev) || + inet_addr_onlink(out_dev, saddr, gw)) + IPCB(skb)->flags |= IPSKB_DOREDIRECT; + } if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is @@ -1744,12 +1756,13 @@ static int __mkroute_input(struct sk_buff *skb, } } - fnhe = find_exception(&FIB_RES_NH(*res), daddr); + nh = container_of(nhc, struct fib_nh, nh_common); + fnhe = find_exception(nh, daddr); if (do_cache) { if (fnhe) rth = rcu_dereference(fnhe->fnhe_rth_input); else - rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); + rth = rcu_dereference(nh->nh_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); goto out; @@ -2043,7 +2056,11 @@ local_input: do_cache = false; if (res->fi) { if (!itag) { - rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); + struct fib_nh_common *nhc = FIB_RES_NHC(*res); + struct fib_nh *nh; + + nh = container_of(nhc, struct fib_nh, nh_common); + rth = rcu_dereference(nh->nh_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); err = 0; @@ -2073,15 +2090,17 @@ local_input: } if (do_cache) { - struct fib_nh *nh = &FIB_RES_NH(*res); + struct fib_nh_common *nhc = FIB_RES_NHC(*res); + struct fib_nh *nh; - rth->dst.lwtstate = lwtstate_get(nh->fib_nh_lws); + rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); if (lwtunnel_input_redirect(rth->dst.lwtstate)) { WARN_ON(rth->dst.input == lwtunnel_input); rth->dst.lwtstate->orig_input = rth->dst.input; rth->dst.input = lwtunnel_input; } + nh = container_of(nhc, struct fib_nh, nh_common); if (unlikely(!rt_cache_route(nh, rth))) rt_add_uncached_list(rth); } @@ -2253,8 +2272,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res, fnhe = NULL; do_cache &= fi != NULL; if (fi) { + struct fib_nh_common *nhc = FIB_RES_NHC(*res); + struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common); struct rtable __rcu **prth; - struct fib_nh *nh = &FIB_RES_NH(*res); fnhe = find_exception(nh, fl4->daddr); if (!do_cache) @@ -2264,8 +2284,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res, } else { if (unlikely(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH && - !(nh->fib_nh_gw4 && - nh->fib_nh_scope == RT_SCOPE_LINK))) { + !(nhc->nhc_has_gw && + nhc->nhc_scope == RT_SCOPE_LINK))) { do_cache = false; goto add; } -- cgit From b0f60193632e4eab4c9663101bb435dd7bc27ae8 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 2 Apr 2019 14:11:56 -0700 Subject: ipv4: Refactor nexthop attributes in fib_dump_info Similar to ipv6, move addition of nexthop attributes to dump message into helpers that are called for both single path and multipath routes. Align the new helpers to the IPv6 variant which most notably means computing the flags argument based on settings in nh_flags. The RTA_FLOW argument is unique to IPv4, so it is appended after the new fib_nexthop_info helper. The intent of a later patch is to make both fib_nexthop_info and fib_add_nexthop usable for both IPv4 and IPv6. This patch is stepping stone in that direction. Signed-off-by: David Ahern Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 166 ++++++++++++++++++++++++++++++----------------- 1 file changed, 107 insertions(+), 59 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 42666a409da0..32fb0123d881 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1317,6 +1317,103 @@ failure: return ERR_PTR(err); } +static int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh *nh, + unsigned int *flags, bool skip_oif) +{ + if (nh->fib_nh_flags & RTNH_F_DEAD) + *flags |= RTNH_F_DEAD; + + if (nh->fib_nh_flags & RTNH_F_LINKDOWN) { + *flags |= RTNH_F_LINKDOWN; + + rcu_read_lock(); + if (ip_ignore_linkdown(nh->fib_nh_dev)) + *flags |= RTNH_F_DEAD; + rcu_read_unlock(); + } + + if (nh->fib_nh_gw4 && + nla_put_in_addr(skb, RTA_GATEWAY, nh->fib_nh_gw4)) + goto nla_put_failure; + + *flags |= (nh->fib_nh_flags & RTNH_F_ONLINK); + if (nh->fib_nh_flags & RTNH_F_OFFLOAD) + *flags |= RTNH_F_OFFLOAD; + + if (!skip_oif && nh->fib_nh_dev && + nla_put_u32(skb, RTA_OIF, nh->fib_nh_dev->ifindex)) + goto nla_put_failure; + + if (nh->fib_nh_lws && + lwtunnel_fill_encap(skb, nh->fib_nh_lws) < 0) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +#ifdef CONFIG_IP_ROUTE_MULTIPATH +static int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh *nh) +{ + const struct net_device *dev = nh->fib_nh_dev; + struct rtnexthop *rtnh; + unsigned int flags = 0; + + rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); + if (!rtnh) + goto nla_put_failure; + + rtnh->rtnh_hops = nh->fib_nh_weight - 1; + rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; + + if (fib_nexthop_info(skb, nh, &flags, true) < 0) + goto nla_put_failure; + + rtnh->rtnh_flags = flags; + + /* length of rtnetlink header + attributes */ + rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) +{ + struct nlattr *mp; + + mp = nla_nest_start(skb, RTA_MULTIPATH); + if (!mp) + goto nla_put_failure; + + for_nexthops(fi) { + if (fib_add_nexthop(skb, nh) < 0) + goto nla_put_failure; +#ifdef CONFIG_IP_ROUTE_CLASSID + if (nh->nh_tclassid && + nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) + goto nla_put_failure; +#endif + } endfor_nexthops(fi); + + nla_nest_end(skb, mp); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} +#else +static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) +{ + return 0; +} +#endif + int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int flags) @@ -1357,72 +1454,23 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) goto nla_put_failure; if (fi->fib_nhs == 1) { - if (fi->fib_nh->fib_nh_gw4 && - nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->fib_nh_gw4)) - goto nla_put_failure; - if (fi->fib_nh->fib_nh_oif && - nla_put_u32(skb, RTA_OIF, fi->fib_nh->fib_nh_oif)) + struct fib_nh *nh = &fi->fib_nh[0]; + unsigned int flags = 0; + + if (fib_nexthop_info(skb, nh, &flags, false) < 0) goto nla_put_failure; - if (fi->fib_nh->fib_nh_flags & RTNH_F_LINKDOWN) { - rcu_read_lock(); - if (ip_ignore_linkdown(fi->fib_nh->fib_nh_dev)) - rtm->rtm_flags |= RTNH_F_DEAD; - rcu_read_unlock(); - } - if (fi->fib_nh->fib_nh_flags & RTNH_F_OFFLOAD) - rtm->rtm_flags |= RTNH_F_OFFLOAD; + + rtm->rtm_flags = flags; #ifdef CONFIG_IP_ROUTE_CLASSID - if (fi->fib_nh[0].nh_tclassid && - nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) + if (nh->nh_tclassid && + nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) goto nla_put_failure; #endif - if (fi->fib_nh->fib_nh_lws && - lwtunnel_fill_encap(skb, fi->fib_nh->fib_nh_lws) < 0) + } else { + if (fib_add_multipath(skb, fi) < 0) goto nla_put_failure; } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - if (fi->fib_nhs > 1) { - struct rtnexthop *rtnh; - struct nlattr *mp; - - mp = nla_nest_start(skb, RTA_MULTIPATH); - if (!mp) - goto nla_put_failure; - for_nexthops(fi) { - rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); - if (!rtnh) - goto nla_put_failure; - - rtnh->rtnh_flags = nh->fib_nh_flags & 0xFF; - if (nh->fib_nh_flags & RTNH_F_LINKDOWN) { - rcu_read_lock(); - if (ip_ignore_linkdown(nh->fib_nh_dev)) - rtnh->rtnh_flags |= RTNH_F_DEAD; - rcu_read_unlock(); - } - rtnh->rtnh_hops = nh->fib_nh_weight - 1; - rtnh->rtnh_ifindex = nh->fib_nh_oif; - - if (nh->fib_nh_gw4 && - nla_put_in_addr(skb, RTA_GATEWAY, nh->fib_nh_gw4)) - goto nla_put_failure; -#ifdef CONFIG_IP_ROUTE_CLASSID - if (nh->nh_tclassid && - nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) - goto nla_put_failure; -#endif - if (nh->fib_nh_lws && - lwtunnel_fill_encap(skb, nh->fib_nh_lws) < 0) - goto nla_put_failure; - - /* length of rtnetlink header + attributes */ - rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; - } endfor_nexthops(fi); - - nla_nest_end(skb, mp); - } -#endif nlmsg_end(skb, nlh); return 0; -- cgit From c236419981224d37a5d0a6e7781f73479d4a030e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 2 Apr 2019 14:11:57 -0700 Subject: ipv4: Change fib_nexthop_info and fib_add_nexthop to take fib_nh_common With the exception of the nexthop weight, the nexthop attributes used by fib_nexthop_info and fib_add_nexthop come from the fib_nh_common struct. Update both to use it and change fib_nexthop_info to check the family as needed. nexthop weight comes from the common struct for existing use cases, but for nexthop groups the weight is outside of the fib_nh_common to allow the same nexthop definition to be used in multiple groups with different weights. Signed-off-by: David Ahern Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 52 +++++++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 20 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 32fb0123d881..33a43965a232 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1317,35 +1317,44 @@ failure: return ERR_PTR(err); } -static int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh *nh, +static int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc, unsigned int *flags, bool skip_oif) { - if (nh->fib_nh_flags & RTNH_F_DEAD) + if (nhc->nhc_flags & RTNH_F_DEAD) *flags |= RTNH_F_DEAD; - if (nh->fib_nh_flags & RTNH_F_LINKDOWN) { + if (nhc->nhc_flags & RTNH_F_LINKDOWN) { *flags |= RTNH_F_LINKDOWN; rcu_read_lock(); - if (ip_ignore_linkdown(nh->fib_nh_dev)) - *flags |= RTNH_F_DEAD; + switch (nhc->nhc_family) { + case AF_INET: + if (ip_ignore_linkdown(nhc->nhc_dev)) + *flags |= RTNH_F_DEAD; + break; + } rcu_read_unlock(); } - if (nh->fib_nh_gw4 && - nla_put_in_addr(skb, RTA_GATEWAY, nh->fib_nh_gw4)) - goto nla_put_failure; + if (nhc->nhc_has_gw) { + switch (nhc->nhc_family) { + case AF_INET: + if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4)) + goto nla_put_failure; + break; + } + } - *flags |= (nh->fib_nh_flags & RTNH_F_ONLINK); - if (nh->fib_nh_flags & RTNH_F_OFFLOAD) + *flags |= (nhc->nhc_flags & RTNH_F_ONLINK); + if (nhc->nhc_flags & RTNH_F_OFFLOAD) *flags |= RTNH_F_OFFLOAD; - if (!skip_oif && nh->fib_nh_dev && - nla_put_u32(skb, RTA_OIF, nh->fib_nh_dev->ifindex)) + if (!skip_oif && nhc->nhc_dev && + nla_put_u32(skb, RTA_OIF, nhc->nhc_dev->ifindex)) goto nla_put_failure; - if (nh->fib_nh_lws && - lwtunnel_fill_encap(skb, nh->fib_nh_lws) < 0) + if (nhc->nhc_lwtstate && + lwtunnel_fill_encap(skb, nhc->nhc_lwtstate) < 0) goto nla_put_failure; return 0; @@ -1355,9 +1364,10 @@ nla_put_failure: } #ifdef CONFIG_IP_ROUTE_MULTIPATH -static int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh *nh) +static int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, + int nh_weight) { - const struct net_device *dev = nh->fib_nh_dev; + const struct net_device *dev = nhc->nhc_dev; struct rtnexthop *rtnh; unsigned int flags = 0; @@ -1365,10 +1375,10 @@ static int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh *nh) if (!rtnh) goto nla_put_failure; - rtnh->rtnh_hops = nh->fib_nh_weight - 1; + rtnh->rtnh_hops = nh_weight - 1; rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; - if (fib_nexthop_info(skb, nh, &flags, true) < 0) + if (fib_nexthop_info(skb, nhc, &flags, true) < 0) goto nla_put_failure; rtnh->rtnh_flags = flags; @@ -1381,7 +1391,9 @@ static int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh *nh) nla_put_failure: return -EMSGSIZE; } +#endif +#ifdef CONFIG_IP_ROUTE_MULTIPATH static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) { struct nlattr *mp; @@ -1391,7 +1403,7 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) goto nla_put_failure; for_nexthops(fi) { - if (fib_add_nexthop(skb, nh) < 0) + if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight) < 0) goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID if (nh->nh_tclassid && @@ -1457,7 +1469,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, struct fib_nh *nh = &fi->fib_nh[0]; unsigned int flags = 0; - if (fib_nexthop_info(skb, nh, &flags, false) < 0) + if (fib_nexthop_info(skb, &nh->nh_common, &flags, false) < 0) goto nla_put_failure; rtm->rtm_flags = flags; -- cgit From c0a720770c01e67374b15f348f17a52409f6545c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 2 Apr 2019 14:11:58 -0700 Subject: ipv6: Flip to fib_nexthop_info Export fib_nexthop_info and fib_add_nexthop for use by IPv6 code. Remove rt6_nexthop_info and rt6_add_nexthop in favor of the IPv4 versions. Update fib_nexthop_info for IPv6 linkdown check and RTA_GATEWAY for AF_INET6. Signed-off-by: David Ahern Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 33a43965a232..8e0cb1687a74 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -45,6 +45,7 @@ #include #include #include +#include #include "fib_lookup.h" @@ -1317,8 +1318,8 @@ failure: return ERR_PTR(err); } -static int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc, - unsigned int *flags, bool skip_oif) +int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc, + unsigned int *flags, bool skip_oif) { if (nhc->nhc_flags & RTNH_F_DEAD) *flags |= RTNH_F_DEAD; @@ -1332,6 +1333,10 @@ static int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc if (ip_ignore_linkdown(nhc->nhc_dev)) *flags |= RTNH_F_DEAD; break; + case AF_INET6: + if (ip6_ignore_linkdown(nhc->nhc_dev)) + *flags |= RTNH_F_DEAD; + break; } rcu_read_unlock(); } @@ -1342,6 +1347,11 @@ static int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4)) goto nla_put_failure; break; + case AF_INET6: + if (nla_put_in6_addr(skb, RTA_GATEWAY, + &nhc->nhc_gw.ipv6) < 0) + goto nla_put_failure; + break; } } @@ -1362,10 +1372,11 @@ static int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc nla_put_failure: return -EMSGSIZE; } +EXPORT_SYMBOL_GPL(fib_nexthop_info); -#ifdef CONFIG_IP_ROUTE_MULTIPATH -static int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, - int nh_weight) +#if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6) +int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, + int nh_weight) { const struct net_device *dev = nhc->nhc_dev; struct rtnexthop *rtnh; @@ -1391,6 +1402,7 @@ static int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, nla_put_failure: return -EMSGSIZE; } +EXPORT_SYMBOL_GPL(fib_add_nexthop); #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH -- cgit From 942f146a63cecaa6d7fb1e8d255efab217126c50 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 4 Apr 2019 13:54:20 +0200 Subject: net: use kfree_skb_list() from ip_do_fragment() Just like 46cfd725c377 ("net: use kfree_skb_list() helper in more places"). Signed-off-by: Pablo Neira Ayuso Acked-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c80188875f39..10b35328cfbc 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -693,11 +693,8 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, return 0; } - while (frag) { - skb = frag->next; - kfree_skb(frag); - frag = skb; - } + kfree_skb_list(frag); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); return err; -- cgit From f6fee16dbbe3fe4f942858192b88507c1f2f21ce Mon Sep 17 00:00:00 2001 From: "Tilmans, Olivier (Nokia - BE/Antwerp)" Date: Wed, 3 Apr 2019 13:49:42 +0000 Subject: tcp: Accept ECT on SYN in the presence of RFC8311 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Linux currently disable ECN for incoming connections when the SYN requests ECN and the IP header has ECT(0)/ECT(1) set, as some networks were reportedly mangling the ToS byte, hence could later trigger false congestion notifications. RFC8311 §4.3 relaxes RFC3168's requirements such that ECT can be set one TCP control packets (including SYNs). The main benefit of this is the decreased probability of losing a SYN in a congested ECN-capable network (i.e., it avoids the initial 1s timeout). Additionally, this allows the development of newer TCP extensions, such as AccECN. This patch relaxes the previous check, by enabling ECN on incoming connections using SYN+ECT if at least one bit of the reserved flags of the TCP header is set. Such bit would indicate that the sender of the SYN is using a newer TCP feature than what the host implements, such as AccECN, and is thus implementing RFC8311. This enables end-hosts not supporting such extensions to still negociate ECN, and to have some of the benefits of using ECN on control packets. Signed-off-by: Olivier Tilmans Suggested-by: Bob Briscoe Cc: Koen De Schepper Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5dfbc333e79a..6660ce2a7333 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6263,6 +6263,11 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) * congestion control: Linux DCTCP asserts ECT on all packets, * including SYN, which is most optimal solution; however, * others, such as FreeBSD do not. + * + * Exception: At least one of the reserved bits of the TCP header (th->res1) is + * set, indicating the use of a future TCP extension (such as AccECN). See + * RFC8311 §4.3 which updates RFC3168 to allow the development of such + * extensions. */ static void tcp_ecn_create_request(struct request_sock *req, const struct sk_buff *skb, @@ -6282,7 +6287,7 @@ static void tcp_ecn_create_request(struct request_sock *req, ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst; - if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || + if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || (ecn_ok_dst & DST_FEATURE_ECN_CA) || tcp_bpf_ca_needs_ecn((struct sock *)req)) inet_rsk(req)->ecn_ok = 1; -- cgit From d1edc085559744fbda7a55e97eeae8bd6135a11b Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 4 Apr 2019 15:46:03 +0100 Subject: tcp: remove redundant check on tskb The non-null check on tskb is always false because it is in an else path of a check on tskb and hence tskb is null in this code block. This is check is therefore redundant and can be removed as well as the label coalesc. if (tsbk) { ... } else { ... if (unlikely(!skb)) { if (tskb) /* can never be true, redundant code */ goto coalesc; return; } } Addresses-Coverity: ("Logically dead code") Signed-off-by: Colin Ian King Reviewed-by: Mukesh Ojha Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e265d1aeeb66..32061928b054 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3088,7 +3088,6 @@ void tcp_send_fin(struct sock *sk) tskb = skb_rb_last(&sk->tcp_rtx_queue); if (tskb) { -coalesce: TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; tp->write_seq++; @@ -3104,11 +3103,9 @@ coalesce: } } else { skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation); - if (unlikely(!skb)) { - if (tskb) - goto coalesce; + if (unlikely(!skb)) return; - } + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); skb_reserve(skb, MAX_TCP_HEADER); sk_forced_mem_schedule(sk, skb->truesize); -- cgit From 8f0db018006a421956965e1149234c4e8db718ee Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 2 Apr 2019 10:07:45 +1100 Subject: rhashtable: use bit_spin_locks to protect hash bucket. This patch changes rhashtables to use a bit_spin_lock on BIT(1) of the bucket pointer to lock the hash chain for that bucket. The benefits of a bit spin_lock are: - no need to allocate a separate array of locks. - no need to have a configuration option to guide the choice of the size of this array - locking cost is often a single test-and-set in a cache line that will have to be loaded anyway. When inserting at, or removing from, the head of the chain, the unlock is free - writing the new address in the bucket head implicitly clears the lock bit. For __rhashtable_insert_fast() we ensure this always happens when adding a new key. - even when lockings costs 2 updates (lock and unlock), they are in a cacheline that needs to be read anyway. The cost of using a bit spin_lock is a little bit of code complexity, which I think is quite manageable. Bit spin_locks are sometimes inappropriate because they are not fair - if multiple CPUs repeatedly contend of the same lock, one CPU can easily be starved. This is not a credible situation with rhashtable. Multiple CPUs may want to repeatedly add or remove objects, but they will typically do so at different buckets, so they will attempt to acquire different locks. As we have more bit-locks than we previously had spinlocks (by at least a factor of two) we can expect slightly less contention to go with the slightly better cache behavior and reduced memory consumption. To enhance type checking, a new struct is introduced to represent the pointer plus lock-bit that is stored in the bucket-table. This is "struct rhash_lock_head" and is empty. A pointer to this needs to be cast to either an unsigned lock, or a "struct rhash_head *" to be useful. Variables of this type are most often called "bkt". Previously "pprev" would sometimes point to a bucket, and sometimes a ->next pointer in an rhash_head. As these are now different types, pprev is NULL when it would have pointed to the bucket. In that case, 'blk' is used, together with correct locking protocol. Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 2c931120c494..9a3f13edc98e 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -373,7 +373,6 @@ static const struct rhashtable_params ipmr_rht_params = { .key_offset = offsetof(struct mfc_cache, cmparg), .key_len = sizeof(struct mfc_cache_cmp_arg), .nelem_hint = 3, - .locks_mul = 1, .obj_cmpfn = ipmr_hash_cmp, .automatic_shrinking = true, }; -- cgit From b262a69582a4676c7378a73077b7bb186c7c5b2a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:22 +0100 Subject: xfrm: place af number into xfrm_mode struct This will be useful to know if we're supposed to decode ipv4 or ipv6. While at it, make the unregister function return void, all module_exit functions did just BUG(); there is never a point in doing error checks if there is no way to handle such error. Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_mode_beet.c | 8 +++----- net/ipv4/xfrm4_mode_transport.c | 8 +++----- net/ipv4/xfrm4_mode_tunnel.c | 8 +++----- 3 files changed, 9 insertions(+), 15 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index 856d2dfdb44b..a2e3b52ae46c 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -134,19 +134,17 @@ static struct xfrm_mode xfrm4_beet_mode = { .owner = THIS_MODULE, .encap = XFRM_MODE_BEET, .flags = XFRM_MODE_FLAG_TUNNEL, + .family = AF_INET, }; static int __init xfrm4_beet_init(void) { - return xfrm_register_mode(&xfrm4_beet_mode, AF_INET); + return xfrm_register_mode(&xfrm4_beet_mode); } static void __exit xfrm4_beet_exit(void) { - int err; - - err = xfrm_unregister_mode(&xfrm4_beet_mode, AF_INET); - BUG_ON(err); + xfrm_unregister_mode(&xfrm4_beet_mode); } module_init(xfrm4_beet_init); diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c index 1ad2c2c4e250..7c5443f797cf 100644 --- a/net/ipv4/xfrm4_mode_transport.c +++ b/net/ipv4/xfrm4_mode_transport.c @@ -93,19 +93,17 @@ static struct xfrm_mode xfrm4_transport_mode = { .xmit = xfrm4_transport_xmit, .owner = THIS_MODULE, .encap = XFRM_MODE_TRANSPORT, + .family = AF_INET, }; static int __init xfrm4_transport_init(void) { - return xfrm_register_mode(&xfrm4_transport_mode, AF_INET); + return xfrm_register_mode(&xfrm4_transport_mode); } static void __exit xfrm4_transport_exit(void) { - int err; - - err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET); - BUG_ON(err); + xfrm_unregister_mode(&xfrm4_transport_mode); } module_init(xfrm4_transport_init); diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 2a9764bd1719..cfc6b6d39755 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -131,19 +131,17 @@ static struct xfrm_mode xfrm4_tunnel_mode = { .owner = THIS_MODULE, .encap = XFRM_MODE_TUNNEL, .flags = XFRM_MODE_FLAG_TUNNEL, + .family = AF_INET, }; static int __init xfrm4_mode_tunnel_init(void) { - return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET); + return xfrm_register_mode(&xfrm4_tunnel_mode); } static void __exit xfrm4_mode_tunnel_exit(void) { - int err; - - err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET); - BUG_ON(err); + xfrm_unregister_mode(&xfrm4_tunnel_mode); } module_init(xfrm4_mode_tunnel_init); -- cgit From b45714b164cac71f503ad73654b3c880cb9f2590 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:23 +0100 Subject: xfrm: prefer family stored in xfrm_mode struct Now that we have the family available directly in the xfrm_mode struct, we can use that and avoid one extra dereference. Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/ipv4/ip_vti.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index a8474799fb79..3f3f6d6be318 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -137,7 +137,7 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) } } - family = inner_mode->afinfo->family; + family = inner_mode->family; skb->mark = be32_to_cpu(tunnel->parms.i_key); ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); -- cgit From c2d305e51038167dd9de8d476c72f667d84cad8b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:24 +0100 Subject: xfrm: remove input indirection from xfrm_mode No need for any indirection or abstraction here, both functions are pretty much the same and quite small, they also have no external dependencies. xfrm_prepare_input can then be made static. With allmodconfig build, size increase of vmlinux is 25 byte: Before: text data bss dec filename 15730207 6936924 4046908 26714039 vmlinux After: 15730208 6936948 4046908 26714064 vmlinux v2: Fix INET_XFRM_MODE_TRANSPORT name in is-enabled test (Sabrina Dubroca) change copied comment to refer to transport and network header, not skb->{h,nh}, which don't exist anymore. (Sabrina) make xfrm_prepare_input static (Eyal Birger) Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_mode_beet.c | 1 - net/ipv4/xfrm4_mode_transport.c | 23 ----------------------- net/ipv4/xfrm4_mode_tunnel.c | 1 - 3 files changed, 25 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index a2e3b52ae46c..264c4c9e2473 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -128,7 +128,6 @@ out: static struct xfrm_mode xfrm4_beet_mode = { .input2 = xfrm4_beet_input, - .input = xfrm_prepare_input, .output2 = xfrm4_beet_output, .output = xfrm4_prepare_output, .owner = THIS_MODULE, diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c index 7c5443f797cf..c943d710f302 100644 --- a/net/ipv4/xfrm4_mode_transport.c +++ b/net/ipv4/xfrm4_mode_transport.c @@ -35,28 +35,6 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) return 0; } -/* Remove encapsulation header. - * - * The IP header will be moved over the top of the encapsulation header. - * - * On entry, skb->h shall point to where the IP header should be and skb->nh - * shall be set to where the IP header currently is. skb->data shall point - * to the start of the payload. - */ -static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) -{ - int ihl = skb->data - skb_transport_header(skb); - - if (skb->transport_header != skb->network_header) { - memmove(skb_transport_header(skb), - skb_network_header(skb), ihl); - skb->network_header = skb->transport_header; - } - ip_hdr(skb)->tot_len = htons(skb->len + ihl); - skb_reset_transport_header(skb); - return 0; -} - static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) @@ -87,7 +65,6 @@ static void xfrm4_transport_xmit(struct xfrm_state *x, struct sk_buff *skb) } static struct xfrm_mode xfrm4_transport_mode = { - .input = xfrm4_transport_input, .output = xfrm4_transport_output, .gso_segment = xfrm4_transport_gso_segment, .xmit = xfrm4_transport_xmit, diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index cfc6b6d39755..678b91754b5e 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -123,7 +123,6 @@ static void xfrm4_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb) static struct xfrm_mode xfrm4_tunnel_mode = { .input2 = xfrm4_mode_tunnel_input, - .input = xfrm_prepare_input, .output2 = xfrm4_mode_tunnel_output, .output = xfrm4_prepare_output, .gso_segment = xfrm4_mode_tunnel_gso_segment, -- cgit From 0c620e97b3490890facbbe06d5deed9b024de255 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:25 +0100 Subject: xfrm: remove output indirection from xfrm_mode Same is input indirection. Only exception: we need to export xfrm_outer_mode_output for pktgen. Increases size of vmlinux by about 163 byte: Before: text data bss dec filename 15730208 6936948 4046908 26714064 vmlinux After: 15730311 6937008 4046908 26714227 vmlinux xfrm_inner_extract_output has no more external callers, make it static. v2: add IS_ENABLED(IPV6) guard in xfrm6_prepare_output add two missing breaks in xfrm_outer_mode_output (Sabrina Dubroca) add WARN_ON_ONCE for 'call AF_INET6 related output function, but CONFIG_IPV6=n' case. make xfrm_inner_extract_output static Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_mode_beet.c | 1 - net/ipv4/xfrm4_mode_transport.c | 22 ---------------------- net/ipv4/xfrm4_mode_tunnel.c | 1 - net/ipv4/xfrm4_output.c | 15 --------------- 4 files changed, 39 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index 264c4c9e2473..f02cc8237d54 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -129,7 +129,6 @@ out: static struct xfrm_mode xfrm4_beet_mode = { .input2 = xfrm4_beet_input, .output2 = xfrm4_beet_output, - .output = xfrm4_prepare_output, .owner = THIS_MODULE, .encap = XFRM_MODE_BEET, .flags = XFRM_MODE_FLAG_TUNNEL, diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c index c943d710f302..6f8cf09ff0ef 100644 --- a/net/ipv4/xfrm4_mode_transport.c +++ b/net/ipv4/xfrm4_mode_transport.c @@ -14,27 +14,6 @@ #include #include -/* Add encapsulation header. - * - * The IP header will be moved forward to make space for the encapsulation - * header. - */ -static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) -{ - struct iphdr *iph = ip_hdr(skb); - int ihl = iph->ihl * 4; - - skb_set_inner_transport_header(skb, skb_transport_offset(skb)); - - skb_set_network_header(skb, -x->props.header_len); - skb->mac_header = skb->network_header + - offsetof(struct iphdr, protocol); - skb->transport_header = skb->network_header + ihl; - __skb_pull(skb, ihl); - memmove(skb_network_header(skb), iph, ihl); - return 0; -} - static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) @@ -65,7 +44,6 @@ static void xfrm4_transport_xmit(struct xfrm_state *x, struct sk_buff *skb) } static struct xfrm_mode xfrm4_transport_mode = { - .output = xfrm4_transport_output, .gso_segment = xfrm4_transport_gso_segment, .xmit = xfrm4_transport_xmit, .owner = THIS_MODULE, diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 678b91754b5e..823bc54b47de 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -124,7 +124,6 @@ static void xfrm4_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb) static struct xfrm_mode xfrm4_tunnel_mode = { .input2 = xfrm4_mode_tunnel_input, .output2 = xfrm4_mode_tunnel_output, - .output = xfrm4_prepare_output, .gso_segment = xfrm4_mode_tunnel_gso_segment, .xmit = xfrm4_mode_tunnel_xmit, .owner = THIS_MODULE, diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index be980c195fc5..6802d1aee424 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -58,21 +58,6 @@ int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb) return xfrm4_extract_header(skb); } -int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) -{ - int err; - - err = xfrm_inner_extract_output(x, skb); - if (err) - return err; - - IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; - skb->protocol = htons(ETH_P_IP); - - return x->outer_mode->output2(x, skb); -} -EXPORT_SYMBOL(xfrm4_prepare_output); - int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb) { memset(IPCB(skb), 0, sizeof(*IPCB(skb))); -- cgit From 303c5fab1272888b22088fbdd08cb770205ccb7a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:26 +0100 Subject: xfrm: remove xmit indirection from xfrm_mode There are only two versions (tunnel and transport). The ip/ipv6 versions are only differ in sizeof(iphdr) vs ipv6hdr. Place this in the core and use x->outer_mode->encap type to call the correct adjustment helper. Before: text data bss dec filename 15730311 6937008 4046908 26714227 vmlinux After: 15730428 6937008 4046908 26714344 vmlinux (about 117 byte increase) v2: use family from x->outer_mode, not inner Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_mode_transport.c | 14 -------------- net/ipv4/xfrm4_mode_tunnel.c | 13 ------------- 2 files changed, 27 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c index 6f8cf09ff0ef..d4b34bb2de00 100644 --- a/net/ipv4/xfrm4_mode_transport.c +++ b/net/ipv4/xfrm4_mode_transport.c @@ -30,22 +30,8 @@ static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x, return segs; } -static void xfrm4_transport_xmit(struct xfrm_state *x, struct sk_buff *skb) -{ - struct xfrm_offload *xo = xfrm_offload(skb); - - skb_reset_mac_len(skb); - pskb_pull(skb, skb->mac_len + sizeof(struct iphdr) + x->props.header_len); - - if (xo->flags & XFRM_GSO_SEGMENT) { - skb_reset_transport_header(skb); - skb->transport_header -= x->props.header_len; - } -} - static struct xfrm_mode xfrm4_transport_mode = { .gso_segment = xfrm4_transport_gso_segment, - .xmit = xfrm4_transport_xmit, .owner = THIS_MODULE, .encap = XFRM_MODE_TRANSPORT, .family = AF_INET, diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 823bc54b47de..8bd5112b3ee3 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -109,23 +109,10 @@ static struct sk_buff *xfrm4_mode_tunnel_gso_segment(struct xfrm_state *x, return skb_mac_gso_segment(skb, features); } -static void xfrm4_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb) -{ - struct xfrm_offload *xo = xfrm_offload(skb); - - if (xo->flags & XFRM_GSO_SEGMENT) - skb->transport_header = skb->network_header + - sizeof(struct iphdr); - - skb_reset_mac_len(skb); - pskb_pull(skb, skb->mac_len + x->props.header_len); -} - static struct xfrm_mode xfrm4_tunnel_mode = { .input2 = xfrm4_mode_tunnel_input, .output2 = xfrm4_mode_tunnel_output, .gso_segment = xfrm4_mode_tunnel_gso_segment, - .xmit = xfrm4_mode_tunnel_xmit, .owner = THIS_MODULE, .encap = XFRM_MODE_TUNNEL, .flags = XFRM_MODE_FLAG_TUNNEL, -- cgit From 7613b92b1ae37141704948b77e8762c5de896510 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:27 +0100 Subject: xfrm: remove gso_segment indirection from xfrm_mode These functions are small and we only have versions for tunnel and transport mode for ipv4 and ipv6 respectively. Just place the 'transport or tunnel' conditional in the protocol specific function instead of using an indirection. Before: 3226 12 0 3238 net/ipv4/esp4_offload.o 7004 492 0 7496 net/ipv4/ip_vti.o 3339 12 0 3351 net/ipv6/esp6_offload.o 11294 460 0 11754 net/ipv6/ip6_vti.o 1180 72 0 1252 net/ipv4/xfrm4_mode_beet.o 428 48 0 476 net/ipv4/xfrm4_mode_transport.o 1271 48 0 1319 net/ipv4/xfrm4_mode_tunnel.o 1083 60 0 1143 net/ipv6/xfrm6_mode_beet.o 172 48 0 220 net/ipv6/xfrm6_mode_ro.o 429 48 0 477 net/ipv6/xfrm6_mode_transport.o 1164 48 0 1212 net/ipv6/xfrm6_mode_tunnel.o 15730428 6937008 4046908 26714344 vmlinux After: 3461 12 0 3473 net/ipv4/esp4_offload.o 7000 492 0 7492 net/ipv4/ip_vti.o 3574 12 0 3586 net/ipv6/esp6_offload.o 11295 460 0 11755 net/ipv6/ip6_vti.o 1180 64 0 1244 net/ipv4/xfrm4_mode_beet.o 171 40 0 211 net/ipv4/xfrm4_mode_transport.o 1163 40 0 1203 net/ipv4/xfrm4_mode_tunnel.o 1083 52 0 1135 net/ipv6/xfrm6_mode_beet.o 172 40 0 212 net/ipv6/xfrm6_mode_ro.o 172 40 0 212 net/ipv6/xfrm6_mode_transport.o 1056 40 0 1096 net/ipv6/xfrm6_mode_tunnel.o 15730424 6937008 4046908 26714340 vmlinux Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/ipv4/esp4_offload.c | 40 +++++++++++++++++++++++++++++++++++++++- net/ipv4/xfrm4_mode_transport.c | 17 ----------------- net/ipv4/xfrm4_mode_tunnel.c | 9 --------- 3 files changed, 39 insertions(+), 27 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index c6c84f2bc41c..74d59e0177a7 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -107,6 +107,44 @@ static void esp4_gso_encap(struct xfrm_state *x, struct sk_buff *skb) xo->proto = proto; } +static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x, + struct sk_buff *skb, + netdev_features_t features) +{ + __skb_push(skb, skb->mac_len); + return skb_mac_gso_segment(skb, features); +} + +static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x, + struct sk_buff *skb, + netdev_features_t features) +{ + const struct net_offload *ops; + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct xfrm_offload *xo = xfrm_offload(skb); + + skb->transport_header += x->props.header_len; + ops = rcu_dereference(inet_offloads[xo->proto]); + if (likely(ops && ops->callbacks.gso_segment)) + segs = ops->callbacks.gso_segment(skb, features); + + return segs; +} + +static struct sk_buff *xfrm4_outer_mode_gso_segment(struct xfrm_state *x, + struct sk_buff *skb, + netdev_features_t features) +{ + switch (x->outer_mode->encap) { + case XFRM_MODE_TUNNEL: + return xfrm4_tunnel_gso_segment(x, skb, features); + case XFRM_MODE_TRANSPORT: + return xfrm4_transport_gso_segment(x, skb, features); + } + + return ERR_PTR(-EOPNOTSUPP); +} + static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -147,7 +185,7 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, xo->flags |= XFRM_GSO_SEGMENT; - return x->outer_mode->gso_segment(x, skb, esp_features); + return xfrm4_outer_mode_gso_segment(x, skb, esp_features); } static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb) diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c index d4b34bb2de00..397863ea762b 100644 --- a/net/ipv4/xfrm4_mode_transport.c +++ b/net/ipv4/xfrm4_mode_transport.c @@ -14,24 +14,7 @@ #include #include -static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x, - struct sk_buff *skb, - netdev_features_t features) -{ - const struct net_offload *ops; - struct sk_buff *segs = ERR_PTR(-EINVAL); - struct xfrm_offload *xo = xfrm_offload(skb); - - skb->transport_header += x->props.header_len; - ops = rcu_dereference(inet_offloads[xo->proto]); - if (likely(ops && ops->callbacks.gso_segment)) - segs = ops->callbacks.gso_segment(skb, features); - - return segs; -} - static struct xfrm_mode xfrm4_transport_mode = { - .gso_segment = xfrm4_transport_gso_segment, .owner = THIS_MODULE, .encap = XFRM_MODE_TRANSPORT, .family = AF_INET, diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 8bd5112b3ee3..b5d4ba41758e 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -101,18 +101,9 @@ out: return err; } -static struct sk_buff *xfrm4_mode_tunnel_gso_segment(struct xfrm_state *x, - struct sk_buff *skb, - netdev_features_t features) -{ - __skb_push(skb, skb->mac_len); - return skb_mac_gso_segment(skb, features); -} - static struct xfrm_mode xfrm4_tunnel_mode = { .input2 = xfrm4_mode_tunnel_input, .output2 = xfrm4_mode_tunnel_output, - .gso_segment = xfrm4_mode_tunnel_gso_segment, .owner = THIS_MODULE, .encap = XFRM_MODE_TUNNEL, .flags = XFRM_MODE_FLAG_TUNNEL, -- cgit From b3284df1c86f7ac078dcb8fb250fe3d6437e740c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:28 +0100 Subject: xfrm: remove input2 indirection from xfrm_mode No external dependencies on any module, place this in the core. Increase is about 1800 byte for xfrm_input.o. The beet helpers get added to internal header, as they can be reused from xfrm_output.c in the next patch (kernel contains several copies of them in the xfrm{4,6}_mode_beet.c files). Before: text data bss dec filename 5578 176 2364 8118 net/xfrm/xfrm_input.o 1180 64 0 1244 net/ipv4/xfrm4_mode_beet.o 171 40 0 211 net/ipv4/xfrm4_mode_transport.o 1163 40 0 1203 net/ipv4/xfrm4_mode_tunnel.o 1083 52 0 1135 net/ipv6/xfrm6_mode_beet.o 172 40 0 212 net/ipv6/xfrm6_mode_ro.o 172 40 0 212 net/ipv6/xfrm6_mode_transport.o 1056 40 0 1096 net/ipv6/xfrm6_mode_tunnel.o After: text data bss dec filename 7373 200 2364 9937 net/xfrm/xfrm_input.o 587 44 0 631 net/ipv4/xfrm4_mode_beet.o 171 32 0 203 net/ipv4/xfrm4_mode_transport.o 649 32 0 681 net/ipv4/xfrm4_mode_tunnel.o 625 44 0 669 net/ipv6/xfrm6_mode_beet.o 172 32 0 204 net/ipv6/xfrm6_mode_ro.o 172 32 0 204 net/ipv6/xfrm6_mode_transport.o 599 32 0 631 net/ipv6/xfrm6_mode_tunnel.o v2: pass inner_mode to xfrm_inner_mode_encap_remove to fix AF_UNSPEC selector breakage (bisected by Benedict Wong) Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_mode_beet.c | 47 -------------------------------------------- net/ipv4/xfrm4_mode_tunnel.c | 39 ------------------------------------ 2 files changed, 86 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index f02cc8237d54..500960172933 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -80,54 +80,7 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) return 0; } -static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb) -{ - struct iphdr *iph; - int optlen = 0; - int err = -EINVAL; - - if (unlikely(XFRM_MODE_SKB_CB(skb)->protocol == IPPROTO_BEETPH)) { - struct ip_beet_phdr *ph; - int phlen; - - if (!pskb_may_pull(skb, sizeof(*ph))) - goto out; - - ph = (struct ip_beet_phdr *)skb->data; - - phlen = sizeof(*ph) + ph->padlen; - optlen = ph->hdrlen * 8 + (IPV4_BEET_PHMAXLEN - phlen); - if (optlen < 0 || optlen & 3 || optlen > 250) - goto out; - - XFRM_MODE_SKB_CB(skb)->protocol = ph->nexthdr; - - if (!pskb_may_pull(skb, phlen)) - goto out; - __skb_pull(skb, phlen); - } - - skb_push(skb, sizeof(*iph)); - skb_reset_network_header(skb); - skb_mac_header_rebuild(skb); - - xfrm4_beet_make_header(skb); - - iph = ip_hdr(skb); - - iph->ihl += optlen / 4; - iph->tot_len = htons(skb->len); - iph->daddr = x->sel.daddr.a4; - iph->saddr = x->sel.saddr.a4; - iph->check = 0; - iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); - err = 0; -out: - return err; -} - static struct xfrm_mode xfrm4_beet_mode = { - .input2 = xfrm4_beet_input, .output2 = xfrm4_beet_output, .owner = THIS_MODULE, .encap = XFRM_MODE_BEET, diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index b5d4ba41758e..31645319aaeb 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -15,14 +15,6 @@ #include #include -static inline void ipip_ecn_decapsulate(struct sk_buff *skb) -{ - struct iphdr *inner_iph = ipip_hdr(skb); - - if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) - IP_ECN_set_ce(inner_iph); -} - /* Add encapsulation header. * * The top IP header will be constructed per RFC 2401. @@ -71,38 +63,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) return 0; } -static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) -{ - int err = -EINVAL; - - if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) - goto out; - - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - goto out; - - err = skb_unclone(skb, GFP_ATOMIC); - if (err) - goto out; - - if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipip_hdr(skb)); - if (!(x->props.flags & XFRM_STATE_NOECN)) - ipip_ecn_decapsulate(skb); - - skb_reset_network_header(skb); - skb_mac_header_rebuild(skb); - if (skb->mac_len) - eth_hdr(skb)->h_proto = skb->protocol; - - err = 0; - -out: - return err; -} - static struct xfrm_mode xfrm4_tunnel_mode = { - .input2 = xfrm4_mode_tunnel_input, .output2 = xfrm4_mode_tunnel_output, .owner = THIS_MODULE, .encap = XFRM_MODE_TUNNEL, -- cgit From 1de70830066b72b6a8e259e5363f6c0bc4ba7bbc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:29 +0100 Subject: xfrm: remove output2 indirection from xfrm_mode similar to previous patch: no external module dependencies, so we can avoid the indirection by placing this in the core. This change removes the last indirection from xfrm_mode and the xfrm4|6_mode_{beet,tunnel}.c modules contain (almost) no code anymore. Before: text data bss dec hex filename 3957 136 0 4093 ffd net/xfrm/xfrm_output.o 587 44 0 631 277 net/ipv4/xfrm4_mode_beet.o 649 32 0 681 2a9 net/ipv4/xfrm4_mode_tunnel.o 625 44 0 669 29d net/ipv6/xfrm6_mode_beet.o 599 32 0 631 277 net/ipv6/xfrm6_mode_tunnel.o After: text data bss dec hex filename 5359 184 0 5543 15a7 net/xfrm/xfrm_output.o 171 24 0 195 c3 net/ipv4/xfrm4_mode_beet.o 171 24 0 195 c3 net/ipv4/xfrm4_mode_tunnel.o 172 24 0 196 c4 net/ipv6/xfrm6_mode_beet.o 172 24 0 196 c4 net/ipv6/xfrm6_mode_tunnel.o v2: fold the *encap_add functions into xfrm*_prepare_output preserve (move) output2 comment (Sabrina) use x->outer_mode->encap, not inner fix a build breakage on ppc (kbuild robot) Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_mode_beet.c | 63 -------------------------------------------- net/ipv4/xfrm4_mode_tunnel.c | 49 ---------------------------------- 2 files changed, 112 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index 500960172933..ba84b278e627 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -17,71 +17,8 @@ #include #include -static void xfrm4_beet_make_header(struct sk_buff *skb) -{ - struct iphdr *iph = ip_hdr(skb); - - iph->ihl = 5; - iph->version = 4; - - iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; - iph->tos = XFRM_MODE_SKB_CB(skb)->tos; - - iph->id = XFRM_MODE_SKB_CB(skb)->id; - iph->frag_off = XFRM_MODE_SKB_CB(skb)->frag_off; - iph->ttl = XFRM_MODE_SKB_CB(skb)->ttl; -} - -/* Add encapsulation header. - * - * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt. - */ -static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) -{ - struct ip_beet_phdr *ph; - struct iphdr *top_iph; - int hdrlen, optlen; - - hdrlen = 0; - optlen = XFRM_MODE_SKB_CB(skb)->optlen; - if (unlikely(optlen)) - hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4); - - skb_set_network_header(skb, -x->props.header_len - - hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph))); - if (x->sel.family != AF_INET6) - skb->network_header += IPV4_BEET_PHMAXLEN; - skb->mac_header = skb->network_header + - offsetof(struct iphdr, protocol); - skb->transport_header = skb->network_header + sizeof(*top_iph); - - xfrm4_beet_make_header(skb); - - ph = __skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl - hdrlen); - - top_iph = ip_hdr(skb); - - if (unlikely(optlen)) { - BUG_ON(optlen < 0); - - ph->padlen = 4 - (optlen & 4); - ph->hdrlen = optlen / 8; - ph->nexthdr = top_iph->protocol; - if (ph->padlen) - memset(ph + 1, IPOPT_NOP, ph->padlen); - - top_iph->protocol = IPPROTO_BEETPH; - top_iph->ihl = sizeof(struct iphdr) / 4; - } - - top_iph->saddr = x->props.saddr.a4; - top_iph->daddr = x->id.daddr.a4; - - return 0; -} static struct xfrm_mode xfrm4_beet_mode = { - .output2 = xfrm4_beet_output, .owner = THIS_MODULE, .encap = XFRM_MODE_BEET, .flags = XFRM_MODE_FLAG_TUNNEL, diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 31645319aaeb..b2b132c800fc 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -15,56 +15,7 @@ #include #include -/* Add encapsulation header. - * - * The top IP header will be constructed per RFC 2401. - */ -static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) -{ - struct dst_entry *dst = skb_dst(skb); - struct iphdr *top_iph; - int flags; - - skb_set_inner_network_header(skb, skb_network_offset(skb)); - skb_set_inner_transport_header(skb, skb_transport_offset(skb)); - - skb_set_network_header(skb, -x->props.header_len); - skb->mac_header = skb->network_header + - offsetof(struct iphdr, protocol); - skb->transport_header = skb->network_header + sizeof(*top_iph); - top_iph = ip_hdr(skb); - - top_iph->ihl = 5; - top_iph->version = 4; - - top_iph->protocol = xfrm_af2proto(skb_dst(skb)->ops->family); - - /* DS disclosing depends on XFRM_SA_XFLAG_DONT_ENCAP_DSCP */ - if (x->props.extra_flags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP) - top_iph->tos = 0; - else - top_iph->tos = XFRM_MODE_SKB_CB(skb)->tos; - top_iph->tos = INET_ECN_encapsulate(top_iph->tos, - XFRM_MODE_SKB_CB(skb)->tos); - - flags = x->props.flags; - if (flags & XFRM_STATE_NOECN) - IP_ECN_clear(top_iph); - - top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? - 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); - - top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst)); - - top_iph->saddr = x->props.saddr.a4; - top_iph->daddr = x->id.daddr.a4; - ip_select_ident(dev_net(dst->dev), skb, NULL); - - return 0; -} - static struct xfrm_mode xfrm4_tunnel_mode = { - .output2 = xfrm4_mode_tunnel_output, .owner = THIS_MODULE, .encap = XFRM_MODE_TUNNEL, .flags = XFRM_MODE_FLAG_TUNNEL, -- cgit From 733a5fac2f15b55b9059230d098ed04341d2d884 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:30 +0100 Subject: xfrm: remove afinfo pointer from xfrm_mode Adds an EXPORT_SYMBOL for afinfo_get_rcu, as it will now be called from ipv6 in case of CONFIG_IPV6=m. This change has virtually no effect on vmlinux size, but it reduces afinfo size and allows followup patch to make xfrm modes const. v2: mark if (afinfo) tests as likely (Sabrina) re-fetch afinfo according to inner_mode in xfrm_prepare_input(). Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_output.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 6802d1aee424..7c3df14daef3 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -72,6 +72,8 @@ int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb) static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct xfrm_state *x = skb_dst(skb)->xfrm; + const struct xfrm_state_afinfo *afinfo; + int ret = -EAFNOSUPPORT; #ifdef CONFIG_NETFILTER if (!x) { @@ -80,7 +82,15 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) } #endif - return x->outer_mode->afinfo->output_finish(sk, skb); + rcu_read_lock(); + afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode->family); + if (likely(afinfo)) + ret = afinfo->output_finish(sk, skb); + else + kfree_skb(skb); + rcu_read_unlock(); + + return ret; } int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) -- cgit From 4c145dce26013763490df88f2473714f5bc7857d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:31 +0100 Subject: xfrm: make xfrm modes builtin after previous changes, xfrm_mode contains no function pointers anymore and all modules defining such struct contain no code except an init/exit functions to register the xfrm_mode struct with the xfrm core. Just place the xfrm modes core and remove the modules, the run-time xfrm_mode register/unregister functionality is removed. Before: text data bss dec filename 7523 200 2364 10087 net/xfrm/xfrm_input.o 40003 628 440 41071 net/xfrm/xfrm_state.o 15730338 6937080 4046908 26714326 vmlinux 7389 200 2364 9953 net/xfrm/xfrm_input.o 40574 656 440 41670 net/xfrm/xfrm_state.o 15730084 6937068 4046908 26714060 vmlinux The xfrm*_mode_{transport,tunnel,beet} modules are gone. v2: replace CONFIG_INET6_XFRM_MODE_* IS_ENABLED guards with CONFIG_IPV6 ones rather than removing them. Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/ipv4/Kconfig | 29 +---------------------------- net/ipv4/Makefile | 3 --- net/ipv4/ip_vti.c | 2 +- net/ipv4/xfrm4_mode_beet.c | 41 ----------------------------------------- net/ipv4/xfrm4_mode_transport.c | 36 ------------------------------------ net/ipv4/xfrm4_mode_tunnel.c | 38 -------------------------------------- 6 files changed, 2 insertions(+), 147 deletions(-) delete mode 100644 net/ipv4/xfrm4_mode_beet.c delete mode 100644 net/ipv4/xfrm4_mode_transport.c delete mode 100644 net/ipv4/xfrm4_mode_tunnel.c (limited to 'net/ipv4') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 32cae39cdff6..8108e97d4285 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -304,7 +304,7 @@ config NET_IPVTI tristate "Virtual (secure) IP: tunneling" select INET_TUNNEL select NET_IP_TUNNEL - depends on INET_XFRM_MODE_TUNNEL + select XFRM ---help--- Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the @@ -396,33 +396,6 @@ config INET_TUNNEL tristate default n -config INET_XFRM_MODE_TRANSPORT - tristate "IP: IPsec transport mode" - default y - select XFRM - ---help--- - Support for IPsec transport mode. - - If unsure, say Y. - -config INET_XFRM_MODE_TUNNEL - tristate "IP: IPsec tunnel mode" - default y - select XFRM - ---help--- - Support for IPsec tunnel mode. - - If unsure, say Y. - -config INET_XFRM_MODE_BEET - tristate "IP: IPsec BEET mode" - default y - select XFRM - ---help--- - Support for IPsec BEET mode. - - If unsure, say Y. - config INET_DIAG tristate "INET: socket monitoring interface" default y diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 58629314eae9..000a61994c8f 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -37,10 +37,7 @@ obj-$(CONFIG_INET_ESP) += esp4.o obj-$(CONFIG_INET_ESP_OFFLOAD) += esp4_offload.o obj-$(CONFIG_INET_IPCOMP) += ipcomp.o obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o -obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o obj-$(CONFIG_INET_TUNNEL) += tunnel4.o -obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o -obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o obj-$(CONFIG_IP_PNP) += ipconfig.o obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ obj-$(CONFIG_INET_DIAG) += inet_diag.o diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 3f3f6d6be318..91926c9a3bc9 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -107,7 +107,7 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) struct net_device *dev; struct pcpu_sw_netstats *tstats; struct xfrm_state *x; - struct xfrm_mode *inner_mode; + const struct xfrm_mode *inner_mode; struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; u32 orig_mark = skb->mark; int ret; diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c deleted file mode 100644 index ba84b278e627..000000000000 --- a/net/ipv4/xfrm4_mode_beet.c +++ /dev/null @@ -1,41 +0,0 @@ -/* - * xfrm4_mode_beet.c - BEET mode encapsulation for IPv4. - * - * Copyright (c) 2006 Diego Beltrami - * Miika Komu - * Herbert Xu - * Abhinav Pathak - * Jeff Ahrenholz - */ - -#include -#include -#include -#include -#include -#include -#include -#include - - -static struct xfrm_mode xfrm4_beet_mode = { - .owner = THIS_MODULE, - .encap = XFRM_MODE_BEET, - .flags = XFRM_MODE_FLAG_TUNNEL, - .family = AF_INET, -}; - -static int __init xfrm4_beet_init(void) -{ - return xfrm_register_mode(&xfrm4_beet_mode); -} - -static void __exit xfrm4_beet_exit(void) -{ - xfrm_unregister_mode(&xfrm4_beet_mode); -} - -module_init(xfrm4_beet_init); -module_exit(xfrm4_beet_exit); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_BEET); diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c deleted file mode 100644 index 397863ea762b..000000000000 --- a/net/ipv4/xfrm4_mode_transport.c +++ /dev/null @@ -1,36 +0,0 @@ -/* - * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4. - * - * Copyright (c) 2004-2006 Herbert Xu - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static struct xfrm_mode xfrm4_transport_mode = { - .owner = THIS_MODULE, - .encap = XFRM_MODE_TRANSPORT, - .family = AF_INET, -}; - -static int __init xfrm4_transport_init(void) -{ - return xfrm_register_mode(&xfrm4_transport_mode); -} - -static void __exit xfrm4_transport_exit(void) -{ - xfrm_unregister_mode(&xfrm4_transport_mode); -} - -module_init(xfrm4_transport_init); -module_exit(xfrm4_transport_exit); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT); diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c deleted file mode 100644 index b2b132c800fc..000000000000 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ /dev/null @@ -1,38 +0,0 @@ -/* - * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4. - * - * Copyright (c) 2004-2006 Herbert Xu - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static struct xfrm_mode xfrm4_tunnel_mode = { - .owner = THIS_MODULE, - .encap = XFRM_MODE_TUNNEL, - .flags = XFRM_MODE_FLAG_TUNNEL, - .family = AF_INET, -}; - -static int __init xfrm4_mode_tunnel_init(void) -{ - return xfrm_register_mode(&xfrm4_tunnel_mode); -} - -static void __exit xfrm4_mode_tunnel_exit(void) -{ - xfrm_unregister_mode(&xfrm4_tunnel_mode); -} - -module_init(xfrm4_mode_tunnel_init); -module_exit(xfrm4_mode_tunnel_exit); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL); -- cgit From c9500d7b7de8ff6ac88ee3e38b782889f1616593 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:32 +0100 Subject: xfrm: store xfrm_mode directly, not its address This structure is now only 4 bytes, so its more efficient to cache a copy rather than its address. No significant size difference in allmodconfig vmlinux. With non-modular kernel that has all XFRM options enabled, this series reduces vmlinux image size by ~11kb. All xfrm_mode indirections are gone and all modes are built-in. before (ipsec-next master): text data bss dec filename 21071494 7233140 11104324 39408958 vmlinux.master after this series: 21066448 7226772 11104324 39397544 vmlinux.patched With allmodconfig kernel, the size increase is only 362 bytes, even all the xfrm config options removed in this series are modular. before: text data bss dec filename 15731286 6936912 4046908 26715106 vmlinux.master after this series: 15731492 6937068 4046908 26715468 vmlinux Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/ipv4/esp4_offload.c | 2 +- net/ipv4/ip_vti.c | 2 +- net/ipv4/xfrm4_output.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 74d59e0177a7..b61a8ff558f9 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -135,7 +135,7 @@ static struct sk_buff *xfrm4_outer_mode_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - switch (x->outer_mode->encap) { + switch (x->outer_mode.encap) { case XFRM_MODE_TUNNEL: return xfrm4_tunnel_gso_segment(x, skb, features); case XFRM_MODE_TRANSPORT: diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 91926c9a3bc9..cc5d9c0a8a10 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -126,7 +126,7 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) x = xfrm_input_state(skb); - inner_mode = x->inner_mode; + inner_mode = &x->inner_mode; if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 7c3df14daef3..9bb8905088c7 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -83,7 +83,7 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) #endif rcu_read_lock(); - afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode->family); + afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode.family); if (likely(afinfo)) ret = afinfo->output_finish(sk, skb); else -- cgit From fd69c399c7d6262086b6b820757c6aeaa71feeba Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 8 Apr 2019 10:15:59 +0200 Subject: datagram: remove rendundant 'peeked' argument After commit a297569fe00a ("net/udp: do not touch skb->peeked unless really needed") the 'peeked' argument of __skb_try_recv_datagram() and friends is always equal to !!'flags & MSG_PEEK'. Since such argument is really a boolean info, and the callers have already 'flags & MSG_PEEK' handy, we can remove it and clean-up the code a bit. Signed-off-by: Paolo Abeni Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 372fdc5381a9..3c58ba02af7d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1631,7 +1631,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) EXPORT_SYMBOL(udp_ioctl); struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, - int noblock, int *peeked, int *off, int *err) + int noblock, int *off, int *err) { struct sk_buff_head *sk_queue = &sk->sk_receive_queue; struct sk_buff_head *queue; @@ -1650,13 +1650,11 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, break; error = -EAGAIN; - *peeked = 0; do { spin_lock_bh(&queue->lock); skb = __skb_try_recv_from_queue(sk, queue, flags, udp_skb_destructor, - peeked, off, err, - &last); + off, err, &last); if (skb) { spin_unlock_bh(&queue->lock); return skb; @@ -1677,8 +1675,7 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, skb = __skb_try_recv_from_queue(sk, queue, flags, udp_skb_dtor_locked, - peeked, off, err, - &last); + off, err, &last); spin_unlock(&sk_queue->lock); spin_unlock_bh(&queue->lock); if (skb) @@ -1713,8 +1710,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; unsigned int ulen, copied; - int peeked, peeking, off; - int err; + int off, err, peeking = flags & MSG_PEEK; int is_udplite = IS_UDPLITE(sk); bool checksum_valid = false; @@ -1722,9 +1718,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, return ip_recv_error(sk, msg, len, addr_len); try_again: - peeking = flags & MSG_PEEK; off = sk_peek_offset(sk, flags); - skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err); + skb = __skb_recv_udp(sk, flags, noblock, &off, &err); if (!skb) return err; @@ -1762,7 +1757,7 @@ try_again: } if (unlikely(err)) { - if (!peeked) { + if (!peeking) { atomic_inc(&sk->sk_drops); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); @@ -1771,7 +1766,7 @@ try_again: return err; } - if (!peeked) + if (!peeking) UDP_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS, is_udplite); -- cgit From c1deb065cf3b5bcd483e3f03479f930edb151b99 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 27 Mar 2019 09:22:25 +0100 Subject: netfilter: nf_tables: merge route type into core very little code, so it really doesn't make sense to have extra modules or even a kconfig knob for this. Merge them and make functionality available unconditionally. The merge makes inet family route support trivial, so add it as well here. Before: text data bss dec hex filename 835 832 0 1667 683 nft_chain_route_ipv4.ko 870 832 0 1702 6a6 nft_chain_route_ipv6.ko 111568 2556 529 114653 1bfdd nf_tables.ko After: text data bss dec hex filename 113133 2556 529 116218 1c5fa nf_tables.ko Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 8 --- net/ipv4/netfilter/Makefile | 1 - net/ipv4/netfilter/nft_chain_route_ipv4.c | 89 ------------------------------- 3 files changed, 98 deletions(-) delete mode 100644 net/ipv4/netfilter/nft_chain_route_ipv4.c (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index c98391d49200..ea688832fc4e 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -27,14 +27,6 @@ config NF_TABLES_IPV4 if NF_TABLES_IPV4 -config NFT_CHAIN_ROUTE_IPV4 - tristate "IPv4 nf_tables route chain support" - help - This option enables the "route" chain for IPv4 in nf_tables. This - chain type is used to force packet re-routing after mangling header - fields such as the source, destination, type of service and - the packet mark. - config NFT_REJECT_IPV4 select NF_REJECT_IPV4 default NFT_REJECT diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index e241f5188ebe..2cfdda7b109f 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -24,7 +24,6 @@ nf_nat_snmp_basic-y := nf_nat_snmp_basic.asn1.o nf_nat_snmp_basic_main.o $(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o -obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c deleted file mode 100644 index 7d82934c46f4..000000000000 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2008 Patrick McHardy - * Copyright (c) 2012 Pablo Neira Ayuso - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static unsigned int nf_route_table_hook(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - unsigned int ret; - struct nft_pktinfo pkt; - u32 mark; - __be32 saddr, daddr; - u_int8_t tos; - const struct iphdr *iph; - int err; - - nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv4(&pkt, skb); - - mark = skb->mark; - iph = ip_hdr(skb); - saddr = iph->saddr; - daddr = iph->daddr; - tos = iph->tos; - - ret = nft_do_chain(&pkt, priv); - if (ret != NF_DROP && ret != NF_STOLEN) { - iph = ip_hdr(skb); - - if (iph->saddr != saddr || - iph->daddr != daddr || - skb->mark != mark || - iph->tos != tos) { - err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); - if (err < 0) - ret = NF_DROP_ERR(err); - } - } - return ret; -} - -static const struct nft_chain_type nft_chain_route_ipv4 = { - .name = "route", - .type = NFT_CHAIN_T_ROUTE, - .family = NFPROTO_IPV4, - .owner = THIS_MODULE, - .hook_mask = (1 << NF_INET_LOCAL_OUT), - .hooks = { - [NF_INET_LOCAL_OUT] = nf_route_table_hook, - }, -}; - -static int __init nft_chain_route_init(void) -{ - nft_register_chain_type(&nft_chain_route_ipv4); - - return 0; -} - -static void __exit nft_chain_route_exit(void) -{ - nft_unregister_chain_type(&nft_chain_route_ipv4); -} - -module_init(nft_chain_route_init); -module_exit(nft_chain_route_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy "); -MODULE_ALIAS_NFT_CHAIN(AF_INET, "route"); -- cgit From bdf004677107e3b847c5db09c9fbf8edefa24996 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:26 -0700 Subject: net: Replace nhc_has_gw with nhc_gw_family Allow the gateway in a fib_nh_common to be from a different address family than the outer fib{6}_nh. To that end, replace nhc_has_gw with nhc_gw_family and update users of nhc_has_gw to check nhc_gw_family. Now nhc_family is used to know if the nh_common is part of a fib_nh or fib6_nh (used for container_of to get to route family specific data), and nhc_gw_family represents the address family for the gateway. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 25 +++++++++++-------------- net/ipv4/route.c | 5 +++-- 2 files changed, 14 insertions(+), 16 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 8e0cb1687a74..e11f78c6373f 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -513,7 +513,7 @@ int fib_nh_init(struct net *net, struct fib_nh *nh, nh->fib_nh_oif = cfg->fc_oif; if (cfg->fc_gw) { nh->fib_nh_gw4 = cfg->fc_gw; - nh->fib_nh_has_gw = 1; + nh->fib_nh_gw_family = AF_INET; } nh->fib_nh_flags = cfg->fc_flags; @@ -1238,7 +1238,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, "Route with host scope can not have multiple nexthops"); goto err_inval; } - if (nh->fib_nh_gw4) { + if (nh->fib_nh_gw_family) { NL_SET_ERR_MSG(extack, "Route with host scope can not have a gateway"); goto err_inval; @@ -1341,18 +1341,15 @@ int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc, rcu_read_unlock(); } - if (nhc->nhc_has_gw) { - switch (nhc->nhc_family) { - case AF_INET: - if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4)) - goto nla_put_failure; - break; - case AF_INET6: - if (nla_put_in6_addr(skb, RTA_GATEWAY, - &nhc->nhc_gw.ipv6) < 0) - goto nla_put_failure; - break; - } + switch (nhc->nhc_gw_family) { + case AF_INET: + if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4)) + goto nla_put_failure; + break; + case AF_INET6: + if (nla_put_in6_addr(skb, RTA_GATEWAY, &nhc->nhc_gw.ipv6) < 0) + goto nla_put_failure; + break; } *flags |= (nhc->nhc_flags & RTNH_F_ONLINK); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f3f2adf630d4..e7338e421796 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1734,8 +1734,9 @@ static int __mkroute_input(struct sk_buff *skb, do_cache = res->fi && !itag; if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && skb->protocol == htons(ETH_P_IP)) { - __be32 gw = nhc->nhc_family == AF_INET ? nhc->nhc_gw.ipv4 : 0; + __be32 gw; + gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0; if (IN_DEV_SHARED_MEDIA(out_dev) || inet_addr_onlink(out_dev, saddr, gw)) IPCB(skb)->flags |= IPSKB_DOREDIRECT; @@ -2284,7 +2285,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, } else { if (unlikely(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH && - !(nhc->nhc_has_gw && + !(nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK))) { do_cache = false; goto add; -- cgit From 1550c171935d264f522581fd037db5e64a716bb6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:27 -0700 Subject: ipv4: Prepare rtable for IPv6 gateway To allow the gateway to be either an IPv4 or IPv6 address, remove rt_uses_gateway from rtable and replace with rt_gw_family. If rt_gw_family is set it implies rt_uses_gateway. Rename rt_gateway to rt_gw4 to represent the IPv4 version. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv4/ip_forward.c | 2 +- net/ipv4/ip_output.c | 2 +- net/ipv4/route.c | 51 ++++++++++++++++++++++------------------- net/ipv4/xfrm4_policy.c | 5 ++-- 5 files changed, 34 insertions(+), 30 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 6ea523d71947..a175e3e7ae97 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -564,7 +564,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) + if (opt && opt->opt.is_strictroute && rt->rt_gw_family) goto route_err; rcu_read_unlock(); return &rt->dst; @@ -602,7 +602,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) + if (opt && opt->opt.is_strictroute && rt->rt_gw_family) goto route_err; return &rt->dst; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 00ec819f949b..06f6f280b9ff 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -123,7 +123,7 @@ int ip_forward(struct sk_buff *skb) rt = skb_rtable(skb); - if (opt->is_strictroute && rt->rt_uses_gateway) + if (opt->is_strictroute && rt->rt_gw_family) goto sr_failed; IPCB(skb)->flags |= IPSKB_FORWARDED; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 10b35328cfbc..a2bd4a6d9e6b 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -472,7 +472,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, skb_dst_set_noref(skb, &rt->dst); packet_routed: - if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) + if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gw_family) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e7338e421796..b77b4950d0c7 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -434,14 +434,13 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { + const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev = dst->dev; const __be32 *pkey = daddr; - const struct rtable *rt; struct neighbour *n; - rt = (const struct rtable *) dst; - if (rt->rt_gateway) - pkey = (const __be32 *) &rt->rt_gateway; + if (rt->rt_gw_family == AF_INET) + pkey = (const __be32 *) &rt->rt_gw4; else if (skb) pkey = &ip_hdr(skb)->daddr; @@ -453,13 +452,12 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) { + const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev = dst->dev; const __be32 *pkey = daddr; - const struct rtable *rt; - rt = (const struct rtable *)dst; - if (rt->rt_gateway) - pkey = (const __be32 *)&rt->rt_gateway; + if (rt->rt_gw_family == AF_INET) + pkey = (const __be32 *)&rt->rt_gw4; else if (!daddr || (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) @@ -629,8 +627,8 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh if (fnhe->fnhe_gw) { rt->rt_flags |= RTCF_REDIRECTED; - rt->rt_gateway = fnhe->fnhe_gw; - rt->rt_uses_gateway = 1; + rt->rt_gw_family = AF_INET; + rt->rt_gw4 = fnhe->fnhe_gw; } } @@ -747,7 +745,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow return; } - if (rt->rt_gateway != old_gw) + if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw) return; in_dev = __in_dev_get_rcu(dev); @@ -1282,7 +1280,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = READ_ONCE(dst->dev->mtu); if (unlikely(ip_mtu_locked(dst))) { - if (rt->rt_uses_gateway && mtu > 576) + if (rt->rt_gw_family && mtu > 576) mtu = 576; } @@ -1410,8 +1408,10 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, orig = NULL; } fill_route_from_fnhe(rt, fnhe); - if (!rt->rt_gateway) - rt->rt_gateway = daddr; + if (!rt->rt_gw4) { + rt->rt_gw4 = daddr; + rt->rt_gw_family = AF_INET; + } if (do_cache) { dst_hold(&rt->dst); @@ -1538,8 +1538,8 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common); if (nh->fib_nh_gw4 && nh->fib_nh_scope == RT_SCOPE_LINK) { - rt->rt_gateway = nh->fib_nh_gw4; - rt->rt_uses_gateway = 1; + rt->rt_gw4 = nh->fib_nh_gw4; + rt->rt_gw_family = AF_INET; } ip_dst_init_metrics(&rt->dst, fi->fib_metrics); @@ -1557,8 +1557,10 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, * However, if we are unsuccessful at storing this * route into the cache we really need to set it. */ - if (!rt->rt_gateway) - rt->rt_gateway = daddr; + if (!rt->rt_gw4) { + rt->rt_gw_family = AF_INET; + rt->rt_gw4 = daddr; + } rt_add_uncached_list(rt); } } else @@ -1591,8 +1593,8 @@ struct rtable *rt_dst_alloc(struct net_device *dev, rt->rt_iif = 0; rt->rt_pmtu = 0; rt->rt_mtu_locked = 0; - rt->rt_gateway = 0; - rt->rt_uses_gateway = 0; + rt->rt_gw_family = 0; + rt->rt_gw4 = 0; INIT_LIST_HEAD(&rt->rt_uncached); rt->dst.output = ip_output; @@ -2595,8 +2597,9 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; - rt->rt_gateway = ort->rt_gateway; - rt->rt_uses_gateway = ort->rt_uses_gateway; + rt->rt_gw_family = ort->rt_gw_family; + if (rt->rt_gw_family == AF_INET) + rt->rt_gw4 = ort->rt_gw4; INIT_LIST_HEAD(&rt->rt_uncached); } @@ -2675,8 +2678,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } - if (rt->rt_uses_gateway && - nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway)) + if (rt->rt_gw_family == AF_INET && + nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) goto nla_put_failure; expires = rt->dst.expires; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index d73a6d6652f6..ee53a91526e5 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -97,8 +97,9 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); xdst->u.rt.rt_type = rt->rt_type; - xdst->u.rt.rt_gateway = rt->rt_gateway; - xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; + xdst->u.rt.rt_gw_family = rt->rt_gw_family; + if (rt->rt_gw_family == AF_INET) + xdst->u.rt.rt_gw4 = rt->rt_gw4; xdst->u.rt.rt_pmtu = rt->rt_pmtu; xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked; INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); -- cgit From f35b794b3b405e2478654ea875bc0b29fe1a1bc5 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:28 -0700 Subject: ipv4: Prepare fib_config for IPv6 gateway Similar to rtable, fib_config needs to allow the gateway to be either an IPv4 or an IPv6 address. To that end, rename fc_gw to fc_gw4 to mean an IPv4 address and add fc_gw_family. Checks on 'is a gateway set' are changed to see if fc_gw_family is set. In the process prepare the code for a fc_gw_family == AF_INET6. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 8 +++++--- net/ipv4/fib_semantics.c | 40 ++++++++++++++++++++++++++-------------- 2 files changed, 31 insertions(+), 17 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 15f779bd26b3..f99a2ec32505 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -558,7 +558,8 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, if (rt->rt_gateway.sa_family == AF_INET && addr) { unsigned int addr_type; - cfg->fc_gw = addr; + cfg->fc_gw4 = addr; + cfg->fc_gw_family = AF_INET; addr_type = inet_addr_type_table(net, addr, cfg->fc_table); if (rt->rt_flags & RTF_GATEWAY && addr_type == RTN_UNICAST) @@ -568,7 +569,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, if (cmd == SIOCDELRT) return 0; - if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw) + if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family) return -EINVAL; if (cfg->fc_scope == RT_SCOPE_NOWHERE) @@ -708,7 +709,8 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, cfg->fc_oif = nla_get_u32(attr); break; case RTA_GATEWAY: - cfg->fc_gw = nla_get_be32(attr); + cfg->fc_gw_family = AF_INET; + cfg->fc_gw4 = nla_get_be32(attr); break; case RTA_VIA: NL_SET_ERR_MSG(extack, "IPv4 does not support RTA_VIA attribute"); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e11f78c6373f..d3e26e55f2e1 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -511,8 +511,8 @@ int fib_nh_init(struct net *net, struct fib_nh *nh, goto init_failure; nh->fib_nh_oif = cfg->fc_oif; - if (cfg->fc_gw) { - nh->fib_nh_gw4 = cfg->fc_gw; + if (cfg->fc_gw_family == AF_INET) { + nh->fib_nh_gw4 = cfg->fc_gw4; nh->fib_nh_gw_family = AF_INET; } nh->fib_nh_flags = cfg->fc_flags; @@ -589,8 +589,10 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, struct nlattr *nla, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); - if (nla) - fib_cfg.fc_gw = nla_get_in_addr(nla); + if (nla) { + fib_cfg.fc_gw_family = AF_INET; + fib_cfg.fc_gw4 = nla_get_in_addr(nla); + } nla = nla_find(attrs, attrlen, RTA_FLOW); if (nla) @@ -616,10 +618,14 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, "Nexthop device index does not match RTA_OIF"); goto errout; } - if (cfg->fc_gw && fi->fib_nh->fib_nh_gw4 != cfg->fc_gw) { - NL_SET_ERR_MSG(extack, - "Nexthop gateway does not match RTA_GATEWAY"); - goto errout; + if (cfg->fc_gw_family) { + if (cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family || + (cfg->fc_gw_family == AF_INET && + fi->fib_nh->fib_nh_gw4 != cfg->fc_gw4)) { + NL_SET_ERR_MSG(extack, + "Nexthop gateway does not match RTA_GATEWAY"); + goto errout; + } } #ifdef CONFIG_IP_ROUTE_CLASSID if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) { @@ -719,7 +725,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority) return 1; - if (cfg->fc_oif || cfg->fc_gw) { + if (cfg->fc_oif || cfg->fc_gw_family) { if (cfg->fc_encap) { if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap, fi->fib_nh, cfg, extack)) @@ -730,10 +736,16 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, cfg->fc_flow != fi->fib_nh->nh_tclassid) return 1; #endif - if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->fib_nh_oif) && - (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->fib_nh_gw4)) - return 0; - return 1; + if ((cfg->fc_oif && cfg->fc_oif != fi->fib_nh->fib_nh_oif) || + (cfg->fc_gw_family && + cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family)) + return 1; + + if (cfg->fc_gw_family == AF_INET && + cfg->fc_gw4 != fi->fib_nh->fib_nh_gw4) + return 1; + + return 0; } #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -1204,7 +1216,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto failure; if (fib_props[cfg->fc_type].error) { - if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) { + if (cfg->fc_gw_family || cfg->fc_oif || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Gateway, device and multipath can not be specified for this route type"); goto err_inval; -- cgit From 0f5f7d7bf6e6bda4dffe7b42812a16ada6ea9816 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:29 -0700 Subject: ipv4: Add support to rtable for ipv6 gateway Add support for an IPv6 gateway to rtable. Since a gateway is either IPv4 or IPv6, make it a union with rt_gw4 where rt_gw_family decides which address is in use. When dumping the route data, encode an ipv6 nexthop using RTA_VIA. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/route.c | 31 ++++++++++++++++++++++++++----- net/ipv4/xfrm4_policy.c | 2 ++ 2 files changed, 28 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b77b4950d0c7..6e58acf0a87b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1535,14 +1535,20 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, if (fi) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); - struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common); + struct fib_nh *nh; - if (nh->fib_nh_gw4 && nh->fib_nh_scope == RT_SCOPE_LINK) { - rt->rt_gw4 = nh->fib_nh_gw4; - rt->rt_gw_family = AF_INET; + if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) { + rt->rt_gw_family = nhc->nhc_gw_family; + /* only INET and INET6 are supported */ + if (likely(nhc->nhc_gw_family == AF_INET)) + rt->rt_gw4 = nhc->nhc_gw.ipv4; + else + rt->rt_gw6 = nhc->nhc_gw.ipv6; } + ip_dst_init_metrics(&rt->dst, fi->fib_metrics); + nh = container_of(nhc, struct fib_nh, nh_common); #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif @@ -2600,6 +2606,8 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_gw_family = ort->rt_gw_family; if (rt->rt_gw_family == AF_INET) rt->rt_gw4 = ort->rt_gw4; + else if (rt->rt_gw_family == AF_INET6) + rt->rt_gw6 = ort->rt_gw6; INIT_LIST_HEAD(&rt->rt_uncached); } @@ -2679,8 +2687,21 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, goto nla_put_failure; } if (rt->rt_gw_family == AF_INET && - nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) + nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { goto nla_put_failure; + } else if (rt->rt_gw_family == AF_INET6) { + int alen = sizeof(struct in6_addr); + struct nlattr *nla; + struct rtvia *via; + + nla = nla_reserve(skb, RTA_VIA, alen + 2); + if (!nla) + goto nla_put_failure; + + via = nla_data(nla); + via->rtvia_family = AF_INET6; + memcpy(via->rtvia_addr, &rt->rt_gw6, alen); + } expires = rt->dst.expires; if (expires) { diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index ee53a91526e5..72d19b1838ed 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -100,6 +100,8 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_gw_family = rt->rt_gw_family; if (rt->rt_gw_family == AF_INET) xdst->u.rt.rt_gw4 = rt->rt_gw4; + else if (rt->rt_gw_family == AF_INET6) + xdst->u.rt.rt_gw6 = rt->rt_gw6; xdst->u.rt.rt_pmtu = rt->rt_pmtu; xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked; INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); -- cgit From a4ea5d43c807be28545625c1e0641905022fa0d1 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:30 -0700 Subject: ipv4: Add support to fib_config for IPv6 gateway Add support for an IPv6 gateway to fib_config. Since a gateway is either IPv4 or IPv6, make it a union with fc_gw4 where fc_gw_family decides which address is in use. Update current checks on family and gw4 to handle ipv6 as well. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d3e26e55f2e1..680b5a9a911a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -276,7 +276,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) for_nexthops(fi) { if (nh->fib_nh_oif != onh->fib_nh_oif || - nh->fib_nh_gw4 != onh->fib_nh_gw4 || + nh->fib_nh_gw_family != onh->fib_nh_gw_family || nh->fib_nh_scope != onh->fib_nh_scope || #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->fib_nh_weight != onh->fib_nh_weight || @@ -287,6 +287,15 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) lwtunnel_cmp_encap(nh->fib_nh_lws, onh->fib_nh_lws) || ((nh->fib_nh_flags ^ onh->fib_nh_flags) & ~RTNH_COMPARE_MASK)) return -1; + + if (nh->fib_nh_gw_family == AF_INET && + nh->fib_nh_gw4 != onh->fib_nh_gw4) + return -1; + + if (nh->fib_nh_gw_family == AF_INET6 && + ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6)) + return -1; + onh++; } endfor_nexthops(fi); return 0; @@ -511,10 +520,12 @@ int fib_nh_init(struct net *net, struct fib_nh *nh, goto init_failure; nh->fib_nh_oif = cfg->fc_oif; - if (cfg->fc_gw_family == AF_INET) { + nh->fib_nh_gw_family = cfg->fc_gw_family; + if (cfg->fc_gw_family == AF_INET) nh->fib_nh_gw4 = cfg->fc_gw4; - nh->fib_nh_gw_family = AF_INET; - } + else if (cfg->fc_gw_family == AF_INET6) + nh->fib_nh_gw6 = cfg->fc_gw6; + nh->fib_nh_flags = cfg->fc_flags; #ifdef CONFIG_IP_ROUTE_CLASSID @@ -621,9 +632,11 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, if (cfg->fc_gw_family) { if (cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family || (cfg->fc_gw_family == AF_INET && - fi->fib_nh->fib_nh_gw4 != cfg->fc_gw4)) { + fi->fib_nh->fib_nh_gw4 != cfg->fc_gw4) || + (cfg->fc_gw_family == AF_INET6 && + ipv6_addr_cmp(&fi->fib_nh->fib_nh_gw6, &cfg->fc_gw6))) { NL_SET_ERR_MSG(extack, - "Nexthop gateway does not match RTA_GATEWAY"); + "Nexthop gateway does not match RTA_GATEWAY or RTA_VIA"); goto errout; } } @@ -745,6 +758,10 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, cfg->fc_gw4 != fi->fib_nh->fib_nh_gw4) return 1; + if (cfg->fc_gw_family == AF_INET6 && + ipv6_addr_cmp(&cfg->fc_gw6, &fi->fib_nh->fib_nh_gw6)) + return 1; + return 0; } -- cgit From 448d7248191706cbbd7761e3bc72c2985c4d38a7 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:31 -0700 Subject: ipv4: Refactor fib_check_nh fib_check_nh is currently huge covering multiple uses cases - device only, device + gateway, and device + gateway with ONLINK. The next patch adds validation checks for IPv6 which only further complicates it. So, break fib_check_nh into 2 helpers - one for gateway validation and one for device only. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 234 +++++++++++++++++++++++++---------------------- 1 file changed, 125 insertions(+), 109 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 680b5a9a911a..32ce6e6202d2 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -885,134 +885,150 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) * | * |-> {local prefix} (terminal node) */ -static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, - struct netlink_ext_ack *extack) +static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table, + u8 scope, struct netlink_ext_ack *extack) { - int err = 0; - struct net *net; struct net_device *dev; + struct fib_result res; + int err; - net = cfg->fc_nlinfo.nl_net; - if (nh->fib_nh_gw4) { - struct fib_result res; - - if (nh->fib_nh_flags & RTNH_F_ONLINK) { - unsigned int addr_type; + if (nh->fib_nh_flags & RTNH_F_ONLINK) { + unsigned int addr_type; - if (cfg->fc_scope >= RT_SCOPE_LINK) { - NL_SET_ERR_MSG(extack, - "Nexthop has invalid scope"); - return -EINVAL; - } - dev = __dev_get_by_index(net, nh->fib_nh_oif); - if (!dev) { - NL_SET_ERR_MSG(extack, "Nexthop device required for onlink"); - return -ENODEV; - } - if (!(dev->flags & IFF_UP)) { - NL_SET_ERR_MSG(extack, - "Nexthop device is not up"); - return -ENETDOWN; - } - addr_type = inet_addr_type_dev_table(net, dev, - nh->fib_nh_gw4); - if (addr_type != RTN_UNICAST) { - NL_SET_ERR_MSG(extack, - "Nexthop has invalid gateway"); - return -EINVAL; - } - if (!netif_carrier_ok(dev)) - nh->fib_nh_flags |= RTNH_F_LINKDOWN; - nh->fib_nh_dev = dev; - dev_hold(dev); - nh->fib_nh_scope = RT_SCOPE_LINK; - return 0; + if (scope >= RT_SCOPE_LINK) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid scope"); + return -EINVAL; } - rcu_read_lock(); - { - struct fib_table *tbl = NULL; - struct flowi4 fl4 = { - .daddr = nh->fib_nh_gw4, - .flowi4_scope = cfg->fc_scope + 1, - .flowi4_oif = nh->fib_nh_oif, - .flowi4_iif = LOOPBACK_IFINDEX, - }; - - /* It is not necessary, but requires a bit of thinking */ - if (fl4.flowi4_scope < RT_SCOPE_LINK) - fl4.flowi4_scope = RT_SCOPE_LINK; - - if (cfg->fc_table) - tbl = fib_get_table(net, cfg->fc_table); - - if (tbl) - err = fib_table_lookup(tbl, &fl4, &res, - FIB_LOOKUP_IGNORE_LINKSTATE | - FIB_LOOKUP_NOREF); - - /* on error or if no table given do full lookup. This - * is needed for example when nexthops are in the local - * table rather than the given table - */ - if (!tbl || err) { - err = fib_lookup(net, &fl4, &res, - FIB_LOOKUP_IGNORE_LINKSTATE); - } - - if (err) { - NL_SET_ERR_MSG(extack, - "Nexthop has invalid gateway"); - rcu_read_unlock(); - return err; - } + dev = __dev_get_by_index(net, nh->fib_nh_oif); + if (!dev) { + NL_SET_ERR_MSG(extack, "Nexthop device required for onlink"); + return -ENODEV; } - err = -EINVAL; - if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) { - NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); - goto out; + if (!(dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + return -ENETDOWN; } - nh->fib_nh_scope = res.scope; - nh->fib_nh_oif = FIB_RES_OIF(res); - nh->fib_nh_dev = dev = FIB_RES_DEV(res); - if (!dev) { - NL_SET_ERR_MSG(extack, - "No egress device for nexthop gateway"); - goto out; + addr_type = inet_addr_type_dev_table(net, dev, nh->fib_nh_gw4); + if (addr_type != RTN_UNICAST) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); + return -EINVAL; } - dev_hold(dev); if (!netif_carrier_ok(dev)) nh->fib_nh_flags |= RTNH_F_LINKDOWN; - err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; - } else { - struct in_device *in_dev; - - if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) { - NL_SET_ERR_MSG(extack, - "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set"); - return -EINVAL; + nh->fib_nh_dev = dev; + dev_hold(dev); + nh->fib_nh_scope = RT_SCOPE_LINK; + return 0; + } + rcu_read_lock(); + { + struct fib_table *tbl = NULL; + struct flowi4 fl4 = { + .daddr = nh->fib_nh_gw4, + .flowi4_scope = scope + 1, + .flowi4_oif = nh->fib_nh_oif, + .flowi4_iif = LOOPBACK_IFINDEX, + }; + + /* It is not necessary, but requires a bit of thinking */ + if (fl4.flowi4_scope < RT_SCOPE_LINK) + fl4.flowi4_scope = RT_SCOPE_LINK; + + if (table) + tbl = fib_get_table(net, table); + + if (tbl) + err = fib_table_lookup(tbl, &fl4, &res, + FIB_LOOKUP_IGNORE_LINKSTATE | + FIB_LOOKUP_NOREF); + + /* on error or if no table given do full lookup. This + * is needed for example when nexthops are in the local + * table rather than the given table + */ + if (!tbl || err) { + err = fib_lookup(net, &fl4, &res, + FIB_LOOKUP_IGNORE_LINKSTATE); } - rcu_read_lock(); - err = -ENODEV; - in_dev = inetdev_by_index(net, nh->fib_nh_oif); - if (!in_dev) - goto out; - err = -ENETDOWN; - if (!(in_dev->dev->flags & IFF_UP)) { - NL_SET_ERR_MSG(extack, "Device for nexthop is not up"); + + if (err) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); goto out; } - nh->fib_nh_dev = in_dev->dev; - dev_hold(nh->fib_nh_dev); - nh->fib_nh_scope = RT_SCOPE_HOST; - if (!netif_carrier_ok(nh->fib_nh_dev)) - nh->fib_nh_flags |= RTNH_F_LINKDOWN; - err = 0; } + + err = -EINVAL; + if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); + goto out; + } + nh->fib_nh_scope = res.scope; + nh->fib_nh_oif = FIB_RES_OIF(res); + nh->fib_nh_dev = dev = FIB_RES_DEV(res); + if (!dev) { + NL_SET_ERR_MSG(extack, + "No egress device for nexthop gateway"); + goto out; + } + dev_hold(dev); + if (!netif_carrier_ok(dev)) + nh->fib_nh_flags |= RTNH_F_LINKDOWN; + err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; out: rcu_read_unlock(); return err; } +static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh, + struct netlink_ext_ack *extack) +{ + struct in_device *in_dev; + int err; + + if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) { + NL_SET_ERR_MSG(extack, + "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set"); + return -EINVAL; + } + + rcu_read_lock(); + + err = -ENODEV; + in_dev = inetdev_by_index(net, nh->fib_nh_oif); + if (!in_dev) + goto out; + err = -ENETDOWN; + if (!(in_dev->dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Device for nexthop is not up"); + goto out; + } + + nh->fib_nh_dev = in_dev->dev; + dev_hold(nh->fib_nh_dev); + nh->fib_nh_scope = RT_SCOPE_HOST; + if (!netif_carrier_ok(nh->fib_nh_dev)) + nh->fib_nh_flags |= RTNH_F_LINKDOWN; + err = 0; +out: + rcu_read_unlock(); + return err; +} + +static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, + struct netlink_ext_ack *extack) +{ + struct net *net = cfg->fc_nlinfo.nl_net; + u32 table = cfg->fc_table; + int err; + + if (nh->fib_nh_gw_family == AF_INET) + err = fib_check_nh_v4_gw(net, nh, table, cfg->fc_scope, extack); + else + err = fib_check_nh_nongw(net, nh, extack); + + return err; +} + static inline unsigned int fib_laddr_hashfn(__be32 val) { unsigned int mask = (fib_info_hash_size - 1); -- cgit From 717a8f5b2923c44da9157e145c294c4343a5f6de Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:32 -0700 Subject: ipv4: Add fib_check_nh_v6_gw Add helper to use fib6_nh_init to validate a nexthop spec with an IPv6 gateway. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 32ce6e6202d2..dd95725c318e 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -841,6 +842,30 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) return true; } +static int fib_check_nh_v6_gw(struct net *net, struct fib_nh *nh, + u32 table, struct netlink_ext_ack *extack) +{ + struct fib6_config cfg = { + .fc_table = table, + .fc_flags = nh->fib_nh_flags | RTF_GATEWAY, + .fc_ifindex = nh->fib_nh_oif, + .fc_gateway = nh->fib_nh_gw6, + }; + struct fib6_nh fib6_nh = {}; + int err; + + err = ipv6_stub->fib6_nh_init(net, &fib6_nh, &cfg, GFP_KERNEL, extack); + if (!err) { + nh->fib_nh_dev = fib6_nh.fib_nh_dev; + dev_hold(nh->fib_nh_dev); + nh->fib_nh_oif = nh->fib_nh_dev->ifindex; + nh->fib_nh_scope = RT_SCOPE_LINK; + + ipv6_stub->fib6_nh_release(&fib6_nh); + } + + return err; +} /* * Picture @@ -1023,6 +1048,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, if (nh->fib_nh_gw_family == AF_INET) err = fib_check_nh_v4_gw(net, nh, table, cfg->fc_scope, extack); + else if (nh->fib_nh_gw_family == AF_INET6) + err = fib_check_nh_v6_gw(net, nh, table, extack); else err = fib_check_nh_nongw(net, nh, extack); -- cgit From 0353f28231c79416191326810e7fe656b69c63b7 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:33 -0700 Subject: neighbor: Add skip_cache argument to neigh_output A later patch allows an IPv6 gateway with an IPv4 route. The neighbor entry will exist in the v6 ndisc table and the cached header will contain the ipv6 protocol which is wrong for an IPv4 packet. For an IPv4 packet to use the v6 neighbor entry, neigh_output needs to skip the cached header and just use the output callback for the neigh entry. A future patchset can look at expanding the hh_cache to handle 2 protocols. For now, IPv6 gateways with an IPv4 route will take the extra overhead of generating the header. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index a2bd4a6d9e6b..cca4892b8cb2 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -226,7 +226,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s int res; sock_confirm_neigh(skb, neigh); - res = neigh_output(neigh, skb); + res = neigh_output(neigh, skb, false); rcu_read_unlock_bh(); return res; -- cgit From 5c9f7c1dfc2e0776551ef1ceb335187c6698d1ff Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:34 -0700 Subject: ipv4: Add helpers for neigh lookup for nexthop A common theme in the output path is looking up a neigh entry for a nexthop, either the gateway in an rtable or a fallback to the daddr in the skb: nexthop = (__force u32)rt_nexthop(rt, ip_hdr(skb)->daddr); neigh = __ipv4_neigh_lookup_noref(dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); To allow the nexthop to be an IPv6 address we need to consider the family of the nexthop and then call __ipv{4,6}_neigh_lookup_noref based on it. To make this simpler, add a ip_neigh_gw4 helper similar to ip_neigh_gw6 added in an earlier patch which handles: neigh = __ipv4_neigh_lookup_noref(dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); And then add a second one, ip_neigh_for_gw, that calls either ip_neigh_gw4 or ip_neigh_gw6 based on the address family of the gateway. Update the output paths in the VRF driver and core v4 code to use ip_neigh_for_gw simplifying the family based lookup and making both ready for a v6 nexthop. ipv4_neigh_lookup has a different need - the potential to resolve a passed in address in addition to any gateway in the rtable or skb. Since this is a one-off, add ip_neigh_gw4 and ip_neigh_gw6 diectly. The difference between __neigh_create used by the helpers and neigh_create called by ipv4_neigh_lookup is taking a refcount, so add rcu_read_lock_bh and bump the refcnt on the neigh entry. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 11 ++++------- net/ipv4/route.c | 29 +++++++++++++++++++---------- 2 files changed, 23 insertions(+), 17 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index cca4892b8cb2..4e42c1974ba2 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -188,7 +188,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; - u32 nexthop; + bool is_v6gw = false; if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len); @@ -218,16 +218,13 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s } rcu_read_lock_bh(); - nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); - neigh = __ipv4_neigh_lookup_noref(dev, nexthop); - if (unlikely(!neigh)) - neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); + neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); if (!IS_ERR(neigh)) { int res; sock_confirm_neigh(skb, neigh); - res = neigh_output(neigh, skb, false); - + /* if crossing protocols, can not use the cached header */ + res = neigh_output(neigh, skb, is_v6gw); rcu_read_unlock_bh(); return res; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6e58acf0a87b..32ecb4c1c7e3 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -436,18 +436,27 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, { const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev = dst->dev; - const __be32 *pkey = daddr; struct neighbour *n; - if (rt->rt_gw_family == AF_INET) - pkey = (const __be32 *) &rt->rt_gw4; - else if (skb) - pkey = &ip_hdr(skb)->daddr; - - n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); - if (n) - return n; - return neigh_create(&arp_tbl, pkey, dev); + rcu_read_lock_bh(); + + if (likely(rt->rt_gw_family == AF_INET)) { + n = ip_neigh_gw4(dev, rt->rt_gw4); + } else if (rt->rt_gw_family == AF_INET6) { + n = ip_neigh_gw6(dev, &rt->rt_gw6); + } else { + __be32 pkey; + + pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr); + n = ip_neigh_gw4(dev, pkey); + } + + if (n && !refcount_inc_not_zero(&n->refcnt)) + n = NULL; + + rcu_read_unlock_bh(); + + return n; } static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) -- cgit From 6de9c0557e4fc7e1b2f8ed6178aad32f64e1d7da Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:36 -0700 Subject: ipv4: Handle ipv6 gateway in ipv4_confirm_neigh Update ipv4_confirm_neigh to handle an ipv6 gateway. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/route.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 32ecb4c1c7e3..efa6a36cbfff 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -465,13 +465,15 @@ static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) struct net_device *dev = dst->dev; const __be32 *pkey = daddr; - if (rt->rt_gw_family == AF_INET) + if (rt->rt_gw_family == AF_INET) { pkey = (const __be32 *)&rt->rt_gw4; - else if (!daddr || + } else if (rt->rt_gw_family == AF_INET6) { + return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6); + } else if (!daddr || (rt->rt_flags & - (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) + (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) { return; - + } __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); } -- cgit From 619d1826269b4be5c992bec0029bbfcc823d663d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:37 -0700 Subject: ipv4: Handle ipv6 gateway in fib_detect_death Update fib_detect_death to handle an ipv6 gateway. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index dd95725c318e..e5a6d431bfab 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -457,10 +457,18 @@ static int fib_detect_death(struct fib_info *fi, int order, struct fib_info **last_resort, int *last_idx, int dflt) { + const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); struct neighbour *n; int state = NUD_NONE; - n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].fib_nh_gw4, fi->fib_dev); + if (likely(nhc->nhc_gw_family == AF_INET)) + n = neigh_lookup(&arp_tbl, &nhc->nhc_gw.ipv4, nhc->nhc_dev); + else if (nhc->nhc_gw_family == AF_INET6) + n = neigh_lookup(ipv6_stub->nd_tbl, &nhc->nhc_gw.ipv6, + nhc->nhc_dev); + else + n = NULL; + if (n) { state = n->nud_state; neigh_release(n); -- cgit From 1a38c43d319e745cf12055a266a1f459e2ba9ec3 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:38 -0700 Subject: ipv4: Handle ipv6 gateway in fib_good_nh Update fib_good_nh to handle an ipv6 gateway. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e5a6d431bfab..c1ea138335a2 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1909,8 +1909,14 @@ static bool fib_good_nh(const struct fib_nh *nh) rcu_read_lock_bh(); - n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, - (__force u32)nh->fib_nh_gw4); + if (likely(nh->fib_nh_gw_family == AF_INET)) + n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, + (__force u32)nh->fib_nh_gw4); + else if (nh->fib_nh_gw_family == AF_INET6) + n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, + &nh->fib_nh_gw6); + else + n = NULL; if (n) state = n->nud_state; -- cgit From 19a9d136f198cd7c4e26ea6897a0cf067d3f7ecb Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:39 -0700 Subject: ipv4: Flag fib_info with a fib_nh using IPv6 gateway Until support is added to the offload drivers, they need to be able to reject routes with an IPv6 gateway. To that end add a flag to fib_info that indicates if any fib_nh has a v6 gateway. The flag allows the drivers to efficiently know the use of a v6 gateway without walking all fib_nh tied to a fib_info each time a route is added. Update mlxsw and rocker to reject the routes with extack message as to why. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c1ea138335a2..4a968e24507b 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1349,6 +1349,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg, change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); + if (nexthop_nh->fib_nh_gw_family == AF_INET6) + fi->fib_nh_is_v6 = true; } endfor_nexthops(fi) fib_rebalance(fi); -- cgit From d15662682db232da77136cd348f4c9df312ca6f9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:40 -0700 Subject: ipv4: Allow ipv6 gateway with ipv4 routes Add support for RTA_VIA and allow an IPv6 nexthop for v4 routes: $ ip ro add 172.16.1.0/24 via inet6 2001:db8::1 dev eth0 $ ip ro ls ... 172.16.1.0/24 via inet6 2001:db8::1 dev eth0 For convenience and simplicity, userspace can use RTA_VIA to specify AF_INET or AF_INET6 gateway. The common fib_nexthop_info dump function compares the gateway address family to the nh_common family to know if the gateway should be encoded as RTA_VIA or RTA_GATEWAY. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 60 ++++++++++++++++++++++++++++++++++++++--- net/ipv4/fib_semantics.c | 69 ++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 121 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index f99a2ec32505..310060e67790 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -665,10 +665,55 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_DPORT] = { .type = NLA_U16 }, }; +int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, + struct netlink_ext_ack *extack) +{ + struct rtvia *via; + int alen; + + if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) { + NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA"); + return -EINVAL; + } + + via = nla_data(nla); + alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr); + + switch (via->rtvia_family) { + case AF_INET: + if (alen != sizeof(__be32)) { + NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA"); + return -EINVAL; + } + cfg->fc_gw_family = AF_INET; + cfg->fc_gw4 = *((__be32 *)via->rtvia_addr); + break; + case AF_INET6: +#ifdef CONFIG_IPV6 + if (alen != sizeof(struct in6_addr)) { + NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA"); + return -EINVAL; + } + cfg->fc_gw_family = AF_INET6; + cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr); +#else + NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel"); + return -EINVAL; +#endif + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA"); + return -EINVAL; + } + + return 0; +} + static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct fib_config *cfg, struct netlink_ext_ack *extack) { + bool has_gw = false, has_via = false; struct nlattr *attr; int err, remaining; struct rtmsg *rtm; @@ -709,13 +754,16 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, cfg->fc_oif = nla_get_u32(attr); break; case RTA_GATEWAY: + has_gw = true; cfg->fc_gw_family = AF_INET; cfg->fc_gw4 = nla_get_be32(attr); break; case RTA_VIA: - NL_SET_ERR_MSG(extack, "IPv4 does not support RTA_VIA attribute"); - err = -EINVAL; - goto errout; + has_via = true; + err = fib_gw_from_via(cfg, attr, extack); + if (err) + goto errout; + break; case RTA_PRIORITY: cfg->fc_priority = nla_get_u32(attr); break; @@ -754,6 +802,12 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, } } + if (has_gw && has_via) { + NL_SET_ERR_MSG(extack, + "Nexthop configuration can not contain both GATEWAY and VIA"); + goto errout; + } + return 0; errout: return err; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 4a968e24507b..017273885eee 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -606,12 +606,22 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { - struct nlattr *nla, *attrs = rtnh_attrs(rtnh); + struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); + nlav = nla_find(attrs, attrlen, RTA_VIA); + if (nla && nlav) { + NL_SET_ERR_MSG(extack, + "Nexthop configuration can not contain both GATEWAY and VIA"); + return -EINVAL; + } if (nla) { fib_cfg.fc_gw_family = AF_INET; fib_cfg.fc_gw4 = nla_get_in_addr(nla); + } else if (nlav) { + ret = fib_gw_from_via(&fib_cfg, nlav, extack); + if (ret) + goto errout; } nla = nla_find(attrs, attrlen, RTA_FLOW); @@ -792,11 +802,43 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { - struct nlattr *nla, *attrs = rtnh_attrs(rtnh); + struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); - if (nla && nla_get_in_addr(nla) != nh->fib_nh_gw4) - return 1; + nlav = nla_find(attrs, attrlen, RTA_VIA); + if (nla && nlav) { + NL_SET_ERR_MSG(extack, + "Nexthop configuration can not contain both GATEWAY and VIA"); + return -EINVAL; + } + + if (nla) { + if (nh->fib_nh_gw_family != AF_INET || + nla_get_in_addr(nla) != nh->fib_nh_gw4) + return 1; + } else if (nlav) { + struct fib_config cfg2; + int err; + + err = fib_gw_from_via(&cfg2, nlav, extack); + if (err) + return err; + + switch (nh->fib_nh_gw_family) { + case AF_INET: + if (cfg2.fc_gw_family != AF_INET || + cfg2.fc_gw4 != nh->fib_nh_gw4) + return 1; + break; + case AF_INET6: + if (cfg2.fc_gw_family != AF_INET6 || + ipv6_addr_cmp(&cfg2.fc_gw6, + &nh->fib_nh_gw6)) + return 1; + break; + } + } + #ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); if (nla && nla_get_u32(nla) != nh->nh_tclassid) @@ -1429,8 +1471,25 @@ int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc, goto nla_put_failure; break; case AF_INET6: - if (nla_put_in6_addr(skb, RTA_GATEWAY, &nhc->nhc_gw.ipv6) < 0) + /* if gateway family does not match nexthop family + * gateway is encoded as RTA_VIA + */ + if (nhc->nhc_gw_family != nhc->nhc_family) { + int alen = sizeof(struct in6_addr); + struct nlattr *nla; + struct rtvia *via; + + nla = nla_reserve(skb, RTA_VIA, alen + 2); + if (!nla) + goto nla_put_failure; + + via = nla_data(nla); + via->rtvia_family = AF_INET6; + memcpy(via->rtvia_addr, &nhc->nhc_gw.ipv6, alen); + } else if (nla_put_in6_addr(skb, RTA_GATEWAY, + &nhc->nhc_gw.ipv6) < 0) { goto nla_put_failure; + } break; } -- cgit From d73f80f921fd323af8f35644fb9f3b129f465f66 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 10 Apr 2019 10:05:51 -0700 Subject: ipv4: Handle RTA_GATEWAY set to 0 Govindarajulu reported a regression with Network Manager which sends an RTA_GATEWAY attribute with the address set to 0. Fixup the handling of RTA_GATEWAY to only set fc_gw_family if the gateway address is actually set. Fixes: f35b794b3b405 ("ipv4: Prepare fib_config for IPv6 gateway") Reported-by: Govindarajulu Varadarajan Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 3 ++- net/ipv4/fib_semantics.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 310060e67790..d4b63f94f7be 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -755,8 +755,9 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, break; case RTA_GATEWAY: has_gw = true; - cfg->fc_gw_family = AF_INET; cfg->fc_gw4 = nla_get_be32(attr); + if (cfg->fc_gw4) + cfg->fc_gw_family = AF_INET; break; case RTA_VIA: has_via = true; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 017273885eee..779d2be2b135 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -616,8 +616,9 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, return -EINVAL; } if (nla) { - fib_cfg.fc_gw_family = AF_INET; fib_cfg.fc_gw4 = nla_get_in_addr(nla); + if (fib_cfg.fc_gw4) + fib_cfg.fc_gw_family = AF_INET; } else if (nlav) { ret = fib_gw_from_via(&fib_cfg, nlav, extack); if (ret) -- cgit From c9d52f216922425b56b002100b75de34b62b11a0 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 9 Apr 2019 09:59:07 +0200 Subject: fou: correct spelling of encapsulation Correct spelling of encapsulation. Found by inspection. Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- net/ipv4/fou.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 100e63f57ea6..d2a2f3258e4b 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -136,7 +136,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) break; case 1: { - /* Direct encasulation of IPv4 or IPv6 */ + /* Direct encapsulation of IPv4 or IPv6 */ int prot; @@ -1137,7 +1137,7 @@ static int gue_err(struct sk_buff *skb, u32 info) case 0: /* Full GUE header present */ break; case 1: { - /* Direct encasulation of IPv4 or IPv6 */ + /* Direct encapsulation of IPv4 or IPv6 */ skb_set_transport_header(skb, -(int)sizeof(struct icmphdr)); switch (((struct iphdr *)guehdr)->version) { -- cgit From 526bb57a6ad6b0ed6de34b3c5eabf394b248618f Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 9 Apr 2019 12:03:07 +0200 Subject: net: fou: remove redundant code in gue_udp_recv Remove not useful protocol version check in gue_udp_recv since just gue version 0 can hit that code. Moreover remove duplicated hdrlen computation Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- net/ipv4/fou.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index d2a2f3258e4b..b038f563baa4 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -170,9 +170,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) /* guehdr may change after pull */ guehdr = (struct guehdr *)&udp_hdr(skb)[1]; - hdrlen = sizeof(struct guehdr) + optlen; - - if (guehdr->version != 0 || validate_gue_flags(guehdr, optlen)) + if (validate_gue_flags(guehdr, optlen)) goto drop; hdrlen = sizeof(struct guehdr) + optlen; -- cgit From bf8981a2aa082d9d64771b47c8a1c9c388d8cd40 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 9 Apr 2019 10:44:06 +0200 Subject: netfilter: nf_nat: merge ip/ip6 masquerade headers Both are now implemented by nf_nat_masquerade.c, so no need to keep different headers. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_MASQUERADE.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index fd3f9e8a74da..0a2bffb6a0ad 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team "); -- cgit From adf82accc5f526f1e812f1a8df7292fef7dad19a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 9 Apr 2019 10:44:07 +0200 Subject: netfilter: x_tables: merge ip and ipv6 masquerade modules No need to have separate modules for this. before: text data bss dec filename 2038 1168 0 3206 net/ipv4/netfilter/ipt_MASQUERADE.ko 1526 1024 0 2550 net/ipv6/netfilter/ip6t_MASQUERADE.ko after: text data bss dec filename 2521 1296 0 3817 net/netfilter/xt_MASQUERADE.ko Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 12 ++--- net/ipv4/netfilter/Makefile | 1 - net/ipv4/netfilter/ipt_MASQUERADE.c | 101 ------------------------------------ 3 files changed, 3 insertions(+), 111 deletions(-) delete mode 100644 net/ipv4/netfilter/ipt_MASQUERADE.c (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index ea688832fc4e..1412b029f37f 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -224,16 +224,10 @@ if IP_NF_NAT config IP_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" - select NF_NAT_MASQUERADE - default m if NETFILTER_ADVANCED=n + select NETFILTER_XT_TARGET_MASQUERADE help - Masquerading is a special case of NAT: all outgoing connections are - changed to seem to come from a particular interface's address, and - if the interface goes down, those connections are lost. This is - only useful for dialup accounts with dynamic IP address (ie. your IP - address will be different on next dialup). - - To compile it as a module, choose M here. If unsure, say N. + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects NETFILTER_XT_TARGET_MASQUERADE. config IP_NF_TARGET_NETMAP tristate "NETMAP target support" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 2cfdda7b109f..c50e0ec095d2 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -48,7 +48,6 @@ obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o # targets obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o -obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c deleted file mode 100644 index 0a2bffb6a0ad..000000000000 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ /dev/null @@ -1,101 +0,0 @@ -/* Masquerade. Simple mapping which alters range to a local IP address - (depending on route). */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Netfilter Core Team "); -MODULE_DESCRIPTION("Xtables: automatic-address SNAT"); - -/* FIXME: Multiple targets. --RR */ -static int masquerade_tg_check(const struct xt_tgchk_param *par) -{ - const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; - - if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) { - pr_debug("bad MAP_IPS.\n"); - return -EINVAL; - } - if (mr->rangesize != 1) { - pr_debug("bad rangesize %u\n", mr->rangesize); - return -EINVAL; - } - return nf_ct_netns_get(par->net, par->family); -} - -static unsigned int -masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) -{ - struct nf_nat_range2 range; - const struct nf_nat_ipv4_multi_range_compat *mr; - - mr = par->targinfo; - range.flags = mr->range[0].flags; - range.min_proto = mr->range[0].min; - range.max_proto = mr->range[0].max; - - return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), &range, - xt_out(par)); -} - -static void masquerade_tg_destroy(const struct xt_tgdtor_param *par) -{ - nf_ct_netns_put(par->net, par->family); -} - -static struct xt_target masquerade_tg_reg __read_mostly = { - .name = "MASQUERADE", - .family = NFPROTO_IPV4, - .target = masquerade_tg, - .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), - .table = "nat", - .hooks = 1 << NF_INET_POST_ROUTING, - .checkentry = masquerade_tg_check, - .destroy = masquerade_tg_destroy, - .me = THIS_MODULE, -}; - -static int __init masquerade_tg_init(void) -{ - int ret; - - ret = xt_register_target(&masquerade_tg_reg); - if (ret) - return ret; - - ret = nf_nat_masquerade_ipv4_register_notifier(); - if (ret) - xt_unregister_target(&masquerade_tg_reg); - - return ret; -} - -static void __exit masquerade_tg_exit(void) -{ - xt_unregister_target(&masquerade_tg_reg); - nf_nat_masquerade_ipv4_unregister_notifier(); -} - -module_init(masquerade_tg_init); -module_exit(masquerade_tg_exit); -- cgit From c7cbdbf29f488a19982cd9f4a109887f18028bbb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 Apr 2019 22:51:48 +0200 Subject: net: rework SIOCGSTAMP ioctl handling The SIOCGSTAMP/SIOCGSTAMPNS ioctl commands are implemented by many socket protocol handlers, and all of those end up calling the same sock_get_timestamp()/sock_get_timestampns() helper functions, which results in a lot of duplicate code. With the introduction of 64-bit time_t on 32-bit architectures, this gets worse, as we then need four different ioctl commands in each socket protocol implementation. To simplify that, let's add a new .gettstamp() operation in struct proto_ops, and move ioctl implementation into the common sock_ioctl()/compat_sock_ioctl_trans() functions that these all go through. We can reuse the sock_get_timestamp() implementation, but generalize it so it can deal with both native and compat mode, as well as timeval and timespec structures. Acked-by: Stefan Schmidt Acked-by: Neil Horman Acked-by: Marc Kleine-Budde Link: https://lore.kernel.org/lkml/CAK8P3a038aDQQotzua_QtKGhq8O9n+rdiz2=WDCp82ys8eUT+A@mail.gmail.com/ Signed-off-by: Arnd Bergmann Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 08a8430f5647..5183a2daba64 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -915,12 +915,6 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) struct rtentry rt; switch (cmd) { - case SIOCGSTAMP: - err = sock_get_timestamp(sk, (struct timeval __user *)arg); - break; - case SIOCGSTAMPNS: - err = sock_get_timestampns(sk, (struct timespec __user *)arg); - break; case SIOCADDRT: case SIOCDELRT: if (copy_from_user(&rt, p, sizeof(struct rtentry))) @@ -992,6 +986,7 @@ const struct proto_ops inet_stream_ops = { .getname = inet_getname, .poll = tcp_poll, .ioctl = inet_ioctl, + .gettstamp = sock_gettstamp, .listen = inet_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, @@ -1027,6 +1022,7 @@ const struct proto_ops inet_dgram_ops = { .getname = inet_getname, .poll = udp_poll, .ioctl = inet_ioctl, + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, @@ -1059,6 +1055,7 @@ static const struct proto_ops inet_sockraw_ops = { .getname = inet_getname, .poll = datagram_poll, .ioctl = inet_ioctl, + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, -- cgit From d7cc399e1227e74e44f78847d9732a228b46cc91 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Apr 2019 16:02:03 -0700 Subject: tcp: properly reset skb->truesize for tx recycling tcp sendmsg() and sendpage() normally advance skb->data_len and skb->truesize by the payload added to an skb. But sendmsg(fd, ..., MSG_ZEROCOPY) has to account for whole pages, even if a single byte of payload is used in the page. This means that we can not assume skb->truesize can be adjusted by skb->data_len. We must instead overwrite its value. Otherwise skb->truesize is too big and can hit socket sndbuf limit, especially if the skb is recycled multiple times :/ Fixes: 472c2e07eef0 ("tcp: add one skb cache for tx") Signed-off-by: Eric Dumazet Cc: Soheil Hassas Yeganeh Cc: Willem de Bruijn Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 603e770d59b3..f7567a3698eb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -868,7 +868,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, if (likely(!size)) { skb = sk->sk_tx_skb_cache; if (skb && !skb_cloned(skb)) { - skb->truesize -= skb->data_len; + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); sk->sk_tx_skb_cache = NULL; pskb_trim(skb, 0); INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); -- cgit From 3c618c1dbb8859625c643121ac80af9a6723533f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 20 Apr 2019 09:28:20 -0700 Subject: net: Rename net/nexthop.h net/rtnh.h The header contains rtnh_ macros so rename the file accordingly. Allows a later patch to use the nexthop.h name for the new nexthop code. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 2 +- net/ipv4/ipmr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 779d2be2b135..b5230c4a1c16 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -43,7 +43,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9a3f13edc98e..a8eb97777c0a 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -66,7 +66,7 @@ #include #include #include -#include +#include #include -- cgit From 3557b3fdeefacdd111469f90db1a0602902c9698 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sat, 20 Apr 2019 23:29:47 -0400 Subject: net: bpfilter: dont use module_init in non-modular code The Kconfig controlling this code is: bpfilter/Kconfig:menuconfig BPFILTER bpfilter/Kconfig: bool "BPF based packet filtering framework (BPFILTER)" Since it isn't a module, we shouldn't use module_init(). Instead we use device_initcall() - which is exactly what module_init() defaults to for non-modular code/builds. We don't remove from the includes since this file does a request_module() and hence is a valid user of that header file, even though it is not modular itself. Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/ipv4/bpfilter/sockopt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index 1e976bb93d99..15427163a041 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -77,5 +77,4 @@ static int __init bpfilter_sockopt_init(void) return 0; } - -module_init(bpfilter_sockopt_init); +device_initcall(bpfilter_sockopt_init); -- cgit From f24ea52873c726bf7b54318f00ec45050222b367 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 16 Apr 2019 16:44:37 +0200 Subject: xfrm: remove tos indirection from afinfo_policy Only used by ipv4, we can read the fl4 tos value directly instead. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_policy.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index d73a6d6652f6..244d26baa3af 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -69,11 +69,6 @@ static int xfrm4_get_saddr(struct net *net, int oif, return 0; } -static int xfrm4_get_tos(const struct flowi *fl) -{ - return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; /* Strip ECN bits */ -} - static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len) { @@ -272,7 +267,6 @@ static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .dst_lookup = xfrm4_dst_lookup, .get_saddr = xfrm4_get_saddr, .decode_session = _decode_session4, - .get_tos = xfrm4_get_tos, .init_path = xfrm4_init_path, .fill_dst = xfrm4_fill_dst, .blackhole_route = ipv4_blackhole_route, -- cgit From 2e8b4aa816eaaf480fe68b1086614259caf1bf3c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 16 Apr 2019 16:44:38 +0200 Subject: xfrm: remove init_path indirection from afinfo_policy handle this directly, its only used by ipv6. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_policy.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 244d26baa3af..6e89378668ae 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -69,12 +69,6 @@ static int xfrm4_get_saddr(struct net *net, int oif, return 0; } -static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, - int nfheader_len) -{ - return 0; -} - static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { @@ -267,7 +261,6 @@ static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .dst_lookup = xfrm4_dst_lookup, .get_saddr = xfrm4_get_saddr, .decode_session = _decode_session4, - .init_path = xfrm4_init_path, .fill_dst = xfrm4_fill_dst, .blackhole_route = ipv4_blackhole_route, }; -- cgit From c53ac41e3720926301c623d6682bb87ce992a3b3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 16 Apr 2019 16:44:39 +0200 Subject: xfrm: remove decode_session indirection from afinfo_policy No external dependencies, might as well handle this directly. xfrm_afinfo_policy is now 40 bytes on x86_64. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_policy.c | 114 ------------------------------------------------ 1 file changed, 114 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 6e89378668ae..414ab0420604 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -96,118 +95,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, return 0; } -static void -_decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) -{ - const struct iphdr *iph = ip_hdr(skb); - u8 *xprth = skb_network_header(skb) + iph->ihl * 4; - struct flowi4 *fl4 = &fl->u.ip4; - int oif = 0; - - if (skb_dst(skb)) - oif = skb_dst(skb)->dev->ifindex; - - memset(fl4, 0, sizeof(struct flowi4)); - fl4->flowi4_mark = skb->mark; - fl4->flowi4_oif = reverse ? skb->skb_iif : oif; - - if (!ip_is_fragment(iph)) { - switch (iph->protocol) { - case IPPROTO_UDP: - case IPPROTO_UDPLITE: - case IPPROTO_TCP: - case IPPROTO_SCTP: - case IPPROTO_DCCP: - if (xprth + 4 < skb->data || - pskb_may_pull(skb, xprth + 4 - skb->data)) { - __be16 *ports; - - xprth = skb_network_header(skb) + iph->ihl * 4; - ports = (__be16 *)xprth; - - fl4->fl4_sport = ports[!!reverse]; - fl4->fl4_dport = ports[!reverse]; - } - break; - - case IPPROTO_ICMP: - if (xprth + 2 < skb->data || - pskb_may_pull(skb, xprth + 2 - skb->data)) { - u8 *icmp; - - xprth = skb_network_header(skb) + iph->ihl * 4; - icmp = xprth; - - fl4->fl4_icmp_type = icmp[0]; - fl4->fl4_icmp_code = icmp[1]; - } - break; - - case IPPROTO_ESP: - if (xprth + 4 < skb->data || - pskb_may_pull(skb, xprth + 4 - skb->data)) { - __be32 *ehdr; - - xprth = skb_network_header(skb) + iph->ihl * 4; - ehdr = (__be32 *)xprth; - - fl4->fl4_ipsec_spi = ehdr[0]; - } - break; - - case IPPROTO_AH: - if (xprth + 8 < skb->data || - pskb_may_pull(skb, xprth + 8 - skb->data)) { - __be32 *ah_hdr; - - xprth = skb_network_header(skb) + iph->ihl * 4; - ah_hdr = (__be32 *)xprth; - - fl4->fl4_ipsec_spi = ah_hdr[1]; - } - break; - - case IPPROTO_COMP: - if (xprth + 4 < skb->data || - pskb_may_pull(skb, xprth + 4 - skb->data)) { - __be16 *ipcomp_hdr; - - xprth = skb_network_header(skb) + iph->ihl * 4; - ipcomp_hdr = (__be16 *)xprth; - - fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); - } - break; - - case IPPROTO_GRE: - if (xprth + 12 < skb->data || - pskb_may_pull(skb, xprth + 12 - skb->data)) { - __be16 *greflags; - __be32 *gre_hdr; - - xprth = skb_network_header(skb) + iph->ihl * 4; - greflags = (__be16 *)xprth; - gre_hdr = (__be32 *)xprth; - - if (greflags[0] & GRE_KEY) { - if (greflags[0] & GRE_CSUM) - gre_hdr++; - fl4->fl4_gre_key = gre_hdr[1]; - } - } - break; - - default: - fl4->fl4_ipsec_spi = 0; - break; - } - } - fl4->flowi4_proto = iph->protocol; - fl4->daddr = reverse ? iph->saddr : iph->daddr; - fl4->saddr = reverse ? iph->daddr : iph->saddr; - fl4->flowi4_tos = iph->tos; -} - static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu) { @@ -260,7 +147,6 @@ static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .dst_ops = &xfrm4_dst_ops_template, .dst_lookup = xfrm4_dst_lookup, .get_saddr = xfrm4_get_saddr, - .decode_session = _decode_session4, .fill_dst = xfrm4_fill_dst, .blackhole_route = ipv4_blackhole_route, }; -- cgit From bb9cd077e216b886438c5698e1cd75f762ecd3c9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 17 Apr 2019 11:45:13 +0200 Subject: xfrm: remove unneeded export_symbols None of them have any external callers, make them static. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_protocol.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index 35c54865dc42..bcab48944c15 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -46,7 +46,7 @@ static inline struct xfrm4_protocol __rcu **proto_handlers(u8 protocol) handler != NULL; \ handler = rcu_dereference(handler->next)) \ -int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err) +static int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err) { int ret; struct xfrm4_protocol *handler; @@ -61,7 +61,6 @@ int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err) return 0; } -EXPORT_SYMBOL(xfrm4_rcv_cb); int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) -- cgit From ffa8ce54be3aaf6b15abae3bbd08282b867d3a4f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 23 Apr 2019 08:23:41 -0700 Subject: lwtunnel: Pass encap and encap type attributes to lwtunnel_fill_encap Currently, lwtunnel_fill_encap hardcodes the encap and encap type attributes as RTA_ENCAP and RTA_ENCAP_TYPE, respectively. The nexthop objects want to re-use this code but the encap attributes passed to userspace as NHA_ENCAP and NHA_ENCAP_TYPE. Since that is the only difference, change lwtunnel_fill_encap to take the attribute type as an input. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b5230c4a1c16..c695e629fac2 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1503,7 +1503,8 @@ int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc, goto nla_put_failure; if (nhc->nhc_lwtstate && - lwtunnel_fill_encap(skb, nhc->nhc_lwtstate) < 0) + lwtunnel_fill_encap(skb, nhc->nhc_lwtstate, + RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; return 0; -- cgit From ecc5663cce8c7d7e4eba32af4e1e3cab296c64b9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 23 Apr 2019 08:48:09 -0700 Subject: net: Change nhc_flags to unsigned char nhc_flags holds the RTNH_F flags for a given nexthop (fib{6}_nh). All of the RTNH_F_ flags fit in an unsigned char, and since the API to userspace (rtnh_flags and lower byte of rtm_flags) is 1 byte it can not grow. Make nhc_flags in fib_nh_common an unsigned char and shrink the size of the struct by 8, from 56 to 48 bytes. Update the flags arguments for up netdevice events and fib_nexthop_info which determines the RTNH_F flags to return on a dump/event. The RTNH_F flags are passed in the lower byte of rtm_flags which is an unsigned int so use a temp variable for the flags to fib_nexthop_info and combine with rtm_flags in the caller. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c695e629fac2..4336f1ec8ab0 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1444,7 +1444,7 @@ failure: } int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc, - unsigned int *flags, bool skip_oif) + unsigned char *flags, bool skip_oif) { if (nhc->nhc_flags & RTNH_F_DEAD) *flags |= RTNH_F_DEAD; @@ -1520,7 +1520,7 @@ int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, { const struct net_device *dev = nhc->nhc_dev; struct rtnexthop *rtnh; - unsigned int flags = 0; + unsigned char flags = 0; rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); if (!rtnh) @@ -1619,7 +1619,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; if (fi->fib_nhs == 1) { struct fib_nh *nh = &fi->fib_nh[0]; - unsigned int flags = 0; + unsigned char flags = 0; if (fib_nexthop_info(skb, &nh->nh_common, &flags, false) < 0) goto nla_put_failure; @@ -1902,7 +1902,7 @@ out: * Dead device goes up. We wake up dead nexthops. * It takes sense only on multipath routes. */ -int fib_sync_up(struct net_device *dev, unsigned int nh_flags) +int fib_sync_up(struct net_device *dev, unsigned char nh_flags) { struct fib_info *prev_fi; unsigned int hash; -- cgit From ae0be8de9a53cda3505865c11826d8ff0640237c Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Fri, 26 Apr 2019 11:13:06 +0200 Subject: netlink: make nla_nest_start() add NLA_F_NESTED flag Even if the NLA_F_NESTED flag was introduced more than 11 years ago, most netlink based interfaces (including recently added ones) are still not setting it in kernel generated messages. Without the flag, message parsers not aware of attribute semantics (e.g. wireshark dissector or libmnl's mnl_nlmsg_fprintf()) cannot recognize nested attributes and won't display the structure of their contents. Unfortunately we cannot just add the flag everywhere as there may be userspace applications which check nlattr::nla_type directly rather than through a helper masking out the flags. Therefore the patch renames nla_nest_start() to nla_nest_start_noflag() and introduces nla_nest_start() as a wrapper adding NLA_F_NESTED. The calls which add NLA_F_NESTED manually are rewritten to use nla_nest_start(). Except for changes in include/net/netlink.h, the patch was generated using this semantic patch: @@ expression E1, E2; @@ -nla_nest_start(E1, E2) +nla_nest_start_noflag(E1, E2) @@ expression E1, E2; @@ -nla_nest_start_noflag(E1, E2 | NLA_F_NESTED) +nla_nest_start(E1, E2) Signed-off-by: Michal Kubecek Acked-by: Jiri Pirko Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 2 +- net/ipv4/ipmr.c | 6 +++--- net/ipv4/ipmr_base.c | 2 +- net/ipv4/tcp_metrics.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 4336f1ec8ab0..71c2165a2ce3 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1550,7 +1550,7 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) { struct nlattr *mp; - mp = nla_nest_start(skb, RTA_MULTIPATH); + mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp) goto nla_put_failure; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a8eb97777c0a..1322573b8228 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2783,7 +2783,7 @@ static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb) return true; vif = &mrt->vif_table[vifid]; - vif_nest = nla_nest_start(skb, IPMRA_VIF); + vif_nest = nla_nest_start_noflag(skb, IPMRA_VIF); if (!vif_nest) return false; if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif->dev->ifindex) || @@ -2867,7 +2867,7 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb) memset(hdr, 0, sizeof(*hdr)); hdr->ifi_family = RTNL_FAMILY_IPMR; - af = nla_nest_start(skb, IFLA_AF_SPEC); + af = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!af) { nlmsg_cancel(skb, nlh); goto out; @@ -2878,7 +2878,7 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb) goto out; } - vifs = nla_nest_start(skb, IPMRA_TABLE_VIFS); + vifs = nla_nest_start_noflag(skb, IPMRA_TABLE_VIFS); if (!vifs) { nla_nest_end(skb, af); nlmsg_end(skb, nlh); diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 3e614cc824f7..278834d4babc 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -228,7 +228,7 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, if (c->mfc_flags & MFC_OFFLOAD) rtm->rtm_flags |= RTNH_F_OFFLOAD; - mp_attr = nla_nest_start(skb, RTA_MULTIPATH); + mp_attr = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp_attr) return -EMSGSIZE; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 4ccec4c705f7..9a08bfb0672c 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -658,7 +658,7 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, { int n = 0; - nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS); + nest = nla_nest_start_noflag(msg, TCP_METRICS_ATTR_VALS); if (!nest) goto nla_put_failure; for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) { -- cgit From 8cb081746c031fb164089322e2336a0bf5b3070c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 26 Apr 2019 14:07:28 +0200 Subject: netlink: make validation more configurable for future strictness We currently have two levels of strict validation: 1) liberal (default) - undefined (type >= max) & NLA_UNSPEC attributes accepted - attribute length >= expected accepted - garbage at end of message accepted 2) strict (opt-in) - NLA_UNSPEC attributes accepted - attribute length >= expected accepted Split out parsing strictness into four different options: * TRAILING - check that there's no trailing data after parsing attributes (in message or nested) * MAXTYPE - reject attrs > max known type * UNSPEC - reject attributes with NLA_UNSPEC policy entries * STRICT_ATTRS - strictly validate attribute size The default for future things should be *everything*. The current *_strict() is a combination of TRAILING and MAXTYPE, and is renamed to _deprecated_strict(). The current regular parsing has none of this, and is renamed to *_parse_deprecated(). Additionally it allows us to selectively set one of the new flags even on old policies. Notably, the UNSPEC flag could be useful in this case, since it can be arranged (by filling in the policy) to not be an incompatible userspace ABI change, but would then going forward prevent forgetting attribute entries. Similar can apply to the POLICY flag. We end up with the following renames: * nla_parse -> nla_parse_deprecated * nla_parse_strict -> nla_parse_deprecated_strict * nlmsg_parse -> nlmsg_parse_deprecated * nlmsg_parse_strict -> nlmsg_parse_deprecated_strict * nla_parse_nested -> nla_parse_nested_deprecated * nla_validate_nested -> nla_validate_nested_deprecated Using spatch, of course: @@ expression TB, MAX, HEAD, LEN, POL, EXT; @@ -nla_parse(TB, MAX, HEAD, LEN, POL, EXT) +nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression TB, MAX, NLA, POL, EXT; @@ -nla_parse_nested(TB, MAX, NLA, POL, EXT) +nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT) @@ expression START, MAX, POL, EXT; @@ -nla_validate_nested(START, MAX, POL, EXT) +nla_validate_nested_deprecated(START, MAX, POL, EXT) @@ expression NLH, HDRLEN, MAX, POL, EXT; @@ -nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT) +nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT) For this patch, don't actually add the strict, non-renamed versions yet so that it breaks compile if I get it wrong. Also, while at it, make nla_validate and nla_parse go down to a common __nla_validate_parse() function to avoid code duplication. Ultimately, this allows us to have very strict validation for every new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the next patch, while existing things will continue to work as is. In effect then, this adds fully strict validation for any new command. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 27 +++++++++++++++------------ net/ipv4/fib_frontend.c | 8 ++++---- net/ipv4/ip_tunnel_core.c | 8 ++++---- net/ipv4/ipmr.c | 12 ++++++------ net/ipv4/route.c | 8 ++++---- 5 files changed, 33 insertions(+), 30 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index eb514f312e6f..701c5d113a34 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -621,8 +621,8 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, - extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, + ifa_ipv4_policy, extack); if (err < 0) goto errout; @@ -793,8 +793,8 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, struct in_device *in_dev; int err; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, - extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, + ifa_ipv4_policy, extack); if (err < 0) goto errout; @@ -1689,8 +1689,8 @@ static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, fillargs->flags |= NLM_F_DUMP_FILTERED; } - err = nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFA_MAX, - ifa_ipv4_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX, + ifa_ipv4_policy, extack); if (err < 0) return err; @@ -1906,7 +1906,8 @@ static int inet_validate_link_af(const struct net_device *dev, if (dev && !__in_dev_get_rcu(dev)) return -EAFNOSUPPORT; - err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy, NULL); + err = nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, + inet_af_policy, NULL); if (err < 0) return err; @@ -1934,7 +1935,7 @@ static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla) if (!in_dev) return -EAFNOSUPPORT; - if (nla_parse_nested(tb, IFLA_INET_MAX, nla, NULL, NULL) < 0) + if (nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, NULL, NULL) < 0) BUG(); if (tb[IFLA_INET_CONF]) { @@ -2076,11 +2077,13 @@ static int inet_netconf_valid_get_req(struct sk_buff *skb, } if (!netlink_strict_get_check(skb)) - return nlmsg_parse(nlh, sizeof(struct netconfmsg), tb, - NETCONFA_MAX, devconf_ipv4_policy, extack); + return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg), + tb, NETCONFA_MAX, + devconf_ipv4_policy, extack); - err = nlmsg_parse_strict(nlh, sizeof(struct netconfmsg), tb, - NETCONFA_MAX, devconf_ipv4_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg), + tb, NETCONFA_MAX, + devconf_ipv4_policy, extack); if (err) return err; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index d4b63f94f7be..b298255f6fdb 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -718,8 +718,8 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, int err, remaining; struct rtmsg *rtm; - err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy, - extack); + err = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX, + rtm_ipv4_policy, extack); if (err < 0) goto errout; @@ -896,8 +896,8 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, filter->rt_type = rtm->rtm_type; filter->table_id = rtm->rtm_table; - err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, - rtm_ipv4_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); if (err < 0) return err; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index c3f3d28d1087..30c1c264bdfc 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -239,8 +239,8 @@ static int ip_tun_build_state(struct nlattr *attr, struct nlattr *tb[LWTUNNEL_IP_MAX + 1]; int err; - err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy, - extack); + err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_MAX, attr, + ip_tun_policy, extack); if (err < 0) return err; @@ -356,8 +356,8 @@ static int ip6_tun_build_state(struct nlattr *attr, struct nlattr *tb[LWTUNNEL_IP6_MAX + 1]; int err; - err = nla_parse_nested(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy, - extack); + err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP6_MAX, attr, + ip6_tun_policy, extack); if (err < 0) return err; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 1322573b8228..2c61e10a60e3 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2498,8 +2498,8 @@ static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb, } if (!netlink_strict_get_check(skb)) - return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, - rtm_ipv4_policy, extack); + return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || @@ -2510,8 +2510,8 @@ static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb, return -EINVAL; } - err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, - rtm_ipv4_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); if (err) return err; @@ -2674,8 +2674,8 @@ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, struct rtmsg *rtm; int ret, rem; - ret = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy, - extack); + ret = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX, + rtm_ipmr_policy, extack); if (ret < 0) goto out; rtm = nlmsg_data(nlh); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4950adeb05c0..795aed6e4720 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2877,8 +2877,8 @@ static int inet_rtm_valid_getroute_req(struct sk_buff *skb, } if (!netlink_strict_get_check(skb)) - return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, - rtm_ipv4_policy, extack); + return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || @@ -2896,8 +2896,8 @@ static int inet_rtm_valid_getroute_req(struct sk_buff *skb, return -EINVAL; } - err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, - rtm_ipv4_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); if (err) return err; -- cgit From ef6243acb4782df587a4d7d6c310fa5b5d82684b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 26 Apr 2019 14:07:31 +0200 Subject: genetlink: optionally validate strictly/dumps Add options to strictly validate messages and dump messages, sometimes perhaps validating dump messages non-strictly may be required, so add an option for that as well. Since none of this can really be applied to existing commands, set the options everwhere using the following spatch: @@ identifier ops; expression X; @@ struct genl_ops ops[] = { ..., { .cmd = X, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ... }, ... }; For new commands one should just not copy the .validate 'opt-out' flags and thus get strict validation. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/ipv4/fou.c | 3 +++ net/ipv4/tcp_metrics.c | 2 ++ 2 files changed, 5 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 1ca1586a7e46..ca95051317ed 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -913,16 +913,19 @@ static int fou_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) static const struct genl_ops fou_nl_ops[] = { { .cmd = FOU_CMD_ADD, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = fou_nl_cmd_add_port, .flags = GENL_ADMIN_PERM, }, { .cmd = FOU_CMD_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = fou_nl_cmd_rm_port, .flags = GENL_ADMIN_PERM, }, { .cmd = FOU_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = fou_nl_cmd_get_port, .dumpit = fou_nl_dump, }, diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 9a08bfb0672c..f262f2cace29 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -951,11 +951,13 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) static const struct genl_ops tcp_metrics_nl_ops[] = { { .cmd = TCP_METRICS_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tcp_metrics_nl_cmd_get, .dumpit = tcp_metrics_nl_dump, }, { .cmd = TCP_METRICS_CMD_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tcp_metrics_nl_cmd_del, .flags = GENL_ADMIN_PERM, }, -- cgit From e1f172e162c0a11721f1188f12e5b4c3f9f80de6 Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Wed, 17 Apr 2019 11:46:14 -0300 Subject: netfilter: use macros to create module aliases. Each NAT helper creates a module alias which follows a pattern. Use macros for consistency. Signed-off-by: Flavio Leitner Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_nat_h323.c | 2 +- net/ipv4/netfilter/nf_nat_pptp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 4e6b53ab6c33..7875c98072eb 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -631,4 +631,4 @@ module_exit(fini); MODULE_AUTHOR("Jing Min Zhao "); MODULE_DESCRIPTION("H.323 NAT helper"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("ip_nat_h323"); +MODULE_ALIAS_NF_NAT_HELPER("h323"); diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 68b4d450391b..e17b4ee7604c 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -37,7 +37,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte "); MODULE_DESCRIPTION("Netfilter NAT helper module for PPTP"); -MODULE_ALIAS("ip_nat_pptp"); +MODULE_ALIAS_NF_NAT_HELPER("pptp"); static void pptp_nat_expected(struct nf_conn *ct, struct nf_conntrack_expect *exp) -- cgit From bc9f38c8328e10c22cb2016d6131ea36141c8d11 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 29 Apr 2019 15:46:13 -0700 Subject: tcp: avoid unconditional congestion window undo on SYN retransmit Previously if an active TCP open has SYN timeout, it always undo the cwnd upon receiving the SYNACK. This is because tcp_clean_rtx_queue would reset tp->retrans_stamp when SYN is acked, which fools then tcp_try_undo_loss and tcp_packet_delayed. Addressing this issue is required to properly support undo for spurious SYN timeout. Fixing this is tricky -- for active TCP open tp->retrans_stamp records the time when the handshake starts, not the first retransmission time as the name may suggest. The simplest fix is for tcp_packet_delayed to ensure it is valid before comparing with other timestamp. One side effect of this change is active TCP Fast Open that incurred SYN timeout. Upon receiving a SYN-ACK that only acknowledged the SYN, it would immediately retransmit unacknowledged data in tcp_ack() because the data is marked lost after SYN timeout. But the retransmission would have an incorrect ack sequence number since rcv_nxt has not been updated yet tcp_rcv_synsent_state_process(), the retransmission needs to properly handed by tcp_rcv_fastopen_synack() like before. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 97671bff597a..e2cbfc3ffa3f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2252,7 +2252,7 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp, */ static inline bool tcp_packet_delayed(const struct tcp_sock *tp) { - return !tp->retrans_stamp || + return tp->retrans_stamp && tcp_tsopt_ecr_before(tp, tp->retrans_stamp); } @@ -3521,7 +3521,7 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) { struct tcp_sock *tp = tcp_sk(sk); - if (rexmit == REXMIT_NONE) + if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT) return; if (unlikely(rexmit == 2)) { -- cgit From 7c1f08154c4e34d10be41156375ce2b8ab591b0f Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 29 Apr 2019 15:46:14 -0700 Subject: tcp: undo initial congestion window on false SYN timeout Linux implements RFC6298 and use an initial congestion window of 1 upon establishing the connection if the SYN packet is retransmitted 2 or more times. In cellular networks SYN timeouts are often spurious if the wireless radio was dormant or idle. Also some network path is longer than the default SYN timeout. Having a minimal cwnd on both cases are detrimental to TCP startup performance. This patch extends TCP undo feature (RFC3522 aka TCP Eifel) to detect spurious SYN timeout via TCP timestamps. Since tp->retrans_stamp records the initial SYN timestamp instead of first retransmission, we have to implement a different undo code additionally. The detection also must happen before tcp_ack() as retrans_stamp is reset when SYN is acknowledged. Note this patch covers both active regular and fast open. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 16 ++++++++++++++++ net/ipv4/tcp_metrics.c | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e2cbfc3ffa3f..695f840acc14 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5748,6 +5748,21 @@ static void smc_check_reset_syn(struct tcp_sock *tp) #endif } +static void tcp_try_undo_spurious_syn(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 syn_stamp; + + /* undo_marker is set when SYN or SYNACK times out. The timeout is + * spurious if the ACK's timestamp option echo value matches the + * original SYN timestamp. + */ + syn_stamp = tp->retrans_stamp; + if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp && + syn_stamp == tp->rx_opt.rcv_tsecr) + tp->undo_marker = 0; +} + static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th) { @@ -5815,6 +5830,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_ecn_rcv_synack(tp, th); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); + tcp_try_undo_spurious_syn(sk); tcp_ack(sk, skb, FLAG_SLOWPATH); /* Ok.. it's good. Set up sequence numbers and diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index f262f2cace29..d4d687330e2b 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -517,7 +517,7 @@ reset: * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK * retransmission has occurred. */ - if (tp->total_retrans > 1) + if (tp->total_retrans > 1 && tp->undo_marker) tp->snd_cwnd = 1; else tp->snd_cwnd = tcp_init_cwnd(tp, dst); -- cgit From 9e450c1ecb027417c99eba651413d2a6ba6ffc1f Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 29 Apr 2019 15:46:15 -0700 Subject: tcp: better SYNACK sent timestamp Detecting spurious SYNACK timeout using timestamp option requires recording the exact SYNACK skb timestamp. Previously the SYNACK sent timestamp was stamped slightly earlier before the skb was transmitted. This patch uses the SYNACK skb transmission timestamp directly. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_output.c | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 695f840acc14..30c6a42b1f5b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6319,7 +6319,7 @@ static void tcp_openreq_init(struct request_sock *req, req->cookie_ts = 0; tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; - tcp_rsk(req)->snt_synack = tcp_clock_us(); + tcp_rsk(req)->snt_synack = 0; tcp_rsk(req)->last_oow_ack_time = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 32061928b054..0c4ed66dc1bf 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3247,7 +3247,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, skb->skb_mstamp_ns = cookie_init_timestamp(req); else #endif + { skb->skb_mstamp_ns = tcp_clock_ns(); + if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ + tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); + } #ifdef CONFIG_TCP_MD5SIG rcu_read_lock(); -- cgit From 336c39a0315139103712d04b9bfaf0215df23b8e Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 29 Apr 2019 15:46:16 -0700 Subject: tcp: undo init congestion window on false SYNACK timeout Linux implements RFC6298 and use an initial congestion window of 1 upon establishing the connection if the SYNACK packet is retransmitted 2 or more times. In cellular networks SYNACK timeouts are often spurious if the wireless radio was dormant or idle. Also some network path is longer than the default SYNACK timeout. In both cases falsely starting with a minimal cwnd are detrimental to performance. This patch avoids doing so when the final ACK's TCP timestamp indicates the original SYNACK was delivered. It remembers the original SYNACK timestamp when SYNACK timeout has occurred and re-uses the function to detect spurious SYN timeout conveniently. Note that a server may receives multiple SYNs from and immediately retransmits SYNACKs without any SYNACK timeout. This often happens on when the client SYNs have timed out due to wireless delay above. In this case since the server will still use the default initial congestion (e.g. 10) because tp->undo_marker is reset in tcp_init_metrics(). This is an intentional design because packets are not lost but delayed. This patch only covers regular TCP passive open. Fast Open is supported in the next patch. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 ++ net/ipv4/tcp_minisocks.c | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 30c6a42b1f5b..53b4c5a3113b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6101,6 +6101,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) */ tcp_rearm_rto(sk); } else { + tcp_try_undo_spurious_syn(sk); + tp->retrans_stamp = 0; tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tp->copied_seq = tp->rcv_nxt; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 79900f783e0d..9c2a0d36fb20 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -522,6 +522,11 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rx_opt.ts_recent_stamp = 0; newtp->tcp_header_len = sizeof(struct tcphdr); } + if (req->num_timeout) { + newtp->undo_marker = treq->snt_isn; + newtp->retrans_stamp = div_u64(treq->snt_synack, + USEC_PER_SEC / TCP_TS_HZ); + } newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG newtp->md5sig_info = NULL; /*XXX*/ -- cgit From 8c3cfe19feac41065bb88bc14b36c318b26847a9 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 29 Apr 2019 15:46:17 -0700 Subject: tcp: lower congestion window on Fast Open SYNACK timeout TCP sender would use congestion window of 1 packet on the second SYN and SYNACK timeout except passive TCP Fast Open. This makes passive TFO too aggressive and unfair during congestion at handshake. This patch fixes this issue so TCP (fast open or not, passive or active) always conforms to the RFC6298. Note that tcp_enter_loss() is called only once during recurring timeouts. This is because during handshake, high_seq and snd_una are the same so tcp_enter_loss() would incorrect set the undo state variables multiple times. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index f0c86398e6a7..2ac23da42dd2 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -393,6 +393,9 @@ static void tcp_fastopen_synack_timer(struct sock *sk) tcp_write_err(sk); return; } + /* Lower cwnd after certain SYNACK timeout like tcp_init_transfer() */ + if (icsk->icsk_retransmits == 1) + tcp_enter_loss(sk); /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error * returned from rtx_syn_ack() to make it more persistent like * regular retransmit because if the child socket has been accepted -- cgit From 794200d66273cbfa32cab2dbcd59a5db6b57a5d1 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 29 Apr 2019 15:46:18 -0700 Subject: tcp: undo cwnd on Fast Open spurious SYNACK retransmit This patch makes passive Fast Open reverts the cwnd to default initial cwnd (10 packets) if the SYNACK timeout is spurious. Passive Fast Open uses a full socket during handshake so it can use the existing undo logic to detect spurious retransmission by recording the first SYNACK timeout in key state variable retrans_stamp. Upon receiving the ACK of the SYNACK, if the socket has sent some data before the timeout, the spurious timeout is detected by tcp_try_undo_recovery() in tcp_process_loss() in tcp_ack(). But if the socket has not send any data yet, tcp_ack() does not execute the undo code since no data is acknowledged. The fix is to check such case explicitly after tcp_ack() during the ACK processing in SYN_RECV state. In addition this is checked in FIN_WAIT_1 state in case the server closes the socket before handshake completes. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 53b4c5a3113b..3a40584cb473 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6089,6 +6089,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) * so release it. */ if (req) { + tcp_try_undo_loss(sk, false); inet_csk(sk)->icsk_retransmits = 0; reqsk_fastopen_remove(sk, req, false); /* Re-arm the timer because data may have been sent out. @@ -6143,6 +6144,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) * our SYNACK so stop the SYNACK timer. */ if (req) { + tcp_try_undo_loss(sk, false); + inet_csk(sk)->icsk_retransmits = 0; /* We no longer need the request sock. */ reqsk_fastopen_remove(sk, req, false); tcp_rearm_rto(sk); -- cgit From 6b94b1c88b660a786fdb1c22d8a0d3529fe40f8c Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 29 Apr 2019 15:46:19 -0700 Subject: tcp: refactor to consolidate TFO passive open code Use a helper to consolidate two identical code block for passive TFO. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 52 +++++++++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 27 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3a40584cb473..706a99ec73f6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5989,6 +5989,27 @@ reset_and_undo: return 1; } +static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) +{ + tcp_try_undo_loss(sk, false); + inet_csk(sk)->icsk_retransmits = 0; + + /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1, + * we no longer need req so release it. + */ + reqsk_fastopen_remove(sk, tcp_sk(sk)->fastopen_rsk, false); + + /* Re-arm the timer because data may have been sent out. + * This is similar to the regular data transmission case + * when new data has just been ack'ed. + * + * (TFO) - we could try to be more aggressive and + * retransmitting any data sooner based on when they + * are sent out. + */ + tcp_rearm_rto(sk); +} + /* * This function implements the receiving procedure of RFC 793 for * all states except ESTABLISHED and TIME_WAIT. @@ -6085,22 +6106,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (!tp->srtt_us) tcp_synack_rtt_meas(sk, req); - /* Once we leave TCP_SYN_RECV, we no longer need req - * so release it. - */ if (req) { - tcp_try_undo_loss(sk, false); - inet_csk(sk)->icsk_retransmits = 0; - reqsk_fastopen_remove(sk, req, false); - /* Re-arm the timer because data may have been sent out. - * This is similar to the regular data transmission case - * when new data has just been ack'ed. - * - * (TFO) - we could try to be more aggressive and - * retransmitting any data sooner based on when they - * are sent out. - */ - tcp_rearm_rto(sk); + tcp_rcv_synrecv_state_fastopen(sk); } else { tcp_try_undo_spurious_syn(sk); tp->retrans_stamp = 0; @@ -6138,18 +6145,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_FIN_WAIT1: { int tmo; - /* If we enter the TCP_FIN_WAIT1 state and we are a - * Fast Open socket and this is the first acceptable - * ACK we have received, this would have acknowledged - * our SYNACK so stop the SYNACK timer. - */ - if (req) { - tcp_try_undo_loss(sk, false); - inet_csk(sk)->icsk_retransmits = 0; - /* We no longer need the request sock. */ - reqsk_fastopen_remove(sk, req, false); - tcp_rearm_rto(sk); - } + if (req) + tcp_rcv_synrecv_state_fastopen(sk); + if (tp->snd_una != tp->write_seq) break; -- cgit From 98fa6271cfcb1de873b3fe0caf48d9daa1bcc0ac Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 29 Apr 2019 15:46:20 -0700 Subject: tcp: refactor setting the initial congestion window Relocate the congestion window initialization from tcp_init_metrics() to tcp_init_transfer() to improve code readability. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 12 ------------ net/ipv4/tcp_input.c | 26 ++++++++++++++++++++++++++ net/ipv4/tcp_metrics.c | 10 ---------- 3 files changed, 26 insertions(+), 22 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f7567a3698eb..1fa15beb8380 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -457,18 +457,6 @@ void tcp_init_sock(struct sock *sk) } EXPORT_SYMBOL(tcp_init_sock); -void tcp_init_transfer(struct sock *sk, int bpf_op) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - tcp_mtup_init(sk); - icsk->icsk_af_ops->rebuild_header(sk); - tcp_init_metrics(sk); - tcp_call_bpf(sk, bpf_op, 0, NULL); - tcp_init_congestion_control(sk); - tcp_init_buffer_space(sk); -} - static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) { struct sk_buff *skb = tcp_write_queue_tail(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 706a99ec73f6..077d9abdfcf5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5647,6 +5647,32 @@ discard: } EXPORT_SYMBOL(tcp_rcv_established); +void tcp_init_transfer(struct sock *sk, int bpf_op) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + + tcp_mtup_init(sk); + icsk->icsk_af_ops->rebuild_header(sk); + tcp_init_metrics(sk); + + /* Initialize the congestion window to start the transfer. + * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been + * retransmitted. In light of RFC6298 more aggressive 1sec + * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK + * retransmission has occurred. + */ + if (tp->total_retrans > 1 && tp->undo_marker) + tp->snd_cwnd = 1; + else + tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); + tp->snd_cwnd_stamp = tcp_jiffies32; + + tcp_call_bpf(sk, bpf_op, 0, NULL); + tcp_init_congestion_control(sk); + tcp_init_buffer_space(sk); +} + void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index d4d687330e2b..c4848e7a0aad 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -512,16 +512,6 @@ reset: inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; } - /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been - * retransmitted. In light of RFC6298 more aggressive 1sec - * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK - * retransmission has occurred. - */ - if (tp->total_retrans > 1 && tp->undo_marker) - tp->snd_cwnd = 1; - else - tp->snd_cwnd = tcp_init_cwnd(tp, dst); - tp->snd_cwnd_stamp = tcp_jiffies32; } bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) -- cgit From 7fcd1e033dacedd520abebc943c960dcf5add3ae Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 2 May 2019 15:14:15 -0700 Subject: ipmr_base: Do not reset index in mr_table_dump e is the counter used to save the location of a dump when an skb is filled. Once the walk of the table is complete, mr_table_dump needs to return without resetting that index to 0. Dump of a specific table is looping because of the reset because there is no way to indicate the walk of the table is done. Move the reset to the caller so the dump of each table starts at 0, but the loop counter is maintained if a dump fills an skb. Fixes: e1cedae1ba6b0 ("ipmr: Refactor mr_rtm_dumproute") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/ipmr_base.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 3e614cc824f7..3a1af50bd0a5 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -335,8 +335,6 @@ next_entry2: } spin_unlock_bh(lock); err = 0; - e = 0; - out: cb->args[1] = e; return err; @@ -374,6 +372,7 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, err = mr_table_dump(mrt, skb, cb, fill, lock, filter); if (err < 0) break; + cb->args[1] = 0; next_table: t++; } -- cgit From 0f457a36626fa94026e483836fbf29e451434567 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 30 Apr 2019 07:45:48 -0700 Subject: ipv4: Move cached routes to fib_nh_common While the cached routes, nh_pcpu_rth_output and nh_rth_input, are IPv4 specific, a later patch wants to make them accessible for IPv6 nexthops with IPv4 routes using a fib6_nh. Move the cached routes from fib_nh to fib_nh_common and update references. Initialization of the cached entries is moved to fib_nh_common_init, and free is moved to fib_nh_common_release. Change in location only, from fib_nh up to fib_nh_common; no functional change intended. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 36 +++++++++++++++++++----------------- net/ipv4/route.c | 18 +++++++++--------- 2 files changed, 28 insertions(+), 26 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 71c2165a2ce3..4402ec6dc426 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -212,6 +212,8 @@ void fib_nh_common_release(struct fib_nh_common *nhc) dev_put(nhc->nhc_dev); lwtstate_put(nhc->nhc_lwtstate); + rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output); + rt_fibinfo_free(&nhc->nhc_rth_input); } EXPORT_SYMBOL_GPL(fib_nh_common_release); @@ -223,8 +225,6 @@ void fib_nh_release(struct net *net, struct fib_nh *fib_nh) #endif fib_nh_common_release(&fib_nh->nh_common); free_nh_exceptions(fib_nh); - rt_fibinfo_free_cpus(fib_nh->nh_pcpu_rth_output); - rt_fibinfo_free(&fib_nh->nh_rth_input); } /* Release a nexthop info record */ @@ -491,23 +491,35 @@ int fib_nh_common_init(struct fib_nh_common *nhc, struct nlattr *encap, u16 encap_type, void *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { + int err; + + nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *, + gfp_flags); + if (!nhc->nhc_pcpu_rth_output) + return -ENOMEM; + if (encap) { struct lwtunnel_state *lwtstate; - int err; if (encap_type == LWTUNNEL_ENCAP_NONE) { NL_SET_ERR_MSG(extack, "LWT encap type not specified"); - return -EINVAL; + err = -EINVAL; + goto lwt_failure; } err = lwtunnel_build_state(encap_type, encap, nhc->nhc_family, cfg, &lwtstate, extack); if (err) - return err; + goto lwt_failure; nhc->nhc_lwtstate = lwtstate_get(lwtstate); } return 0; + +lwt_failure: + rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output); + nhc->nhc_pcpu_rth_output = NULL; + return err; } EXPORT_SYMBOL_GPL(fib_nh_common_init); @@ -515,18 +527,14 @@ int fib_nh_init(struct net *net, struct fib_nh *nh, struct fib_config *cfg, int nh_weight, struct netlink_ext_ack *extack) { - int err = -ENOMEM; + int err; nh->fib_nh_family = AF_INET; - nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *); - if (!nh->nh_pcpu_rth_output) - goto err_out; - err = fib_nh_common_init(&nh->nh_common, cfg->fc_encap, cfg->fc_encap_type, cfg, GFP_KERNEL, extack); if (err) - goto init_failure; + return err; nh->fib_nh_oif = cfg->fc_oif; nh->fib_nh_gw_family = cfg->fc_gw_family; @@ -546,12 +554,6 @@ int fib_nh_init(struct net *net, struct fib_nh *nh, nh->fib_nh_weight = nh_weight; #endif return 0; - -init_failure: - rt_fibinfo_free_cpus(nh->nh_pcpu_rth_output); - nh->nh_pcpu_rth_output = NULL; -err_out: - return err; } #ifdef CONFIG_IP_ROUTE_MULTIPATH diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 795aed6e4720..662ac9bd956e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -646,6 +646,7 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, u32 pmtu, bool lock, unsigned long expires) { + struct fib_nh_common *nhc = &nh->nh_common; struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe; struct rtable *rt; @@ -715,13 +716,13 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, * stale, so anyone caching it rechecks if this exception * applies to them. */ - rt = rcu_dereference(nh->nh_rth_input); + rt = rcu_dereference(nhc->nhc_rth_input); if (rt) rt->dst.obsolete = DST_OBSOLETE_KILL; for_each_possible_cpu(i) { struct rtable __rcu **prt; - prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i); + prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i); rt = rcu_dereference(*prt); if (rt) rt->dst.obsolete = DST_OBSOLETE_KILL; @@ -1471,13 +1472,14 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) { + struct fib_nh_common *nhc = &nh->nh_common; struct rtable *orig, *prev, **p; bool ret = true; if (rt_is_input_route(rt)) { - p = (struct rtable **)&nh->nh_rth_input; + p = (struct rtable **)&nhc->nhc_rth_input; } else { - p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output); + p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output); } orig = *p; @@ -1810,7 +1812,7 @@ static int __mkroute_input(struct sk_buff *skb, if (fnhe) rth = rcu_dereference(fnhe->fnhe_rth_input); else - rth = rcu_dereference(nh->nh_rth_input); + rth = rcu_dereference(nhc->nhc_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); goto out; @@ -2105,10 +2107,8 @@ local_input: if (res->fi) { if (!itag) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); - struct fib_nh *nh; - nh = container_of(nhc, struct fib_nh, nh_common); - rth = rcu_dereference(nh->nh_rth_input); + rth = rcu_dereference(nhc->nhc_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); err = 0; @@ -2337,7 +2337,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, do_cache = false; goto add; } - prth = raw_cpu_ptr(nh->nh_pcpu_rth_output); + prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output); } rth = rcu_dereference(*prth); if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst)) -- cgit From 87063a1fa66740302f08add95ad3d4d316376bef Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 30 Apr 2019 07:45:49 -0700 Subject: ipv4: Pass fib_nh_common to rt_cache_route Now that the cached routes are in fib_nh_common, pass it to rt_cache_route and simplify its callers. For rt_set_nexthop, the tclassid becomes the last user of fib_nh so move the container_of under the #ifdef CONFIG_IP_ROUTE_CLASSID. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/route.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 662ac9bd956e..9b50d0440940 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1470,9 +1470,8 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, return ret; } -static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) +static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt) { - struct fib_nh_common *nhc = &nh->nh_common; struct rtable *orig, *prev, **p; bool ret = true; @@ -1576,7 +1575,6 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, if (fi) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); - struct fib_nh *nh; if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) { rt->rt_gw_family = nhc->nhc_gw_family; @@ -1589,15 +1587,19 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, ip_dst_init_metrics(&rt->dst, fi->fib_metrics); - nh = container_of(nhc, struct fib_nh, nh_common); #ifdef CONFIG_IP_ROUTE_CLASSID - rt->dst.tclassid = nh->nh_tclassid; + { + struct fib_nh *nh; + + nh = container_of(nhc, struct fib_nh, nh_common); + rt->dst.tclassid = nh->nh_tclassid; + } #endif - rt->dst.lwtstate = lwtstate_get(nh->fib_nh_lws); + rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); if (unlikely(fnhe)) cached = rt_bind_exception(rt, fnhe, daddr, do_cache); else if (do_cache) - cached = rt_cache_route(nh, rt); + cached = rt_cache_route(nhc, rt); if (unlikely(!cached)) { /* Routes we intend to cache in nexthop exception or * FIB nexthop have the DST_NOCACHE bit clear. @@ -2139,7 +2141,6 @@ local_input: if (do_cache) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); - struct fib_nh *nh; rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); if (lwtunnel_input_redirect(rth->dst.lwtstate)) { @@ -2148,8 +2149,7 @@ local_input: rth->dst.input = lwtunnel_input; } - nh = container_of(nhc, struct fib_nh, nh_common); - if (unlikely(!rt_cache_route(nh, rth))) + if (unlikely(!rt_cache_route(nhc, rth))) rt_add_uncached_list(rth); } skb_dst_set(skb, &rth->dst); -- cgit From a5995e7107eb3d9c44744d3cf47d49fabfef01f5 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 30 Apr 2019 07:45:50 -0700 Subject: ipv4: Move exception bucket to nh_common Similar to the cached routes, make IPv4 exceptions accessible when using an IPv6 nexthop struct with IPv4 routes. Simplify the exception functions by passing in fib_nh_common since that is all it needs, and then cleanup the call sites that have extraneous fib_nh conversions. As with the cached routes this is a change in location only, from fib_nh up to fib_nh_common; no functional change intended. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 12 ++++++------ net/ipv4/route.c | 41 +++++++++++++++++------------------------ 2 files changed, 23 insertions(+), 30 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 4402ec6dc426..d3da6a10f86f 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -159,12 +159,12 @@ static void rt_fibinfo_free(struct rtable __rcu **rtp) dst_release_immediate(&rt->dst); } -static void free_nh_exceptions(struct fib_nh *nh) +static void free_nh_exceptions(struct fib_nh_common *nhc) { struct fnhe_hash_bucket *hash; int i; - hash = rcu_dereference_protected(nh->nh_exceptions, 1); + hash = rcu_dereference_protected(nhc->nhc_exceptions, 1); if (!hash) return; for (i = 0; i < FNHE_HASH_SIZE; i++) { @@ -214,6 +214,7 @@ void fib_nh_common_release(struct fib_nh_common *nhc) lwtstate_put(nhc->nhc_lwtstate); rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output); rt_fibinfo_free(&nhc->nhc_rth_input); + free_nh_exceptions(nhc); } EXPORT_SYMBOL_GPL(fib_nh_common_release); @@ -224,7 +225,6 @@ void fib_nh_release(struct net *net, struct fib_nh *fib_nh) net->ipv4.fib_num_tclassid_users--; #endif fib_nh_common_release(&fib_nh->nh_common); - free_nh_exceptions(fib_nh); } /* Release a nexthop info record */ @@ -1713,12 +1713,12 @@ static int call_fib_nh_notifiers(struct fib_nh *nh, * - if the new MTU is greater than the PMTU, don't make any change * - otherwise, unlock and set PMTU */ -static void nh_update_mtu(struct fib_nh *nh, u32 new, u32 orig) +static void nh_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig) { struct fnhe_hash_bucket *bucket; int i; - bucket = rcu_dereference_protected(nh->nh_exceptions, 1); + bucket = rcu_dereference_protected(nhc->nhc_exceptions, 1); if (!bucket) return; @@ -1749,7 +1749,7 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) hlist_for_each_entry(nh, head, nh_hash) { if (nh->fib_nh_dev == dev) - nh_update_mtu(nh, dev->mtu, orig_mtu); + nh_update_mtu(&nh->nh_common, dev->mtu, orig_mtu); } } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 9b50d0440940..11ddc276776e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -643,10 +643,10 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh } } -static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, - u32 pmtu, bool lock, unsigned long expires) +static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, + __be32 gw, u32 pmtu, bool lock, + unsigned long expires) { - struct fib_nh_common *nhc = &nh->nh_common; struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe; struct rtable *rt; @@ -654,17 +654,17 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, unsigned int i; int depth; - genid = fnhe_genid(dev_net(nh->fib_nh_dev)); + genid = fnhe_genid(dev_net(nhc->nhc_dev)); hval = fnhe_hashfun(daddr); spin_lock_bh(&fnhe_lock); - hash = rcu_dereference(nh->nh_exceptions); + hash = rcu_dereference(nhc->nhc_exceptions); if (!hash) { hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC); if (!hash) goto out_unlock; - rcu_assign_pointer(nh->nh_exceptions, hash); + rcu_assign_pointer(nhc->nhc_exceptions, hash); } hash += hval; @@ -789,10 +789,8 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow } else { if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh_common *nhc = FIB_RES_NHC(res); - struct fib_nh *nh; - nh = container_of(nhc, struct fib_nh, nh_common); - update_or_create_fnhe(nh, fl4->daddr, new_gw, + update_or_create_fnhe(nhc, fl4->daddr, new_gw, 0, false, jiffies + ip_rt_gc_timeout); } @@ -1040,10 +1038,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) rcu_read_lock(); if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) { struct fib_nh_common *nhc = FIB_RES_NHC(res); - struct fib_nh *nh; - nh = container_of(nhc, struct fib_nh, nh_common); - update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock, + update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + ip_rt_mtu_expires); } rcu_read_unlock(); @@ -1329,7 +1325,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } -static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) +static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr) { struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe, __rcu **fnhe_p; @@ -1337,7 +1333,7 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) spin_lock_bh(&fnhe_lock); - hash = rcu_dereference_protected(nh->nh_exceptions, + hash = rcu_dereference_protected(nhc->nhc_exceptions, lockdep_is_held(&fnhe_lock)); hash += hval; @@ -1363,9 +1359,10 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) spin_unlock_bh(&fnhe_lock); } -static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) +static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc, + __be32 daddr) { - struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions); + struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions); struct fib_nh_exception *fnhe; u32 hval; @@ -1379,7 +1376,7 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) if (fnhe->fnhe_daddr == daddr) { if (fnhe->fnhe_expires && time_after(jiffies, fnhe->fnhe_expires)) { - ip_del_fnhe(nh, daddr); + ip_del_fnhe(nhc, daddr); break; } return fnhe; @@ -1406,10 +1403,9 @@ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) mtu = fi->fib_mtu; if (likely(!mtu)) { - struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common); struct fib_nh_exception *fnhe; - fnhe = find_exception(nh, daddr); + fnhe = find_exception(nhc, daddr); if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires)) mtu = fnhe->fnhe_pmtu; } @@ -1760,7 +1756,6 @@ static int __mkroute_input(struct sk_buff *skb, struct net_device *dev = nhc->nhc_dev; struct fib_nh_exception *fnhe; struct rtable *rth; - struct fib_nh *nh; int err; struct in_device *out_dev; bool do_cache; @@ -1808,8 +1803,7 @@ static int __mkroute_input(struct sk_buff *skb, } } - nh = container_of(nhc, struct fib_nh, nh_common); - fnhe = find_exception(nh, daddr); + fnhe = find_exception(nhc, daddr); if (do_cache) { if (fnhe) rth = rcu_dereference(fnhe->fnhe_rth_input); @@ -2321,10 +2315,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res, do_cache &= fi != NULL; if (fi) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); - struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common); struct rtable __rcu **prth; - fnhe = find_exception(nh, fl4->daddr); + fnhe = find_exception(nhc, fl4->daddr); if (!do_cache) goto add; if (fnhe) { -- cgit From 0e219ae48c3bbf382ef96adf3825457315728c03 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 3 May 2019 17:01:37 +0200 Subject: net: use indirect calls helpers for L3 handler hooks So that we avoid another indirect call per RX packet in the common case. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 1132d6d1796a..8d78de4b0304 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -130,6 +130,7 @@ #include #include #include +#include #include #include @@ -188,6 +189,8 @@ bool ip_call_ra_chain(struct sk_buff *skb) return false; } +INDIRECT_CALLABLE_DECLARE(int udp_rcv(struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(int tcp_v4_rcv(struct sk_buff *)); void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol) { const struct net_protocol *ipprot; @@ -205,7 +208,8 @@ resubmit: } nf_reset(skb); } - ret = ipprot->handler(skb); + ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv, + skb); if (ret < 0) { protocol = -ret; goto resubmit; -- cgit From 97ff7ffb11fe7a859a490771e7ce23f1f335176b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 3 May 2019 17:01:38 +0200 Subject: net: use indirect calls helpers at early demux stage So that we avoid another indirect call per RX packet, if early demux is enabled. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 8d78de4b0304..ed97724c5e33 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -309,6 +309,8 @@ drop: return true; } +INDIRECT_CALLABLE_DECLARE(int udp_v4_early_demux(struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(int tcp_v4_early_demux(struct sk_buff *)); static int ip_rcv_finish_core(struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *dev) { @@ -326,7 +328,8 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk, ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) { - err = edemux(skb); + err = INDIRECT_CALL_2(edemux, tcp_v4_early_demux, + udp_v4_early_demux, skb); if (unlikely(err)) goto drop_error; /* must reload iph, skb->head might have changed */ -- cgit