From 9403cf2302588022d06f1878b072d3f6933021f0 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Tue, 19 Mar 2019 16:05:44 +0100
Subject: tcp: free request sock directly upon TFO or syncookies error

Since the request socket is created locally, it'd make more sense to
use reqsk_free() instead of reqsk_put() in TFO and syncookies' error
path.

However, tcp_get_cookie_sock() may set ->rsk_refcnt before freeing the
socket; tcp_conn_request() may also have non-null ->rsk_refcnt because
of tcp_try_fastopen(). In both cases 'req' hasn't been exposed
to the outside world and is safe to free immediately, but that'd
trigger the WARN_ON_ONCE in reqsk_free().

Define __reqsk_free() for these situations where we know nobody's
referencing the socket, even though ->rsk_refcnt might be non-null.
Now we can consolidate the error path of tcp_get_cookie_sock() and
tcp_conn_request().

Signed-off-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/syncookies.c | 17 ++++++++---------
 net/ipv4/tcp_input.c  |  5 ++---
 2 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index e531344611a0..008545f63667 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -216,16 +216,15 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
 		refcount_set(&req->rsk_refcnt, 1);
 		tcp_sk(child)->tsoffset = tsoff;
 		sock_rps_save_rxhash(child, skb);
-		if (!inet_csk_reqsk_queue_add(sk, req, child)) {
-			bh_unlock_sock(child);
-			sock_put(child);
-			child = NULL;
-			reqsk_put(req);
-		}
-	} else {
-		reqsk_free(req);
+		if (inet_csk_reqsk_queue_add(sk, req, child))
+			return child;
+
+		bh_unlock_sock(child);
+		sock_put(child);
 	}
-	return child;
+	__reqsk_free(req);
+
+	return NULL;
 }
 EXPORT_SYMBOL(tcp_get_cookie_sock);
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5def3c48870e..5dfbc333e79a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6502,8 +6502,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 			reqsk_fastopen_remove(fastopen_sk, req, false);
 			bh_unlock_sock(fastopen_sk);
 			sock_put(fastopen_sk);
-			reqsk_put(req);
-			goto drop;
+			goto drop_and_free;
 		}
 		sk->sk_data_ready(sk);
 		bh_unlock_sock(fastopen_sk);
@@ -6527,7 +6526,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 drop_and_release:
 	dst_release(dst);
 drop_and_free:
-	reqsk_free(req);
+	__reqsk_free(req);
 drop:
 	tcp_listendrop(sk);
 	return 0;
-- 
cgit 


From 9ab948a91b2c2abc8e82845c0e61f4b1683e3a4f Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 20 Mar 2019 09:18:59 -0700
Subject: ipv4: Allow amount of dirty memory from fib resizing to be
 controllable

fib_trie implementation calls synchronize_rcu when a certain amount of
pages are dirty from freed entries. The number of pages was determined
experimentally in 2009 (commit c3059477fce2d).

At the current setting, synchronize_rcu is called often -- 51 times in a
second in one test with an average of an 8 msec delay adding a fib entry.
The total impact is a lot of slow down modifying the fib. This is seen
in the output of 'time' - the difference between real time and sys+user.
For example, using 720,022 single path routes and 'ip -batch'[1]:

    $ time ./ip -batch ipv4/routes-1-hops
    real    0m14.214s
    user    0m2.513s
    sys     0m6.783s

So roughly 35% of the actual time to install the routes is from the ip
command getting scheduled out, most notably due to synchronize_rcu (this
is observed using 'perf sched timehist').

This patch makes the amount of dirty memory configurable between 64k where
the synchronize_rcu is called often (small, low end systems that are memory
sensitive) to 64M where synchronize_rcu is called rarely during a large
FIB change (for high end systems with lots of memory). The default is 512kB
which corresponds to the current setting of 128 pages with a 4kB page size.

As an example, at 16MB the worst interval shows 4 calls to synchronize_rcu
in a second blocking for up to 30 msec in a single instance, and a total
of almost 100 msec across the 4 calls in the second. The trade off is
allowing FIB entries to consume more memory in a given time window but
but with much better fib insertion rates (~30% increase in prefixes/sec).
With this patch and net.ipv4.fib_sync_mem set to 16MB, the same batch
file runs in:

    $ time ./ip -batch ipv4/routes-1-hops
    real    0m9.692s
    user    0m2.491s
    sys     0m6.769s

So the dead time is reduced to about 1/2 second or <5% of the real time.

[1] 'ip' modified to not request ACK messages which improves route
    insertion times by about 20%

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c        | 14 ++++++++------
 net/ipv4/sysctl_net_ipv4.c |  9 +++++++++
 2 files changed, 17 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index a573e37e0615..1704f432de1f 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -183,14 +183,16 @@ struct trie {
 };
 
 static struct key_vector *resize(struct trie *t, struct key_vector *tn);
-static size_t tnode_free_size;
+static unsigned int tnode_free_size;
 
 /*
- * synchronize_rcu after call_rcu for that many pages; it should be especially
- * useful before resizing the root node with PREEMPT_NONE configs; the value was
- * obtained experimentally, aiming to avoid visible slowdown.
+ * synchronize_rcu after call_rcu for outstanding dirty memory; it should be
+ * especially useful before resizing the root node with PREEMPT_NONE configs;
+ * the value was obtained experimentally, aiming to avoid visible slowdown.
  */
-static const int sync_pages = 128;
+unsigned int sysctl_fib_sync_mem = 512 * 1024;
+unsigned int sysctl_fib_sync_mem_min = 64 * 1024;
+unsigned int sysctl_fib_sync_mem_max = 64 * 1024 * 1024;
 
 static struct kmem_cache *fn_alias_kmem __ro_after_init;
 static struct kmem_cache *trie_leaf_kmem __ro_after_init;
@@ -504,7 +506,7 @@ static void tnode_free(struct key_vector *tn)
 		tn = container_of(head, struct tnode, rcu)->kv;
 	}
 
-	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
+	if (tnode_free_size >= sysctl_fib_sync_mem) {
 		tnode_free_size = 0;
 		synchronize_rcu();
 	}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index ba0fc4b18465..2316c08e9591 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -549,6 +549,15 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
 	},
+	{
+		.procname	= "fib_sync_mem",
+		.data		= &sysctl_fib_sync_mem,
+		.maxlen		= sizeof(sysctl_fib_sync_mem),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra1		= &sysctl_fib_sync_mem_min,
+		.extra2		= &sysctl_fib_sync_mem_max,
+	},
 	{ }
 };
 
-- 
cgit 


From 02afc7ad45bd6cfc9fd51fdbc132455371b63469 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.ibm.com>
Date: Wed, 20 Mar 2019 20:02:56 +0100
Subject: net: dst: remove gc leftovers

Get rid of some obsolete gc-related documentation and macros that were
missed in commit 5b7c9a8ff828 ("net: remove dst gc related code").

CC: Wei Wang <weiwan@google.com>
Signed-off-by: Julian Wiedmann <jwi@linux.ibm.com>
Acked-by: Wei Wang <weiwan@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a5da63e5faa2..14c7fdacaa72 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1176,7 +1176,7 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 	 *
 	 * When a PMTU/redirect information update invalidates a route,
 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
-	 * DST_OBSOLETE_DEAD by dst_free().
+	 * DST_OBSOLETE_DEAD.
 	 */
 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
 		return NULL;
-- 
cgit 


From 3b0f31f2b8c9fb348e4530b88f6b64f9621f83d6 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 21 Mar 2019 22:51:02 +0100
Subject: genetlink: make policy common to family

Since maxattr is common, the policy can't really differ sanely,
so make it common as well.

The only user that did in fact manage to make a non-common policy
is taskstats, which has to be really careful about it (since it's
still using a common maxattr!). This is no longer supported, but
we can fake it using pre_doit.

This reduces the size of e.g. nl80211.o (which has lots of commands):

   text	   data	    bss	    dec	    hex	filename
 398745	  14323	   2240	 415308	  6564c	net/wireless/nl80211.o (before)
 397913	  14331	   2240	 414484	  65314	net/wireless/nl80211.o (after)
--------------------------------
   -832      +8       0    -824

Which is obviously just 8 bytes for each command, and an added 8
bytes for the new policy pointer. I'm not sure why the ops list is
counted as .text though.

Most of the code transformations were done using the following spatch:
    @ops@
    identifier OPS;
    expression POLICY;
    @@
    struct genl_ops OPS[] = {
    ...,
     {
    -	.policy = POLICY,
     },
    ...
    };

    @@
    identifier ops.OPS;
    expression ops.POLICY;
    identifier fam;
    expression M;
    @@
    struct genl_family fam = {
            .ops = OPS,
            .maxattr = M,
    +       .policy = POLICY,
            ...
    };

This also gets rid of devlink_nl_cmd_region_read_dumpit() accessing
the cb->data as ops, which we want to change in a later genl patch.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fou.c         | 4 +---
 net/ipv4/tcp_metrics.c | 3 +--
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 79e98e21cdd7..a23fbb52d265 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -808,20 +808,17 @@ static const struct genl_ops fou_nl_ops[] = {
 	{
 		.cmd = FOU_CMD_ADD,
 		.doit = fou_nl_cmd_add_port,
-		.policy = fou_nl_policy,
 		.flags = GENL_ADMIN_PERM,
 	},
 	{
 		.cmd = FOU_CMD_DEL,
 		.doit = fou_nl_cmd_rm_port,
-		.policy = fou_nl_policy,
 		.flags = GENL_ADMIN_PERM,
 	},
 	{
 		.cmd = FOU_CMD_GET,
 		.doit = fou_nl_cmd_get_port,
 		.dumpit = fou_nl_dump,
-		.policy = fou_nl_policy,
 	},
 };
 
@@ -830,6 +827,7 @@ static struct genl_family fou_nl_family __ro_after_init = {
 	.name		= FOU_GENL_NAME,
 	.version	= FOU_GENL_VERSION,
 	.maxattr	= FOU_ATTR_MAX,
+	.policy = fou_nl_policy,
 	.netnsok	= true,
 	.module		= THIS_MODULE,
 	.ops		= fou_nl_ops,
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index b467a7cabf40..4ccec4c705f7 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -953,12 +953,10 @@ static const struct genl_ops tcp_metrics_nl_ops[] = {
 		.cmd = TCP_METRICS_CMD_GET,
 		.doit = tcp_metrics_nl_cmd_get,
 		.dumpit = tcp_metrics_nl_dump,
-		.policy = tcp_metrics_nl_policy,
 	},
 	{
 		.cmd = TCP_METRICS_CMD_DEL,
 		.doit = tcp_metrics_nl_cmd_del,
-		.policy = tcp_metrics_nl_policy,
 		.flags = GENL_ADMIN_PERM,
 	},
 };
@@ -968,6 +966,7 @@ static struct genl_family tcp_metrics_nl_family __ro_after_init = {
 	.name		= TCP_METRICS_GENL_NAME,
 	.version	= TCP_METRICS_GENL_VERSION,
 	.maxattr	= TCP_METRICS_ATTR_MAX,
+	.policy = tcp_metrics_nl_policy,
 	.netnsok	= true,
 	.module		= THIS_MODULE,
 	.ops		= tcp_metrics_nl_ops,
-- 
cgit 


From e6d1407013a91722ffc89e980d715eb9ce7b57f6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 22 Mar 2019 06:26:29 -0700
Subject: tcp: remove conditional branches from tcp_mstamp_refresh()

tcp_clock_ns() (aka ktime_get_ns()) is using monotonic clock,
so the checks we had in tcp_mstamp_refresh() are no longer
relevant.

This patch removes cpu stall (when the cache line is not hot)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 4522579aaca2..e265d1aeeb66 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -52,12 +52,8 @@ void tcp_mstamp_refresh(struct tcp_sock *tp)
 {
 	u64 val = tcp_clock_ns();
 
-	if (val > tp->tcp_clock_cache)
-		tp->tcp_clock_cache = val;
-
-	val = div_u64(val, NSEC_PER_USEC);
-	if (val > tp->tcp_mstamp)
-		tp->tcp_mstamp = val;
+	tp->tcp_clock_cache = val;
+	tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC);
 }
 
 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
-- 
cgit 


From 472c2e07eef045145bc1493cc94a01c87140780a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 22 Mar 2019 08:56:39 -0700
Subject: tcp: add one skb cache for tx

On hosts with a lot of cores, RPC workloads suffer from heavy contention on slab spinlocks.

    20.69%  [kernel]       [k] queued_spin_lock_slowpath
     5.64%  [kernel]       [k] _raw_spin_lock
     3.83%  [kernel]       [k] syscall_return_via_sysret
     3.48%  [kernel]       [k] __entry_text_start
     1.76%  [kernel]       [k] __netif_receive_skb_core
     1.64%  [kernel]       [k] __fget

For each sendmsg(), we allocate one skb, and free it at the time ACK packet comes.

In many cases, ACK packets are handled by another cpus, and this unfortunately
incurs heavy costs for slab layer.

This patch uses an extra pointer in socket structure, so that we try to reuse
the same skb and avoid these expensive costs.

We cache at most one skb per socket so this should be safe as far as
memory pressure is concerned.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 50 +++++++++++++++++++++++---------------------------
 1 file changed, 23 insertions(+), 27 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6baa6dc1b13b..f0b5a5999145 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -865,6 +865,21 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
 {
 	struct sk_buff *skb;
 
+	skb = sk->sk_tx_skb_cache;
+	if (skb && !size) {
+		const struct sk_buff_fclones *fclones;
+
+		fclones = container_of(skb, struct sk_buff_fclones, skb1);
+		if (refcount_read(&fclones->fclone_ref) == 1) {
+			sk->sk_wmem_queued -= skb->truesize;
+			sk_mem_uncharge(sk, skb->truesize);
+			skb->truesize -= skb->data_len;
+			sk->sk_tx_skb_cache = NULL;
+			pskb_trim(skb, 0);
+			INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
+			return skb;
+		}
+	}
 	/* The TCP header must be at least 32-bit aligned.  */
 	size = ALIGN(size, 4);
 
@@ -1098,30 +1113,6 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
 }
 EXPORT_SYMBOL(tcp_sendpage);
 
-/* Do not bother using a page frag for very small frames.
- * But use this heuristic only for the first skb in write queue.
- *
- * Having no payload in skb->head allows better SACK shifting
- * in tcp_shift_skb_data(), reducing sack/rack overhead, because
- * write queue has less skbs.
- * Each skb can hold up to MAX_SKB_FRAGS * 32Kbytes, or ~0.5 MB.
- * This also speeds up tso_fragment(), since it wont fallback
- * to tcp_fragment().
- */
-static int linear_payload_sz(bool first_skb)
-{
-	if (first_skb)
-		return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
-	return 0;
-}
-
-static int select_size(bool first_skb, bool zc)
-{
-	if (zc)
-		return 0;
-	return linear_payload_sz(first_skb);
-}
-
 void tcp_free_fastopen_req(struct tcp_sock *tp)
 {
 	if (tp->fastopen_req) {
@@ -1272,7 +1263,6 @@ restart:
 
 		if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
 			bool first_skb;
-			int linear;
 
 new_segment:
 			if (!sk_stream_memory_free(sk))
@@ -1283,8 +1273,7 @@ new_segment:
 				goto restart;
 			}
 			first_skb = tcp_rtx_and_write_queues_empty(sk);
-			linear = select_size(first_skb, zc);
-			skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation,
+			skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
 						  first_skb);
 			if (!skb)
 				goto wait_for_memory;
@@ -2552,6 +2541,13 @@ void tcp_write_queue_purge(struct sock *sk)
 		sk_wmem_free_skb(sk, skb);
 	}
 	tcp_rtx_queue_purge(sk);
+	skb = sk->sk_tx_skb_cache;
+	if (skb) {
+		sk->sk_wmem_queued -= skb->truesize;
+		sk_mem_uncharge(sk, skb->truesize);
+		__kfree_skb(skb);
+		sk->sk_tx_skb_cache = NULL;
+	}
 	INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
 	sk_mem_reclaim(sk);
 	tcp_clear_all_retrans_hints(tcp_sk(sk));
-- 
cgit 


From 8b27dae5a2e89a61c46c6dbc76c040c0e6d0ed4c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 22 Mar 2019 08:56:40 -0700
Subject: tcp: add one skb cache for rx

Often times, recvmsg() system calls and BH handling for a particular
TCP socket are done on different cpus.

This means the incoming skb had to be allocated on a cpu,
but freed on another.

This incurs a high spinlock contention in slab layer for small rpc,
but also a high number of cache line ping pongs for larger packets.

A full size GRO packet might use 45 page fragments, meaning
that up to 45 put_page() can be involved.

More over performing the __kfree_skb() in the recvmsg() context
adds a latency for user applications, and increase probability
of trapping them in backlog processing, since the BH handler
might found the socket owned by the user.

This patch, combined with the prior one increases the rpc
performance by about 10 % on servers with large number of cores.

(tcp_rr workload with 10,000 flows and 112 threads reach 9 Mpps
 instead of 8 Mpps)

This also increases single bulk flow performance on 40Gbit+ links,
since in this case there are often two cpus working in tandem :

 - CPU handling the NIC rx interrupts, feeding the receive queue,
  and (after this patch) freeing the skbs that were consumed.

 - CPU in recvmsg() system call, essentially 100 % busy copying out
  data to user space.

Having at most one skb in a per-socket cache has very little risk
of memory exhaustion, and since it is protected by socket lock,
its management is essentially free.

Note that if rps/rfs is used, we do not enable this feature, because
there is high chance that the same cpu is handling both the recvmsg()
system call and the TCP rx path, but that another cpu did the skb
allocations in the device driver right before the RPS/RFS logic.

To properly handle this case, it seems we would need to record
on which cpu skb was allocated, and use a different channel
to give skbs back to this cpu.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c  |  4 ++++
 net/ipv4/tcp.c      |  4 ++++
 net/ipv4/tcp_ipv4.c | 11 +++++++++--
 3 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index eab3ebde981e..7f3a984ad618 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -136,6 +136,10 @@ void inet_sock_destruct(struct sock *sk)
 	struct inet_sock *inet = inet_sk(sk);
 
 	__skb_queue_purge(&sk->sk_receive_queue);
+	if (sk->sk_rx_skb_cache) {
+		__kfree_skb(sk->sk_rx_skb_cache);
+		sk->sk_rx_skb_cache = NULL;
+	}
 	__skb_queue_purge(&sk->sk_error_queue);
 
 	sk_mem_reclaim(sk);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f0b5a5999145..29b94edf05f9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2583,6 +2583,10 @@ int tcp_disconnect(struct sock *sk, int flags)
 
 	tcp_clear_xmit_timers(sk);
 	__skb_queue_purge(&sk->sk_receive_queue);
+	if (sk->sk_rx_skb_cache) {
+		__kfree_skb(sk->sk_rx_skb_cache);
+		sk->sk_rx_skb_cache = NULL;
+	}
 	tp->copied_seq = tp->rcv_nxt;
 	tp->urg_data = 0;
 	tcp_write_queue_purge(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 277d71239d75..3979939804b7 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1774,6 +1774,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
 int tcp_v4_rcv(struct sk_buff *skb)
 {
 	struct net *net = dev_net(skb->dev);
+	struct sk_buff *skb_to_free;
 	int sdif = inet_sdif(skb);
 	const struct iphdr *iph;
 	const struct tcphdr *th;
@@ -1905,11 +1906,17 @@ process:
 	tcp_segs_in(tcp_sk(sk), skb);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
+		skb_to_free = sk->sk_rx_skb_cache;
+		sk->sk_rx_skb_cache = NULL;
 		ret = tcp_v4_do_rcv(sk, skb);
-	} else if (tcp_add_backlog(sk, skb)) {
-		goto discard_and_relse;
+	} else {
+		if (tcp_add_backlog(sk, skb))
+			goto discard_and_relse;
+		skb_to_free = NULL;
 	}
 	bh_unlock_sock(sk);
+	if (skb_to_free)
+		__kfree_skb(skb_to_free);
 
 put_and_return:
 	if (refcounted)
-- 
cgit 


From 65fd2c2afac31a4b46a80150347a1748fa9101cb Mon Sep 17 00:00:00 2001
From: Boris Pismenny <borisp@mellanox.com>
Date: Thu, 21 Mar 2019 16:41:37 +0200
Subject: xfrm: gso partial offload support

This patch introduces support for gso partial ESP offload.

Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Raed Salem <raeds@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/esp4_offload.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 8756e0e790d2..c6c84f2bc41c 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -138,9 +138,11 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
 
 	skb->encap_hdr_csum = 1;
 
-	if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev)
+	if ((!(skb->dev->gso_partial_features & NETIF_F_HW_ESP) &&
+	     !(features & NETIF_F_HW_ESP)) || x->xso.dev != skb->dev)
 		esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
-	else if (!(features & NETIF_F_HW_ESP_TX_CSUM))
+	else if (!(features & NETIF_F_HW_ESP_TX_CSUM) &&
+		 !(skb->dev->gso_partial_features & NETIF_F_HW_ESP_TX_CSUM))
 		esp_features = features & ~NETIF_F_CSUM_MASK;
 
 	xo->flags |= XFRM_GSO_SEGMENT;
@@ -181,7 +183,9 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb,  netdev_features_
 	if (!xo)
 		return -EINVAL;
 
-	if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) {
+	if ((!(features & NETIF_F_HW_ESP) &&
+	     !(skb->dev->gso_partial_features & NETIF_F_HW_ESP)) ||
+	    x->xso.dev != skb->dev) {
 		xo->flags |= CRYPTO_FALLBACK;
 		hw_offload = false;
 	}
-- 
cgit 


From f981c57ffd2d7cf2dd4b6d6f8fcb3965df42f54c Mon Sep 17 00:00:00 2001
From: Jeremy Sowden <jeremy@azazel.net>
Date: Sat, 23 Mar 2019 14:43:02 +0000
Subject: vti4: eliminated some duplicate code.

The ipip tunnel introduced in commit dd9ee3444014 ("vti4: Fix a ipip
packet processing bug in 'IPCOMP' virtual tunnel") largely duplicated
the existing vti_input and vti_recv functions.  Refactored to
deduplicate the common code.

Signed-off-by: Jeremy Sowden <jeremy@azazel.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/ip_vti.c | 60 ++++++++++++++++++++-----------------------------------
 1 file changed, 22 insertions(+), 38 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 68a21bf75dd0..a8474799fb79 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -50,7 +50,7 @@ static unsigned int vti_net_id __read_mostly;
 static int vti_tunnel_init(struct net_device *dev);
 
 static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
-		     int encap_type)
+		     int encap_type, bool update_skb_dev)
 {
 	struct ip_tunnel *tunnel;
 	const struct iphdr *iph = ip_hdr(skb);
@@ -65,6 +65,9 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
 
 		XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel;
 
+		if (update_skb_dev)
+			skb->dev = tunnel->dev;
+
 		return xfrm_input(skb, nexthdr, spi, encap_type);
 	}
 
@@ -74,47 +77,28 @@ drop:
 	return 0;
 }
 
-static int vti_input_ipip(struct sk_buff *skb, int nexthdr, __be32 spi,
-		     int encap_type)
+static int vti_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi,
+			   int encap_type)
 {
-	struct ip_tunnel *tunnel;
-	const struct iphdr *iph = ip_hdr(skb);
-	struct net *net = dev_net(skb->dev);
-	struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
-
-	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
-				  iph->saddr, iph->daddr, 0);
-	if (tunnel) {
-		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
-			goto drop;
-
-		XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel;
-
-		skb->dev = tunnel->dev;
-
-		return xfrm_input(skb, nexthdr, spi, encap_type);
-	}
-
-	return -EINVAL;
-drop:
-	kfree_skb(skb);
-	return 0;
+	return vti_input(skb, nexthdr, spi, encap_type, false);
 }
 
-static int vti_rcv(struct sk_buff *skb)
+static int vti_rcv(struct sk_buff *skb, __be32 spi, bool update_skb_dev)
 {
 	XFRM_SPI_SKB_CB(skb)->family = AF_INET;
 	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
 
-	return vti_input(skb, ip_hdr(skb)->protocol, 0, 0);
+	return vti_input(skb, ip_hdr(skb)->protocol, spi, 0, update_skb_dev);
 }
 
-static int vti_rcv_ipip(struct sk_buff *skb)
+static int vti_rcv_proto(struct sk_buff *skb)
 {
-	XFRM_SPI_SKB_CB(skb)->family = AF_INET;
-	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+	return vti_rcv(skb, 0, false);
+}
 
-	return vti_input_ipip(skb, ip_hdr(skb)->protocol, ip_hdr(skb)->saddr, 0);
+static int vti_rcv_tunnel(struct sk_buff *skb)
+{
+	return vti_rcv(skb, ip_hdr(skb)->saddr, true);
 }
 
 static int vti_rcv_cb(struct sk_buff *skb, int err)
@@ -447,31 +431,31 @@ static void __net_init vti_fb_tunnel_init(struct net_device *dev)
 }
 
 static struct xfrm4_protocol vti_esp4_protocol __read_mostly = {
-	.handler	=	vti_rcv,
-	.input_handler	=	vti_input,
+	.handler	=	vti_rcv_proto,
+	.input_handler	=	vti_input_proto,
 	.cb_handler	=	vti_rcv_cb,
 	.err_handler	=	vti4_err,
 	.priority	=	100,
 };
 
 static struct xfrm4_protocol vti_ah4_protocol __read_mostly = {
-	.handler	=	vti_rcv,
-	.input_handler	=	vti_input,
+	.handler	=	vti_rcv_proto,
+	.input_handler	=	vti_input_proto,
 	.cb_handler	=	vti_rcv_cb,
 	.err_handler	=	vti4_err,
 	.priority	=	100,
 };
 
 static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = {
-	.handler	=	vti_rcv,
-	.input_handler	=	vti_input,
+	.handler	=	vti_rcv_proto,
+	.input_handler	=	vti_input_proto,
 	.cb_handler	=	vti_rcv_cb,
 	.err_handler	=	vti4_err,
 	.priority	=	100,
 };
 
 static struct xfrm_tunnel ipip_handler __read_mostly = {
-	.handler	=	vti_rcv_ipip,
+	.handler	=	vti_rcv_tunnel,
 	.err_handler	=	vti4_err,
 	.priority	=	0,
 };
-- 
cgit 


From 1713cb37bf671e5d98919536941a8b56337874fd Mon Sep 17 00:00:00 2001
From: Kristian Evensen <kristian.evensen@gmail.com>
Date: Wed, 27 Mar 2019 11:16:03 +0100
Subject: fou: Support binding FoU socket
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

An FoU socket is currently bound to the wildcard-address. While this
works fine, there are several use-cases where the use of the
wildcard-address is not desirable. For example, I use FoU on some
multi-homed servers and would like to use FoU on only one of the
interfaces.

This commit adds support for binding FoU sockets to a given source
address/interface, as well as connecting the socket to a given
destination address/port. udp_tunnel already provides the required
infrastructure, so most of the code added is for exposing and setting
the different attributes (local address, peer address, etc.).

The lookups performed when we add, delete or get an FoU-socket has also
been updated to compare all the attributes a user can set. Since the
comparison now involves several elements, I have added a separate
comparison-function instead of open-coding.

In order to test the code and ensure that the new comparison code works
correctly, I started by creating a wildcard socket bound to port 1234 on
my machine. I then tried to create a non-wildcarded socket bound to the
same port, as well as fetching and deleting the socket (including source
address, peer address or interface index in the netlink request).  Both
the create, fetch and delete request failed. Deleting/fetching the
socket was only successful when my netlink request attributes matched
those used to create the socket.

I then repeated the tests, but with a socket bound to a local ip
address, a socket bound to a local address + interface, and a bound
socket that was also «connected» to a peer. Add only worked when no
socket with the matching source address/interface (or wildcard) existed,
while fetch/delete was only successful when all attributes matched.

In addition to testing that the new code work, I also checked that the
current behavior is kept. If none of the new attributes are provided,
then an FoU-socket is configured as before (i.e., wildcarded).  If any
of the new attributes are provided, the FoU-socket is configured as
expected.

v1->v2:
* Fixed building with IPv6 disabled (kbuild).
* Fixed a return type warning and make the ugly comparison function more
readable (kbuild).
* Describe more in detail what has been tested (thanks David Miller).
* Make peer port required if peer address is specified.

Signed-off-by: Kristian Evensen <kristian.evensen@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fou.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 122 insertions(+), 16 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index a23fbb52d265..100e63f57ea6 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -499,15 +499,45 @@ out_unlock:
 	return err;
 }
 
-static int fou_add_to_port_list(struct net *net, struct fou *fou)
+static bool fou_cfg_cmp(struct fou *fou, struct fou_cfg *cfg)
+{
+	struct sock *sk = fou->sock->sk;
+	struct udp_port_cfg *udp_cfg = &cfg->udp_config;
+
+	if (fou->family != udp_cfg->family ||
+	    fou->port != udp_cfg->local_udp_port ||
+	    sk->sk_dport != udp_cfg->peer_udp_port ||
+	    sk->sk_bound_dev_if != udp_cfg->bind_ifindex)
+		return false;
+
+	if (fou->family == AF_INET) {
+		if (sk->sk_rcv_saddr != udp_cfg->local_ip.s_addr ||
+		    sk->sk_daddr != udp_cfg->peer_ip.s_addr)
+			return false;
+		else
+			return true;
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		if (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, &udp_cfg->local_ip6) ||
+		    ipv6_addr_cmp(&sk->sk_v6_daddr, &udp_cfg->peer_ip6))
+			return false;
+		else
+			return true;
+#endif
+	}
+
+	return false;
+}
+
+static int fou_add_to_port_list(struct net *net, struct fou *fou,
+				struct fou_cfg *cfg)
 {
 	struct fou_net *fn = net_generic(net, fou_net_id);
 	struct fou *fout;
 
 	mutex_lock(&fn->fou_lock);
 	list_for_each_entry(fout, &fn->fou_list, list) {
-		if (fou->port == fout->port &&
-		    fou->family == fout->family) {
+		if (fou_cfg_cmp(fout, cfg)) {
 			mutex_unlock(&fn->fou_lock);
 			return -EALREADY;
 		}
@@ -585,7 +615,7 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
 
 	sk->sk_allocation = GFP_ATOMIC;
 
-	err = fou_add_to_port_list(net, fou);
+	err = fou_add_to_port_list(net, fou, cfg);
 	if (err)
 		goto error;
 
@@ -605,14 +635,12 @@ error:
 static int fou_destroy(struct net *net, struct fou_cfg *cfg)
 {
 	struct fou_net *fn = net_generic(net, fou_net_id);
-	__be16 port = cfg->udp_config.local_udp_port;
-	u8 family = cfg->udp_config.family;
 	int err = -EINVAL;
 	struct fou *fou;
 
 	mutex_lock(&fn->fou_lock);
 	list_for_each_entry(fou, &fn->fou_list, list) {
-		if (fou->port == port && fou->family == family) {
+		if (fou_cfg_cmp(fou, cfg)) {
 			fou_release(fou);
 			err = 0;
 			break;
@@ -626,16 +654,27 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg)
 static struct genl_family fou_nl_family;
 
 static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = {
-	[FOU_ATTR_PORT] = { .type = NLA_U16, },
-	[FOU_ATTR_AF] = { .type = NLA_U8, },
-	[FOU_ATTR_IPPROTO] = { .type = NLA_U8, },
-	[FOU_ATTR_TYPE] = { .type = NLA_U8, },
-	[FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, },
+	[FOU_ATTR_PORT]			= { .type = NLA_U16, },
+	[FOU_ATTR_AF]			= { .type = NLA_U8, },
+	[FOU_ATTR_IPPROTO]		= { .type = NLA_U8, },
+	[FOU_ATTR_TYPE]			= { .type = NLA_U8, },
+	[FOU_ATTR_REMCSUM_NOPARTIAL]	= { .type = NLA_FLAG, },
+	[FOU_ATTR_LOCAL_V4]		= { .type = NLA_U32, },
+	[FOU_ATTR_PEER_V4]		= { .type = NLA_U32, },
+	[FOU_ATTR_LOCAL_V6]		= { .type = sizeof(struct in6_addr), },
+	[FOU_ATTR_PEER_V6]		= { .type = sizeof(struct in6_addr), },
+	[FOU_ATTR_PEER_PORT]		= { .type = NLA_U16, },
+	[FOU_ATTR_IFINDEX]		= { .type = NLA_S32, },
 };
 
 static int parse_nl_config(struct genl_info *info,
 			   struct fou_cfg *cfg)
 {
+	bool has_local = false, has_peer = false;
+	struct nlattr *attr;
+	int ifindex;
+	__be16 port;
+
 	memset(cfg, 0, sizeof(*cfg));
 
 	cfg->udp_config.family = AF_INET;
@@ -657,8 +696,7 @@ static int parse_nl_config(struct genl_info *info,
 	}
 
 	if (info->attrs[FOU_ATTR_PORT]) {
-		__be16 port = nla_get_be16(info->attrs[FOU_ATTR_PORT]);
-
+		port = nla_get_be16(info->attrs[FOU_ATTR_PORT]);
 		cfg->udp_config.local_udp_port = port;
 	}
 
@@ -671,6 +709,52 @@ static int parse_nl_config(struct genl_info *info,
 	if (info->attrs[FOU_ATTR_REMCSUM_NOPARTIAL])
 		cfg->flags |= FOU_F_REMCSUM_NOPARTIAL;
 
+	if (cfg->udp_config.family == AF_INET) {
+		if (info->attrs[FOU_ATTR_LOCAL_V4]) {
+			attr = info->attrs[FOU_ATTR_LOCAL_V4];
+			cfg->udp_config.local_ip.s_addr = nla_get_in_addr(attr);
+			has_local = true;
+		}
+
+		if (info->attrs[FOU_ATTR_PEER_V4]) {
+			attr = info->attrs[FOU_ATTR_PEER_V4];
+			cfg->udp_config.peer_ip.s_addr = nla_get_in_addr(attr);
+			has_peer = true;
+		}
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		if (info->attrs[FOU_ATTR_LOCAL_V6]) {
+			attr = info->attrs[FOU_ATTR_LOCAL_V6];
+			cfg->udp_config.local_ip6 = nla_get_in6_addr(attr);
+			has_local = true;
+		}
+
+		if (info->attrs[FOU_ATTR_PEER_V6]) {
+			attr = info->attrs[FOU_ATTR_PEER_V6];
+			cfg->udp_config.peer_ip6 = nla_get_in6_addr(attr);
+			has_peer = true;
+		}
+#endif
+	}
+
+	if (has_peer) {
+		if (info->attrs[FOU_ATTR_PEER_PORT]) {
+			port = nla_get_be16(info->attrs[FOU_ATTR_PEER_PORT]);
+			cfg->udp_config.peer_udp_port = port;
+		} else {
+			return -EINVAL;
+		}
+	}
+
+	if (info->attrs[FOU_ATTR_IFINDEX]) {
+		if (!has_local)
+			return -EINVAL;
+
+		ifindex = nla_get_s32(info->attrs[FOU_ATTR_IFINDEX]);
+
+		cfg->udp_config.bind_ifindex = ifindex;
+	}
+
 	return 0;
 }
 
@@ -702,15 +786,37 @@ static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info)
 
 static int fou_fill_info(struct fou *fou, struct sk_buff *msg)
 {
+	struct sock *sk = fou->sock->sk;
+
 	if (nla_put_u8(msg, FOU_ATTR_AF, fou->sock->sk->sk_family) ||
 	    nla_put_be16(msg, FOU_ATTR_PORT, fou->port) ||
+	    nla_put_be16(msg, FOU_ATTR_PEER_PORT, sk->sk_dport) ||
 	    nla_put_u8(msg, FOU_ATTR_IPPROTO, fou->protocol) ||
-	    nla_put_u8(msg, FOU_ATTR_TYPE, fou->type))
+	    nla_put_u8(msg, FOU_ATTR_TYPE, fou->type) ||
+	    nla_put_s32(msg, FOU_ATTR_IFINDEX, sk->sk_bound_dev_if))
 		return -1;
 
 	if (fou->flags & FOU_F_REMCSUM_NOPARTIAL)
 		if (nla_put_flag(msg, FOU_ATTR_REMCSUM_NOPARTIAL))
 			return -1;
+
+	if (fou->sock->sk->sk_family == AF_INET) {
+		if (nla_put_in_addr(msg, FOU_ATTR_LOCAL_V4, sk->sk_rcv_saddr))
+			return -1;
+
+		if (nla_put_in_addr(msg, FOU_ATTR_PEER_V4, sk->sk_daddr))
+			return -1;
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		if (nla_put_in6_addr(msg, FOU_ATTR_LOCAL_V6,
+				     &sk->sk_v6_rcv_saddr))
+			return -1;
+
+		if (nla_put_in6_addr(msg, FOU_ATTR_PEER_V6, &sk->sk_v6_daddr))
+			return -1;
+#endif
+	}
+
 	return 0;
 }
 
@@ -763,7 +869,7 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info)
 	ret = -ESRCH;
 	mutex_lock(&fn->fou_lock);
 	list_for_each_entry(fout, &fn->fou_list, list) {
-		if (port == fout->port && family == fout->family) {
+		if (fou_cfg_cmp(fout, &cfg)) {
 			ret = fou_dump_info(fout, info->snd_portid,
 					    info->snd_seq, 0, msg,
 					    info->genlhdr->cmd);
-- 
cgit 


From 4f661542a40217713f2cee0bb6678fbb30d9d367 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 26 Mar 2019 08:34:55 -0700
Subject: tcp: fix zerocopy and notsent_lowat issues
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

My recent patch had at least three problems :

1) TX zerocopy wants notification when skb is acknowledged,
   thus we need to call skb_zcopy_clear() if the skb is
   cached into sk->sk_tx_skb_cache

2) Some applications might expect precise EPOLLOUT
   notifications, so we need to update sk->sk_wmem_queued
   and call sk_mem_uncharge() from sk_wmem_free_skb()
   in all cases. The SOCK_QUEUE_SHRUNK flag must also be set.

3) Reuse of saved skb should have used skb_cloned() instead
  of simply checking if the fast clone has been freed.

Fixes: 472c2e07eef0 ("tcp: add one skb cache for tx")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 29b94edf05f9..82bd707c0347 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -865,14 +865,9 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
 {
 	struct sk_buff *skb;
 
-	skb = sk->sk_tx_skb_cache;
-	if (skb && !size) {
-		const struct sk_buff_fclones *fclones;
-
-		fclones = container_of(skb, struct sk_buff_fclones, skb1);
-		if (refcount_read(&fclones->fclone_ref) == 1) {
-			sk->sk_wmem_queued -= skb->truesize;
-			sk_mem_uncharge(sk, skb->truesize);
+	if (likely(!size)) {
+		skb = sk->sk_tx_skb_cache;
+		if (skb && !skb_cloned(skb)) {
 			skb->truesize -= skb->data_len;
 			sk->sk_tx_skb_cache = NULL;
 			pskb_trim(skb, 0);
@@ -2543,8 +2538,6 @@ void tcp_write_queue_purge(struct sock *sk)
 	tcp_rtx_queue_purge(sk);
 	skb = sk->sk_tx_skb_cache;
 	if (skb) {
-		sk->sk_wmem_queued -= skb->truesize;
-		sk_mem_uncharge(sk, skb->truesize);
 		__kfree_skb(skb);
 		sk->sk_tx_skb_cache = NULL;
 	}
-- 
cgit 


From df453700e8d81b1bdafdf684365ee2b9431fb702 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 27 Mar 2019 12:40:33 -0700
Subject: inet: switch IP ID generator to siphash

According to Amit Klein and Benny Pinkas, IP ID generation is too weak
and might be used by attackers.

Even with recent net_hash_mix() fix (netns: provide pure entropy for net_hash_mix())
having 64bit key and Jenkins hash is risky.

It is time to switch to siphash and its 128bit keys.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Amit Klein <aksecurity@gmail.com>
Reported-by: Benny Pinkas <benny@pinkas.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 14c7fdacaa72..f2688fce39e1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -500,15 +500,17 @@ EXPORT_SYMBOL(ip_idents_reserve);
 
 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 {
-	static u32 ip_idents_hashrnd __read_mostly;
 	u32 hash, id;
 
-	net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
+	/* Note the following code is not safe, but this is okay. */
+	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
+		get_random_bytes(&net->ipv4.ip_id_key,
+				 sizeof(net->ipv4.ip_id_key));
 
-	hash = jhash_3words((__force u32)iph->daddr,
+	hash = siphash_3u32((__force u32)iph->daddr,
 			    (__force u32)iph->saddr,
-			    iph->protocol ^ net_hash_mix(net),
-			    ip_idents_hashrnd);
+			    iph->protocol,
+			    &net->ipv4.ip_id_key);
 	id = ip_idents_reserve(hash, segs);
 	iph->id = htons(id);
 }
-- 
cgit 


From 8373c6c84e6748e1dd8b82c43af37572ab040233 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:46 -0700
Subject: ipv4: Define fib_get_nhs when CONFIG_IP_ROUTE_MULTIPATH is disabled

Define fib_get_nhs to return EINVAL when CONFIG_IP_ROUTE_MULTIPATH is
not enabled and remove the ifdef check for CONFIG_IP_ROUTE_MULTIPATH
in fib_create_info.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 8e185b5a2bf6..b5dbbdfd1e49 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -601,6 +601,15 @@ static void fib_rebalance(struct fib_info *fi)
 }
 #else /* CONFIG_IP_ROUTE_MULTIPATH */
 
+static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
+		       int remaining, struct fib_config *cfg,
+		       struct netlink_ext_ack *extack)
+{
+	NL_SET_ERR_MSG(extack, "Multipath support not enabled in kernel");
+
+	return -EINVAL;
+}
+
 #define fib_rebalance(fi) do { } while (0)
 
 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
@@ -1102,7 +1111,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 	} endfor_nexthops(fi)
 
 	if (cfg->fc_mp) {
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
 		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack);
 		if (err != 0)
 			goto failure;
@@ -1122,11 +1130,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 				       "Nexthop class id does not match RTA_FLOW");
 			goto err_inval;
 		}
-#endif
-#else
-		NL_SET_ERR_MSG(extack,
-			       "Multipath support not enabled in kernel");
-		goto err_inval;
 #endif
 	} else {
 		struct fib_nh *nh = fi->fib_nh;
-- 
cgit 


From 331c7a402358de6206232f6aab7aa48ec6c1088a Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:47 -0700
Subject: ipv4: Move IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN to helper

in_dev lookup followed by IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN check
is called in several places, some with the rcu lock and others with the
rtnl held.

Move the check to a helper similar to what IPv6 has. Since the helper
can be invoked from either context use rcu_dereference_rtnl to
dereference ip_ptr.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 31 +++++++------------------------
 net/ipv4/fib_trie.c      |  4 +---
 2 files changed, 8 insertions(+), 27 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index b5dbbdfd1e49..78631eb255f7 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -558,7 +558,6 @@ static void fib_rebalance(struct fib_info *fi)
 {
 	int total;
 	int w;
-	struct in_device *in_dev;
 
 	if (fi->fib_nhs < 2)
 		return;
@@ -568,10 +567,7 @@ static void fib_rebalance(struct fib_info *fi)
 		if (nh->nh_flags & RTNH_F_DEAD)
 			continue;
 
-		in_dev = __in_dev_get_rtnl(nh->nh_dev);
-
-		if (in_dev &&
-		    IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+		if (ip_ignore_linkdown(nh->nh_dev) &&
 		    nh->nh_flags & RTNH_F_LINKDOWN)
 			continue;
 
@@ -582,12 +578,9 @@ static void fib_rebalance(struct fib_info *fi)
 	change_nexthops(fi) {
 		int upper_bound;
 
-		in_dev = __in_dev_get_rtnl(nexthop_nh->nh_dev);
-
 		if (nexthop_nh->nh_flags & RTNH_F_DEAD) {
 			upper_bound = -1;
-		} else if (in_dev &&
-			   IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+		} else if (ip_ignore_linkdown(nexthop_nh->nh_dev) &&
 			   nexthop_nh->nh_flags & RTNH_F_LINKDOWN) {
 			upper_bound = -1;
 		} else {
@@ -1325,12 +1318,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 		    nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
 			goto nla_put_failure;
 		if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) {
-			struct in_device *in_dev;
-
 			rcu_read_lock();
-			in_dev = __in_dev_get_rcu(fi->fib_nh->nh_dev);
-			if (in_dev &&
-			    IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
+			if (ip_ignore_linkdown(fi->fib_nh->nh_dev))
 				rtm->rtm_flags |= RTNH_F_DEAD;
 			rcu_read_unlock();
 		}
@@ -1361,12 +1350,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 
 			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
 			if (nh->nh_flags & RTNH_F_LINKDOWN) {
-				struct in_device *in_dev;
-
 				rcu_read_lock();
-				in_dev = __in_dev_get_rcu(nh->nh_dev);
-				if (in_dev &&
-				    IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
+				if (ip_ignore_linkdown(nh->nh_dev))
 					rtnh->rtnh_flags |= RTNH_F_DEAD;
 				rcu_read_unlock();
 			}
@@ -1433,7 +1418,7 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local)
 static int call_fib_nh_notifiers(struct fib_nh *fib_nh,
 				 enum fib_event_type event_type)
 {
-	struct in_device *in_dev = __in_dev_get_rtnl(fib_nh->nh_dev);
+	bool ignore_link_down = ip_ignore_linkdown(fib_nh->nh_dev);
 	struct fib_nh_notifier_info info = {
 		.fib_nh = fib_nh,
 	};
@@ -1442,14 +1427,12 @@ static int call_fib_nh_notifiers(struct fib_nh *fib_nh,
 	case FIB_EVENT_NH_ADD:
 		if (fib_nh->nh_flags & RTNH_F_DEAD)
 			break;
-		if (IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
-		    fib_nh->nh_flags & RTNH_F_LINKDOWN)
+		if (ignore_link_down && fib_nh->nh_flags & RTNH_F_LINKDOWN)
 			break;
 		return call_fib4_notifiers(dev_net(fib_nh->nh_dev), event_type,
 					   &info.info);
 	case FIB_EVENT_NH_DEL:
-		if ((in_dev && IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
-		     fib_nh->nh_flags & RTNH_F_LINKDOWN) ||
+		if ((ignore_link_down && fib_nh->nh_flags & RTNH_F_LINKDOWN) ||
 		    (fib_nh->nh_flags & RTNH_F_DEAD))
 			return call_fib4_notifiers(dev_net(fib_nh->nh_dev),
 						   event_type, &info.info);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 1704f432de1f..656d3d19f112 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1471,12 +1471,10 @@ found:
 			continue;
 		for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
 			const struct fib_nh *nh = &fi->fib_nh[nhsel];
-			struct in_device *in_dev = __in_dev_get_rcu(nh->nh_dev);
 
 			if (nh->nh_flags & RTNH_F_DEAD)
 				continue;
-			if (in_dev &&
-			    IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+			if (ip_ignore_linkdown(nh->nh_dev) &&
 			    nh->nh_flags & RTNH_F_LINKDOWN &&
 			    !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
 				continue;
-- 
cgit 


From e4516ef65490ef29d48a98ad4d7c90dccf39068f Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:48 -0700
Subject: ipv4: Create init helper for fib_nh

Consolidate the fib_nh initialization which is duplicated between
fib_create_info for single path and fib_get_nhs for multipath.
Export the helper to allow for use with nexthop objects in the
future.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 180 ++++++++++++++++++++++++-----------------------
 1 file changed, 91 insertions(+), 89 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 78631eb255f7..cd15746e2b3f 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -457,6 +457,54 @@ static int fib_detect_death(struct fib_info *fi, int order,
 	return 1;
 }
 
+int fib_nh_init(struct net *net, struct fib_nh *nh,
+		struct fib_config *cfg, int nh_weight,
+		struct netlink_ext_ack *extack)
+{
+	int err = -ENOMEM;
+
+	nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
+	if (!nh->nh_pcpu_rth_output)
+		goto err_out;
+
+	if (cfg->fc_encap) {
+		struct lwtunnel_state *lwtstate;
+
+		err = -EINVAL;
+		if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) {
+			NL_SET_ERR_MSG(extack, "LWT encap type not specified");
+			goto lwt_failure;
+		}
+		err = lwtunnel_build_state(cfg->fc_encap_type,
+					   cfg->fc_encap, AF_INET, cfg,
+					   &lwtstate, extack);
+		if (err)
+			goto lwt_failure;
+
+		nh->nh_lwtstate = lwtstate_get(lwtstate);
+	}
+
+	nh->nh_oif = cfg->fc_oif;
+	nh->nh_gw = cfg->fc_gw;
+	nh->nh_flags = cfg->fc_flags;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	nh->nh_tclassid = cfg->fc_flow;
+	if (nh->nh_tclassid)
+		net->ipv4.fib_num_tclassid_users++;
+#endif
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	nh->nh_weight = nh_weight;
+#endif
+	return 0;
+
+lwt_failure:
+	rt_fibinfo_free_cpus(nh->nh_pcpu_rth_output);
+	nh->nh_pcpu_rth_output = NULL;
+err_out:
+	return err;
+}
+
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 
 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining,
@@ -483,11 +531,15 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 		       int remaining, struct fib_config *cfg,
 		       struct netlink_ext_ack *extack)
 {
+	struct net *net = fi->fib_net;
+	struct fib_config fib_cfg;
 	int ret;
 
 	change_nexthops(fi) {
 		int attrlen;
 
+		memset(&fib_cfg, 0, sizeof(fib_cfg));
+
 		if (!rtnh_ok(rtnh, remaining)) {
 			NL_SET_ERR_MSG(extack,
 				       "Invalid nexthop configuration - extra data after nexthop");
@@ -500,56 +552,54 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 			return -EINVAL;
 		}
 
-		nexthop_nh->nh_flags =
-			(cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
-		nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
-		nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
+		fib_cfg.fc_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
+		fib_cfg.fc_oif = rtnh->rtnh_ifindex;
 
 		attrlen = rtnh_attrlen(rtnh);
 		if (attrlen > 0) {
 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
 
 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
-			nexthop_nh->nh_gw = nla ? nla_get_in_addr(nla) : 0;
-#ifdef CONFIG_IP_ROUTE_CLASSID
+			if (nla)
+				fib_cfg.fc_gw = nla_get_in_addr(nla);
+
 			nla = nla_find(attrs, attrlen, RTA_FLOW);
-			nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
-			if (nexthop_nh->nh_tclassid)
-				fi->fib_net->ipv4.fib_num_tclassid_users++;
-#endif
-			nla = nla_find(attrs, attrlen, RTA_ENCAP);
-			if (nla) {
-				struct lwtunnel_state *lwtstate;
-				struct nlattr *nla_entype;
-
-				nla_entype = nla_find(attrs, attrlen,
-						      RTA_ENCAP_TYPE);
-				if (!nla_entype) {
-					NL_SET_BAD_ATTR(extack, nla);
-					NL_SET_ERR_MSG(extack,
-						       "Encap type is missing");
-					goto err_inval;
-				}
+			if (nla)
+				fib_cfg.fc_flow = nla_get_u32(nla);
 
-				ret = lwtunnel_build_state(nla_get_u16(
-							   nla_entype),
-							   nla,  AF_INET, cfg,
-							   &lwtstate, extack);
-				if (ret)
-					goto errout;
-				nexthop_nh->nh_lwtstate =
-					lwtstate_get(lwtstate);
-			}
+			fib_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
+			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
+			if (nla)
+				fib_cfg.fc_encap_type = nla_get_u16(nla);
 		}
 
+		ret = fib_nh_init(net, nexthop_nh, &fib_cfg,
+				  rtnh->rtnh_hops + 1, extack);
+		if (ret)
+			goto errout;
+
 		rtnh = rtnh_next(rtnh, &remaining);
 	} endfor_nexthops(fi);
 
-	return 0;
-
-err_inval:
 	ret = -EINVAL;
-
+	if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) {
+		NL_SET_ERR_MSG(extack,
+			       "Nexthop device index does not match RTA_OIF");
+		goto errout;
+	}
+	if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) {
+		NL_SET_ERR_MSG(extack,
+			       "Nexthop gateway does not match RTA_GATEWAY");
+		goto errout;
+	}
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) {
+		NL_SET_ERR_MSG(extack,
+			       "Nexthop class id does not match RTA_FLOW");
+		goto errout;
+	}
+#endif
+	ret = 0;
 errout:
 	return ret;
 }
@@ -1098,63 +1148,15 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 	fi->fib_nhs = nhs;
 	change_nexthops(fi) {
 		nexthop_nh->nh_parent = fi;
-		nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
-		if (!nexthop_nh->nh_pcpu_rth_output)
-			goto failure;
 	} endfor_nexthops(fi)
 
-	if (cfg->fc_mp) {
+	if (cfg->fc_mp)
 		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack);
-		if (err != 0)
-			goto failure;
-		if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) {
-			NL_SET_ERR_MSG(extack,
-				       "Nexthop device index does not match RTA_OIF");
-			goto err_inval;
-		}
-		if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) {
-			NL_SET_ERR_MSG(extack,
-				       "Nexthop gateway does not match RTA_GATEWAY");
-			goto err_inval;
-		}
-#ifdef CONFIG_IP_ROUTE_CLASSID
-		if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) {
-			NL_SET_ERR_MSG(extack,
-				       "Nexthop class id does not match RTA_FLOW");
-			goto err_inval;
-		}
-#endif
-	} else {
-		struct fib_nh *nh = fi->fib_nh;
-
-		if (cfg->fc_encap) {
-			struct lwtunnel_state *lwtstate;
-
-			if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) {
-				NL_SET_ERR_MSG(extack,
-					       "LWT encap type not specified");
-				goto err_inval;
-			}
-			err = lwtunnel_build_state(cfg->fc_encap_type,
-						   cfg->fc_encap, AF_INET, cfg,
-						   &lwtstate, extack);
-			if (err)
-				goto failure;
+	else
+		err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack);
 
-			nh->nh_lwtstate = lwtstate_get(lwtstate);
-		}
-		nh->nh_oif = cfg->fc_oif;
-		nh->nh_gw = cfg->fc_gw;
-		nh->nh_flags = cfg->fc_flags;
-#ifdef CONFIG_IP_ROUTE_CLASSID
-		nh->nh_tclassid = cfg->fc_flow;
-		if (nh->nh_tclassid)
-			fi->fib_net->ipv4.fib_num_tclassid_users++;
-#endif
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-		nh->nh_weight = 1;
-#endif
-	}
+	if (err != 0)
+		goto failure;
 
 	if (fib_props[cfg->fc_type].error) {
 		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) {
-- 
cgit 


From faa041a40b9fa64913789fcc0161c7c73161f57e Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:49 -0700
Subject: ipv4: Create cleanup helper for fib_nh

Move the fib_nh cleanup code from free_fib_info_rcu into a new helper,
fib_nh_release. Move classid accounting into fib_nh_release which is
called per fib_nh to make accounting symmetrical with fib_nh_init.
Export the helper to allow for use with nexthop objects in the
future.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index cd15746e2b3f..184940a06cb5 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -204,18 +204,28 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
 	free_percpu(rtp);
 }
 
+void fib_nh_release(struct net *net, struct fib_nh *fib_nh)
+{
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	if (fib_nh->nh_tclassid)
+		net->ipv4.fib_num_tclassid_users--;
+#endif
+	if (fib_nh->nh_dev)
+		dev_put(fib_nh->nh_dev);
+
+	lwtstate_put(fib_nh->nh_lwtstate);
+	free_nh_exceptions(fib_nh);
+	rt_fibinfo_free_cpus(fib_nh->nh_pcpu_rth_output);
+	rt_fibinfo_free(&fib_nh->nh_rth_input);
+}
+
 /* Release a nexthop info record */
 static void free_fib_info_rcu(struct rcu_head *head)
 {
 	struct fib_info *fi = container_of(head, struct fib_info, rcu);
 
 	change_nexthops(fi) {
-		if (nexthop_nh->nh_dev)
-			dev_put(nexthop_nh->nh_dev);
-		lwtstate_put(nexthop_nh->nh_lwtstate);
-		free_nh_exceptions(nexthop_nh);
-		rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
-		rt_fibinfo_free(&nexthop_nh->nh_rth_input);
+		fib_nh_release(fi->fib_net, nexthop_nh);
 	} endfor_nexthops(fi);
 
 	ip_fib_metrics_put(fi->fib_metrics);
@@ -230,12 +240,7 @@ void free_fib_info(struct fib_info *fi)
 		return;
 	}
 	fib_info_cnt--;
-#ifdef CONFIG_IP_ROUTE_CLASSID
-	change_nexthops(fi) {
-		if (nexthop_nh->nh_tclassid)
-			fi->fib_net->ipv4.fib_num_tclassid_users--;
-	} endfor_nexthops(fi);
-#endif
+
 	call_rcu(&fi->rcu, free_fib_info_rcu);
 }
 EXPORT_SYMBOL_GPL(free_fib_info);
-- 
cgit 


From b75ed8b1aa9c3a99702159c3be8b0c1d54972ae5 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:55 -0700
Subject: ipv4: Rename fib_nh entries

Rename fib_nh entries that will be moved to a fib_nh_common struct.
Specifically, the device, oif, gateway, flags, scope, lwtstate,
nh_weight and nh_upper_bound are common with all nexthop definitions.
In the process shorten fib_nh_lwtstate to fib_nh_lws to avoid really
long lines.

Rename only; no functional change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c  |  10 +--
 net/ipv4/fib_semantics.c | 229 ++++++++++++++++++++++++-----------------------
 net/ipv4/fib_trie.c      |  12 +--
 net/ipv4/route.c         |  18 ++--
 4 files changed, 135 insertions(+), 134 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index ed14ec245584..ffbe24397dbe 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -324,16 +324,16 @@ bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
 	for (ret = 0; ret < fi->fib_nhs; ret++) {
 		struct fib_nh *nh = &fi->fib_nh[ret];
 
-		if (nh->nh_dev == dev) {
+		if (nh->fib_nh_dev == dev) {
 			dev_match = true;
 			break;
-		} else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) {
+		} else if (l3mdev_master_ifindex_rcu(nh->fib_nh_dev) == dev->ifindex) {
 			dev_match = true;
 			break;
 		}
 	}
 #else
-	if (fi->fib_nh[0].nh_dev == dev)
+	if (fi->fib_nh[0].fib_nh_dev == dev)
 		dev_match = true;
 #endif
 
@@ -390,7 +390,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 
 	dev_match = fib_info_nh_uses_dev(res.fi, dev);
 	if (dev_match) {
-		ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+		ret = FIB_RES_NH(res).fib_nh_scope >= RT_SCOPE_HOST;
 		return ret;
 	}
 	if (no_addr)
@@ -402,7 +402,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 	ret = 0;
 	if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
 		if (res.type == RTN_UNICAST)
-			ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+			ret = FIB_RES_NH(res).fib_nh_scope >= RT_SCOPE_HOST;
 	}
 	return ret;
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 184940a06cb5..c1e16b52338b 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -210,10 +210,10 @@ void fib_nh_release(struct net *net, struct fib_nh *fib_nh)
 	if (fib_nh->nh_tclassid)
 		net->ipv4.fib_num_tclassid_users--;
 #endif
-	if (fib_nh->nh_dev)
-		dev_put(fib_nh->nh_dev);
+	if (fib_nh->fib_nh_dev)
+		dev_put(fib_nh->fib_nh_dev);
 
-	lwtstate_put(fib_nh->nh_lwtstate);
+	lwtstate_put(fib_nh->fib_nh_lws);
 	free_nh_exceptions(fib_nh);
 	rt_fibinfo_free_cpus(fib_nh->nh_pcpu_rth_output);
 	rt_fibinfo_free(&fib_nh->nh_rth_input);
@@ -253,7 +253,7 @@ void fib_release_info(struct fib_info *fi)
 		if (fi->fib_prefsrc)
 			hlist_del(&fi->fib_lhash);
 		change_nexthops(fi) {
-			if (!nexthop_nh->nh_dev)
+			if (!nexthop_nh->fib_nh_dev)
 				continue;
 			hlist_del(&nexthop_nh->nh_hash);
 		} endfor_nexthops(fi)
@@ -268,17 +268,17 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
 	const struct fib_nh *onh = ofi->fib_nh;
 
 	for_nexthops(fi) {
-		if (nh->nh_oif != onh->nh_oif ||
-		    nh->nh_gw  != onh->nh_gw ||
-		    nh->nh_scope != onh->nh_scope ||
+		if (nh->fib_nh_oif != onh->fib_nh_oif ||
+		    nh->fib_nh_gw4 != onh->fib_nh_gw4 ||
+		    nh->fib_nh_scope != onh->fib_nh_scope ||
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-		    nh->nh_weight != onh->nh_weight ||
+		    nh->fib_nh_weight != onh->fib_nh_weight ||
 #endif
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		    nh->nh_tclassid != onh->nh_tclassid ||
 #endif
-		    lwtunnel_cmp_encap(nh->nh_lwtstate, onh->nh_lwtstate) ||
-		    ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK))
+		    lwtunnel_cmp_encap(nh->fib_nh_lws, onh->fib_nh_lws) ||
+		    ((nh->fib_nh_flags ^ onh->fib_nh_flags) & ~RTNH_COMPARE_MASK))
 			return -1;
 		onh++;
 	} endfor_nexthops(fi);
@@ -303,7 +303,7 @@ static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
 	val ^= (__force u32)fi->fib_prefsrc;
 	val ^= fi->fib_priority;
 	for_nexthops(fi) {
-		val ^= fib_devindex_hashfn(nh->nh_oif);
+		val ^= fib_devindex_hashfn(nh->fib_nh_oif);
 	} endfor_nexthops(fi)
 
 	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
@@ -352,9 +352,9 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev)
 	hash = fib_devindex_hashfn(dev->ifindex);
 	head = &fib_info_devhash[hash];
 	hlist_for_each_entry(nh, head, nh_hash) {
-		if (nh->nh_dev == dev &&
-		    nh->nh_gw == gw &&
-		    !(nh->nh_flags & RTNH_F_DEAD)) {
+		if (nh->fib_nh_dev == dev &&
+		    nh->fib_nh_gw4 == gw &&
+		    !(nh->fib_nh_flags & RTNH_F_DEAD)) {
 			spin_unlock(&fib_info_lock);
 			return 0;
 		}
@@ -389,10 +389,10 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
 
 		/* grab encap info */
 		for_nexthops(fi) {
-			if (nh->nh_lwtstate) {
+			if (nh->fib_nh_lws) {
 				/* RTA_ENCAP_TYPE */
 				nh_encapsize += lwtunnel_get_encap_size(
-						nh->nh_lwtstate);
+						nh->fib_nh_lws);
 				/* RTA_ENCAP */
 				nh_encapsize +=  nla_total_size(2);
 			}
@@ -443,7 +443,7 @@ static int fib_detect_death(struct fib_info *fi, int order,
 	struct neighbour *n;
 	int state = NUD_NONE;
 
-	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
+	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].fib_nh_gw4, fi->fib_dev);
 	if (n) {
 		state = n->nud_state;
 		neigh_release(n);
@@ -486,12 +486,12 @@ int fib_nh_init(struct net *net, struct fib_nh *nh,
 		if (err)
 			goto lwt_failure;
 
-		nh->nh_lwtstate = lwtstate_get(lwtstate);
+		nh->fib_nh_lws = lwtstate_get(lwtstate);
 	}
 
-	nh->nh_oif = cfg->fc_oif;
-	nh->nh_gw = cfg->fc_gw;
-	nh->nh_flags = cfg->fc_flags;
+	nh->fib_nh_oif = cfg->fc_oif;
+	nh->fib_nh_gw4 = cfg->fc_gw;
+	nh->fib_nh_flags = cfg->fc_flags;
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	nh->nh_tclassid = cfg->fc_flow;
@@ -499,7 +499,7 @@ int fib_nh_init(struct net *net, struct fib_nh *nh,
 		net->ipv4.fib_num_tclassid_users++;
 #endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-	nh->nh_weight = nh_weight;
+	nh->fib_nh_weight = nh_weight;
 #endif
 	return 0;
 
@@ -587,12 +587,12 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 	} endfor_nexthops(fi);
 
 	ret = -EINVAL;
-	if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) {
+	if (cfg->fc_oif && fi->fib_nh->fib_nh_oif != cfg->fc_oif) {
 		NL_SET_ERR_MSG(extack,
 			       "Nexthop device index does not match RTA_OIF");
 		goto errout;
 	}
-	if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) {
+	if (cfg->fc_gw && fi->fib_nh->fib_nh_gw4 != cfg->fc_gw) {
 		NL_SET_ERR_MSG(extack,
 			       "Nexthop gateway does not match RTA_GATEWAY");
 		goto errout;
@@ -619,32 +619,32 @@ static void fib_rebalance(struct fib_info *fi)
 
 	total = 0;
 	for_nexthops(fi) {
-		if (nh->nh_flags & RTNH_F_DEAD)
+		if (nh->fib_nh_flags & RTNH_F_DEAD)
 			continue;
 
-		if (ip_ignore_linkdown(nh->nh_dev) &&
-		    nh->nh_flags & RTNH_F_LINKDOWN)
+		if (ip_ignore_linkdown(nh->fib_nh_dev) &&
+		    nh->fib_nh_flags & RTNH_F_LINKDOWN)
 			continue;
 
-		total += nh->nh_weight;
+		total += nh->fib_nh_weight;
 	} endfor_nexthops(fi);
 
 	w = 0;
 	change_nexthops(fi) {
 		int upper_bound;
 
-		if (nexthop_nh->nh_flags & RTNH_F_DEAD) {
+		if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) {
 			upper_bound = -1;
-		} else if (ip_ignore_linkdown(nexthop_nh->nh_dev) &&
-			   nexthop_nh->nh_flags & RTNH_F_LINKDOWN) {
+		} else if (ip_ignore_linkdown(nexthop_nh->fib_nh_dev) &&
+			   nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) {
 			upper_bound = -1;
 		} else {
-			w += nexthop_nh->nh_weight;
+			w += nexthop_nh->fib_nh_weight;
 			upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
 							    total) - 1;
 		}
 
-		atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
+		atomic_set(&nexthop_nh->fib_nh_upper_bound, upper_bound);
 	} endfor_nexthops(fi);
 }
 #else /* CONFIG_IP_ROUTE_MULTIPATH */
@@ -677,7 +677,7 @@ static int fib_encap_match(u16 encap_type,
 	ret = lwtunnel_build_state(encap_type, encap, AF_INET,
 				   cfg, &lwtstate, extack);
 	if (!ret) {
-		result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
+		result = lwtunnel_cmp_encap(lwtstate, nh->fib_nh_lws);
 		lwtstate_free(lwtstate);
 	}
 
@@ -706,8 +706,8 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
 		    cfg->fc_flow != fi->fib_nh->nh_tclassid)
 			return 1;
 #endif
-		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
-		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
+		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->fib_nh_oif) &&
+		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->fib_nh_gw4))
 			return 0;
 		return 1;
 	}
@@ -725,7 +725,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
 		if (!rtnh_ok(rtnh, remaining))
 			return -EINVAL;
 
-		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
+		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->fib_nh_oif)
 			return 1;
 
 		attrlen = rtnh_attrlen(rtnh);
@@ -733,7 +733,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
 
 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
-			if (nla && nla_get_in_addr(nla) != nh->nh_gw)
+			if (nla && nla_get_in_addr(nla) != nh->fib_nh_gw4)
 				return 1;
 #ifdef CONFIG_IP_ROUTE_CLASSID
 			nla = nla_find(attrs, attrlen, RTA_FLOW);
@@ -840,10 +840,10 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
 	struct net_device *dev;
 
 	net = cfg->fc_nlinfo.nl_net;
-	if (nh->nh_gw) {
+	if (nh->fib_nh_gw4) {
 		struct fib_result res;
 
-		if (nh->nh_flags & RTNH_F_ONLINK) {
+		if (nh->fib_nh_flags & RTNH_F_ONLINK) {
 			unsigned int addr_type;
 
 			if (cfg->fc_scope >= RT_SCOPE_LINK) {
@@ -851,7 +851,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
 					       "Nexthop has invalid scope");
 				return -EINVAL;
 			}
-			dev = __dev_get_by_index(net, nh->nh_oif);
+			dev = __dev_get_by_index(net, nh->fib_nh_oif);
 			if (!dev) {
 				NL_SET_ERR_MSG(extack, "Nexthop device required for onlink");
 				return -ENODEV;
@@ -861,26 +861,27 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
 					       "Nexthop device is not up");
 				return -ENETDOWN;
 			}
-			addr_type = inet_addr_type_dev_table(net, dev, nh->nh_gw);
+			addr_type = inet_addr_type_dev_table(net, dev,
+							     nh->fib_nh_gw4);
 			if (addr_type != RTN_UNICAST) {
 				NL_SET_ERR_MSG(extack,
 					       "Nexthop has invalid gateway");
 				return -EINVAL;
 			}
 			if (!netif_carrier_ok(dev))
-				nh->nh_flags |= RTNH_F_LINKDOWN;
-			nh->nh_dev = dev;
+				nh->fib_nh_flags |= RTNH_F_LINKDOWN;
+			nh->fib_nh_dev = dev;
 			dev_hold(dev);
-			nh->nh_scope = RT_SCOPE_LINK;
+			nh->fib_nh_scope = RT_SCOPE_LINK;
 			return 0;
 		}
 		rcu_read_lock();
 		{
 			struct fib_table *tbl = NULL;
 			struct flowi4 fl4 = {
-				.daddr = nh->nh_gw,
+				.daddr = nh->fib_nh_gw4,
 				.flowi4_scope = cfg->fc_scope + 1,
-				.flowi4_oif = nh->nh_oif,
+				.flowi4_oif = nh->fib_nh_oif,
 				.flowi4_iif = LOOPBACK_IFINDEX,
 			};
 
@@ -917,9 +918,9 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
 			NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
 			goto out;
 		}
-		nh->nh_scope = res.scope;
-		nh->nh_oif = FIB_RES_OIF(res);
-		nh->nh_dev = dev = FIB_RES_DEV(res);
+		nh->fib_nh_scope = res.scope;
+		nh->fib_nh_oif = FIB_RES_OIF(res);
+		nh->fib_nh_dev = dev = FIB_RES_DEV(res);
 		if (!dev) {
 			NL_SET_ERR_MSG(extack,
 				       "No egress device for nexthop gateway");
@@ -927,19 +928,19 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
 		}
 		dev_hold(dev);
 		if (!netif_carrier_ok(dev))
-			nh->nh_flags |= RTNH_F_LINKDOWN;
+			nh->fib_nh_flags |= RTNH_F_LINKDOWN;
 		err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
 	} else {
 		struct in_device *in_dev;
 
-		if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) {
+		if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) {
 			NL_SET_ERR_MSG(extack,
 				       "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set");
 			return -EINVAL;
 		}
 		rcu_read_lock();
 		err = -ENODEV;
-		in_dev = inetdev_by_index(net, nh->nh_oif);
+		in_dev = inetdev_by_index(net, nh->fib_nh_oif);
 		if (!in_dev)
 			goto out;
 		err = -ENETDOWN;
@@ -947,11 +948,11 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
 			NL_SET_ERR_MSG(extack, "Device for nexthop is not up");
 			goto out;
 		}
-		nh->nh_dev = in_dev->dev;
-		dev_hold(nh->nh_dev);
-		nh->nh_scope = RT_SCOPE_HOST;
-		if (!netif_carrier_ok(nh->nh_dev))
-			nh->nh_flags |= RTNH_F_LINKDOWN;
+		nh->fib_nh_dev = in_dev->dev;
+		dev_hold(nh->fib_nh_dev);
+		nh->fib_nh_scope = RT_SCOPE_HOST;
+		if (!netif_carrier_ok(nh->fib_nh_dev))
+			nh->fib_nh_flags |= RTNH_F_LINKDOWN;
 		err = 0;
 	}
 out:
@@ -1043,8 +1044,8 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
 
 __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
 {
-	nh->nh_saddr = inet_select_addr(nh->nh_dev,
-					nh->nh_gw,
+	nh->nh_saddr = inet_select_addr(nh->fib_nh_dev,
+					nh->fib_nh_gw4,
 					nh->nh_parent->fib_scope);
 	nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);
 
@@ -1198,15 +1199,15 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 				       "Route with host scope can not have multiple nexthops");
 			goto err_inval;
 		}
-		if (nh->nh_gw) {
+		if (nh->fib_nh_gw4) {
 			NL_SET_ERR_MSG(extack,
 				       "Route with host scope can not have a gateway");
 			goto err_inval;
 		}
-		nh->nh_scope = RT_SCOPE_NOWHERE;
-		nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
+		nh->fib_nh_scope = RT_SCOPE_NOWHERE;
+		nh->fib_nh_dev = dev_get_by_index(net, fi->fib_nh->fib_nh_oif);
 		err = -ENODEV;
-		if (!nh->nh_dev)
+		if (!nh->fib_nh_dev)
 			goto failure;
 	} else {
 		int linkdown = 0;
@@ -1215,7 +1216,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 			err = fib_check_nh(cfg, nexthop_nh, extack);
 			if (err != 0)
 				goto failure;
-			if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
+			if (nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN)
 				linkdown++;
 		} endfor_nexthops(fi)
 		if (linkdown == fi->fib_nhs)
@@ -1257,9 +1258,9 @@ link_it:
 		struct hlist_head *head;
 		unsigned int hash;
 
-		if (!nexthop_nh->nh_dev)
+		if (!nexthop_nh->fib_nh_dev)
 			continue;
-		hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
+		hash = fib_devindex_hashfn(nexthop_nh->fib_nh_dev->ifindex);
 		head = &fib_info_devhash[hash];
 		hlist_add_head(&nexthop_nh->nh_hash, head);
 	} endfor_nexthops(fi)
@@ -1318,27 +1319,27 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 	    nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
 		goto nla_put_failure;
 	if (fi->fib_nhs == 1) {
-		if (fi->fib_nh->nh_gw &&
-		    nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
+		if (fi->fib_nh->fib_nh_gw4 &&
+		    nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->fib_nh_gw4))
 			goto nla_put_failure;
-		if (fi->fib_nh->nh_oif &&
-		    nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
+		if (fi->fib_nh->fib_nh_oif &&
+		    nla_put_u32(skb, RTA_OIF, fi->fib_nh->fib_nh_oif))
 			goto nla_put_failure;
-		if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) {
+		if (fi->fib_nh->fib_nh_flags & RTNH_F_LINKDOWN) {
 			rcu_read_lock();
-			if (ip_ignore_linkdown(fi->fib_nh->nh_dev))
+			if (ip_ignore_linkdown(fi->fib_nh->fib_nh_dev))
 				rtm->rtm_flags |= RTNH_F_DEAD;
 			rcu_read_unlock();
 		}
-		if (fi->fib_nh->nh_flags & RTNH_F_OFFLOAD)
+		if (fi->fib_nh->fib_nh_flags & RTNH_F_OFFLOAD)
 			rtm->rtm_flags |= RTNH_F_OFFLOAD;
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		if (fi->fib_nh[0].nh_tclassid &&
 		    nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
 			goto nla_put_failure;
 #endif
-		if (fi->fib_nh->nh_lwtstate &&
-		    lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate) < 0)
+		if (fi->fib_nh->fib_nh_lws &&
+		    lwtunnel_fill_encap(skb, fi->fib_nh->fib_nh_lws) < 0)
 			goto nla_put_failure;
 	}
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -1355,26 +1356,26 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 			if (!rtnh)
 				goto nla_put_failure;
 
-			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
-			if (nh->nh_flags & RTNH_F_LINKDOWN) {
+			rtnh->rtnh_flags = nh->fib_nh_flags & 0xFF;
+			if (nh->fib_nh_flags & RTNH_F_LINKDOWN) {
 				rcu_read_lock();
-				if (ip_ignore_linkdown(nh->nh_dev))
+				if (ip_ignore_linkdown(nh->fib_nh_dev))
 					rtnh->rtnh_flags |= RTNH_F_DEAD;
 				rcu_read_unlock();
 			}
-			rtnh->rtnh_hops = nh->nh_weight - 1;
-			rtnh->rtnh_ifindex = nh->nh_oif;
+			rtnh->rtnh_hops = nh->fib_nh_weight - 1;
+			rtnh->rtnh_ifindex = nh->fib_nh_oif;
 
-			if (nh->nh_gw &&
-			    nla_put_in_addr(skb, RTA_GATEWAY, nh->nh_gw))
+			if (nh->fib_nh_gw4 &&
+			    nla_put_in_addr(skb, RTA_GATEWAY, nh->fib_nh_gw4))
 				goto nla_put_failure;
 #ifdef CONFIG_IP_ROUTE_CLASSID
 			if (nh->nh_tclassid &&
 			    nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
 				goto nla_put_failure;
 #endif
-			if (nh->nh_lwtstate &&
-			    lwtunnel_fill_encap(skb, nh->nh_lwtstate) < 0)
+			if (nh->fib_nh_lws &&
+			    lwtunnel_fill_encap(skb, nh->fib_nh_lws) < 0)
 				goto nla_put_failure;
 
 			/* length of rtnetlink header + attributes */
@@ -1422,26 +1423,26 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local)
 	return ret;
 }
 
-static int call_fib_nh_notifiers(struct fib_nh *fib_nh,
+static int call_fib_nh_notifiers(struct fib_nh *nh,
 				 enum fib_event_type event_type)
 {
-	bool ignore_link_down = ip_ignore_linkdown(fib_nh->nh_dev);
+	bool ignore_link_down = ip_ignore_linkdown(nh->fib_nh_dev);
 	struct fib_nh_notifier_info info = {
-		.fib_nh = fib_nh,
+		.fib_nh = nh,
 	};
 
 	switch (event_type) {
 	case FIB_EVENT_NH_ADD:
-		if (fib_nh->nh_flags & RTNH_F_DEAD)
+		if (nh->fib_nh_flags & RTNH_F_DEAD)
 			break;
-		if (ignore_link_down && fib_nh->nh_flags & RTNH_F_LINKDOWN)
+		if (ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN)
 			break;
-		return call_fib4_notifiers(dev_net(fib_nh->nh_dev), event_type,
+		return call_fib4_notifiers(dev_net(nh->fib_nh_dev), event_type,
 					   &info.info);
 	case FIB_EVENT_NH_DEL:
-		if ((ignore_link_down && fib_nh->nh_flags & RTNH_F_LINKDOWN) ||
-		    (fib_nh->nh_flags & RTNH_F_DEAD))
-			return call_fib4_notifiers(dev_net(fib_nh->nh_dev),
+		if ((ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) ||
+		    (nh->fib_nh_flags & RTNH_F_DEAD))
+			return call_fib4_notifiers(dev_net(nh->fib_nh_dev),
 						   event_type, &info.info);
 	default:
 		break;
@@ -1495,7 +1496,7 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu)
 	struct fib_nh *nh;
 
 	hlist_for_each_entry(nh, head, nh_hash) {
-		if (nh->nh_dev == dev)
+		if (nh->fib_nh_dev == dev)
 			nh_update_mtu(nh, dev->mtu, orig_mtu);
 	}
 }
@@ -1523,22 +1524,22 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
 		int dead;
 
 		BUG_ON(!fi->fib_nhs);
-		if (nh->nh_dev != dev || fi == prev_fi)
+		if (nh->fib_nh_dev != dev || fi == prev_fi)
 			continue;
 		prev_fi = fi;
 		dead = 0;
 		change_nexthops(fi) {
-			if (nexthop_nh->nh_flags & RTNH_F_DEAD)
+			if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD)
 				dead++;
-			else if (nexthop_nh->nh_dev == dev &&
-				 nexthop_nh->nh_scope != scope) {
+			else if (nexthop_nh->fib_nh_dev == dev &&
+				 nexthop_nh->fib_nh_scope != scope) {
 				switch (event) {
 				case NETDEV_DOWN:
 				case NETDEV_UNREGISTER:
-					nexthop_nh->nh_flags |= RTNH_F_DEAD;
+					nexthop_nh->fib_nh_flags |= RTNH_F_DEAD;
 					/* fall through */
 				case NETDEV_CHANGE:
-					nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
+					nexthop_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
 					break;
 				}
 				call_fib_nh_notifiers(nexthop_nh,
@@ -1547,7 +1548,7 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
 			}
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 			if (event == NETDEV_UNREGISTER &&
-			    nexthop_nh->nh_dev == dev) {
+			    nexthop_nh->fib_nh_dev == dev) {
 				dead = fi->fib_nhs;
 				break;
 			}
@@ -1607,8 +1608,8 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
 		if (next_fi->fib_scope != res->scope ||
 		    fa->fa_type != RTN_UNICAST)
 			continue;
-		if (!next_fi->fib_nh[0].nh_gw ||
-		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+		if (!next_fi->fib_nh[0].fib_nh_gw4 ||
+		    next_fi->fib_nh[0].fib_nh_scope != RT_SCOPE_LINK)
 			continue;
 
 		fib_alias_accessed(fa);
@@ -1679,24 +1680,24 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
 		int alive;
 
 		BUG_ON(!fi->fib_nhs);
-		if (nh->nh_dev != dev || fi == prev_fi)
+		if (nh->fib_nh_dev != dev || fi == prev_fi)
 			continue;
 
 		prev_fi = fi;
 		alive = 0;
 		change_nexthops(fi) {
-			if (!(nexthop_nh->nh_flags & nh_flags)) {
+			if (!(nexthop_nh->fib_nh_flags & nh_flags)) {
 				alive++;
 				continue;
 			}
-			if (!nexthop_nh->nh_dev ||
-			    !(nexthop_nh->nh_dev->flags & IFF_UP))
+			if (!nexthop_nh->fib_nh_dev ||
+			    !(nexthop_nh->fib_nh_dev->flags & IFF_UP))
 				continue;
-			if (nexthop_nh->nh_dev != dev ||
+			if (nexthop_nh->fib_nh_dev != dev ||
 			    !__in_dev_get_rtnl(dev))
 				continue;
 			alive++;
-			nexthop_nh->nh_flags &= ~nh_flags;
+			nexthop_nh->fib_nh_flags &= ~nh_flags;
 			call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD);
 		} endfor_nexthops(fi)
 
@@ -1716,13 +1717,13 @@ static bool fib_good_nh(const struct fib_nh *nh)
 {
 	int state = NUD_REACHABLE;
 
-	if (nh->nh_scope == RT_SCOPE_LINK) {
+	if (nh->fib_nh_scope == RT_SCOPE_LINK) {
 		struct neighbour *n;
 
 		rcu_read_lock_bh();
 
-		n = __ipv4_neigh_lookup_noref(nh->nh_dev,
-					      (__force u32)nh->nh_gw);
+		n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
+					      (__force u32)nh->fib_nh_gw4);
 		if (n)
 			state = n->nud_state;
 
@@ -1748,7 +1749,7 @@ void fib_select_multipath(struct fib_result *res, int hash)
 			}
 		}
 
-		if (hash > atomic_read(&nh->nh_upper_bound))
+		if (hash > atomic_read(&nh->fib_nh_upper_bound))
 			continue;
 
 		res->nh_sel = nhsel;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 656d3d19f112..1e3b492690f9 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1472,15 +1472,15 @@ found:
 		for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
 			const struct fib_nh *nh = &fi->fib_nh[nhsel];
 
-			if (nh->nh_flags & RTNH_F_DEAD)
+			if (nh->fib_nh_flags & RTNH_F_DEAD)
 				continue;
-			if (ip_ignore_linkdown(nh->nh_dev) &&
-			    nh->nh_flags & RTNH_F_LINKDOWN &&
+			if (ip_ignore_linkdown(nh->fib_nh_dev) &&
+			    nh->fib_nh_flags & RTNH_F_LINKDOWN &&
 			    !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
 				continue;
 			if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {
 				if (flp->flowi4_oif &&
-				    flp->flowi4_oif != nh->nh_oif)
+				    flp->flowi4_oif != nh->fib_nh_oif)
 					continue;
 			}
 
@@ -2651,7 +2651,7 @@ static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info
 
 	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
 		flags = RTF_REJECT;
-	if (fi && fi->fib_nh->nh_gw)
+	if (fi && fi->fib_nh->fib_nh_gw4)
 		flags |= RTF_GATEWAY;
 	if (mask == htonl(0xFFFFFFFF))
 		flags |= RTF_HOST;
@@ -2702,7 +2702,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
 				   "%d\t%08X\t%d\t%u\t%u",
 				   fi->fib_dev ? fi->fib_dev->name : "*",
 				   prefix,
-				   fi->fib_nh->nh_gw, flags, 0, 0,
+				   fi->fib_nh->fib_nh_gw4, flags, 0, 0,
 				   fi->fib_priority,
 				   mask,
 				   (fi->fib_advmss ?
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f2688fce39e1..7977514d90f5 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -644,7 +644,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 	unsigned int i;
 	int depth;
 
-	genid = fnhe_genid(dev_net(nh->nh_dev));
+	genid = fnhe_genid(dev_net(nh->fib_nh_dev));
 	hval = fnhe_hashfun(daddr);
 
 	spin_lock_bh(&fnhe_lock);
@@ -1356,7 +1356,7 @@ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
 {
 	struct fib_info *fi = res->fi;
 	struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
-	struct net_device *dev = nh->nh_dev;
+	struct net_device *dev = nh->fib_nh_dev;
 	u32 mtu = 0;
 
 	if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
@@ -1374,7 +1374,7 @@ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
 	if (likely(!mtu))
 		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
 
-	return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu);
+	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
 }
 
 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
@@ -1531,8 +1531,8 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 	if (fi) {
 		struct fib_nh *nh = &FIB_RES_NH(*res);
 
-		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
-			rt->rt_gateway = nh->nh_gw;
+		if (nh->fib_nh_gw4 && nh->fib_nh_scope == RT_SCOPE_LINK) {
+			rt->rt_gateway = nh->fib_nh_gw4;
 			rt->rt_uses_gateway = 1;
 		}
 		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
@@ -1540,7 +1540,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		rt->dst.tclassid = nh->nh_tclassid;
 #endif
-		rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
+		rt->dst.lwtstate = lwtstate_get(nh->fib_nh_lws);
 		if (unlikely(fnhe))
 			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
 		else if (do_cache)
@@ -2075,7 +2075,7 @@ local_input:
 	if (do_cache) {
 		struct fib_nh *nh = &FIB_RES_NH(*res);
 
-		rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
+		rth->dst.lwtstate = lwtstate_get(nh->fib_nh_lws);
 		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
 			WARN_ON(rth->dst.input == lwtunnel_input);
 			rth->dst.lwtstate->orig_input = rth->dst.input;
@@ -2264,8 +2264,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 		} else {
 			if (unlikely(fl4->flowi4_flags &
 				     FLOWI_FLAG_KNOWN_NH &&
-				     !(nh->nh_gw &&
-				       nh->nh_scope == RT_SCOPE_LINK))) {
+				     !(nh->fib_nh_gw4 &&
+				       nh->fib_nh_scope == RT_SCOPE_LINK))) {
 				do_cache = false;
 				goto add;
 			}
-- 
cgit 


From f1741730dd18828fe3ea5fa91c22f41cf001c625 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:57 -0700
Subject: net: Add fib_nh_common and update fib_nh and fib6_nh

Add fib_nh_common struct with common nexthop attributes. Convert
fib_nh and fib6_nh to use it. Use macros to move existing
fib_nh_* references to the new nh_common.nhc_*.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c1e16b52338b..e9992407863e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -468,6 +468,8 @@ int fib_nh_init(struct net *net, struct fib_nh *nh,
 {
 	int err = -ENOMEM;
 
+	nh->fib_nh_family = AF_INET;
+
 	nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
 	if (!nh->nh_pcpu_rth_output)
 		goto err_out;
@@ -490,7 +492,10 @@ int fib_nh_init(struct net *net, struct fib_nh *nh,
 	}
 
 	nh->fib_nh_oif = cfg->fc_oif;
-	nh->fib_nh_gw4 = cfg->fc_gw;
+	if (cfg->fc_gw) {
+		nh->fib_nh_gw4 = cfg->fc_gw;
+		nh->fib_nh_has_gw = 1;
+	}
 	nh->fib_nh_flags = cfg->fc_flags;
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
-- 
cgit 


From 979e276ebebd537782797c439c9cb42b6d3aba27 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:58 -0700
Subject: net: Use common nexthop init and release helpers

With fib_nh_common in place, move common initialization and release
code into helpers used by both ipv4 and ipv6. For the moment, the init
is just the lwt encap and the release is both the netdev reference and
the the lwt state reference. More will be added later.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 60 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 21 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e9992407863e..df777af7e278 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -204,16 +204,22 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
 	free_percpu(rtp);
 }
 
+void fib_nh_common_release(struct fib_nh_common *nhc)
+{
+	if (nhc->nhc_dev)
+		dev_put(nhc->nhc_dev);
+
+	lwtstate_put(nhc->nhc_lwtstate);
+}
+EXPORT_SYMBOL_GPL(fib_nh_common_release);
+
 void fib_nh_release(struct net *net, struct fib_nh *fib_nh)
 {
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	if (fib_nh->nh_tclassid)
 		net->ipv4.fib_num_tclassid_users--;
 #endif
-	if (fib_nh->fib_nh_dev)
-		dev_put(fib_nh->fib_nh_dev);
-
-	lwtstate_put(fib_nh->fib_nh_lws);
+	fib_nh_common_release(&fib_nh->nh_common);
 	free_nh_exceptions(fib_nh);
 	rt_fibinfo_free_cpus(fib_nh->nh_pcpu_rth_output);
 	rt_fibinfo_free(&fib_nh->nh_rth_input);
@@ -462,6 +468,30 @@ static int fib_detect_death(struct fib_info *fi, int order,
 	return 1;
 }
 
+int fib_nh_common_init(struct fib_nh_common *nhc, struct nlattr *encap,
+		       u16 encap_type, void *cfg, gfp_t gfp_flags,
+		       struct netlink_ext_ack *extack)
+{
+	if (encap) {
+		struct lwtunnel_state *lwtstate;
+		int err;
+
+		if (encap_type == LWTUNNEL_ENCAP_NONE) {
+			NL_SET_ERR_MSG(extack, "LWT encap type not specified");
+			return -EINVAL;
+		}
+		err = lwtunnel_build_state(encap_type, encap, nhc->nhc_family,
+					   cfg, &lwtstate, extack);
+		if (err)
+			return err;
+
+		nhc->nhc_lwtstate = lwtstate_get(lwtstate);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fib_nh_common_init);
+
 int fib_nh_init(struct net *net, struct fib_nh *nh,
 		struct fib_config *cfg, int nh_weight,
 		struct netlink_ext_ack *extack)
@@ -474,22 +504,10 @@ int fib_nh_init(struct net *net, struct fib_nh *nh,
 	if (!nh->nh_pcpu_rth_output)
 		goto err_out;
 
-	if (cfg->fc_encap) {
-		struct lwtunnel_state *lwtstate;
-
-		err = -EINVAL;
-		if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) {
-			NL_SET_ERR_MSG(extack, "LWT encap type not specified");
-			goto lwt_failure;
-		}
-		err = lwtunnel_build_state(cfg->fc_encap_type,
-					   cfg->fc_encap, AF_INET, cfg,
-					   &lwtstate, extack);
-		if (err)
-			goto lwt_failure;
-
-		nh->fib_nh_lws = lwtstate_get(lwtstate);
-	}
+	err = fib_nh_common_init(&nh->nh_common, cfg->fc_encap,
+				 cfg->fc_encap_type, cfg, GFP_KERNEL, extack);
+	if (err)
+		goto init_failure;
 
 	nh->fib_nh_oif = cfg->fc_oif;
 	if (cfg->fc_gw) {
@@ -508,7 +526,7 @@ int fib_nh_init(struct net *net, struct fib_nh *nh,
 #endif
 	return 0;
 
-lwt_failure:
+init_failure:
 	rt_fibinfo_free_cpus(nh->nh_pcpu_rth_output);
 	nh->nh_pcpu_rth_output = NULL;
 err_out:
-- 
cgit 


From eb70a1ae2339769156f8ecddd7f6cd59ac994888 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 29 Mar 2019 12:46:17 -0700
Subject: tcp: cleanup sk_tx_skb_cache before reuse

TCP stack relies on the fact that a freshly allocated skb
has skb->cb[] and skb_shinfo(skb)->tx_flags cleared.

When recycling tx skb, we must ensure these fields are cleared.

Fixes: 472c2e07eef0 ("tcp: add one skb cache for tx")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 82bd707c0347..603e770d59b3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -872,6 +872,8 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
 			sk->sk_tx_skb_cache = NULL;
 			pskb_trim(skb, 0);
 			INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
+			skb_shinfo(skb)->tx_flags = 0;
+			memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb));
 			return skb;
 		}
 	}
-- 
cgit 


From 5869b8fadad0be948e310c456f111c4103f5fbfb Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 31 Mar 2019 17:03:02 +0800
Subject: net: use rcu_dereference_protected to fetch sk_dst_cache in
 sk_destruct

As Eric noticed, in .sk_destruct, sk->sk_dst_cache update is prevented, and
no barrier is needed for this. So change to use rcu_dereference_protected()
instead of rcu_dereference_check() to fetch sk_dst_cache in there.

v1->v2:
  - no change, repost after net-next is open.

Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 7f3a984ad618..08a8430f5647 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -160,7 +160,7 @@ void inet_sock_destruct(struct sock *sk)
 	WARN_ON(sk->sk_forward_alloc);
 
 	kfree(rcu_dereference_protected(inet->inet_opt, 1));
-	dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
+	dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1));
 	dst_release(sk->sk_rx_dst);
 	sk_refcnt_debug_dec(sk);
 }
-- 
cgit 


From 0af7e7c128eb33f2dc16ed088ced00675785d628 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 2 Apr 2019 14:11:54 -0700
Subject: ipv4: Update fib_table_lookup tracepoint to take common nexthop

Update fib_table_lookup tracepoint to take a fib_nh_common struct and
dump the v6 gateway address if the nexthop uses it.

Over the years saddr has not proven useful and the output of the
tracepoint produces very long lines. Since saddr is not part of
fib_nh_common, drop it. If it needs to be added later, fib_nh which
contains saddr can be obtained from a fib_nh_common via container_of.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 1e3b492690f9..13b3327206f9 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1498,7 +1498,7 @@ found:
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 			this_cpu_inc(stats->semantic_match_passed);
 #endif
-			trace_fib_table_lookup(tb->tb_id, flp, nh, err);
+			trace_fib_table_lookup(tb->tb_id, flp, &nh->nh_common, err);
 
 			return err;
 		}
-- 
cgit 


From eba618abacade71669eb67c3360eecfee810cc88 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 2 Apr 2019 14:11:55 -0700
Subject: ipv4: Add fib_nh_common to fib_result

Most of the ipv4 code only needs data from fib_nh_common. Add
fib_nh_common selection to fib_result and update users to use it.

Right now, fib_nh_common in fib_result will point to a fib_nh struct
that is embedded within a fib_info:

        fib_info  --> fib_nh
                      fib_nh
                      ...
                      fib_nh
                        ^
    fib_result->nhc ----+

Later, nhc can point to a fib_nh within a nexthop struct:

        fib_info --> nexthop --> fib_nh
                                   ^
    fib_result->nhc ---------------+

or for a nexthop group:

        fib_info --> nexthop --> nexthop --> fib_nh
                                 nexthop --> fib_nh
                                 ...
                                 nexthop --> fib_nh
                                               ^
    fib_result->nhc ---------------------------+

In all cases nhsel within fib_result will point to which leg in the
multipath route is used.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c  |  6 ++---
 net/ipv4/fib_lookup.h    |  1 +
 net/ipv4/fib_semantics.c | 25 ++++++++++++++++----
 net/ipv4/fib_trie.c      | 13 ++++++-----
 net/ipv4/route.c         | 60 ++++++++++++++++++++++++++++++++----------------
 5 files changed, 72 insertions(+), 33 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index ffbe24397dbe..15f779bd26b3 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -307,7 +307,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
 			.flowi4_mark = vmark ? skb->mark : 0,
 		};
 		if (!fib_lookup(net, &fl4, &res, 0))
-			return FIB_RES_PREFSRC(net, res);
+			return fib_result_prefsrc(net, &res);
 	} else {
 		scope = RT_SCOPE_LINK;
 	}
@@ -390,7 +390,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 
 	dev_match = fib_info_nh_uses_dev(res.fi, dev);
 	if (dev_match) {
-		ret = FIB_RES_NH(res).fib_nh_scope >= RT_SCOPE_HOST;
+		ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
 		return ret;
 	}
 	if (no_addr)
@@ -402,7 +402,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 	ret = 0;
 	if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
 		if (res.type == RTN_UNICAST)
-			ret = FIB_RES_NH(res).fib_nh_scope >= RT_SCOPE_HOST;
+			ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
 	}
 	return ret;
 
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index e6ff282bb7f4..7945f0534db7 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -45,6 +45,7 @@ static inline void fib_result_assign(struct fib_result *res,
 {
 	/* we used to play games with refcounts, but we now use RCU */
 	res->fi = fi;
+	res->nhc = fib_info_nhc(fi, 0);
 }
 
 struct fib_prop {
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index df777af7e278..42666a409da0 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1075,6 +1075,21 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
 	return nh->nh_saddr;
 }
 
+__be32 fib_result_prefsrc(struct net *net, struct fib_result *res)
+{
+	struct fib_nh_common *nhc = res->nhc;
+	struct fib_nh *nh;
+
+	if (res->fi->fib_prefsrc)
+		return res->fi->fib_prefsrc;
+
+	nh = container_of(nhc, struct fib_nh, nh_common);
+	if (nh->nh_saddr_genid == atomic_read(&net->ipv4.dev_addr_genid))
+		return nh->nh_saddr;
+
+	return fib_info_update_nh_saddr(net, nh);
+}
+
 static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
 {
 	if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
@@ -1762,20 +1777,22 @@ void fib_select_multipath(struct fib_result *res, int hash)
 	struct net *net = fi->fib_net;
 	bool first = false;
 
-	for_nexthops(fi) {
+	change_nexthops(fi) {
 		if (net->ipv4.sysctl_fib_multipath_use_neigh) {
-			if (!fib_good_nh(nh))
+			if (!fib_good_nh(nexthop_nh))
 				continue;
 			if (!first) {
 				res->nh_sel = nhsel;
+				res->nhc = &nexthop_nh->nh_common;
 				first = true;
 			}
 		}
 
-		if (hash > atomic_read(&nh->fib_nh_upper_bound))
+		if (hash > atomic_read(&nexthop_nh->fib_nh_upper_bound))
 			continue;
 
 		res->nh_sel = nhsel;
+		res->nhc = &nexthop_nh->nh_common;
 		return;
 	} endfor_nexthops(fi);
 }
@@ -1802,5 +1819,5 @@ void fib_select_path(struct net *net, struct fib_result *res,
 
 check_saddr:
 	if (!fl4->saddr)
-		fl4->saddr = FIB_RES_PREFSRC(net, *res);
+		fl4->saddr = fib_result_prefsrc(net, res);
 }
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 13b3327206f9..334f723bdf80 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1470,17 +1470,17 @@ found:
 		if (fi->fib_flags & RTNH_F_DEAD)
 			continue;
 		for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
-			const struct fib_nh *nh = &fi->fib_nh[nhsel];
+			struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
 
-			if (nh->fib_nh_flags & RTNH_F_DEAD)
+			if (nhc->nhc_flags & RTNH_F_DEAD)
 				continue;
-			if (ip_ignore_linkdown(nh->fib_nh_dev) &&
-			    nh->fib_nh_flags & RTNH_F_LINKDOWN &&
+			if (ip_ignore_linkdown(nhc->nhc_dev) &&
+			    nhc->nhc_flags & RTNH_F_LINKDOWN &&
 			    !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
 				continue;
 			if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {
 				if (flp->flowi4_oif &&
-				    flp->flowi4_oif != nh->fib_nh_oif)
+				    flp->flowi4_oif != nhc->nhc_oif)
 					continue;
 			}
 
@@ -1490,6 +1490,7 @@ found:
 			res->prefix = htonl(n->key);
 			res->prefixlen = KEYLENGTH - fa->fa_slen;
 			res->nh_sel = nhsel;
+			res->nhc = nhc;
 			res->type = fa->fa_type;
 			res->scope = fi->fib_scope;
 			res->fi = fi;
@@ -1498,7 +1499,7 @@ found:
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 			this_cpu_inc(stats->semantic_match_passed);
 #endif
-			trace_fib_table_lookup(tb->tb_id, flp, &nh->nh_common, err);
+			trace_fib_table_lookup(tb->tb_id, flp, nhc, err);
 
 			return err;
 		}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 7977514d90f5..f3f2adf630d4 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -778,8 +778,10 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
 			neigh_event_send(n, NULL);
 		} else {
 			if (fib_lookup(net, fl4, &res, 0) == 0) {
-				struct fib_nh *nh = &FIB_RES_NH(res);
+				struct fib_nh_common *nhc = FIB_RES_NHC(res);
+				struct fib_nh *nh;
 
+				nh = container_of(nhc, struct fib_nh, nh_common);
 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
 						0, false,
 						jiffies + ip_rt_gc_timeout);
@@ -1027,8 +1029,10 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 
 	rcu_read_lock();
 	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
-		struct fib_nh *nh = &FIB_RES_NH(res);
+		struct fib_nh_common *nhc = FIB_RES_NHC(res);
+		struct fib_nh *nh;
 
+		nh = container_of(nhc, struct fib_nh, nh_common);
 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
 				      jiffies + ip_rt_mtu_expires);
 	}
@@ -1235,7 +1239,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
 
 		rcu_read_lock();
 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
-			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
+			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
 		else
 			src = inet_select_addr(rt->dst.dev,
 					       rt_nexthop(rt, iph->daddr),
@@ -1354,9 +1358,9 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
 
 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
 {
+	struct fib_nh_common *nhc = res->nhc;
+	struct net_device *dev = nhc->nhc_dev;
 	struct fib_info *fi = res->fi;
-	struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
-	struct net_device *dev = nh->fib_nh_dev;
 	u32 mtu = 0;
 
 	if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
@@ -1364,6 +1368,7 @@ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
 		mtu = fi->fib_mtu;
 
 	if (likely(!mtu)) {
+		struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common);
 		struct fib_nh_exception *fnhe;
 
 		fnhe = find_exception(nh, daddr);
@@ -1374,7 +1379,7 @@ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
 	if (likely(!mtu))
 		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
 
-	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
+	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
 }
 
 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
@@ -1529,7 +1534,8 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 	bool cached = false;
 
 	if (fi) {
-		struct fib_nh *nh = &FIB_RES_NH(*res);
+		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
+		struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common);
 
 		if (nh->fib_nh_gw4 && nh->fib_nh_scope == RT_SCOPE_LINK) {
 			rt->rt_gateway = nh->fib_nh_gw4;
@@ -1699,15 +1705,18 @@ static int __mkroute_input(struct sk_buff *skb,
 			   struct in_device *in_dev,
 			   __be32 daddr, __be32 saddr, u32 tos)
 {
+	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
+	struct net_device *dev = nhc->nhc_dev;
 	struct fib_nh_exception *fnhe;
 	struct rtable *rth;
+	struct fib_nh *nh;
 	int err;
 	struct in_device *out_dev;
 	bool do_cache;
 	u32 itag = 0;
 
 	/* get a working reference to the output device */
-	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
+	out_dev = __in_dev_get_rcu(dev);
 	if (!out_dev) {
 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
 		return -EINVAL;
@@ -1724,10 +1733,13 @@ static int __mkroute_input(struct sk_buff *skb,
 
 	do_cache = res->fi && !itag;
 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
-	    skb->protocol == htons(ETH_P_IP) &&
-	    (IN_DEV_SHARED_MEDIA(out_dev) ||
-	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
-		IPCB(skb)->flags |= IPSKB_DOREDIRECT;
+	    skb->protocol == htons(ETH_P_IP)) {
+		__be32 gw = nhc->nhc_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
+
+		if (IN_DEV_SHARED_MEDIA(out_dev) ||
+		    inet_addr_onlink(out_dev, saddr, gw))
+			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
+	}
 
 	if (skb->protocol != htons(ETH_P_IP)) {
 		/* Not IP (i.e. ARP). Do not create route, if it is
@@ -1744,12 +1756,13 @@ static int __mkroute_input(struct sk_buff *skb,
 		}
 	}
 
-	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
+	nh = container_of(nhc, struct fib_nh, nh_common);
+	fnhe = find_exception(nh, daddr);
 	if (do_cache) {
 		if (fnhe)
 			rth = rcu_dereference(fnhe->fnhe_rth_input);
 		else
-			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+			rth = rcu_dereference(nh->nh_rth_input);
 		if (rt_cache_valid(rth)) {
 			skb_dst_set_noref(skb, &rth->dst);
 			goto out;
@@ -2043,7 +2056,11 @@ local_input:
 	do_cache = false;
 	if (res->fi) {
 		if (!itag) {
-			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+			struct fib_nh_common *nhc = FIB_RES_NHC(*res);
+			struct fib_nh *nh;
+
+			nh = container_of(nhc, struct fib_nh, nh_common);
+			rth = rcu_dereference(nh->nh_rth_input);
 			if (rt_cache_valid(rth)) {
 				skb_dst_set_noref(skb, &rth->dst);
 				err = 0;
@@ -2073,15 +2090,17 @@ local_input:
 	}
 
 	if (do_cache) {
-		struct fib_nh *nh = &FIB_RES_NH(*res);
+		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
+		struct fib_nh *nh;
 
-		rth->dst.lwtstate = lwtstate_get(nh->fib_nh_lws);
+		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
 		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
 			WARN_ON(rth->dst.input == lwtunnel_input);
 			rth->dst.lwtstate->orig_input = rth->dst.input;
 			rth->dst.input = lwtunnel_input;
 		}
 
+		nh = container_of(nhc, struct fib_nh, nh_common);
 		if (unlikely(!rt_cache_route(nh, rth)))
 			rt_add_uncached_list(rth);
 	}
@@ -2253,8 +2272,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	fnhe = NULL;
 	do_cache &= fi != NULL;
 	if (fi) {
+		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
+		struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common);
 		struct rtable __rcu **prth;
-		struct fib_nh *nh = &FIB_RES_NH(*res);
 
 		fnhe = find_exception(nh, fl4->daddr);
 		if (!do_cache)
@@ -2264,8 +2284,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 		} else {
 			if (unlikely(fl4->flowi4_flags &
 				     FLOWI_FLAG_KNOWN_NH &&
-				     !(nh->fib_nh_gw4 &&
-				       nh->fib_nh_scope == RT_SCOPE_LINK))) {
+				     !(nhc->nhc_has_gw &&
+				       nhc->nhc_scope == RT_SCOPE_LINK))) {
 				do_cache = false;
 				goto add;
 			}
-- 
cgit 


From b0f60193632e4eab4c9663101bb435dd7bc27ae8 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 2 Apr 2019 14:11:56 -0700
Subject: ipv4: Refactor nexthop attributes in fib_dump_info

Similar to ipv6, move addition of nexthop attributes to dump
message into helpers that are called for both single path and
multipath routes. Align the new helpers to the IPv6 variant
which most notably means computing the flags argument based on
settings in nh_flags.

The RTA_FLOW argument is unique to IPv4, so it is appended after
the new fib_nexthop_info helper. The intent of a later patch is to
make both fib_nexthop_info and fib_add_nexthop usable for both IPv4
and IPv6. This patch is stepping stone in that direction.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 166 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 107 insertions(+), 59 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 42666a409da0..32fb0123d881 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1317,6 +1317,103 @@ failure:
 	return ERR_PTR(err);
 }
 
+static int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh *nh,
+			    unsigned int *flags, bool skip_oif)
+{
+	if (nh->fib_nh_flags & RTNH_F_DEAD)
+		*flags |= RTNH_F_DEAD;
+
+	if (nh->fib_nh_flags & RTNH_F_LINKDOWN) {
+		*flags |= RTNH_F_LINKDOWN;
+
+		rcu_read_lock();
+		if (ip_ignore_linkdown(nh->fib_nh_dev))
+			*flags |= RTNH_F_DEAD;
+		rcu_read_unlock();
+	}
+
+	if (nh->fib_nh_gw4 &&
+	    nla_put_in_addr(skb, RTA_GATEWAY, nh->fib_nh_gw4))
+		goto nla_put_failure;
+
+	*flags |= (nh->fib_nh_flags & RTNH_F_ONLINK);
+	if (nh->fib_nh_flags & RTNH_F_OFFLOAD)
+		*flags |= RTNH_F_OFFLOAD;
+
+	if (!skip_oif && nh->fib_nh_dev &&
+	    nla_put_u32(skb, RTA_OIF, nh->fib_nh_dev->ifindex))
+		goto nla_put_failure;
+
+	if (nh->fib_nh_lws &&
+	    lwtunnel_fill_encap(skb, nh->fib_nh_lws) < 0)
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+static int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh *nh)
+{
+	const struct net_device *dev = nh->fib_nh_dev;
+	struct rtnexthop *rtnh;
+	unsigned int flags = 0;
+
+	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
+	if (!rtnh)
+		goto nla_put_failure;
+
+	rtnh->rtnh_hops = nh->fib_nh_weight - 1;
+	rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
+
+	if (fib_nexthop_info(skb, nh, &flags, true) < 0)
+		goto nla_put_failure;
+
+	rtnh->rtnh_flags = flags;
+
+	/* length of rtnetlink header + attributes */
+	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
+{
+	struct nlattr *mp;
+
+	mp = nla_nest_start(skb, RTA_MULTIPATH);
+	if (!mp)
+		goto nla_put_failure;
+
+	for_nexthops(fi) {
+		if (fib_add_nexthop(skb, nh) < 0)
+			goto nla_put_failure;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+		if (nh->nh_tclassid &&
+		    nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
+			goto nla_put_failure;
+#endif
+	} endfor_nexthops(fi);
+
+	nla_nest_end(skb, mp);
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+#else
+static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
+{
+	return 0;
+}
+#endif
+
 int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 		  u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
 		  struct fib_info *fi, unsigned int flags)
@@ -1357,72 +1454,23 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 	    nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
 		goto nla_put_failure;
 	if (fi->fib_nhs == 1) {
-		if (fi->fib_nh->fib_nh_gw4 &&
-		    nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->fib_nh_gw4))
-			goto nla_put_failure;
-		if (fi->fib_nh->fib_nh_oif &&
-		    nla_put_u32(skb, RTA_OIF, fi->fib_nh->fib_nh_oif))
+		struct fib_nh *nh = &fi->fib_nh[0];
+		unsigned int flags = 0;
+
+		if (fib_nexthop_info(skb, nh, &flags, false) < 0)
 			goto nla_put_failure;
-		if (fi->fib_nh->fib_nh_flags & RTNH_F_LINKDOWN) {
-			rcu_read_lock();
-			if (ip_ignore_linkdown(fi->fib_nh->fib_nh_dev))
-				rtm->rtm_flags |= RTNH_F_DEAD;
-			rcu_read_unlock();
-		}
-		if (fi->fib_nh->fib_nh_flags & RTNH_F_OFFLOAD)
-			rtm->rtm_flags |= RTNH_F_OFFLOAD;
+
+		rtm->rtm_flags = flags;
 #ifdef CONFIG_IP_ROUTE_CLASSID
-		if (fi->fib_nh[0].nh_tclassid &&
-		    nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
+		if (nh->nh_tclassid &&
+		    nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
 			goto nla_put_failure;
 #endif
-		if (fi->fib_nh->fib_nh_lws &&
-		    lwtunnel_fill_encap(skb, fi->fib_nh->fib_nh_lws) < 0)
+	} else {
+		if (fib_add_multipath(skb, fi) < 0)
 			goto nla_put_failure;
 	}
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-	if (fi->fib_nhs > 1) {
-		struct rtnexthop *rtnh;
-		struct nlattr *mp;
-
-		mp = nla_nest_start(skb, RTA_MULTIPATH);
-		if (!mp)
-			goto nla_put_failure;
 
-		for_nexthops(fi) {
-			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
-			if (!rtnh)
-				goto nla_put_failure;
-
-			rtnh->rtnh_flags = nh->fib_nh_flags & 0xFF;
-			if (nh->fib_nh_flags & RTNH_F_LINKDOWN) {
-				rcu_read_lock();
-				if (ip_ignore_linkdown(nh->fib_nh_dev))
-					rtnh->rtnh_flags |= RTNH_F_DEAD;
-				rcu_read_unlock();
-			}
-			rtnh->rtnh_hops = nh->fib_nh_weight - 1;
-			rtnh->rtnh_ifindex = nh->fib_nh_oif;
-
-			if (nh->fib_nh_gw4 &&
-			    nla_put_in_addr(skb, RTA_GATEWAY, nh->fib_nh_gw4))
-				goto nla_put_failure;
-#ifdef CONFIG_IP_ROUTE_CLASSID
-			if (nh->nh_tclassid &&
-			    nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
-				goto nla_put_failure;
-#endif
-			if (nh->fib_nh_lws &&
-			    lwtunnel_fill_encap(skb, nh->fib_nh_lws) < 0)
-				goto nla_put_failure;
-
-			/* length of rtnetlink header + attributes */
-			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
-		} endfor_nexthops(fi);
-
-		nla_nest_end(skb, mp);
-	}
-#endif
 	nlmsg_end(skb, nlh);
 	return 0;
 
-- 
cgit 


From c236419981224d37a5d0a6e7781f73479d4a030e Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 2 Apr 2019 14:11:57 -0700
Subject: ipv4: Change fib_nexthop_info and fib_add_nexthop to take
 fib_nh_common

With the exception of the nexthop weight, the nexthop attributes used by
fib_nexthop_info and fib_add_nexthop come from the fib_nh_common struct.
Update both to use it and change fib_nexthop_info to check the family
as needed.

nexthop weight comes from the common struct for existing use cases, but
for nexthop groups the weight is outside of the fib_nh_common to allow
the same nexthop definition to be used in multiple groups with different
weights.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 52 +++++++++++++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 20 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 32fb0123d881..33a43965a232 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1317,35 +1317,44 @@ failure:
 	return ERR_PTR(err);
 }
 
-static int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh *nh,
+static int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
 			    unsigned int *flags, bool skip_oif)
 {
-	if (nh->fib_nh_flags & RTNH_F_DEAD)
+	if (nhc->nhc_flags & RTNH_F_DEAD)
 		*flags |= RTNH_F_DEAD;
 
-	if (nh->fib_nh_flags & RTNH_F_LINKDOWN) {
+	if (nhc->nhc_flags & RTNH_F_LINKDOWN) {
 		*flags |= RTNH_F_LINKDOWN;
 
 		rcu_read_lock();
-		if (ip_ignore_linkdown(nh->fib_nh_dev))
-			*flags |= RTNH_F_DEAD;
+		switch (nhc->nhc_family) {
+		case AF_INET:
+			if (ip_ignore_linkdown(nhc->nhc_dev))
+				*flags |= RTNH_F_DEAD;
+			break;
+		}
 		rcu_read_unlock();
 	}
 
-	if (nh->fib_nh_gw4 &&
-	    nla_put_in_addr(skb, RTA_GATEWAY, nh->fib_nh_gw4))
-		goto nla_put_failure;
+	if (nhc->nhc_has_gw) {
+		switch (nhc->nhc_family) {
+		case AF_INET:
+			if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4))
+				goto nla_put_failure;
+			break;
+		}
+	}
 
-	*flags |= (nh->fib_nh_flags & RTNH_F_ONLINK);
-	if (nh->fib_nh_flags & RTNH_F_OFFLOAD)
+	*flags |= (nhc->nhc_flags & RTNH_F_ONLINK);
+	if (nhc->nhc_flags & RTNH_F_OFFLOAD)
 		*flags |= RTNH_F_OFFLOAD;
 
-	if (!skip_oif && nh->fib_nh_dev &&
-	    nla_put_u32(skb, RTA_OIF, nh->fib_nh_dev->ifindex))
+	if (!skip_oif && nhc->nhc_dev &&
+	    nla_put_u32(skb, RTA_OIF, nhc->nhc_dev->ifindex))
 		goto nla_put_failure;
 
-	if (nh->fib_nh_lws &&
-	    lwtunnel_fill_encap(skb, nh->fib_nh_lws) < 0)
+	if (nhc->nhc_lwtstate &&
+	    lwtunnel_fill_encap(skb, nhc->nhc_lwtstate) < 0)
 		goto nla_put_failure;
 
 	return 0;
@@ -1355,9 +1364,10 @@ nla_put_failure:
 }
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-static int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh *nh)
+static int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc,
+			   int nh_weight)
 {
-	const struct net_device *dev = nh->fib_nh_dev;
+	const struct net_device *dev = nhc->nhc_dev;
 	struct rtnexthop *rtnh;
 	unsigned int flags = 0;
 
@@ -1365,10 +1375,10 @@ static int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh *nh)
 	if (!rtnh)
 		goto nla_put_failure;
 
-	rtnh->rtnh_hops = nh->fib_nh_weight - 1;
+	rtnh->rtnh_hops = nh_weight - 1;
 	rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
 
-	if (fib_nexthop_info(skb, nh, &flags, true) < 0)
+	if (fib_nexthop_info(skb, nhc, &flags, true) < 0)
 		goto nla_put_failure;
 
 	rtnh->rtnh_flags = flags;
@@ -1381,7 +1391,9 @@ static int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh *nh)
 nla_put_failure:
 	return -EMSGSIZE;
 }
+#endif
 
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
 static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
 {
 	struct nlattr *mp;
@@ -1391,7 +1403,7 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
 		goto nla_put_failure;
 
 	for_nexthops(fi) {
-		if (fib_add_nexthop(skb, nh) < 0)
+		if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight) < 0)
 			goto nla_put_failure;
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		if (nh->nh_tclassid &&
@@ -1457,7 +1469,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 		struct fib_nh *nh = &fi->fib_nh[0];
 		unsigned int flags = 0;
 
-		if (fib_nexthop_info(skb, nh, &flags, false) < 0)
+		if (fib_nexthop_info(skb, &nh->nh_common, &flags, false) < 0)
 			goto nla_put_failure;
 
 		rtm->rtm_flags = flags;
-- 
cgit 


From c0a720770c01e67374b15f348f17a52409f6545c Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 2 Apr 2019 14:11:58 -0700
Subject: ipv6: Flip to fib_nexthop_info

Export fib_nexthop_info and fib_add_nexthop for use by IPv6 code.
Remove rt6_nexthop_info and rt6_add_nexthop in favor of the IPv4
versions. Update fib_nexthop_info for IPv6 linkdown check and
RTA_GATEWAY for AF_INET6.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 33a43965a232..8e0cb1687a74 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -45,6 +45,7 @@
 #include <net/nexthop.h>
 #include <net/lwtunnel.h>
 #include <net/fib_notifier.h>
+#include <net/addrconf.h>
 
 #include "fib_lookup.h"
 
@@ -1317,8 +1318,8 @@ failure:
 	return ERR_PTR(err);
 }
 
-static int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
-			    unsigned int *flags, bool skip_oif)
+int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
+		     unsigned int *flags, bool skip_oif)
 {
 	if (nhc->nhc_flags & RTNH_F_DEAD)
 		*flags |= RTNH_F_DEAD;
@@ -1332,6 +1333,10 @@ static int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc
 			if (ip_ignore_linkdown(nhc->nhc_dev))
 				*flags |= RTNH_F_DEAD;
 			break;
+		case AF_INET6:
+			if (ip6_ignore_linkdown(nhc->nhc_dev))
+				*flags |= RTNH_F_DEAD;
+			break;
 		}
 		rcu_read_unlock();
 	}
@@ -1342,6 +1347,11 @@ static int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc
 			if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4))
 				goto nla_put_failure;
 			break;
+		case AF_INET6:
+			if (nla_put_in6_addr(skb, RTA_GATEWAY,
+					     &nhc->nhc_gw.ipv6) < 0)
+				goto nla_put_failure;
+			break;
 		}
 	}
 
@@ -1362,10 +1372,11 @@ static int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc
 nla_put_failure:
 	return -EMSGSIZE;
 }
+EXPORT_SYMBOL_GPL(fib_nexthop_info);
 
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-static int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc,
-			   int nh_weight)
+#if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6)
+int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc,
+		    int nh_weight)
 {
 	const struct net_device *dev = nhc->nhc_dev;
 	struct rtnexthop *rtnh;
@@ -1391,6 +1402,7 @@ static int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc,
 nla_put_failure:
 	return -EMSGSIZE;
 }
+EXPORT_SYMBOL_GPL(fib_add_nexthop);
 #endif
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-- 
cgit 


From 942f146a63cecaa6d7fb1e8d255efab217126c50 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 4 Apr 2019 13:54:20 +0200
Subject: net: use kfree_skb_list() from ip_do_fragment()

Just like 46cfd725c377 ("net: use kfree_skb_list() helper in more places").

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index c80188875f39..10b35328cfbc 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -693,11 +693,8 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 			return 0;
 		}
 
-		while (frag) {
-			skb = frag->next;
-			kfree_skb(frag);
-			frag = skb;
-		}
+		kfree_skb_list(frag);
+
 		IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
 		return err;
 
-- 
cgit 


From f6fee16dbbe3fe4f942858192b88507c1f2f21ce Mon Sep 17 00:00:00 2001
From: "Tilmans, Olivier (Nokia - BE/Antwerp)"
 <olivier.tilmans@nokia-bell-labs.com>
Date: Wed, 3 Apr 2019 13:49:42 +0000
Subject: tcp: Accept ECT on SYN in the presence of RFC8311
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Linux currently disable ECN for incoming connections when the SYN
requests ECN and the IP header has ECT(0)/ECT(1) set, as some
networks were reportedly mangling the ToS byte, hence could later
trigger false congestion notifications.

RFC8311 §4.3 relaxes RFC3168's requirements such that ECT can be set
one TCP control packets (including SYNs). The main benefit of this
is the decreased probability of losing a SYN in a congested
ECN-capable network (i.e., it avoids the initial 1s timeout).
Additionally, this allows the development of newer TCP extensions,
such as AccECN.

This patch relaxes the previous check, by enabling ECN on incoming
connections using SYN+ECT if at least one bit of the reserved flags
of the TCP header is set. Such bit would indicate that the sender of
the SYN is using a newer TCP feature than what the host implements,
such as AccECN, and is thus implementing RFC8311. This enables
end-hosts not supporting such extensions to still negociate ECN, and
to have some of the benefits of using ECN on control packets.

Signed-off-by: Olivier Tilmans <olivier.tilmans@nokia-bell-labs.com>
Suggested-by: Bob Briscoe <research@bobbriscoe.net>
Cc: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5dfbc333e79a..6660ce2a7333 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6263,6 +6263,11 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
  * congestion control: Linux DCTCP asserts ECT on all packets,
  * including SYN, which is most optimal solution; however,
  * others, such as FreeBSD do not.
+ *
+ * Exception: At least one of the reserved bits of the TCP header (th->res1) is
+ * set, indicating the use of a future TCP extension (such as AccECN). See
+ * RFC8311 §4.3 which updates RFC3168 to allow the development of such
+ * extensions.
  */
 static void tcp_ecn_create_request(struct request_sock *req,
 				   const struct sk_buff *skb,
@@ -6282,7 +6287,7 @@ static void tcp_ecn_create_request(struct request_sock *req,
 	ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
 	ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
 
-	if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
+	if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
 	    (ecn_ok_dst & DST_FEATURE_ECN_CA) ||
 	    tcp_bpf_ca_needs_ecn((struct sock *)req))
 		inet_rsk(req)->ecn_ok = 1;
-- 
cgit 


From d1edc085559744fbda7a55e97eeae8bd6135a11b Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 4 Apr 2019 15:46:03 +0100
Subject: tcp: remove redundant check on tskb

The non-null check on tskb is always false because it is in an else
path of a check on tskb and hence tskb is null in this code block.
This is check is therefore redundant and can be removed as well
as the label coalesc.

if (tsbk) {
        ...
} else {
        ...
        if (unlikely(!skb)) {
                if (tskb)       /* can never be true, redundant code */
                        goto coalesc;
                return;
        }
}

Addresses-Coverity: ("Logically dead code")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Mukesh Ojha <mojha@codeaurora.org>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e265d1aeeb66..32061928b054 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3088,7 +3088,6 @@ void tcp_send_fin(struct sock *sk)
 		tskb = skb_rb_last(&sk->tcp_rtx_queue);
 
 	if (tskb) {
-coalesce:
 		TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
 		TCP_SKB_CB(tskb)->end_seq++;
 		tp->write_seq++;
@@ -3104,11 +3103,9 @@ coalesce:
 		}
 	} else {
 		skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
-		if (unlikely(!skb)) {
-			if (tskb)
-				goto coalesce;
+		if (unlikely(!skb))
 			return;
-		}
+
 		INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
 		skb_reserve(skb, MAX_TCP_HEADER);
 		sk_forced_mem_schedule(sk, skb->truesize);
-- 
cgit 


From 8f0db018006a421956965e1149234c4e8db718ee Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 2 Apr 2019 10:07:45 +1100
Subject: rhashtable: use bit_spin_locks to protect hash bucket.

This patch changes rhashtables to use a bit_spin_lock on BIT(1) of the
bucket pointer to lock the hash chain for that bucket.

The benefits of a bit spin_lock are:
 - no need to allocate a separate array of locks.
 - no need to have a configuration option to guide the
   choice of the size of this array
 - locking cost is often a single test-and-set in a cache line
   that will have to be loaded anyway.  When inserting at, or removing
   from, the head of the chain, the unlock is free - writing the new
   address in the bucket head implicitly clears the lock bit.
   For __rhashtable_insert_fast() we ensure this always happens
   when adding a new key.
 - even when lockings costs 2 updates (lock and unlock), they are
   in a cacheline that needs to be read anyway.

The cost of using a bit spin_lock is a little bit of code complexity,
which I think is quite manageable.

Bit spin_locks are sometimes inappropriate because they are not fair -
if multiple CPUs repeatedly contend of the same lock, one CPU can
easily be starved.  This is not a credible situation with rhashtable.
Multiple CPUs may want to repeatedly add or remove objects, but they
will typically do so at different buckets, so they will attempt to
acquire different locks.

As we have more bit-locks than we previously had spinlocks (by at
least a factor of two) we can expect slightly less contention to
go with the slightly better cache behavior and reduced memory
consumption.

To enhance type checking, a new struct is introduced to represent the
  pointer plus lock-bit
that is stored in the bucket-table.  This is "struct rhash_lock_head"
and is empty.  A pointer to this needs to be cast to either an
unsigned lock, or a "struct rhash_head *" to be useful.
Variables of this type are most often called "bkt".

Previously "pprev" would sometimes point to a bucket, and sometimes a
->next pointer in an rhash_head.  As these are now different types,
pprev is NULL when it would have pointed to the bucket. In that case,
'blk' is used, together with correct locking protocol.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 2c931120c494..9a3f13edc98e 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -373,7 +373,6 @@ static const struct rhashtable_params ipmr_rht_params = {
 	.key_offset = offsetof(struct mfc_cache, cmparg),
 	.key_len = sizeof(struct mfc_cache_cmp_arg),
 	.nelem_hint = 3,
-	.locks_mul = 1,
 	.obj_cmpfn = ipmr_hash_cmp,
 	.automatic_shrinking = true,
 };
-- 
cgit 


From b262a69582a4676c7378a73077b7bb186c7c5b2a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:22 +0100
Subject: xfrm: place af number into xfrm_mode struct

This will be useful to know if we're supposed to decode ipv4 or ipv6.

While at it, make the unregister function return void, all module_exit
functions did just BUG(); there is never a point in doing error checks
if there is no way to handle such error.

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_mode_beet.c      | 8 +++-----
 net/ipv4/xfrm4_mode_transport.c | 8 +++-----
 net/ipv4/xfrm4_mode_tunnel.c    | 8 +++-----
 3 files changed, 9 insertions(+), 15 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
index 856d2dfdb44b..a2e3b52ae46c 100644
--- a/net/ipv4/xfrm4_mode_beet.c
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -134,19 +134,17 @@ static struct xfrm_mode xfrm4_beet_mode = {
 	.owner = THIS_MODULE,
 	.encap = XFRM_MODE_BEET,
 	.flags = XFRM_MODE_FLAG_TUNNEL,
+	.family = AF_INET,
 };
 
 static int __init xfrm4_beet_init(void)
 {
-	return xfrm_register_mode(&xfrm4_beet_mode, AF_INET);
+	return xfrm_register_mode(&xfrm4_beet_mode);
 }
 
 static void __exit xfrm4_beet_exit(void)
 {
-	int err;
-
-	err = xfrm_unregister_mode(&xfrm4_beet_mode, AF_INET);
-	BUG_ON(err);
+	xfrm_unregister_mode(&xfrm4_beet_mode);
 }
 
 module_init(xfrm4_beet_init);
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
index 1ad2c2c4e250..7c5443f797cf 100644
--- a/net/ipv4/xfrm4_mode_transport.c
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -93,19 +93,17 @@ static struct xfrm_mode xfrm4_transport_mode = {
 	.xmit = xfrm4_transport_xmit,
 	.owner = THIS_MODULE,
 	.encap = XFRM_MODE_TRANSPORT,
+	.family = AF_INET,
 };
 
 static int __init xfrm4_transport_init(void)
 {
-	return xfrm_register_mode(&xfrm4_transport_mode, AF_INET);
+	return xfrm_register_mode(&xfrm4_transport_mode);
 }
 
 static void __exit xfrm4_transport_exit(void)
 {
-	int err;
-
-	err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET);
-	BUG_ON(err);
+	xfrm_unregister_mode(&xfrm4_transport_mode);
 }
 
 module_init(xfrm4_transport_init);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 2a9764bd1719..cfc6b6d39755 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -131,19 +131,17 @@ static struct xfrm_mode xfrm4_tunnel_mode = {
 	.owner = THIS_MODULE,
 	.encap = XFRM_MODE_TUNNEL,
 	.flags = XFRM_MODE_FLAG_TUNNEL,
+	.family = AF_INET,
 };
 
 static int __init xfrm4_mode_tunnel_init(void)
 {
-	return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET);
+	return xfrm_register_mode(&xfrm4_tunnel_mode);
 }
 
 static void __exit xfrm4_mode_tunnel_exit(void)
 {
-	int err;
-
-	err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET);
-	BUG_ON(err);
+	xfrm_unregister_mode(&xfrm4_tunnel_mode);
 }
 
 module_init(xfrm4_mode_tunnel_init);
-- 
cgit 


From b45714b164cac71f503ad73654b3c880cb9f2590 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:23 +0100
Subject: xfrm: prefer family stored in xfrm_mode struct

Now that we have the family available directly in the
xfrm_mode struct, we can use that and avoid one extra dereference.

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/ip_vti.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index a8474799fb79..3f3f6d6be318 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -137,7 +137,7 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
 		}
 	}
 
-	family = inner_mode->afinfo->family;
+	family = inner_mode->family;
 
 	skb->mark = be32_to_cpu(tunnel->parms.i_key);
 	ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family);
-- 
cgit 


From c2d305e51038167dd9de8d476c72f667d84cad8b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:24 +0100
Subject: xfrm: remove input indirection from xfrm_mode

No need for any indirection or abstraction here, both functions
are pretty much the same and quite small, they also have no external
dependencies.

xfrm_prepare_input can then be made static.

With allmodconfig build, size increase of vmlinux is 25 byte:

Before:
   text   data     bss     dec      filename
15730207  6936924 4046908 26714039  vmlinux

After:
15730208  6936948 4046908 26714064 vmlinux

v2: Fix INET_XFRM_MODE_TRANSPORT name in is-enabled test (Sabrina Dubroca)
    change copied comment to refer to transport and network header,
    not skb->{h,nh}, which don't exist anymore. (Sabrina)
    make xfrm_prepare_input static (Eyal Birger)

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_mode_beet.c      |  1 -
 net/ipv4/xfrm4_mode_transport.c | 23 -----------------------
 net/ipv4/xfrm4_mode_tunnel.c    |  1 -
 3 files changed, 25 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
index a2e3b52ae46c..264c4c9e2473 100644
--- a/net/ipv4/xfrm4_mode_beet.c
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -128,7 +128,6 @@ out:
 
 static struct xfrm_mode xfrm4_beet_mode = {
 	.input2 = xfrm4_beet_input,
-	.input = xfrm_prepare_input,
 	.output2 = xfrm4_beet_output,
 	.output = xfrm4_prepare_output,
 	.owner = THIS_MODULE,
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
index 7c5443f797cf..c943d710f302 100644
--- a/net/ipv4/xfrm4_mode_transport.c
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -35,28 +35,6 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
 	return 0;
 }
 
-/* Remove encapsulation header.
- *
- * The IP header will be moved over the top of the encapsulation header.
- *
- * On entry, skb->h shall point to where the IP header should be and skb->nh
- * shall be set to where the IP header currently is.  skb->data shall point
- * to the start of the payload.
- */
-static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
-{
-	int ihl = skb->data - skb_transport_header(skb);
-
-	if (skb->transport_header != skb->network_header) {
-		memmove(skb_transport_header(skb),
-			skb_network_header(skb), ihl);
-		skb->network_header = skb->transport_header;
-	}
-	ip_hdr(skb)->tot_len = htons(skb->len + ihl);
-	skb_reset_transport_header(skb);
-	return 0;
-}
-
 static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x,
 						   struct sk_buff *skb,
 						   netdev_features_t features)
@@ -87,7 +65,6 @@ static void xfrm4_transport_xmit(struct xfrm_state *x, struct sk_buff *skb)
 }
 
 static struct xfrm_mode xfrm4_transport_mode = {
-	.input = xfrm4_transport_input,
 	.output = xfrm4_transport_output,
 	.gso_segment = xfrm4_transport_gso_segment,
 	.xmit = xfrm4_transport_xmit,
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index cfc6b6d39755..678b91754b5e 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -123,7 +123,6 @@ static void xfrm4_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb)
 
 static struct xfrm_mode xfrm4_tunnel_mode = {
 	.input2 = xfrm4_mode_tunnel_input,
-	.input = xfrm_prepare_input,
 	.output2 = xfrm4_mode_tunnel_output,
 	.output = xfrm4_prepare_output,
 	.gso_segment = xfrm4_mode_tunnel_gso_segment,
-- 
cgit 


From 0c620e97b3490890facbbe06d5deed9b024de255 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:25 +0100
Subject: xfrm: remove output indirection from xfrm_mode

Same is input indirection.  Only exception: we need to export
xfrm_outer_mode_output for pktgen.

Increases size of vmlinux by about 163 byte:
Before:
   text    data     bss     dec      filename
15730208  6936948 4046908 26714064   vmlinux

After:
15730311  6937008 4046908 26714227   vmlinux

xfrm_inner_extract_output has no more external callers, make it static.

v2: add IS_ENABLED(IPV6) guard in xfrm6_prepare_output
    add two missing breaks in xfrm_outer_mode_output (Sabrina Dubroca)
    add WARN_ON_ONCE for 'call AF_INET6 related output function, but
    CONFIG_IPV6=n' case.
    make xfrm_inner_extract_output static

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_mode_beet.c      |  1 -
 net/ipv4/xfrm4_mode_transport.c | 22 ----------------------
 net/ipv4/xfrm4_mode_tunnel.c    |  1 -
 net/ipv4/xfrm4_output.c         | 15 ---------------
 4 files changed, 39 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
index 264c4c9e2473..f02cc8237d54 100644
--- a/net/ipv4/xfrm4_mode_beet.c
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -129,7 +129,6 @@ out:
 static struct xfrm_mode xfrm4_beet_mode = {
 	.input2 = xfrm4_beet_input,
 	.output2 = xfrm4_beet_output,
-	.output = xfrm4_prepare_output,
 	.owner = THIS_MODULE,
 	.encap = XFRM_MODE_BEET,
 	.flags = XFRM_MODE_FLAG_TUNNEL,
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
index c943d710f302..6f8cf09ff0ef 100644
--- a/net/ipv4/xfrm4_mode_transport.c
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -14,27 +14,6 @@
 #include <net/xfrm.h>
 #include <net/protocol.h>
 
-/* Add encapsulation header.
- *
- * The IP header will be moved forward to make space for the encapsulation
- * header.
- */
-static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
-{
-	struct iphdr *iph = ip_hdr(skb);
-	int ihl = iph->ihl * 4;
-
-	skb_set_inner_transport_header(skb, skb_transport_offset(skb));
-
-	skb_set_network_header(skb, -x->props.header_len);
-	skb->mac_header = skb->network_header +
-			  offsetof(struct iphdr, protocol);
-	skb->transport_header = skb->network_header + ihl;
-	__skb_pull(skb, ihl);
-	memmove(skb_network_header(skb), iph, ihl);
-	return 0;
-}
-
 static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x,
 						   struct sk_buff *skb,
 						   netdev_features_t features)
@@ -65,7 +44,6 @@ static void xfrm4_transport_xmit(struct xfrm_state *x, struct sk_buff *skb)
 }
 
 static struct xfrm_mode xfrm4_transport_mode = {
-	.output = xfrm4_transport_output,
 	.gso_segment = xfrm4_transport_gso_segment,
 	.xmit = xfrm4_transport_xmit,
 	.owner = THIS_MODULE,
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 678b91754b5e..823bc54b47de 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -124,7 +124,6 @@ static void xfrm4_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb)
 static struct xfrm_mode xfrm4_tunnel_mode = {
 	.input2 = xfrm4_mode_tunnel_input,
 	.output2 = xfrm4_mode_tunnel_output,
-	.output = xfrm4_prepare_output,
 	.gso_segment = xfrm4_mode_tunnel_gso_segment,
 	.xmit = xfrm4_mode_tunnel_xmit,
 	.owner = THIS_MODULE,
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index be980c195fc5..6802d1aee424 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -58,21 +58,6 @@ int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb)
 	return xfrm4_extract_header(skb);
 }
 
-int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
-{
-	int err;
-
-	err = xfrm_inner_extract_output(x, skb);
-	if (err)
-		return err;
-
-	IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
-	skb->protocol = htons(ETH_P_IP);
-
-	return x->outer_mode->output2(x, skb);
-}
-EXPORT_SYMBOL(xfrm4_prepare_output);
-
 int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb)
 {
 	memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
-- 
cgit 


From 303c5fab1272888b22088fbdd08cb770205ccb7a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:26 +0100
Subject: xfrm: remove xmit indirection from xfrm_mode

There are only two versions (tunnel and transport). The ip/ipv6 versions
are only differ in sizeof(iphdr) vs ipv6hdr.

Place this in the core and use x->outer_mode->encap type to call the
correct adjustment helper.

Before:
   text   data    bss     dec      filename
15730311  6937008 4046908 26714227 vmlinux

After:
15730428  6937008 4046908 26714344 vmlinux

(about 117 byte increase)

v2: use family from x->outer_mode, not inner

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_mode_transport.c | 14 --------------
 net/ipv4/xfrm4_mode_tunnel.c    | 13 -------------
 2 files changed, 27 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
index 6f8cf09ff0ef..d4b34bb2de00 100644
--- a/net/ipv4/xfrm4_mode_transport.c
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -30,22 +30,8 @@ static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x,
 	return segs;
 }
 
-static void xfrm4_transport_xmit(struct xfrm_state *x, struct sk_buff *skb)
-{
-	struct xfrm_offload *xo = xfrm_offload(skb);
-
-	skb_reset_mac_len(skb);
-	pskb_pull(skb, skb->mac_len + sizeof(struct iphdr) + x->props.header_len);
-
-	if (xo->flags & XFRM_GSO_SEGMENT) {
-		 skb_reset_transport_header(skb);
-		 skb->transport_header -= x->props.header_len;
-	}
-}
-
 static struct xfrm_mode xfrm4_transport_mode = {
 	.gso_segment = xfrm4_transport_gso_segment,
-	.xmit = xfrm4_transport_xmit,
 	.owner = THIS_MODULE,
 	.encap = XFRM_MODE_TRANSPORT,
 	.family = AF_INET,
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 823bc54b47de..8bd5112b3ee3 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -109,23 +109,10 @@ static struct sk_buff *xfrm4_mode_tunnel_gso_segment(struct xfrm_state *x,
 	return skb_mac_gso_segment(skb, features);
 }
 
-static void xfrm4_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb)
-{
-	struct xfrm_offload *xo = xfrm_offload(skb);
-
-	if (xo->flags & XFRM_GSO_SEGMENT)
-		skb->transport_header = skb->network_header +
-					sizeof(struct iphdr);
-
-	skb_reset_mac_len(skb);
-	pskb_pull(skb, skb->mac_len + x->props.header_len);
-}
-
 static struct xfrm_mode xfrm4_tunnel_mode = {
 	.input2 = xfrm4_mode_tunnel_input,
 	.output2 = xfrm4_mode_tunnel_output,
 	.gso_segment = xfrm4_mode_tunnel_gso_segment,
-	.xmit = xfrm4_mode_tunnel_xmit,
 	.owner = THIS_MODULE,
 	.encap = XFRM_MODE_TUNNEL,
 	.flags = XFRM_MODE_FLAG_TUNNEL,
-- 
cgit 


From 7613b92b1ae37141704948b77e8762c5de896510 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:27 +0100
Subject: xfrm: remove gso_segment indirection from xfrm_mode

These functions are small and we only have versions for tunnel
and transport mode for ipv4 and ipv6 respectively.

Just place the 'transport or tunnel' conditional in the protocol
specific function instead of using an indirection.

Before:
    3226       12       0     3238   net/ipv4/esp4_offload.o
    7004      492       0     7496   net/ipv4/ip_vti.o
    3339       12       0     3351   net/ipv6/esp6_offload.o
   11294      460       0    11754   net/ipv6/ip6_vti.o
    1180       72       0     1252   net/ipv4/xfrm4_mode_beet.o
     428       48       0      476   net/ipv4/xfrm4_mode_transport.o
    1271       48       0     1319   net/ipv4/xfrm4_mode_tunnel.o
    1083       60       0     1143   net/ipv6/xfrm6_mode_beet.o
     172       48       0      220   net/ipv6/xfrm6_mode_ro.o
     429       48       0      477   net/ipv6/xfrm6_mode_transport.o
    1164       48       0     1212   net/ipv6/xfrm6_mode_tunnel.o
15730428  6937008 4046908 26714344   vmlinux

After:
    3461       12       0     3473   net/ipv4/esp4_offload.o
    7000      492       0     7492   net/ipv4/ip_vti.o
    3574       12       0     3586   net/ipv6/esp6_offload.o
   11295      460       0    11755   net/ipv6/ip6_vti.o
    1180       64       0     1244   net/ipv4/xfrm4_mode_beet.o
     171       40       0      211   net/ipv4/xfrm4_mode_transport.o
    1163       40       0     1203   net/ipv4/xfrm4_mode_tunnel.o
    1083       52       0     1135   net/ipv6/xfrm6_mode_beet.o
     172       40       0      212   net/ipv6/xfrm6_mode_ro.o
     172       40       0      212   net/ipv6/xfrm6_mode_transport.o
    1056       40       0     1096   net/ipv6/xfrm6_mode_tunnel.o
15730424  6937008 4046908 26714340   vmlinux

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/esp4_offload.c         | 40 +++++++++++++++++++++++++++++++++++++++-
 net/ipv4/xfrm4_mode_transport.c | 17 -----------------
 net/ipv4/xfrm4_mode_tunnel.c    |  9 ---------
 3 files changed, 39 insertions(+), 27 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index c6c84f2bc41c..74d59e0177a7 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -107,6 +107,44 @@ static void esp4_gso_encap(struct xfrm_state *x, struct sk_buff *skb)
 	xo->proto = proto;
 }
 
+static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x,
+						struct sk_buff *skb,
+						netdev_features_t features)
+{
+	__skb_push(skb, skb->mac_len);
+	return skb_mac_gso_segment(skb, features);
+}
+
+static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x,
+						   struct sk_buff *skb,
+						   netdev_features_t features)
+{
+	const struct net_offload *ops;
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	struct xfrm_offload *xo = xfrm_offload(skb);
+
+	skb->transport_header += x->props.header_len;
+	ops = rcu_dereference(inet_offloads[xo->proto]);
+	if (likely(ops && ops->callbacks.gso_segment))
+		segs = ops->callbacks.gso_segment(skb, features);
+
+	return segs;
+}
+
+static struct sk_buff *xfrm4_outer_mode_gso_segment(struct xfrm_state *x,
+						    struct sk_buff *skb,
+						    netdev_features_t features)
+{
+	switch (x->outer_mode->encap) {
+	case XFRM_MODE_TUNNEL:
+		return xfrm4_tunnel_gso_segment(x, skb, features);
+	case XFRM_MODE_TRANSPORT:
+		return xfrm4_transport_gso_segment(x, skb, features);
+	}
+
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
 				        netdev_features_t features)
 {
@@ -147,7 +185,7 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
 
 	xo->flags |= XFRM_GSO_SEGMENT;
 
-	return x->outer_mode->gso_segment(x, skb, esp_features);
+	return xfrm4_outer_mode_gso_segment(x, skb, esp_features);
 }
 
 static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb)
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
index d4b34bb2de00..397863ea762b 100644
--- a/net/ipv4/xfrm4_mode_transport.c
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -14,24 +14,7 @@
 #include <net/xfrm.h>
 #include <net/protocol.h>
 
-static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x,
-						   struct sk_buff *skb,
-						   netdev_features_t features)
-{
-	const struct net_offload *ops;
-	struct sk_buff *segs = ERR_PTR(-EINVAL);
-	struct xfrm_offload *xo = xfrm_offload(skb);
-
-	skb->transport_header += x->props.header_len;
-	ops = rcu_dereference(inet_offloads[xo->proto]);
-	if (likely(ops && ops->callbacks.gso_segment))
-		segs = ops->callbacks.gso_segment(skb, features);
-
-	return segs;
-}
-
 static struct xfrm_mode xfrm4_transport_mode = {
-	.gso_segment = xfrm4_transport_gso_segment,
 	.owner = THIS_MODULE,
 	.encap = XFRM_MODE_TRANSPORT,
 	.family = AF_INET,
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 8bd5112b3ee3..b5d4ba41758e 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -101,18 +101,9 @@ out:
 	return err;
 }
 
-static struct sk_buff *xfrm4_mode_tunnel_gso_segment(struct xfrm_state *x,
-						     struct sk_buff *skb,
-						     netdev_features_t features)
-{
-	__skb_push(skb, skb->mac_len);
-	return skb_mac_gso_segment(skb, features);
-}
-
 static struct xfrm_mode xfrm4_tunnel_mode = {
 	.input2 = xfrm4_mode_tunnel_input,
 	.output2 = xfrm4_mode_tunnel_output,
-	.gso_segment = xfrm4_mode_tunnel_gso_segment,
 	.owner = THIS_MODULE,
 	.encap = XFRM_MODE_TUNNEL,
 	.flags = XFRM_MODE_FLAG_TUNNEL,
-- 
cgit 


From b3284df1c86f7ac078dcb8fb250fe3d6437e740c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:28 +0100
Subject: xfrm: remove input2 indirection from xfrm_mode

No external dependencies on any module, place this in the core.
Increase is about 1800 byte for xfrm_input.o.

The beet helpers get added to internal header, as they can be reused
from xfrm_output.c in the next patch (kernel contains several
copies of them in the xfrm{4,6}_mode_beet.c files).

Before:
   text    data     bss     dec filename
   5578     176    2364    8118 net/xfrm/xfrm_input.o
   1180      64       0    1244 net/ipv4/xfrm4_mode_beet.o
    171      40       0     211 net/ipv4/xfrm4_mode_transport.o
   1163      40       0    1203 net/ipv4/xfrm4_mode_tunnel.o
   1083      52       0    1135 net/ipv6/xfrm6_mode_beet.o
    172      40       0     212 net/ipv6/xfrm6_mode_ro.o
    172      40       0     212 net/ipv6/xfrm6_mode_transport.o
   1056      40       0    1096 net/ipv6/xfrm6_mode_tunnel.o

After:
   text    data     bss     dec filename
   7373     200    2364    9937 net/xfrm/xfrm_input.o
    587      44       0     631 net/ipv4/xfrm4_mode_beet.o
    171      32       0     203 net/ipv4/xfrm4_mode_transport.o
    649      32       0     681 net/ipv4/xfrm4_mode_tunnel.o
    625      44       0     669 net/ipv6/xfrm6_mode_beet.o
    172      32       0     204 net/ipv6/xfrm6_mode_ro.o
    172      32       0     204 net/ipv6/xfrm6_mode_transport.o
    599      32       0     631 net/ipv6/xfrm6_mode_tunnel.o

v2: pass inner_mode to xfrm_inner_mode_encap_remove to fix
    AF_UNSPEC selector breakage (bisected by Benedict Wong)

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_mode_beet.c   | 47 --------------------------------------------
 net/ipv4/xfrm4_mode_tunnel.c | 39 ------------------------------------
 2 files changed, 86 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
index f02cc8237d54..500960172933 100644
--- a/net/ipv4/xfrm4_mode_beet.c
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -80,54 +80,7 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb)
 	return 0;
 }
 
-static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb)
-{
-	struct iphdr *iph;
-	int optlen = 0;
-	int err = -EINVAL;
-
-	if (unlikely(XFRM_MODE_SKB_CB(skb)->protocol == IPPROTO_BEETPH)) {
-		struct ip_beet_phdr *ph;
-		int phlen;
-
-		if (!pskb_may_pull(skb, sizeof(*ph)))
-			goto out;
-
-		ph = (struct ip_beet_phdr *)skb->data;
-
-		phlen = sizeof(*ph) + ph->padlen;
-		optlen = ph->hdrlen * 8 + (IPV4_BEET_PHMAXLEN - phlen);
-		if (optlen < 0 || optlen & 3 || optlen > 250)
-			goto out;
-
-		XFRM_MODE_SKB_CB(skb)->protocol = ph->nexthdr;
-
-		if (!pskb_may_pull(skb, phlen))
-			goto out;
-		__skb_pull(skb, phlen);
-	}
-
-	skb_push(skb, sizeof(*iph));
-	skb_reset_network_header(skb);
-	skb_mac_header_rebuild(skb);
-
-	xfrm4_beet_make_header(skb);
-
-	iph = ip_hdr(skb);
-
-	iph->ihl += optlen / 4;
-	iph->tot_len = htons(skb->len);
-	iph->daddr = x->sel.daddr.a4;
-	iph->saddr = x->sel.saddr.a4;
-	iph->check = 0;
-	iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
-	err = 0;
-out:
-	return err;
-}
-
 static struct xfrm_mode xfrm4_beet_mode = {
-	.input2 = xfrm4_beet_input,
 	.output2 = xfrm4_beet_output,
 	.owner = THIS_MODULE,
 	.encap = XFRM_MODE_BEET,
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index b5d4ba41758e..31645319aaeb 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -15,14 +15,6 @@
 #include <net/ip.h>
 #include <net/xfrm.h>
 
-static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
-{
-	struct iphdr *inner_iph = ipip_hdr(skb);
-
-	if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos))
-		IP_ECN_set_ce(inner_iph);
-}
-
 /* Add encapsulation header.
  *
  * The top IP header will be constructed per RFC 2401.
@@ -71,38 +63,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 	return 0;
 }
 
-static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
-{
-	int err = -EINVAL;
-
-	if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP)
-		goto out;
-
-	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
-		goto out;
-
-	err = skb_unclone(skb, GFP_ATOMIC);
-	if (err)
-		goto out;
-
-	if (x->props.flags & XFRM_STATE_DECAP_DSCP)
-		ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipip_hdr(skb));
-	if (!(x->props.flags & XFRM_STATE_NOECN))
-		ipip_ecn_decapsulate(skb);
-
-	skb_reset_network_header(skb);
-	skb_mac_header_rebuild(skb);
-	if (skb->mac_len)
-		eth_hdr(skb)->h_proto = skb->protocol;
-
-	err = 0;
-
-out:
-	return err;
-}
-
 static struct xfrm_mode xfrm4_tunnel_mode = {
-	.input2 = xfrm4_mode_tunnel_input,
 	.output2 = xfrm4_mode_tunnel_output,
 	.owner = THIS_MODULE,
 	.encap = XFRM_MODE_TUNNEL,
-- 
cgit 


From 1de70830066b72b6a8e259e5363f6c0bc4ba7bbc Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:29 +0100
Subject: xfrm: remove output2 indirection from xfrm_mode

similar to previous patch: no external module dependencies,
so we can avoid the indirection by placing this in the core.

This change removes the last indirection from xfrm_mode and the
xfrm4|6_mode_{beet,tunnel}.c modules contain (almost) no code anymore.

Before:
   text    data     bss     dec     hex filename
   3957     136       0    4093     ffd net/xfrm/xfrm_output.o
    587      44       0     631     277 net/ipv4/xfrm4_mode_beet.o
    649      32       0     681     2a9 net/ipv4/xfrm4_mode_tunnel.o
    625      44       0     669     29d net/ipv6/xfrm6_mode_beet.o
    599      32       0     631     277 net/ipv6/xfrm6_mode_tunnel.o
After:
   text    data     bss     dec     hex filename
   5359     184       0    5543    15a7 net/xfrm/xfrm_output.o
    171      24       0     195      c3 net/ipv4/xfrm4_mode_beet.o
    171      24       0     195      c3 net/ipv4/xfrm4_mode_tunnel.o
    172      24       0     196      c4 net/ipv6/xfrm6_mode_beet.o
    172      24       0     196      c4 net/ipv6/xfrm6_mode_tunnel.o

v2: fold the *encap_add functions into xfrm*_prepare_output
    preserve (move) output2 comment (Sabrina)
    use x->outer_mode->encap, not inner
    fix a build breakage on ppc (kbuild robot)

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_mode_beet.c   | 63 --------------------------------------------
 net/ipv4/xfrm4_mode_tunnel.c | 49 ----------------------------------
 2 files changed, 112 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
index 500960172933..ba84b278e627 100644
--- a/net/ipv4/xfrm4_mode_beet.c
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -17,71 +17,8 @@
 #include <net/ip.h>
 #include <net/xfrm.h>
 
-static void xfrm4_beet_make_header(struct sk_buff *skb)
-{
-	struct iphdr *iph = ip_hdr(skb);
-
-	iph->ihl = 5;
-	iph->version = 4;
-
-	iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol;
-	iph->tos = XFRM_MODE_SKB_CB(skb)->tos;
-
-	iph->id = XFRM_MODE_SKB_CB(skb)->id;
-	iph->frag_off = XFRM_MODE_SKB_CB(skb)->frag_off;
-	iph->ttl = XFRM_MODE_SKB_CB(skb)->ttl;
-}
-
-/* Add encapsulation header.
- *
- * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt.
- */
-static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb)
-{
-	struct ip_beet_phdr *ph;
-	struct iphdr *top_iph;
-	int hdrlen, optlen;
-
-	hdrlen = 0;
-	optlen = XFRM_MODE_SKB_CB(skb)->optlen;
-	if (unlikely(optlen))
-		hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4);
-
-	skb_set_network_header(skb, -x->props.header_len -
-				    hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph)));
-	if (x->sel.family != AF_INET6)
-		skb->network_header += IPV4_BEET_PHMAXLEN;
-	skb->mac_header = skb->network_header +
-			  offsetof(struct iphdr, protocol);
-	skb->transport_header = skb->network_header + sizeof(*top_iph);
-
-	xfrm4_beet_make_header(skb);
-
-	ph = __skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl - hdrlen);
-
-	top_iph = ip_hdr(skb);
-
-	if (unlikely(optlen)) {
-		BUG_ON(optlen < 0);
-
-		ph->padlen = 4 - (optlen & 4);
-		ph->hdrlen = optlen / 8;
-		ph->nexthdr = top_iph->protocol;
-		if (ph->padlen)
-			memset(ph + 1, IPOPT_NOP, ph->padlen);
-
-		top_iph->protocol = IPPROTO_BEETPH;
-		top_iph->ihl = sizeof(struct iphdr) / 4;
-	}
-
-	top_iph->saddr = x->props.saddr.a4;
-	top_iph->daddr = x->id.daddr.a4;
-
-	return 0;
-}
 
 static struct xfrm_mode xfrm4_beet_mode = {
-	.output2 = xfrm4_beet_output,
 	.owner = THIS_MODULE,
 	.encap = XFRM_MODE_BEET,
 	.flags = XFRM_MODE_FLAG_TUNNEL,
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 31645319aaeb..b2b132c800fc 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -15,56 +15,7 @@
 #include <net/ip.h>
 #include <net/xfrm.h>
 
-/* Add encapsulation header.
- *
- * The top IP header will be constructed per RFC 2401.
- */
-static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
-{
-	struct dst_entry *dst = skb_dst(skb);
-	struct iphdr *top_iph;
-	int flags;
-
-	skb_set_inner_network_header(skb, skb_network_offset(skb));
-	skb_set_inner_transport_header(skb, skb_transport_offset(skb));
-
-	skb_set_network_header(skb, -x->props.header_len);
-	skb->mac_header = skb->network_header +
-			  offsetof(struct iphdr, protocol);
-	skb->transport_header = skb->network_header + sizeof(*top_iph);
-	top_iph = ip_hdr(skb);
-
-	top_iph->ihl = 5;
-	top_iph->version = 4;
-
-	top_iph->protocol = xfrm_af2proto(skb_dst(skb)->ops->family);
-
-	/* DS disclosing depends on XFRM_SA_XFLAG_DONT_ENCAP_DSCP */
-	if (x->props.extra_flags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP)
-		top_iph->tos = 0;
-	else
-		top_iph->tos = XFRM_MODE_SKB_CB(skb)->tos;
-	top_iph->tos = INET_ECN_encapsulate(top_iph->tos,
-					    XFRM_MODE_SKB_CB(skb)->tos);
-
-	flags = x->props.flags;
-	if (flags & XFRM_STATE_NOECN)
-		IP_ECN_clear(top_iph);
-
-	top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
-		0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
-
-	top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst));
-
-	top_iph->saddr = x->props.saddr.a4;
-	top_iph->daddr = x->id.daddr.a4;
-	ip_select_ident(dev_net(dst->dev), skb, NULL);
-
-	return 0;
-}
-
 static struct xfrm_mode xfrm4_tunnel_mode = {
-	.output2 = xfrm4_mode_tunnel_output,
 	.owner = THIS_MODULE,
 	.encap = XFRM_MODE_TUNNEL,
 	.flags = XFRM_MODE_FLAG_TUNNEL,
-- 
cgit 


From 733a5fac2f15b55b9059230d098ed04341d2d884 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:30 +0100
Subject: xfrm: remove afinfo pointer from xfrm_mode

Adds an EXPORT_SYMBOL for afinfo_get_rcu, as it will now be called from
ipv6 in case of CONFIG_IPV6=m.

This change has virtually no effect on vmlinux size, but it reduces
afinfo size and allows followup patch to make xfrm modes const.

v2: mark if (afinfo) tests as likely (Sabrina)
    re-fetch afinfo according to inner_mode in xfrm_prepare_input().

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_output.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 6802d1aee424..7c3df14daef3 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -72,6 +72,8 @@ int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb)
 static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	struct xfrm_state *x = skb_dst(skb)->xfrm;
+	const struct xfrm_state_afinfo *afinfo;
+	int ret = -EAFNOSUPPORT;
 
 #ifdef CONFIG_NETFILTER
 	if (!x) {
@@ -80,7 +82,15 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 	}
 #endif
 
-	return x->outer_mode->afinfo->output_finish(sk, skb);
+	rcu_read_lock();
+	afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode->family);
+	if (likely(afinfo))
+		ret = afinfo->output_finish(sk, skb);
+	else
+		kfree_skb(skb);
+	rcu_read_unlock();
+
+	return ret;
 }
 
 int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
-- 
cgit 


From 4c145dce26013763490df88f2473714f5bc7857d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:31 +0100
Subject: xfrm: make xfrm modes builtin

after previous changes, xfrm_mode contains no function pointers anymore
and all modules defining such struct contain no code except an init/exit
functions to register the xfrm_mode struct with the xfrm core.

Just place the xfrm modes core and remove the modules,
the run-time xfrm_mode register/unregister functionality is removed.

Before:

    text    data     bss      dec filename
    7523     200    2364    10087 net/xfrm/xfrm_input.o
   40003     628     440    41071 net/xfrm/xfrm_state.o
15730338 6937080 4046908 26714326 vmlinux

    7389     200    2364    9953  net/xfrm/xfrm_input.o
   40574     656     440   41670  net/xfrm/xfrm_state.o
15730084 6937068 4046908 26714060 vmlinux

The xfrm*_mode_{transport,tunnel,beet} modules are gone.

v2: replace CONFIG_INET6_XFRM_MODE_* IS_ENABLED guards with CONFIG_IPV6
    ones rather than removing them.

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/Kconfig                | 29 +----------------------------
 net/ipv4/Makefile               |  3 ---
 net/ipv4/ip_vti.c               |  2 +-
 net/ipv4/xfrm4_mode_beet.c      | 41 -----------------------------------------
 net/ipv4/xfrm4_mode_transport.c | 36 ------------------------------------
 net/ipv4/xfrm4_mode_tunnel.c    | 38 --------------------------------------
 6 files changed, 2 insertions(+), 147 deletions(-)
 delete mode 100644 net/ipv4/xfrm4_mode_beet.c
 delete mode 100644 net/ipv4/xfrm4_mode_transport.c
 delete mode 100644 net/ipv4/xfrm4_mode_tunnel.c

(limited to 'net/ipv4')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 32cae39cdff6..8108e97d4285 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -304,7 +304,7 @@ config NET_IPVTI
 	tristate "Virtual (secure) IP: tunneling"
 	select INET_TUNNEL
 	select NET_IP_TUNNEL
-	depends on INET_XFRM_MODE_TUNNEL
+	select XFRM
 	---help---
 	  Tunneling means encapsulating data of one protocol type within
 	  another protocol and sending it over a channel that understands the
@@ -396,33 +396,6 @@ config INET_TUNNEL
 	tristate
 	default n
 
-config INET_XFRM_MODE_TRANSPORT
-	tristate "IP: IPsec transport mode"
-	default y
-	select XFRM
-	---help---
-	  Support for IPsec transport mode.
-
-	  If unsure, say Y.
-
-config INET_XFRM_MODE_TUNNEL
-	tristate "IP: IPsec tunnel mode"
-	default y
-	select XFRM
-	---help---
-	  Support for IPsec tunnel mode.
-
-	  If unsure, say Y.
-
-config INET_XFRM_MODE_BEET
-	tristate "IP: IPsec BEET mode"
-	default y
-	select XFRM
-	---help---
-	  Support for IPsec BEET mode.
-
-	  If unsure, say Y.
-
 config INET_DIAG
 	tristate "INET: socket monitoring interface"
 	default y
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 58629314eae9..000a61994c8f 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -37,10 +37,7 @@ obj-$(CONFIG_INET_ESP) += esp4.o
 obj-$(CONFIG_INET_ESP_OFFLOAD) += esp4_offload.o
 obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
 obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
-obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o
 obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
-obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
-obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
 obj-$(CONFIG_IP_PNP) += ipconfig.o
 obj-$(CONFIG_NETFILTER)	+= netfilter.o netfilter/
 obj-$(CONFIG_INET_DIAG) += inet_diag.o
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 3f3f6d6be318..91926c9a3bc9 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -107,7 +107,7 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
 	struct net_device *dev;
 	struct pcpu_sw_netstats *tstats;
 	struct xfrm_state *x;
-	struct xfrm_mode *inner_mode;
+	const struct xfrm_mode *inner_mode;
 	struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4;
 	u32 orig_mark = skb->mark;
 	int ret;
diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
deleted file mode 100644
index ba84b278e627..000000000000
--- a/net/ipv4/xfrm4_mode_beet.c
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * xfrm4_mode_beet.c - BEET mode encapsulation for IPv4.
- *
- * Copyright (c) 2006 Diego Beltrami <diego.beltrami@gmail.com>
- *                    Miika Komu     <miika@iki.fi>
- *                    Herbert Xu     <herbert@gondor.apana.org.au>
- *                    Abhinav Pathak <abhinav.pathak@hiit.fi>
- *                    Jeff Ahrenholz <ahrenholz@gmail.com>
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/stringify.h>
-#include <net/dst.h>
-#include <net/ip.h>
-#include <net/xfrm.h>
-
-
-static struct xfrm_mode xfrm4_beet_mode = {
-	.owner = THIS_MODULE,
-	.encap = XFRM_MODE_BEET,
-	.flags = XFRM_MODE_FLAG_TUNNEL,
-	.family = AF_INET,
-};
-
-static int __init xfrm4_beet_init(void)
-{
-	return xfrm_register_mode(&xfrm4_beet_mode);
-}
-
-static void __exit xfrm4_beet_exit(void)
-{
-	xfrm_unregister_mode(&xfrm4_beet_mode);
-}
-
-module_init(xfrm4_beet_init);
-module_exit(xfrm4_beet_exit);
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_BEET);
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
deleted file mode 100644
index 397863ea762b..000000000000
--- a/net/ipv4/xfrm4_mode_transport.c
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4.
- *
- * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/stringify.h>
-#include <net/dst.h>
-#include <net/ip.h>
-#include <net/xfrm.h>
-#include <net/protocol.h>
-
-static struct xfrm_mode xfrm4_transport_mode = {
-	.owner = THIS_MODULE,
-	.encap = XFRM_MODE_TRANSPORT,
-	.family = AF_INET,
-};
-
-static int __init xfrm4_transport_init(void)
-{
-	return xfrm_register_mode(&xfrm4_transport_mode);
-}
-
-static void __exit xfrm4_transport_exit(void)
-{
-	xfrm_unregister_mode(&xfrm4_transport_mode);
-}
-
-module_init(xfrm4_transport_init);
-module_exit(xfrm4_transport_exit);
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
deleted file mode 100644
index b2b132c800fc..000000000000
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4.
- *
- * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
- */
-
-#include <linux/gfp.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/stringify.h>
-#include <net/dst.h>
-#include <net/inet_ecn.h>
-#include <net/ip.h>
-#include <net/xfrm.h>
-
-static struct xfrm_mode xfrm4_tunnel_mode = {
-	.owner = THIS_MODULE,
-	.encap = XFRM_MODE_TUNNEL,
-	.flags = XFRM_MODE_FLAG_TUNNEL,
-	.family = AF_INET,
-};
-
-static int __init xfrm4_mode_tunnel_init(void)
-{
-	return xfrm_register_mode(&xfrm4_tunnel_mode);
-}
-
-static void __exit xfrm4_mode_tunnel_exit(void)
-{
-	xfrm_unregister_mode(&xfrm4_tunnel_mode);
-}
-
-module_init(xfrm4_mode_tunnel_init);
-module_exit(xfrm4_mode_tunnel_exit);
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL);
-- 
cgit 


From c9500d7b7de8ff6ac88ee3e38b782889f1616593 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:32 +0100
Subject: xfrm: store xfrm_mode directly, not its address

This structure is now only 4 bytes, so its more efficient
to cache a copy rather than its address.

No significant size difference in allmodconfig vmlinux.

With non-modular kernel that has all XFRM options enabled, this
series reduces vmlinux image size by ~11kb. All xfrm_mode
indirections are gone and all modes are built-in.

before (ipsec-next master):
    text      data      bss         dec   filename
21071494   7233140 11104324    39408958   vmlinux.master

after this series:
21066448   7226772 11104324    39397544   vmlinux.patched

With allmodconfig kernel, the size increase is only 362 bytes,
even all the xfrm config options removed in this series are
modular.

before:
    text      data     bss      dec   filename
15731286   6936912 4046908 26715106   vmlinux.master

after this series:
15731492   6937068  4046908  26715468 vmlinux

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/esp4_offload.c | 2 +-
 net/ipv4/ip_vti.c       | 2 +-
 net/ipv4/xfrm4_output.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 74d59e0177a7..b61a8ff558f9 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -135,7 +135,7 @@ static struct sk_buff *xfrm4_outer_mode_gso_segment(struct xfrm_state *x,
 						    struct sk_buff *skb,
 						    netdev_features_t features)
 {
-	switch (x->outer_mode->encap) {
+	switch (x->outer_mode.encap) {
 	case XFRM_MODE_TUNNEL:
 		return xfrm4_tunnel_gso_segment(x, skb, features);
 	case XFRM_MODE_TRANSPORT:
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 91926c9a3bc9..cc5d9c0a8a10 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -126,7 +126,7 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
 
 	x = xfrm_input_state(skb);
 
-	inner_mode = x->inner_mode;
+	inner_mode = &x->inner_mode;
 
 	if (x->sel.family == AF_UNSPEC) {
 		inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 7c3df14daef3..9bb8905088c7 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -83,7 +83,7 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 #endif
 
 	rcu_read_lock();
-	afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode->family);
+	afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode.family);
 	if (likely(afinfo))
 		ret = afinfo->output_finish(sk, skb);
 	else
-- 
cgit 


From fd69c399c7d6262086b6b820757c6aeaa71feeba Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 8 Apr 2019 10:15:59 +0200
Subject: datagram: remove rendundant 'peeked' argument

After commit a297569fe00a ("net/udp: do not touch skb->peeked unless
really needed") the 'peeked' argument of __skb_try_recv_datagram()
and friends is always equal to !!'flags & MSG_PEEK'.

Since such argument is really a boolean info, and the callers have
already 'flags & MSG_PEEK' handy, we can remove it and clean-up the
code a bit.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 372fdc5381a9..3c58ba02af7d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1631,7 +1631,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 EXPORT_SYMBOL(udp_ioctl);
 
 struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
-			       int noblock, int *peeked, int *off, int *err)
+			       int noblock, int *off, int *err)
 {
 	struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
 	struct sk_buff_head *queue;
@@ -1650,13 +1650,11 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
 			break;
 
 		error = -EAGAIN;
-		*peeked = 0;
 		do {
 			spin_lock_bh(&queue->lock);
 			skb = __skb_try_recv_from_queue(sk, queue, flags,
 							udp_skb_destructor,
-							peeked, off, err,
-							&last);
+							off, err, &last);
 			if (skb) {
 				spin_unlock_bh(&queue->lock);
 				return skb;
@@ -1677,8 +1675,7 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
 
 			skb = __skb_try_recv_from_queue(sk, queue, flags,
 							udp_skb_dtor_locked,
-							peeked, off, err,
-							&last);
+							off, err, &last);
 			spin_unlock(&sk_queue->lock);
 			spin_unlock_bh(&queue->lock);
 			if (skb)
@@ -1713,8 +1710,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
 	struct sk_buff *skb;
 	unsigned int ulen, copied;
-	int peeked, peeking, off;
-	int err;
+	int off, err, peeking = flags & MSG_PEEK;
 	int is_udplite = IS_UDPLITE(sk);
 	bool checksum_valid = false;
 
@@ -1722,9 +1718,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 		return ip_recv_error(sk, msg, len, addr_len);
 
 try_again:
-	peeking = flags & MSG_PEEK;
 	off = sk_peek_offset(sk, flags);
-	skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
+	skb = __skb_recv_udp(sk, flags, noblock, &off, &err);
 	if (!skb)
 		return err;
 
@@ -1762,7 +1757,7 @@ try_again:
 	}
 
 	if (unlikely(err)) {
-		if (!peeked) {
+		if (!peeking) {
 			atomic_inc(&sk->sk_drops);
 			UDP_INC_STATS(sock_net(sk),
 				      UDP_MIB_INERRORS, is_udplite);
@@ -1771,7 +1766,7 @@ try_again:
 		return err;
 	}
 
-	if (!peeked)
+	if (!peeking)
 		UDP_INC_STATS(sock_net(sk),
 			      UDP_MIB_INDATAGRAMS, is_udplite);
 
-- 
cgit 


From c1deb065cf3b5bcd483e3f03479f930edb151b99 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 27 Mar 2019 09:22:25 +0100
Subject: netfilter: nf_tables: merge route type into core

very little code, so it really doesn't make sense to have extra
modules or even a kconfig knob for this.

Merge them and make functionality available unconditionally.
The merge makes inet family route support trivial, so add it
as well here.

Before:
   text	   data	    bss	    dec	    hex	filename
    835	    832	      0	   1667	    683 nft_chain_route_ipv4.ko
    870	    832	      0	   1702	    6a6	nft_chain_route_ipv6.ko
 111568	   2556	    529	 114653	  1bfdd	nf_tables.ko

After:
   text	   data	    bss	    dec	    hex	filename
 113133	   2556	    529	 116218	  1c5fa	nf_tables.ko

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/Kconfig                |  8 ---
 net/ipv4/netfilter/Makefile               |  1 -
 net/ipv4/netfilter/nft_chain_route_ipv4.c | 89 -------------------------------
 3 files changed, 98 deletions(-)
 delete mode 100644 net/ipv4/netfilter/nft_chain_route_ipv4.c

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index c98391d49200..ea688832fc4e 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -27,14 +27,6 @@ config NF_TABLES_IPV4
 
 if NF_TABLES_IPV4
 
-config NFT_CHAIN_ROUTE_IPV4
-	tristate "IPv4 nf_tables route chain support"
-	help
-	  This option enables the "route" chain for IPv4 in nf_tables. This
-	  chain type is used to force packet re-routing after mangling header
-	  fields such as the source, destination, type of service and
-	  the packet mark.
-
 config NFT_REJECT_IPV4
 	select NF_REJECT_IPV4
 	default NFT_REJECT
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index e241f5188ebe..2cfdda7b109f 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -24,7 +24,6 @@ nf_nat_snmp_basic-y := nf_nat_snmp_basic.asn1.o nf_nat_snmp_basic_main.o
 $(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h
 obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
 
-obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
 obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
 obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o
 obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
deleted file mode 100644
index 7d82934c46f4..000000000000
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables_ipv4.h>
-#include <net/route.h>
-#include <net/ip.h>
-
-static unsigned int nf_route_table_hook(void *priv,
-					struct sk_buff *skb,
-					const struct nf_hook_state *state)
-{
-	unsigned int ret;
-	struct nft_pktinfo pkt;
-	u32 mark;
-	__be32 saddr, daddr;
-	u_int8_t tos;
-	const struct iphdr *iph;
-	int err;
-
-	nft_set_pktinfo(&pkt, skb, state);
-	nft_set_pktinfo_ipv4(&pkt, skb);
-
-	mark = skb->mark;
-	iph = ip_hdr(skb);
-	saddr = iph->saddr;
-	daddr = iph->daddr;
-	tos = iph->tos;
-
-	ret = nft_do_chain(&pkt, priv);
-	if (ret != NF_DROP && ret != NF_STOLEN) {
-		iph = ip_hdr(skb);
-
-		if (iph->saddr != saddr ||
-		    iph->daddr != daddr ||
-		    skb->mark != mark ||
-		    iph->tos != tos) {
-			err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
-			if (err < 0)
-				ret = NF_DROP_ERR(err);
-		}
-	}
-	return ret;
-}
-
-static const struct nft_chain_type nft_chain_route_ipv4 = {
-	.name		= "route",
-	.type		= NFT_CHAIN_T_ROUTE,
-	.family		= NFPROTO_IPV4,
-	.owner		= THIS_MODULE,
-	.hook_mask	= (1 << NF_INET_LOCAL_OUT),
-	.hooks		= {
-		[NF_INET_LOCAL_OUT]	= nf_route_table_hook,
-	},
-};
-
-static int __init nft_chain_route_init(void)
-{
-	nft_register_chain_type(&nft_chain_route_ipv4);
-
-	return 0;
-}
-
-static void __exit nft_chain_route_exit(void)
-{
-	nft_unregister_chain_type(&nft_chain_route_ipv4);
-}
-
-module_init(nft_chain_route_init);
-module_exit(nft_chain_route_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_CHAIN(AF_INET, "route");
-- 
cgit 


From bdf004677107e3b847c5db09c9fbf8edefa24996 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:26 -0700
Subject: net: Replace nhc_has_gw with nhc_gw_family

Allow the gateway in a fib_nh_common to be from a different address
family than the outer fib{6}_nh. To that end, replace nhc_has_gw with
nhc_gw_family and update users of nhc_has_gw to check nhc_gw_family.
Now nhc_family is used to know if the nh_common is part of a fib_nh
or fib6_nh (used for container_of to get to route family specific data),
and nhc_gw_family represents the address family for the gateway.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 25 +++++++++++--------------
 net/ipv4/route.c         |  5 +++--
 2 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 8e0cb1687a74..e11f78c6373f 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -513,7 +513,7 @@ int fib_nh_init(struct net *net, struct fib_nh *nh,
 	nh->fib_nh_oif = cfg->fc_oif;
 	if (cfg->fc_gw) {
 		nh->fib_nh_gw4 = cfg->fc_gw;
-		nh->fib_nh_has_gw = 1;
+		nh->fib_nh_gw_family = AF_INET;
 	}
 	nh->fib_nh_flags = cfg->fc_flags;
 
@@ -1238,7 +1238,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 				       "Route with host scope can not have multiple nexthops");
 			goto err_inval;
 		}
-		if (nh->fib_nh_gw4) {
+		if (nh->fib_nh_gw_family) {
 			NL_SET_ERR_MSG(extack,
 				       "Route with host scope can not have a gateway");
 			goto err_inval;
@@ -1341,18 +1341,15 @@ int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
 		rcu_read_unlock();
 	}
 
-	if (nhc->nhc_has_gw) {
-		switch (nhc->nhc_family) {
-		case AF_INET:
-			if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4))
-				goto nla_put_failure;
-			break;
-		case AF_INET6:
-			if (nla_put_in6_addr(skb, RTA_GATEWAY,
-					     &nhc->nhc_gw.ipv6) < 0)
-				goto nla_put_failure;
-			break;
-		}
+	switch (nhc->nhc_gw_family) {
+	case AF_INET:
+		if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4))
+			goto nla_put_failure;
+		break;
+	case AF_INET6:
+		if (nla_put_in6_addr(skb, RTA_GATEWAY, &nhc->nhc_gw.ipv6) < 0)
+			goto nla_put_failure;
+		break;
 	}
 
 	*flags |= (nhc->nhc_flags & RTNH_F_ONLINK);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f3f2adf630d4..e7338e421796 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1734,8 +1734,9 @@ static int __mkroute_input(struct sk_buff *skb,
 	do_cache = res->fi && !itag;
 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
 	    skb->protocol == htons(ETH_P_IP)) {
-		__be32 gw = nhc->nhc_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
+		__be32 gw;
 
+		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
 		if (IN_DEV_SHARED_MEDIA(out_dev) ||
 		    inet_addr_onlink(out_dev, saddr, gw))
 			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
@@ -2284,7 +2285,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 		} else {
 			if (unlikely(fl4->flowi4_flags &
 				     FLOWI_FLAG_KNOWN_NH &&
-				     !(nhc->nhc_has_gw &&
+				     !(nhc->nhc_gw_family &&
 				       nhc->nhc_scope == RT_SCOPE_LINK))) {
 				do_cache = false;
 				goto add;
-- 
cgit 


From 1550c171935d264f522581fd037db5e64a716bb6 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:27 -0700
Subject: ipv4: Prepare rtable for IPv6 gateway

To allow the gateway to be either an IPv4 or IPv6 address, remove
rt_uses_gateway from rtable and replace with rt_gw_family. If
rt_gw_family is set it implies rt_uses_gateway. Rename rt_gateway
to rt_gw4 to represent the IPv4 version.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_connection_sock.c |  4 ++--
 net/ipv4/ip_forward.c           |  2 +-
 net/ipv4/ip_output.c            |  2 +-
 net/ipv4/route.c                | 51 ++++++++++++++++++++++-------------------
 net/ipv4/xfrm4_policy.c         |  5 ++--
 5 files changed, 34 insertions(+), 30 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 6ea523d71947..a175e3e7ae97 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -564,7 +564,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
 	rt = ip_route_output_flow(net, fl4, sk);
 	if (IS_ERR(rt))
 		goto no_route;
-	if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
+	if (opt && opt->opt.is_strictroute && rt->rt_gw_family)
 		goto route_err;
 	rcu_read_unlock();
 	return &rt->dst;
@@ -602,7 +602,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
 	rt = ip_route_output_flow(net, fl4, sk);
 	if (IS_ERR(rt))
 		goto no_route;
-	if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
+	if (opt && opt->opt.is_strictroute && rt->rt_gw_family)
 		goto route_err;
 	return &rt->dst;
 
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 00ec819f949b..06f6f280b9ff 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -123,7 +123,7 @@ int ip_forward(struct sk_buff *skb)
 
 	rt = skb_rtable(skb);
 
-	if (opt->is_strictroute && rt->rt_uses_gateway)
+	if (opt->is_strictroute && rt->rt_gw_family)
 		goto sr_failed;
 
 	IPCB(skb)->flags |= IPSKB_FORWARDED;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 10b35328cfbc..a2bd4a6d9e6b 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -472,7 +472,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 	skb_dst_set_noref(skb, &rt->dst);
 
 packet_routed:
-	if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
+	if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gw_family)
 		goto no_route;
 
 	/* OK, we know where to send it, allocate and build IP header. */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index e7338e421796..b77b4950d0c7 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -434,14 +434,13 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 					   struct sk_buff *skb,
 					   const void *daddr)
 {
+	const struct rtable *rt = container_of(dst, struct rtable, dst);
 	struct net_device *dev = dst->dev;
 	const __be32 *pkey = daddr;
-	const struct rtable *rt;
 	struct neighbour *n;
 
-	rt = (const struct rtable *) dst;
-	if (rt->rt_gateway)
-		pkey = (const __be32 *) &rt->rt_gateway;
+	if (rt->rt_gw_family == AF_INET)
+		pkey = (const __be32 *) &rt->rt_gw4;
 	else if (skb)
 		pkey = &ip_hdr(skb)->daddr;
 
@@ -453,13 +452,12 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 
 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 {
+	const struct rtable *rt = container_of(dst, struct rtable, dst);
 	struct net_device *dev = dst->dev;
 	const __be32 *pkey = daddr;
-	const struct rtable *rt;
 
-	rt = (const struct rtable *)dst;
-	if (rt->rt_gateway)
-		pkey = (const __be32 *)&rt->rt_gateway;
+	if (rt->rt_gw_family == AF_INET)
+		pkey = (const __be32 *)&rt->rt_gw4;
 	else if (!daddr ||
 		 (rt->rt_flags &
 		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
@@ -629,8 +627,8 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh
 
 	if (fnhe->fnhe_gw) {
 		rt->rt_flags |= RTCF_REDIRECTED;
-		rt->rt_gateway = fnhe->fnhe_gw;
-		rt->rt_uses_gateway = 1;
+		rt->rt_gw_family = AF_INET;
+		rt->rt_gw4 = fnhe->fnhe_gw;
 	}
 }
 
@@ -747,7 +745,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
 		return;
 	}
 
-	if (rt->rt_gateway != old_gw)
+	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
 		return;
 
 	in_dev = __in_dev_get_rcu(dev);
@@ -1282,7 +1280,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 	mtu = READ_ONCE(dst->dev->mtu);
 
 	if (unlikely(ip_mtu_locked(dst))) {
-		if (rt->rt_uses_gateway && mtu > 576)
+		if (rt->rt_gw_family && mtu > 576)
 			mtu = 576;
 	}
 
@@ -1410,8 +1408,10 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
 			orig = NULL;
 		}
 		fill_route_from_fnhe(rt, fnhe);
-		if (!rt->rt_gateway)
-			rt->rt_gateway = daddr;
+		if (!rt->rt_gw4) {
+			rt->rt_gw4 = daddr;
+			rt->rt_gw_family = AF_INET;
+		}
 
 		if (do_cache) {
 			dst_hold(&rt->dst);
@@ -1538,8 +1538,8 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 		struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common);
 
 		if (nh->fib_nh_gw4 && nh->fib_nh_scope == RT_SCOPE_LINK) {
-			rt->rt_gateway = nh->fib_nh_gw4;
-			rt->rt_uses_gateway = 1;
+			rt->rt_gw4 = nh->fib_nh_gw4;
+			rt->rt_gw_family = AF_INET;
 		}
 		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
 
@@ -1557,8 +1557,10 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 			 * However, if we are unsuccessful at storing this
 			 * route into the cache we really need to set it.
 			 */
-			if (!rt->rt_gateway)
-				rt->rt_gateway = daddr;
+			if (!rt->rt_gw4) {
+				rt->rt_gw_family = AF_INET;
+				rt->rt_gw4 = daddr;
+			}
 			rt_add_uncached_list(rt);
 		}
 	} else
@@ -1591,8 +1593,8 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
 		rt->rt_iif = 0;
 		rt->rt_pmtu = 0;
 		rt->rt_mtu_locked = 0;
-		rt->rt_gateway = 0;
-		rt->rt_uses_gateway = 0;
+		rt->rt_gw_family = 0;
+		rt->rt_gw4 = 0;
 		INIT_LIST_HEAD(&rt->rt_uncached);
 
 		rt->dst.output = ip_output;
@@ -2595,8 +2597,9 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_genid = rt_genid_ipv4(net);
 		rt->rt_flags = ort->rt_flags;
 		rt->rt_type = ort->rt_type;
-		rt->rt_gateway = ort->rt_gateway;
-		rt->rt_uses_gateway = ort->rt_uses_gateway;
+		rt->rt_gw_family = ort->rt_gw_family;
+		if (rt->rt_gw_family == AF_INET)
+			rt->rt_gw4 = ort->rt_gw4;
 
 		INIT_LIST_HEAD(&rt->rt_uncached);
 	}
@@ -2675,8 +2678,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
 		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
 			goto nla_put_failure;
 	}
-	if (rt->rt_uses_gateway &&
-	    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
+	if (rt->rt_gw_family == AF_INET &&
+	    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4))
 		goto nla_put_failure;
 
 	expires = rt->dst.expires;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index d73a6d6652f6..ee53a91526e5 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -97,8 +97,9 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
 					      RTCF_LOCAL);
 	xdst->u.rt.rt_type = rt->rt_type;
-	xdst->u.rt.rt_gateway = rt->rt_gateway;
-	xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway;
+	xdst->u.rt.rt_gw_family = rt->rt_gw_family;
+	if (rt->rt_gw_family == AF_INET)
+		xdst->u.rt.rt_gw4 = rt->rt_gw4;
 	xdst->u.rt.rt_pmtu = rt->rt_pmtu;
 	xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked;
 	INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
-- 
cgit 


From f35b794b3b405e2478654ea875bc0b29fe1a1bc5 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:28 -0700
Subject: ipv4: Prepare fib_config for IPv6 gateway

Similar to rtable, fib_config needs to allow the gateway to be either an
IPv4 or an IPv6 address. To that end, rename fc_gw to fc_gw4 to mean an
IPv4 address and add fc_gw_family. Checks on 'is a gateway set' are changed
to see if fc_gw_family is set. In the process prepare the code for a
fc_gw_family == AF_INET6.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c  |  8 +++++---
 net/ipv4/fib_semantics.c | 40 ++++++++++++++++++++++++++--------------
 2 files changed, 31 insertions(+), 17 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 15f779bd26b3..f99a2ec32505 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -558,7 +558,8 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
 	if (rt->rt_gateway.sa_family == AF_INET && addr) {
 		unsigned int addr_type;
 
-		cfg->fc_gw = addr;
+		cfg->fc_gw4 = addr;
+		cfg->fc_gw_family = AF_INET;
 		addr_type = inet_addr_type_table(net, addr, cfg->fc_table);
 		if (rt->rt_flags & RTF_GATEWAY &&
 		    addr_type == RTN_UNICAST)
@@ -568,7 +569,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
 	if (cmd == SIOCDELRT)
 		return 0;
 
-	if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw)
+	if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family)
 		return -EINVAL;
 
 	if (cfg->fc_scope == RT_SCOPE_NOWHERE)
@@ -708,7 +709,8 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
 			cfg->fc_oif = nla_get_u32(attr);
 			break;
 		case RTA_GATEWAY:
-			cfg->fc_gw = nla_get_be32(attr);
+			cfg->fc_gw_family = AF_INET;
+			cfg->fc_gw4 = nla_get_be32(attr);
 			break;
 		case RTA_VIA:
 			NL_SET_ERR_MSG(extack, "IPv4 does not support RTA_VIA attribute");
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e11f78c6373f..d3e26e55f2e1 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -511,8 +511,8 @@ int fib_nh_init(struct net *net, struct fib_nh *nh,
 		goto init_failure;
 
 	nh->fib_nh_oif = cfg->fc_oif;
-	if (cfg->fc_gw) {
-		nh->fib_nh_gw4 = cfg->fc_gw;
+	if (cfg->fc_gw_family == AF_INET) {
+		nh->fib_nh_gw4 = cfg->fc_gw4;
 		nh->fib_nh_gw_family = AF_INET;
 	}
 	nh->fib_nh_flags = cfg->fc_flags;
@@ -589,8 +589,10 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
 
 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
-			if (nla)
-				fib_cfg.fc_gw = nla_get_in_addr(nla);
+			if (nla) {
+				fib_cfg.fc_gw_family = AF_INET;
+				fib_cfg.fc_gw4 = nla_get_in_addr(nla);
+			}
 
 			nla = nla_find(attrs, attrlen, RTA_FLOW);
 			if (nla)
@@ -616,10 +618,14 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 			       "Nexthop device index does not match RTA_OIF");
 		goto errout;
 	}
-	if (cfg->fc_gw && fi->fib_nh->fib_nh_gw4 != cfg->fc_gw) {
-		NL_SET_ERR_MSG(extack,
-			       "Nexthop gateway does not match RTA_GATEWAY");
-		goto errout;
+	if (cfg->fc_gw_family) {
+		if (cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family ||
+		    (cfg->fc_gw_family == AF_INET &&
+		     fi->fib_nh->fib_nh_gw4 != cfg->fc_gw4)) {
+			NL_SET_ERR_MSG(extack,
+				       "Nexthop gateway does not match RTA_GATEWAY");
+			goto errout;
+		}
 	}
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) {
@@ -719,7 +725,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
 	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
 		return 1;
 
-	if (cfg->fc_oif || cfg->fc_gw) {
+	if (cfg->fc_oif || cfg->fc_gw_family) {
 		if (cfg->fc_encap) {
 			if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap,
 					    fi->fib_nh, cfg, extack))
@@ -730,10 +736,16 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
 		    cfg->fc_flow != fi->fib_nh->nh_tclassid)
 			return 1;
 #endif
-		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->fib_nh_oif) &&
-		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->fib_nh_gw4))
-			return 0;
-		return 1;
+		if ((cfg->fc_oif && cfg->fc_oif != fi->fib_nh->fib_nh_oif) ||
+		    (cfg->fc_gw_family &&
+		     cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family))
+			return 1;
+
+		if (cfg->fc_gw_family == AF_INET &&
+		    cfg->fc_gw4 != fi->fib_nh->fib_nh_gw4)
+			return 1;
+
+		return 0;
 	}
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -1204,7 +1216,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 		goto failure;
 
 	if (fib_props[cfg->fc_type].error) {
-		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) {
+		if (cfg->fc_gw_family || cfg->fc_oif || cfg->fc_mp) {
 			NL_SET_ERR_MSG(extack,
 				       "Gateway, device and multipath can not be specified for this route type");
 			goto err_inval;
-- 
cgit 


From 0f5f7d7bf6e6bda4dffe7b42812a16ada6ea9816 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:29 -0700
Subject: ipv4: Add support to rtable for ipv6 gateway

Add support for an IPv6 gateway to rtable. Since a gateway is either
IPv4 or IPv6, make it a union with rt_gw4 where rt_gw_family decides
which address is in use.

When dumping the route data, encode an ipv6 nexthop using RTA_VIA.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c        | 31 ++++++++++++++++++++++++++-----
 net/ipv4/xfrm4_policy.c |  2 ++
 2 files changed, 28 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b77b4950d0c7..6e58acf0a87b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1535,14 +1535,20 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 
 	if (fi) {
 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
-		struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common);
+		struct fib_nh *nh;
 
-		if (nh->fib_nh_gw4 && nh->fib_nh_scope == RT_SCOPE_LINK) {
-			rt->rt_gw4 = nh->fib_nh_gw4;
-			rt->rt_gw_family = AF_INET;
+		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
+			rt->rt_gw_family = nhc->nhc_gw_family;
+			/* only INET and INET6 are supported */
+			if (likely(nhc->nhc_gw_family == AF_INET))
+				rt->rt_gw4 = nhc->nhc_gw.ipv4;
+			else
+				rt->rt_gw6 = nhc->nhc_gw.ipv6;
 		}
+
 		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
 
+		nh = container_of(nhc, struct fib_nh, nh_common);
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		rt->dst.tclassid = nh->nh_tclassid;
 #endif
@@ -2600,6 +2606,8 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_gw_family = ort->rt_gw_family;
 		if (rt->rt_gw_family == AF_INET)
 			rt->rt_gw4 = ort->rt_gw4;
+		else if (rt->rt_gw_family == AF_INET6)
+			rt->rt_gw6 = ort->rt_gw6;
 
 		INIT_LIST_HEAD(&rt->rt_uncached);
 	}
@@ -2679,8 +2687,21 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
 			goto nla_put_failure;
 	}
 	if (rt->rt_gw_family == AF_INET &&
-	    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4))
+	    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
 		goto nla_put_failure;
+	} else if (rt->rt_gw_family == AF_INET6) {
+		int alen = sizeof(struct in6_addr);
+		struct nlattr *nla;
+		struct rtvia *via;
+
+		nla = nla_reserve(skb, RTA_VIA, alen + 2);
+		if (!nla)
+			goto nla_put_failure;
+
+		via = nla_data(nla);
+		via->rtvia_family = AF_INET6;
+		memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
+	}
 
 	expires = rt->dst.expires;
 	if (expires) {
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index ee53a91526e5..72d19b1838ed 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -100,6 +100,8 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	xdst->u.rt.rt_gw_family = rt->rt_gw_family;
 	if (rt->rt_gw_family == AF_INET)
 		xdst->u.rt.rt_gw4 = rt->rt_gw4;
+	else if (rt->rt_gw_family == AF_INET6)
+		xdst->u.rt.rt_gw6 = rt->rt_gw6;
 	xdst->u.rt.rt_pmtu = rt->rt_pmtu;
 	xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked;
 	INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
-- 
cgit 


From a4ea5d43c807be28545625c1e0641905022fa0d1 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:30 -0700
Subject: ipv4: Add support to fib_config for IPv6 gateway

Add support for an IPv6 gateway to fib_config. Since a gateway is either
IPv4 or IPv6, make it a union with fc_gw4 where fc_gw_family decides
which address is in use. Update current checks on family and gw4 to
handle ipv6 as well.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index d3e26e55f2e1..680b5a9a911a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -276,7 +276,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
 
 	for_nexthops(fi) {
 		if (nh->fib_nh_oif != onh->fib_nh_oif ||
-		    nh->fib_nh_gw4 != onh->fib_nh_gw4 ||
+		    nh->fib_nh_gw_family != onh->fib_nh_gw_family ||
 		    nh->fib_nh_scope != onh->fib_nh_scope ||
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 		    nh->fib_nh_weight != onh->fib_nh_weight ||
@@ -287,6 +287,15 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
 		    lwtunnel_cmp_encap(nh->fib_nh_lws, onh->fib_nh_lws) ||
 		    ((nh->fib_nh_flags ^ onh->fib_nh_flags) & ~RTNH_COMPARE_MASK))
 			return -1;
+
+		if (nh->fib_nh_gw_family == AF_INET &&
+		    nh->fib_nh_gw4 != onh->fib_nh_gw4)
+			return -1;
+
+		if (nh->fib_nh_gw_family == AF_INET6 &&
+		    ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6))
+			return -1;
+
 		onh++;
 	} endfor_nexthops(fi);
 	return 0;
@@ -511,10 +520,12 @@ int fib_nh_init(struct net *net, struct fib_nh *nh,
 		goto init_failure;
 
 	nh->fib_nh_oif = cfg->fc_oif;
-	if (cfg->fc_gw_family == AF_INET) {
+	nh->fib_nh_gw_family = cfg->fc_gw_family;
+	if (cfg->fc_gw_family == AF_INET)
 		nh->fib_nh_gw4 = cfg->fc_gw4;
-		nh->fib_nh_gw_family = AF_INET;
-	}
+	else if (cfg->fc_gw_family == AF_INET6)
+		nh->fib_nh_gw6 = cfg->fc_gw6;
+
 	nh->fib_nh_flags = cfg->fc_flags;
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
@@ -621,9 +632,11 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 	if (cfg->fc_gw_family) {
 		if (cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family ||
 		    (cfg->fc_gw_family == AF_INET &&
-		     fi->fib_nh->fib_nh_gw4 != cfg->fc_gw4)) {
+		     fi->fib_nh->fib_nh_gw4 != cfg->fc_gw4) ||
+		    (cfg->fc_gw_family == AF_INET6 &&
+		     ipv6_addr_cmp(&fi->fib_nh->fib_nh_gw6, &cfg->fc_gw6))) {
 			NL_SET_ERR_MSG(extack,
-				       "Nexthop gateway does not match RTA_GATEWAY");
+				       "Nexthop gateway does not match RTA_GATEWAY or RTA_VIA");
 			goto errout;
 		}
 	}
@@ -745,6 +758,10 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
 		    cfg->fc_gw4 != fi->fib_nh->fib_nh_gw4)
 			return 1;
 
+		if (cfg->fc_gw_family == AF_INET6 &&
+		    ipv6_addr_cmp(&cfg->fc_gw6, &fi->fib_nh->fib_nh_gw6))
+			return 1;
+
 		return 0;
 	}
 
-- 
cgit 


From 448d7248191706cbbd7761e3bc72c2985c4d38a7 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:31 -0700
Subject: ipv4: Refactor fib_check_nh

fib_check_nh is currently huge covering multiple uses cases - device only,
device + gateway, and device + gateway with ONLINK. The next patch adds
validation checks for IPv6 which only further complicates it. So, break
fib_check_nh into 2 helpers - one for gateway validation and one for device
only.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 234 +++++++++++++++++++++++++----------------------
 1 file changed, 125 insertions(+), 109 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 680b5a9a911a..32ce6e6202d2 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -885,134 +885,150 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
  *					|
  *					|-> {local prefix} (terminal node)
  */
-static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
-			struct netlink_ext_ack *extack)
+static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table,
+			      u8 scope, struct netlink_ext_ack *extack)
 {
-	int err = 0;
-	struct net *net;
 	struct net_device *dev;
+	struct fib_result res;
+	int err;
 
-	net = cfg->fc_nlinfo.nl_net;
-	if (nh->fib_nh_gw4) {
-		struct fib_result res;
-
-		if (nh->fib_nh_flags & RTNH_F_ONLINK) {
-			unsigned int addr_type;
+	if (nh->fib_nh_flags & RTNH_F_ONLINK) {
+		unsigned int addr_type;
 
-			if (cfg->fc_scope >= RT_SCOPE_LINK) {
-				NL_SET_ERR_MSG(extack,
-					       "Nexthop has invalid scope");
-				return -EINVAL;
-			}
-			dev = __dev_get_by_index(net, nh->fib_nh_oif);
-			if (!dev) {
-				NL_SET_ERR_MSG(extack, "Nexthop device required for onlink");
-				return -ENODEV;
-			}
-			if (!(dev->flags & IFF_UP)) {
-				NL_SET_ERR_MSG(extack,
-					       "Nexthop device is not up");
-				return -ENETDOWN;
-			}
-			addr_type = inet_addr_type_dev_table(net, dev,
-							     nh->fib_nh_gw4);
-			if (addr_type != RTN_UNICAST) {
-				NL_SET_ERR_MSG(extack,
-					       "Nexthop has invalid gateway");
-				return -EINVAL;
-			}
-			if (!netif_carrier_ok(dev))
-				nh->fib_nh_flags |= RTNH_F_LINKDOWN;
-			nh->fib_nh_dev = dev;
-			dev_hold(dev);
-			nh->fib_nh_scope = RT_SCOPE_LINK;
-			return 0;
+		if (scope >= RT_SCOPE_LINK) {
+			NL_SET_ERR_MSG(extack, "Nexthop has invalid scope");
+			return -EINVAL;
 		}
-		rcu_read_lock();
-		{
-			struct fib_table *tbl = NULL;
-			struct flowi4 fl4 = {
-				.daddr = nh->fib_nh_gw4,
-				.flowi4_scope = cfg->fc_scope + 1,
-				.flowi4_oif = nh->fib_nh_oif,
-				.flowi4_iif = LOOPBACK_IFINDEX,
-			};
-
-			/* It is not necessary, but requires a bit of thinking */
-			if (fl4.flowi4_scope < RT_SCOPE_LINK)
-				fl4.flowi4_scope = RT_SCOPE_LINK;
-
-			if (cfg->fc_table)
-				tbl = fib_get_table(net, cfg->fc_table);
-
-			if (tbl)
-				err = fib_table_lookup(tbl, &fl4, &res,
-						       FIB_LOOKUP_IGNORE_LINKSTATE |
-						       FIB_LOOKUP_NOREF);
-
-			/* on error or if no table given do full lookup. This
-			 * is needed for example when nexthops are in the local
-			 * table rather than the given table
-			 */
-			if (!tbl || err) {
-				err = fib_lookup(net, &fl4, &res,
-						 FIB_LOOKUP_IGNORE_LINKSTATE);
-			}
-
-			if (err) {
-				NL_SET_ERR_MSG(extack,
-					       "Nexthop has invalid gateway");
-				rcu_read_unlock();
-				return err;
-			}
+		dev = __dev_get_by_index(net, nh->fib_nh_oif);
+		if (!dev) {
+			NL_SET_ERR_MSG(extack, "Nexthop device required for onlink");
+			return -ENODEV;
 		}
-		err = -EINVAL;
-		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) {
-			NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
-			goto out;
+		if (!(dev->flags & IFF_UP)) {
+			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+			return -ENETDOWN;
 		}
-		nh->fib_nh_scope = res.scope;
-		nh->fib_nh_oif = FIB_RES_OIF(res);
-		nh->fib_nh_dev = dev = FIB_RES_DEV(res);
-		if (!dev) {
-			NL_SET_ERR_MSG(extack,
-				       "No egress device for nexthop gateway");
-			goto out;
+		addr_type = inet_addr_type_dev_table(net, dev, nh->fib_nh_gw4);
+		if (addr_type != RTN_UNICAST) {
+			NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
+			return -EINVAL;
 		}
-		dev_hold(dev);
 		if (!netif_carrier_ok(dev))
 			nh->fib_nh_flags |= RTNH_F_LINKDOWN;
-		err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
-	} else {
-		struct in_device *in_dev;
-
-		if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) {
-			NL_SET_ERR_MSG(extack,
-				       "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set");
-			return -EINVAL;
+		nh->fib_nh_dev = dev;
+		dev_hold(dev);
+		nh->fib_nh_scope = RT_SCOPE_LINK;
+		return 0;
+	}
+	rcu_read_lock();
+	{
+		struct fib_table *tbl = NULL;
+		struct flowi4 fl4 = {
+			.daddr = nh->fib_nh_gw4,
+			.flowi4_scope = scope + 1,
+			.flowi4_oif = nh->fib_nh_oif,
+			.flowi4_iif = LOOPBACK_IFINDEX,
+		};
+
+		/* It is not necessary, but requires a bit of thinking */
+		if (fl4.flowi4_scope < RT_SCOPE_LINK)
+			fl4.flowi4_scope = RT_SCOPE_LINK;
+
+		if (table)
+			tbl = fib_get_table(net, table);
+
+		if (tbl)
+			err = fib_table_lookup(tbl, &fl4, &res,
+					       FIB_LOOKUP_IGNORE_LINKSTATE |
+					       FIB_LOOKUP_NOREF);
+
+		/* on error or if no table given do full lookup. This
+		 * is needed for example when nexthops are in the local
+		 * table rather than the given table
+		 */
+		if (!tbl || err) {
+			err = fib_lookup(net, &fl4, &res,
+					 FIB_LOOKUP_IGNORE_LINKSTATE);
 		}
-		rcu_read_lock();
-		err = -ENODEV;
-		in_dev = inetdev_by_index(net, nh->fib_nh_oif);
-		if (!in_dev)
-			goto out;
-		err = -ENETDOWN;
-		if (!(in_dev->dev->flags & IFF_UP)) {
-			NL_SET_ERR_MSG(extack, "Device for nexthop is not up");
+
+		if (err) {
+			NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
 			goto out;
 		}
-		nh->fib_nh_dev = in_dev->dev;
-		dev_hold(nh->fib_nh_dev);
-		nh->fib_nh_scope = RT_SCOPE_HOST;
-		if (!netif_carrier_ok(nh->fib_nh_dev))
-			nh->fib_nh_flags |= RTNH_F_LINKDOWN;
-		err = 0;
 	}
+
+	err = -EINVAL;
+	if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) {
+		NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
+		goto out;
+	}
+	nh->fib_nh_scope = res.scope;
+	nh->fib_nh_oif = FIB_RES_OIF(res);
+	nh->fib_nh_dev = dev = FIB_RES_DEV(res);
+	if (!dev) {
+		NL_SET_ERR_MSG(extack,
+			       "No egress device for nexthop gateway");
+		goto out;
+	}
+	dev_hold(dev);
+	if (!netif_carrier_ok(dev))
+		nh->fib_nh_flags |= RTNH_F_LINKDOWN;
+	err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
 out:
 	rcu_read_unlock();
 	return err;
 }
 
+static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh,
+			      struct netlink_ext_ack *extack)
+{
+	struct in_device *in_dev;
+	int err;
+
+	if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) {
+		NL_SET_ERR_MSG(extack,
+			       "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set");
+		return -EINVAL;
+	}
+
+	rcu_read_lock();
+
+	err = -ENODEV;
+	in_dev = inetdev_by_index(net, nh->fib_nh_oif);
+	if (!in_dev)
+		goto out;
+	err = -ENETDOWN;
+	if (!(in_dev->dev->flags & IFF_UP)) {
+		NL_SET_ERR_MSG(extack, "Device for nexthop is not up");
+		goto out;
+	}
+
+	nh->fib_nh_dev = in_dev->dev;
+	dev_hold(nh->fib_nh_dev);
+	nh->fib_nh_scope = RT_SCOPE_HOST;
+	if (!netif_carrier_ok(nh->fib_nh_dev))
+		nh->fib_nh_flags |= RTNH_F_LINKDOWN;
+	err = 0;
+out:
+	rcu_read_unlock();
+	return err;
+}
+
+static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
+			struct netlink_ext_ack *extack)
+{
+	struct net *net = cfg->fc_nlinfo.nl_net;
+	u32 table = cfg->fc_table;
+	int err;
+
+	if (nh->fib_nh_gw_family == AF_INET)
+		err = fib_check_nh_v4_gw(net, nh, table, cfg->fc_scope, extack);
+	else
+		err = fib_check_nh_nongw(net, nh, extack);
+
+	return err;
+}
+
 static inline unsigned int fib_laddr_hashfn(__be32 val)
 {
 	unsigned int mask = (fib_info_hash_size - 1);
-- 
cgit 


From 717a8f5b2923c44da9157e145c294c4343a5f6de Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:32 -0700
Subject: ipv4: Add fib_check_nh_v6_gw

Add helper to use fib6_nh_init to validate a nexthop spec with an IPv6
gateway.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 32ce6e6202d2..dd95725c318e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -41,6 +41,7 @@
 #include <net/tcp.h>
 #include <net/sock.h>
 #include <net/ip_fib.h>
+#include <net/ip6_fib.h>
 #include <net/netlink.h>
 #include <net/nexthop.h>
 #include <net/lwtunnel.h>
@@ -841,6 +842,30 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
 	return true;
 }
 
+static int fib_check_nh_v6_gw(struct net *net, struct fib_nh *nh,
+			      u32 table, struct netlink_ext_ack *extack)
+{
+	struct fib6_config cfg = {
+		.fc_table = table,
+		.fc_flags = nh->fib_nh_flags | RTF_GATEWAY,
+		.fc_ifindex = nh->fib_nh_oif,
+		.fc_gateway = nh->fib_nh_gw6,
+	};
+	struct fib6_nh fib6_nh = {};
+	int err;
+
+	err = ipv6_stub->fib6_nh_init(net, &fib6_nh, &cfg, GFP_KERNEL, extack);
+	if (!err) {
+		nh->fib_nh_dev = fib6_nh.fib_nh_dev;
+		dev_hold(nh->fib_nh_dev);
+		nh->fib_nh_oif = nh->fib_nh_dev->ifindex;
+		nh->fib_nh_scope = RT_SCOPE_LINK;
+
+		ipv6_stub->fib6_nh_release(&fib6_nh);
+	}
+
+	return err;
+}
 
 /*
  * Picture
@@ -1023,6 +1048,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
 
 	if (nh->fib_nh_gw_family == AF_INET)
 		err = fib_check_nh_v4_gw(net, nh, table, cfg->fc_scope, extack);
+	else if (nh->fib_nh_gw_family == AF_INET6)
+		err = fib_check_nh_v6_gw(net, nh, table, extack);
 	else
 		err = fib_check_nh_nongw(net, nh, extack);
 
-- 
cgit 


From 0353f28231c79416191326810e7fe656b69c63b7 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:33 -0700
Subject: neighbor: Add skip_cache argument to neigh_output

A later patch allows an IPv6 gateway with an IPv4 route. The neighbor
entry will exist in the v6 ndisc table and the cached header will contain
the ipv6 protocol which is wrong for an IPv4 packet. For an IPv4 packet to
use the v6 neighbor entry, neigh_output needs to skip the cached header
and just use the output callback for the neigh entry.

A future patchset can look at expanding the hh_cache to handle 2
protocols. For now, IPv6 gateways with an IPv4 route will take the
extra overhead of generating the header.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index a2bd4a6d9e6b..cca4892b8cb2 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -226,7 +226,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
 		int res;
 
 		sock_confirm_neigh(skb, neigh);
-		res = neigh_output(neigh, skb);
+		res = neigh_output(neigh, skb, false);
 
 		rcu_read_unlock_bh();
 		return res;
-- 
cgit 


From 5c9f7c1dfc2e0776551ef1ceb335187c6698d1ff Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:34 -0700
Subject: ipv4: Add helpers for neigh lookup for nexthop

A common theme in the output path is looking up a neigh entry for a
nexthop, either the gateway in an rtable or a fallback to the daddr
in the skb:

        nexthop = (__force u32)rt_nexthop(rt, ip_hdr(skb)->daddr);
        neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
        if (unlikely(!neigh))
                neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);

To allow the nexthop to be an IPv6 address we need to consider the
family of the nexthop and then call __ipv{4,6}_neigh_lookup_noref based
on it.

To make this simpler, add a ip_neigh_gw4 helper similar to ip_neigh_gw6
added in an earlier patch which handles:

        neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
        if (unlikely(!neigh))
                neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);

And then add a second one, ip_neigh_for_gw, that calls either
ip_neigh_gw4 or ip_neigh_gw6 based on the address family of the gateway.

Update the output paths in the VRF driver and core v4 code to use
ip_neigh_for_gw simplifying the family based lookup and making both
ready for a v6 nexthop.

ipv4_neigh_lookup has a different need - the potential to resolve a
passed in address in addition to any gateway in the rtable or skb. Since
this is a one-off, add ip_neigh_gw4 and ip_neigh_gw6 diectly. The
difference between __neigh_create used by the helpers and neigh_create
called by ipv4_neigh_lookup is taking a refcount, so add rcu_read_lock_bh
and bump the refcnt on the neigh entry.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 11 ++++-------
 net/ipv4/route.c     | 29 +++++++++++++++++++----------
 2 files changed, 23 insertions(+), 17 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index cca4892b8cb2..4e42c1974ba2 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -188,7 +188,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
 	struct net_device *dev = dst->dev;
 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
 	struct neighbour *neigh;
-	u32 nexthop;
+	bool is_v6gw = false;
 
 	if (rt->rt_type == RTN_MULTICAST) {
 		IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
@@ -218,16 +218,13 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
 	}
 
 	rcu_read_lock_bh();
-	nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
-	neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
-	if (unlikely(!neigh))
-		neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
+	neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
 	if (!IS_ERR(neigh)) {
 		int res;
 
 		sock_confirm_neigh(skb, neigh);
-		res = neigh_output(neigh, skb, false);
-
+		/* if crossing protocols, can not use the cached header */
+		res = neigh_output(neigh, skb, is_v6gw);
 		rcu_read_unlock_bh();
 		return res;
 	}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6e58acf0a87b..32ecb4c1c7e3 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -436,18 +436,27 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 {
 	const struct rtable *rt = container_of(dst, struct rtable, dst);
 	struct net_device *dev = dst->dev;
-	const __be32 *pkey = daddr;
 	struct neighbour *n;
 
-	if (rt->rt_gw_family == AF_INET)
-		pkey = (const __be32 *) &rt->rt_gw4;
-	else if (skb)
-		pkey = &ip_hdr(skb)->daddr;
-
-	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
-	if (n)
-		return n;
-	return neigh_create(&arp_tbl, pkey, dev);
+	rcu_read_lock_bh();
+
+	if (likely(rt->rt_gw_family == AF_INET)) {
+		n = ip_neigh_gw4(dev, rt->rt_gw4);
+	} else if (rt->rt_gw_family == AF_INET6) {
+		n = ip_neigh_gw6(dev, &rt->rt_gw6);
+        } else {
+		__be32 pkey;
+
+		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
+		n = ip_neigh_gw4(dev, pkey);
+	}
+
+	if (n && !refcount_inc_not_zero(&n->refcnt))
+		n = NULL;
+
+	rcu_read_unlock_bh();
+
+	return n;
 }
 
 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
-- 
cgit 


From 6de9c0557e4fc7e1b2f8ed6178aad32f64e1d7da Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:36 -0700
Subject: ipv4: Handle ipv6 gateway in ipv4_confirm_neigh

Update ipv4_confirm_neigh to handle an ipv6 gateway.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 32ecb4c1c7e3..efa6a36cbfff 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -465,13 +465,15 @@ static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 	struct net_device *dev = dst->dev;
 	const __be32 *pkey = daddr;
 
-	if (rt->rt_gw_family == AF_INET)
+	if (rt->rt_gw_family == AF_INET) {
 		pkey = (const __be32 *)&rt->rt_gw4;
-	else if (!daddr ||
+	} else if (rt->rt_gw_family == AF_INET6) {
+		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
+	} else if (!daddr ||
 		 (rt->rt_flags &
-		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
+		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
 		return;
-
+	}
 	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 }
 
-- 
cgit 


From 619d1826269b4be5c992bec0029bbfcc823d663d Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:37 -0700
Subject: ipv4: Handle ipv6 gateway in fib_detect_death

Update fib_detect_death to handle an ipv6 gateway.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index dd95725c318e..e5a6d431bfab 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -457,10 +457,18 @@ static int fib_detect_death(struct fib_info *fi, int order,
 			    struct fib_info **last_resort, int *last_idx,
 			    int dflt)
 {
+	const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
 	struct neighbour *n;
 	int state = NUD_NONE;
 
-	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].fib_nh_gw4, fi->fib_dev);
+	if (likely(nhc->nhc_gw_family == AF_INET))
+		n = neigh_lookup(&arp_tbl, &nhc->nhc_gw.ipv4, nhc->nhc_dev);
+	else if (nhc->nhc_gw_family == AF_INET6)
+		n = neigh_lookup(ipv6_stub->nd_tbl, &nhc->nhc_gw.ipv6,
+				 nhc->nhc_dev);
+	else
+		n = NULL;
+
 	if (n) {
 		state = n->nud_state;
 		neigh_release(n);
-- 
cgit 


From 1a38c43d319e745cf12055a266a1f459e2ba9ec3 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:38 -0700
Subject: ipv4: Handle ipv6 gateway in fib_good_nh

Update fib_good_nh to handle an ipv6 gateway.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e5a6d431bfab..c1ea138335a2 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1909,8 +1909,14 @@ static bool fib_good_nh(const struct fib_nh *nh)
 
 		rcu_read_lock_bh();
 
-		n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
-					      (__force u32)nh->fib_nh_gw4);
+		if (likely(nh->fib_nh_gw_family == AF_INET))
+			n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
+						   (__force u32)nh->fib_nh_gw4);
+		else if (nh->fib_nh_gw_family == AF_INET6)
+			n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev,
+							   &nh->fib_nh_gw6);
+		else
+			n = NULL;
 		if (n)
 			state = n->nud_state;
 
-- 
cgit 


From 19a9d136f198cd7c4e26ea6897a0cf067d3f7ecb Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:39 -0700
Subject: ipv4: Flag fib_info with a fib_nh using IPv6 gateway

Until support is added to the offload drivers, they need to be able to
reject routes with an IPv6 gateway. To that end add a flag to fib_info
that indicates if any fib_nh has a v6 gateway. The flag allows the drivers
to efficiently know the use of a v6 gateway without walking all fib_nh
tied to a fib_info each time a route is added.

Update mlxsw and rocker to reject the routes with extack message as to why.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c1ea138335a2..4a968e24507b 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1349,6 +1349,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 
 	change_nexthops(fi) {
 		fib_info_update_nh_saddr(net, nexthop_nh);
+		if (nexthop_nh->fib_nh_gw_family == AF_INET6)
+			fi->fib_nh_is_v6 = true;
 	} endfor_nexthops(fi)
 
 	fib_rebalance(fi);
-- 
cgit 


From d15662682db232da77136cd348f4c9df312ca6f9 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:40 -0700
Subject: ipv4: Allow ipv6 gateway with ipv4 routes

Add support for RTA_VIA and allow an IPv6 nexthop for v4 routes:
   $ ip ro add 172.16.1.0/24 via inet6 2001:db8::1 dev eth0
   $ ip ro ls
   ...
   172.16.1.0/24 via inet6 2001:db8::1 dev eth0

For convenience and simplicity, userspace can use RTA_VIA to specify
AF_INET or AF_INET6 gateway.

The common fib_nexthop_info dump function compares the gateway address
family to the nh_common family to know if the gateway should be encoded
as RTA_VIA or RTA_GATEWAY.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c  | 60 ++++++++++++++++++++++++++++++++++++++---
 net/ipv4/fib_semantics.c | 69 ++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 121 insertions(+), 8 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index f99a2ec32505..310060e67790 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -665,10 +665,55 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 	[RTA_DPORT]		= { .type = NLA_U16 },
 };
 
+int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
+		    struct netlink_ext_ack *extack)
+{
+	struct rtvia *via;
+	int alen;
+
+	if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) {
+		NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA");
+		return -EINVAL;
+	}
+
+	via = nla_data(nla);
+	alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr);
+
+	switch (via->rtvia_family) {
+	case AF_INET:
+		if (alen != sizeof(__be32)) {
+			NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA");
+			return -EINVAL;
+		}
+		cfg->fc_gw_family = AF_INET;
+		cfg->fc_gw4 = *((__be32 *)via->rtvia_addr);
+		break;
+	case AF_INET6:
+#ifdef CONFIG_IPV6
+		if (alen != sizeof(struct in6_addr)) {
+			NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA");
+			return -EINVAL;
+		}
+		cfg->fc_gw_family = AF_INET6;
+		cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr);
+#else
+		NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel");
+		return -EINVAL;
+#endif
+		break;
+	default:
+		NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
 			     struct nlmsghdr *nlh, struct fib_config *cfg,
 			     struct netlink_ext_ack *extack)
 {
+	bool has_gw = false, has_via = false;
 	struct nlattr *attr;
 	int err, remaining;
 	struct rtmsg *rtm;
@@ -709,13 +754,16 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
 			cfg->fc_oif = nla_get_u32(attr);
 			break;
 		case RTA_GATEWAY:
+			has_gw = true;
 			cfg->fc_gw_family = AF_INET;
 			cfg->fc_gw4 = nla_get_be32(attr);
 			break;
 		case RTA_VIA:
-			NL_SET_ERR_MSG(extack, "IPv4 does not support RTA_VIA attribute");
-			err = -EINVAL;
-			goto errout;
+			has_via = true;
+			err = fib_gw_from_via(cfg, attr, extack);
+			if (err)
+				goto errout;
+			break;
 		case RTA_PRIORITY:
 			cfg->fc_priority = nla_get_u32(attr);
 			break;
@@ -754,6 +802,12 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
 		}
 	}
 
+	if (has_gw && has_via) {
+		NL_SET_ERR_MSG(extack,
+			       "Nexthop configuration can not contain both GATEWAY and VIA");
+		goto errout;
+	}
+
 	return 0;
 errout:
 	return err;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 4a968e24507b..017273885eee 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -606,12 +606,22 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 
 		attrlen = rtnh_attrlen(rtnh);
 		if (attrlen > 0) {
-			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+			struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh);
 
 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+			nlav = nla_find(attrs, attrlen, RTA_VIA);
+			if (nla && nlav) {
+				NL_SET_ERR_MSG(extack,
+					       "Nexthop configuration can not contain both GATEWAY and VIA");
+				return -EINVAL;
+			}
 			if (nla) {
 				fib_cfg.fc_gw_family = AF_INET;
 				fib_cfg.fc_gw4 = nla_get_in_addr(nla);
+			} else if (nlav) {
+				ret = fib_gw_from_via(&fib_cfg, nlav, extack);
+				if (ret)
+					goto errout;
 			}
 
 			nla = nla_find(attrs, attrlen, RTA_FLOW);
@@ -792,11 +802,43 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
 
 		attrlen = rtnh_attrlen(rtnh);
 		if (attrlen > 0) {
-			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+			struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh);
 
 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
-			if (nla && nla_get_in_addr(nla) != nh->fib_nh_gw4)
-				return 1;
+			nlav = nla_find(attrs, attrlen, RTA_VIA);
+			if (nla && nlav) {
+				NL_SET_ERR_MSG(extack,
+					       "Nexthop configuration can not contain both GATEWAY and VIA");
+				return -EINVAL;
+			}
+
+			if (nla) {
+				if (nh->fib_nh_gw_family != AF_INET ||
+				    nla_get_in_addr(nla) != nh->fib_nh_gw4)
+					return 1;
+			} else if (nlav) {
+				struct fib_config cfg2;
+				int err;
+
+				err = fib_gw_from_via(&cfg2, nlav, extack);
+				if (err)
+					return err;
+
+				switch (nh->fib_nh_gw_family) {
+				case AF_INET:
+					if (cfg2.fc_gw_family != AF_INET ||
+					    cfg2.fc_gw4 != nh->fib_nh_gw4)
+						return 1;
+					break;
+				case AF_INET6:
+					if (cfg2.fc_gw_family != AF_INET6 ||
+					    ipv6_addr_cmp(&cfg2.fc_gw6,
+							  &nh->fib_nh_gw6))
+						return 1;
+					break;
+				}
+			}
+
 #ifdef CONFIG_IP_ROUTE_CLASSID
 			nla = nla_find(attrs, attrlen, RTA_FLOW);
 			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
@@ -1429,8 +1471,25 @@ int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
 			goto nla_put_failure;
 		break;
 	case AF_INET6:
-		if (nla_put_in6_addr(skb, RTA_GATEWAY, &nhc->nhc_gw.ipv6) < 0)
+		/* if gateway family does not match nexthop family
+		 * gateway is encoded as RTA_VIA
+		 */
+		if (nhc->nhc_gw_family != nhc->nhc_family) {
+			int alen = sizeof(struct in6_addr);
+			struct nlattr *nla;
+			struct rtvia *via;
+
+			nla = nla_reserve(skb, RTA_VIA, alen + 2);
+			if (!nla)
+				goto nla_put_failure;
+
+			via = nla_data(nla);
+			via->rtvia_family = AF_INET6;
+			memcpy(via->rtvia_addr, &nhc->nhc_gw.ipv6, alen);
+		} else if (nla_put_in6_addr(skb, RTA_GATEWAY,
+					    &nhc->nhc_gw.ipv6) < 0) {
 			goto nla_put_failure;
+		}
 		break;
 	}
 
-- 
cgit 


From d73f80f921fd323af8f35644fb9f3b129f465f66 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 10 Apr 2019 10:05:51 -0700
Subject: ipv4: Handle RTA_GATEWAY set to 0

Govindarajulu reported a regression with Network Manager which sends an
RTA_GATEWAY attribute with the address set to 0. Fixup the handling of
RTA_GATEWAY to only set fc_gw_family if the gateway address is actually
set.

Fixes: f35b794b3b405 ("ipv4: Prepare fib_config for IPv6 gateway")
Reported-by: Govindarajulu Varadarajan <govind.varadar@gmail.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c  | 3 ++-
 net/ipv4/fib_semantics.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 310060e67790..d4b63f94f7be 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -755,8 +755,9 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
 			break;
 		case RTA_GATEWAY:
 			has_gw = true;
-			cfg->fc_gw_family = AF_INET;
 			cfg->fc_gw4 = nla_get_be32(attr);
+			if (cfg->fc_gw4)
+				cfg->fc_gw_family = AF_INET;
 			break;
 		case RTA_VIA:
 			has_via = true;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 017273885eee..779d2be2b135 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -616,8 +616,9 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 				return -EINVAL;
 			}
 			if (nla) {
-				fib_cfg.fc_gw_family = AF_INET;
 				fib_cfg.fc_gw4 = nla_get_in_addr(nla);
+				if (fib_cfg.fc_gw4)
+					fib_cfg.fc_gw_family = AF_INET;
 			} else if (nlav) {
 				ret = fib_gw_from_via(&fib_cfg, nlav, extack);
 				if (ret)
-- 
cgit 


From c9d52f216922425b56b002100b75de34b62b11a0 Mon Sep 17 00:00:00 2001
From: Simon Horman <simon.horman@netronome.com>
Date: Tue, 9 Apr 2019 09:59:07 +0200
Subject: fou: correct spelling of encapsulation

Correct spelling of encapsulation.
Found by inspection.

Signed-off-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fou.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 100e63f57ea6..d2a2f3258e4b 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -136,7 +136,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
 		break;
 
 	case 1: {
-		/* Direct encasulation of IPv4 or IPv6 */
+		/* Direct encapsulation of IPv4 or IPv6 */
 
 		int prot;
 
@@ -1137,7 +1137,7 @@ static int gue_err(struct sk_buff *skb, u32 info)
 	case 0: /* Full GUE header present */
 		break;
 	case 1: {
-		/* Direct encasulation of IPv4 or IPv6 */
+		/* Direct encapsulation of IPv4 or IPv6 */
 		skb_set_transport_header(skb, -(int)sizeof(struct icmphdr));
 
 		switch (((struct iphdr *)guehdr)->version) {
-- 
cgit 


From 526bb57a6ad6b0ed6de34b3c5eabf394b248618f Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Tue, 9 Apr 2019 12:03:07 +0200
Subject: net: fou: remove redundant code in gue_udp_recv

Remove not useful protocol version check in gue_udp_recv since just
gue version 0 can hit that code. Moreover remove duplicated hdrlen
computation

Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fou.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index d2a2f3258e4b..b038f563baa4 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -170,9 +170,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
 	/* guehdr may change after pull */
 	guehdr = (struct guehdr *)&udp_hdr(skb)[1];
 
-	hdrlen = sizeof(struct guehdr) + optlen;
-
-	if (guehdr->version != 0 || validate_gue_flags(guehdr, optlen))
+	if (validate_gue_flags(guehdr, optlen))
 		goto drop;
 
 	hdrlen = sizeof(struct guehdr) + optlen;
-- 
cgit 


From bf8981a2aa082d9d64771b47c8a1c9c388d8cd40 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 9 Apr 2019 10:44:06 +0200
Subject: netfilter: nf_nat: merge ip/ip6 masquerade headers

Both are now implemented by nf_nat_masquerade.c, so no need to keep
different headers.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ipt_MASQUERADE.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index fd3f9e8a74da..0a2bffb6a0ad 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -22,7 +22,7 @@
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter/x_tables.h>
 #include <net/netfilter/nf_nat.h>
-#include <net/netfilter/ipv4/nf_nat_masquerade.h>
+#include <net/netfilter/nf_nat_masquerade.h>
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-- 
cgit 


From adf82accc5f526f1e812f1a8df7292fef7dad19a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 9 Apr 2019 10:44:07 +0200
Subject: netfilter: x_tables: merge ip and ipv6 masquerade modules

No need to have separate modules for this.
before:
 text    data   bss    dec  filename
 2038    1168     0   3206  net/ipv4/netfilter/ipt_MASQUERADE.ko
 1526    1024     0   2550  net/ipv6/netfilter/ip6t_MASQUERADE.ko
after:
 text    data   bss    dec  filename
 2521    1296     0   3817  net/netfilter/xt_MASQUERADE.ko

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/Kconfig          |  12 ++---
 net/ipv4/netfilter/Makefile         |   1 -
 net/ipv4/netfilter/ipt_MASQUERADE.c | 101 ------------------------------------
 3 files changed, 3 insertions(+), 111 deletions(-)
 delete mode 100644 net/ipv4/netfilter/ipt_MASQUERADE.c

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index ea688832fc4e..1412b029f37f 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -224,16 +224,10 @@ if IP_NF_NAT
 
 config IP_NF_TARGET_MASQUERADE
 	tristate "MASQUERADE target support"
-	select NF_NAT_MASQUERADE
-	default m if NETFILTER_ADVANCED=n
+	select NETFILTER_XT_TARGET_MASQUERADE
 	help
-	  Masquerading is a special case of NAT: all outgoing connections are
-	  changed to seem to come from a particular interface's address, and
-	  if the interface goes down, those connections are lost.  This is
-	  only useful for dialup accounts with dynamic IP address (ie. your IP
-	  address will be different on next dialup).
-
-	  To compile it as a module, choose M here.  If unsure, say N.
+	  This is a backwards-compat option for the user's convenience
+	  (e.g. when running oldconfig). It selects NETFILTER_XT_TARGET_MASQUERADE.
 
 config IP_NF_TARGET_NETMAP
 	tristate "NETMAP target support"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 2cfdda7b109f..c50e0ec095d2 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -48,7 +48,6 @@ obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o
 # targets
 obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
 obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
-obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
 obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
 obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o
 
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
deleted file mode 100644
index 0a2bffb6a0ad..000000000000
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Masquerade.  Simple mapping which alters range to a local IP address
-   (depending on route). */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/types.h>
-#include <linux/inetdevice.h>
-#include <linux/ip.h>
-#include <linux/timer.h>
-#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <net/protocol.h>
-#include <net/ip.h>
-#include <net/checksum.h>
-#include <net/route.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/x_tables.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_masquerade.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-MODULE_DESCRIPTION("Xtables: automatic-address SNAT");
-
-/* FIXME: Multiple targets. --RR */
-static int masquerade_tg_check(const struct xt_tgchk_param *par)
-{
-	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-
-	if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) {
-		pr_debug("bad MAP_IPS.\n");
-		return -EINVAL;
-	}
-	if (mr->rangesize != 1) {
-		pr_debug("bad rangesize %u\n", mr->rangesize);
-		return -EINVAL;
-	}
-	return nf_ct_netns_get(par->net, par->family);
-}
-
-static unsigned int
-masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
-{
-	struct nf_nat_range2 range;
-	const struct nf_nat_ipv4_multi_range_compat *mr;
-
-	mr = par->targinfo;
-	range.flags = mr->range[0].flags;
-	range.min_proto = mr->range[0].min;
-	range.max_proto = mr->range[0].max;
-
-	return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), &range,
-				      xt_out(par));
-}
-
-static void masquerade_tg_destroy(const struct xt_tgdtor_param *par)
-{
-	nf_ct_netns_put(par->net, par->family);
-}
-
-static struct xt_target masquerade_tg_reg __read_mostly = {
-	.name		= "MASQUERADE",
-	.family		= NFPROTO_IPV4,
-	.target		= masquerade_tg,
-	.targetsize	= sizeof(struct nf_nat_ipv4_multi_range_compat),
-	.table		= "nat",
-	.hooks		= 1 << NF_INET_POST_ROUTING,
-	.checkentry	= masquerade_tg_check,
-	.destroy	= masquerade_tg_destroy,
-	.me		= THIS_MODULE,
-};
-
-static int __init masquerade_tg_init(void)
-{
-	int ret;
-
-	ret = xt_register_target(&masquerade_tg_reg);
-	if (ret)
-		return ret;
-
-	ret = nf_nat_masquerade_ipv4_register_notifier();
-	if (ret)
-		xt_unregister_target(&masquerade_tg_reg);
-
-	return ret;
-}
-
-static void __exit masquerade_tg_exit(void)
-{
-	xt_unregister_target(&masquerade_tg_reg);
-	nf_nat_masquerade_ipv4_unregister_notifier();
-}
-
-module_init(masquerade_tg_init);
-module_exit(masquerade_tg_exit);
-- 
cgit 


From c7cbdbf29f488a19982cd9f4a109887f18028bbb Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 17 Apr 2019 22:51:48 +0200
Subject: net: rework SIOCGSTAMP ioctl handling

The SIOCGSTAMP/SIOCGSTAMPNS ioctl commands are implemented by many
socket protocol handlers, and all of those end up calling the same
sock_get_timestamp()/sock_get_timestampns() helper functions, which
results in a lot of duplicate code.

With the introduction of 64-bit time_t on 32-bit architectures, this
gets worse, as we then need four different ioctl commands in each
socket protocol implementation.

To simplify that, let's add a new .gettstamp() operation in
struct proto_ops, and move ioctl implementation into the common
sock_ioctl()/compat_sock_ioctl_trans() functions that these all go
through.

We can reuse the sock_get_timestamp() implementation, but generalize
it so it can deal with both native and compat mode, as well as
timeval and timespec structures.

Acked-by: Stefan Schmidt <stefan@datenfreihafen.org>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Marc Kleine-Budde <mkl@pengutronix.de>
Link: https://lore.kernel.org/lkml/CAK8P3a038aDQQotzua_QtKGhq8O9n+rdiz2=WDCp82ys8eUT+A@mail.gmail.com/
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 08a8430f5647..5183a2daba64 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -915,12 +915,6 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	struct rtentry rt;
 
 	switch (cmd) {
-	case SIOCGSTAMP:
-		err = sock_get_timestamp(sk, (struct timeval __user *)arg);
-		break;
-	case SIOCGSTAMPNS:
-		err = sock_get_timestampns(sk, (struct timespec __user *)arg);
-		break;
 	case SIOCADDRT:
 	case SIOCDELRT:
 		if (copy_from_user(&rt, p, sizeof(struct rtentry)))
@@ -992,6 +986,7 @@ const struct proto_ops inet_stream_ops = {
 	.getname	   = inet_getname,
 	.poll		   = tcp_poll,
 	.ioctl		   = inet_ioctl,
+	.gettstamp	   = sock_gettstamp,
 	.listen		   = inet_listen,
 	.shutdown	   = inet_shutdown,
 	.setsockopt	   = sock_common_setsockopt,
@@ -1027,6 +1022,7 @@ const struct proto_ops inet_dgram_ops = {
 	.getname	   = inet_getname,
 	.poll		   = udp_poll,
 	.ioctl		   = inet_ioctl,
+	.gettstamp	   = sock_gettstamp,
 	.listen		   = sock_no_listen,
 	.shutdown	   = inet_shutdown,
 	.setsockopt	   = sock_common_setsockopt,
@@ -1059,6 +1055,7 @@ static const struct proto_ops inet_sockraw_ops = {
 	.getname	   = inet_getname,
 	.poll		   = datagram_poll,
 	.ioctl		   = inet_ioctl,
+	.gettstamp	   = sock_gettstamp,
 	.listen		   = sock_no_listen,
 	.shutdown	   = inet_shutdown,
 	.setsockopt	   = sock_common_setsockopt,
-- 
cgit 


From d7cc399e1227e74e44f78847d9732a228b46cc91 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 19 Apr 2019 16:02:03 -0700
Subject: tcp: properly reset skb->truesize for tx recycling

tcp sendmsg() and sendpage() normally advance skb->data_len
and skb->truesize by the payload added to an skb.

But sendmsg(fd, ..., MSG_ZEROCOPY) has to account for whole pages,
even if a single byte of payload is used in the page.

This means that we can not assume skb->truesize can be adjusted
by skb->data_len. We must instead overwrite its value.

Otherwise skb->truesize is too big and can hit socket sndbuf limit,
especially if the skb is recycled multiple times :/

Fixes: 472c2e07eef0 ("tcp: add one skb cache for tx")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 603e770d59b3..f7567a3698eb 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -868,7 +868,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
 	if (likely(!size)) {
 		skb = sk->sk_tx_skb_cache;
 		if (skb && !skb_cloned(skb)) {
-			skb->truesize -= skb->data_len;
+			skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
 			sk->sk_tx_skb_cache = NULL;
 			pskb_trim(skb, 0);
 			INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
-- 
cgit 


From 3c618c1dbb8859625c643121ac80af9a6723533f Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sat, 20 Apr 2019 09:28:20 -0700
Subject: net: Rename net/nexthop.h net/rtnh.h

The header contains rtnh_ macros so rename the file accordingly.
Allows a later patch to use the nexthop.h name for the new
nexthop code.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 2 +-
 net/ipv4/ipmr.c          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 779d2be2b135..b5230c4a1c16 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -43,7 +43,7 @@
 #include <net/ip_fib.h>
 #include <net/ip6_fib.h>
 #include <net/netlink.h>
-#include <net/nexthop.h>
+#include <net/rtnh.h>
 #include <net/lwtunnel.h>
 #include <net/fib_notifier.h>
 #include <net/addrconf.h>
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 9a3f13edc98e..a8eb97777c0a 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -66,7 +66,7 @@
 #include <net/netlink.h>
 #include <net/fib_rules.h>
 #include <linux/netconf.h>
-#include <net/nexthop.h>
+#include <net/rtnh.h>
 
 #include <linux/nospec.h>
 
-- 
cgit 


From 3557b3fdeefacdd111469f90db1a0602902c9698 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sat, 20 Apr 2019 23:29:47 -0400
Subject: net: bpfilter: dont use module_init in non-modular code

The Kconfig controlling this code is:

bpfilter/Kconfig:menuconfig BPFILTER
bpfilter/Kconfig:   bool "BPF based packet filtering framework (BPFILTER)"

Since it isn't a module, we shouldn't use module_init().  Instead we
use device_initcall() - which is exactly what module_init() defaults
to for non-modular code/builds.

We don't remove <linux/module.h> from the includes since this file does
a request_module() and hence is a valid user of that header file, even
though it is not modular itself.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/bpfilter/sockopt.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
index 1e976bb93d99..15427163a041 100644
--- a/net/ipv4/bpfilter/sockopt.c
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -77,5 +77,4 @@ static int __init bpfilter_sockopt_init(void)
 
 	return 0;
 }
-
-module_init(bpfilter_sockopt_init);
+device_initcall(bpfilter_sockopt_init);
-- 
cgit 


From f24ea52873c726bf7b54318f00ec45050222b367 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 16 Apr 2019 16:44:37 +0200
Subject: xfrm: remove tos indirection from afinfo_policy

Only used by ipv4, we can read the fl4 tos value directly instead.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_policy.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index d73a6d6652f6..244d26baa3af 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -69,11 +69,6 @@ static int xfrm4_get_saddr(struct net *net, int oif,
 	return 0;
 }
 
-static int xfrm4_get_tos(const struct flowi *fl)
-{
-	return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; /* Strip ECN bits */
-}
-
 static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
 			   int nfheader_len)
 {
@@ -272,7 +267,6 @@ static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
 	.dst_lookup =		xfrm4_dst_lookup,
 	.get_saddr =		xfrm4_get_saddr,
 	.decode_session =	_decode_session4,
-	.get_tos =		xfrm4_get_tos,
 	.init_path =		xfrm4_init_path,
 	.fill_dst =		xfrm4_fill_dst,
 	.blackhole_route =	ipv4_blackhole_route,
-- 
cgit 


From 2e8b4aa816eaaf480fe68b1086614259caf1bf3c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 16 Apr 2019 16:44:38 +0200
Subject: xfrm: remove init_path indirection from afinfo_policy

handle this directly, its only used by ipv6.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_policy.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 244d26baa3af..6e89378668ae 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -69,12 +69,6 @@ static int xfrm4_get_saddr(struct net *net, int oif,
 	return 0;
 }
 
-static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
-			   int nfheader_len)
-{
-	return 0;
-}
-
 static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 			  const struct flowi *fl)
 {
@@ -267,7 +261,6 @@ static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
 	.dst_lookup =		xfrm4_dst_lookup,
 	.get_saddr =		xfrm4_get_saddr,
 	.decode_session =	_decode_session4,
-	.init_path =		xfrm4_init_path,
 	.fill_dst =		xfrm4_fill_dst,
 	.blackhole_route =	ipv4_blackhole_route,
 };
-- 
cgit 


From c53ac41e3720926301c623d6682bb87ce992a3b3 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 16 Apr 2019 16:44:39 +0200
Subject: xfrm: remove decode_session indirection from afinfo_policy

No external dependencies, might as well handle this directly.
xfrm_afinfo_policy is now 40 bytes on x86_64.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_policy.c | 114 ------------------------------------------------
 1 file changed, 114 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 6e89378668ae..414ab0420604 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -12,7 +12,6 @@
 #include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/inetdevice.h>
-#include <linux/if_tunnel.h>
 #include <net/dst.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
@@ -96,118 +95,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	return 0;
 }
 
-static void
-_decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
-{
-	const struct iphdr *iph = ip_hdr(skb);
-	u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
-	struct flowi4 *fl4 = &fl->u.ip4;
-	int oif = 0;
-
-	if (skb_dst(skb))
-		oif = skb_dst(skb)->dev->ifindex;
-
-	memset(fl4, 0, sizeof(struct flowi4));
-	fl4->flowi4_mark = skb->mark;
-	fl4->flowi4_oif = reverse ? skb->skb_iif : oif;
-
-	if (!ip_is_fragment(iph)) {
-		switch (iph->protocol) {
-		case IPPROTO_UDP:
-		case IPPROTO_UDPLITE:
-		case IPPROTO_TCP:
-		case IPPROTO_SCTP:
-		case IPPROTO_DCCP:
-			if (xprth + 4 < skb->data ||
-			    pskb_may_pull(skb, xprth + 4 - skb->data)) {
-				__be16 *ports;
-
-				xprth = skb_network_header(skb) + iph->ihl * 4;
-				ports = (__be16 *)xprth;
-
-				fl4->fl4_sport = ports[!!reverse];
-				fl4->fl4_dport = ports[!reverse];
-			}
-			break;
-
-		case IPPROTO_ICMP:
-			if (xprth + 2 < skb->data ||
-			    pskb_may_pull(skb, xprth + 2 - skb->data)) {
-				u8 *icmp;
-
-				xprth = skb_network_header(skb) + iph->ihl * 4;
-				icmp = xprth;
-
-				fl4->fl4_icmp_type = icmp[0];
-				fl4->fl4_icmp_code = icmp[1];
-			}
-			break;
-
-		case IPPROTO_ESP:
-			if (xprth + 4 < skb->data ||
-			    pskb_may_pull(skb, xprth + 4 - skb->data)) {
-				__be32 *ehdr;
-
-				xprth = skb_network_header(skb) + iph->ihl * 4;
-				ehdr = (__be32 *)xprth;
-
-				fl4->fl4_ipsec_spi = ehdr[0];
-			}
-			break;
-
-		case IPPROTO_AH:
-			if (xprth + 8 < skb->data ||
-			    pskb_may_pull(skb, xprth + 8 - skb->data)) {
-				__be32 *ah_hdr;
-
-				xprth = skb_network_header(skb) + iph->ihl * 4;
-				ah_hdr = (__be32 *)xprth;
-
-				fl4->fl4_ipsec_spi = ah_hdr[1];
-			}
-			break;
-
-		case IPPROTO_COMP:
-			if (xprth + 4 < skb->data ||
-			    pskb_may_pull(skb, xprth + 4 - skb->data)) {
-				__be16 *ipcomp_hdr;
-
-				xprth = skb_network_header(skb) + iph->ihl * 4;
-				ipcomp_hdr = (__be16 *)xprth;
-
-				fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
-			}
-			break;
-
-		case IPPROTO_GRE:
-			if (xprth + 12 < skb->data ||
-			    pskb_may_pull(skb, xprth + 12 - skb->data)) {
-				__be16 *greflags;
-				__be32 *gre_hdr;
-
-				xprth = skb_network_header(skb) + iph->ihl * 4;
-				greflags = (__be16 *)xprth;
-				gre_hdr = (__be32 *)xprth;
-
-				if (greflags[0] & GRE_KEY) {
-					if (greflags[0] & GRE_CSUM)
-						gre_hdr++;
-					fl4->fl4_gre_key = gre_hdr[1];
-				}
-			}
-			break;
-
-		default:
-			fl4->fl4_ipsec_spi = 0;
-			break;
-		}
-	}
-	fl4->flowi4_proto = iph->protocol;
-	fl4->daddr = reverse ? iph->saddr : iph->daddr;
-	fl4->saddr = reverse ? iph->daddr : iph->saddr;
-	fl4->flowi4_tos = iph->tos;
-}
-
 static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk,
 			      struct sk_buff *skb, u32 mtu)
 {
@@ -260,7 +147,6 @@ static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
 	.dst_ops =		&xfrm4_dst_ops_template,
 	.dst_lookup =		xfrm4_dst_lookup,
 	.get_saddr =		xfrm4_get_saddr,
-	.decode_session =	_decode_session4,
 	.fill_dst =		xfrm4_fill_dst,
 	.blackhole_route =	ipv4_blackhole_route,
 };
-- 
cgit 


From bb9cd077e216b886438c5698e1cd75f762ecd3c9 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 17 Apr 2019 11:45:13 +0200
Subject: xfrm: remove unneeded export_symbols

None of them have any external callers, make them static.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_protocol.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index 35c54865dc42..bcab48944c15 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -46,7 +46,7 @@ static inline struct xfrm4_protocol __rcu **proto_handlers(u8 protocol)
 	     handler != NULL;				\
 	     handler = rcu_dereference(handler->next))	\
 
-int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
+static int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
 {
 	int ret;
 	struct xfrm4_protocol *handler;
@@ -61,7 +61,6 @@ int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
 
 	return 0;
 }
-EXPORT_SYMBOL(xfrm4_rcv_cb);
 
 int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
 		    int encap_type)
-- 
cgit 


From ffa8ce54be3aaf6b15abae3bbd08282b867d3a4f Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 23 Apr 2019 08:23:41 -0700
Subject: lwtunnel: Pass encap and encap type attributes to lwtunnel_fill_encap

Currently, lwtunnel_fill_encap hardcodes the encap and encap type
attributes as RTA_ENCAP and RTA_ENCAP_TYPE, respectively. The nexthop
objects want to re-use this code but the encap attributes passed to
userspace as NHA_ENCAP and NHA_ENCAP_TYPE. Since that is the only
difference, change lwtunnel_fill_encap to take the attribute type as
an input.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index b5230c4a1c16..c695e629fac2 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1503,7 +1503,8 @@ int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
 		goto nla_put_failure;
 
 	if (nhc->nhc_lwtstate &&
-	    lwtunnel_fill_encap(skb, nhc->nhc_lwtstate) < 0)
+	    lwtunnel_fill_encap(skb, nhc->nhc_lwtstate,
+				RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
 		goto nla_put_failure;
 
 	return 0;
-- 
cgit 


From ecc5663cce8c7d7e4eba32af4e1e3cab296c64b9 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 23 Apr 2019 08:48:09 -0700
Subject: net: Change nhc_flags to unsigned char

nhc_flags holds the RTNH_F flags for a given nexthop (fib{6}_nh).
All of the RTNH_F_ flags fit in an unsigned char, and since the API to
userspace (rtnh_flags and lower byte of rtm_flags) is 1 byte it can not
grow. Make nhc_flags in fib_nh_common an unsigned char and shrink the
size of the struct by 8, from 56 to 48 bytes.

Update the flags arguments for up netdevice events and fib_nexthop_info
which determines the RTNH_F flags to return on a dump/event. The RTNH_F
flags are passed in the lower byte of rtm_flags which is an unsigned int
so use a temp variable for the flags to fib_nexthop_info and combine
with rtm_flags in the caller.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c695e629fac2..4336f1ec8ab0 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1444,7 +1444,7 @@ failure:
 }
 
 int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
-		     unsigned int *flags, bool skip_oif)
+		     unsigned char *flags, bool skip_oif)
 {
 	if (nhc->nhc_flags & RTNH_F_DEAD)
 		*flags |= RTNH_F_DEAD;
@@ -1520,7 +1520,7 @@ int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc,
 {
 	const struct net_device *dev = nhc->nhc_dev;
 	struct rtnexthop *rtnh;
-	unsigned int flags = 0;
+	unsigned char flags = 0;
 
 	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
 	if (!rtnh)
@@ -1619,7 +1619,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 		goto nla_put_failure;
 	if (fi->fib_nhs == 1) {
 		struct fib_nh *nh = &fi->fib_nh[0];
-		unsigned int flags = 0;
+		unsigned char flags = 0;
 
 		if (fib_nexthop_info(skb, &nh->nh_common, &flags, false) < 0)
 			goto nla_put_failure;
@@ -1902,7 +1902,7 @@ out:
  * Dead device goes up. We wake up dead nexthops.
  * It takes sense only on multipath routes.
  */
-int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
+int fib_sync_up(struct net_device *dev, unsigned char nh_flags)
 {
 	struct fib_info *prev_fi;
 	unsigned int hash;
-- 
cgit 


From ae0be8de9a53cda3505865c11826d8ff0640237c Mon Sep 17 00:00:00 2001
From: Michal Kubecek <mkubecek@suse.cz>
Date: Fri, 26 Apr 2019 11:13:06 +0200
Subject: netlink: make nla_nest_start() add NLA_F_NESTED flag

Even if the NLA_F_NESTED flag was introduced more than 11 years ago, most
netlink based interfaces (including recently added ones) are still not
setting it in kernel generated messages. Without the flag, message parsers
not aware of attribute semantics (e.g. wireshark dissector or libmnl's
mnl_nlmsg_fprintf()) cannot recognize nested attributes and won't display
the structure of their contents.

Unfortunately we cannot just add the flag everywhere as there may be
userspace applications which check nlattr::nla_type directly rather than
through a helper masking out the flags. Therefore the patch renames
nla_nest_start() to nla_nest_start_noflag() and introduces nla_nest_start()
as a wrapper adding NLA_F_NESTED. The calls which add NLA_F_NESTED manually
are rewritten to use nla_nest_start().

Except for changes in include/net/netlink.h, the patch was generated using
this semantic patch:

@@ expression E1, E2; @@
-nla_nest_start(E1, E2)
+nla_nest_start_noflag(E1, E2)

@@ expression E1, E2; @@
-nla_nest_start_noflag(E1, E2 | NLA_F_NESTED)
+nla_nest_start(E1, E2)

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 2 +-
 net/ipv4/ipmr.c          | 6 +++---
 net/ipv4/ipmr_base.c     | 2 +-
 net/ipv4/tcp_metrics.c   | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 4336f1ec8ab0..71c2165a2ce3 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1550,7 +1550,7 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
 {
 	struct nlattr *mp;
 
-	mp = nla_nest_start(skb, RTA_MULTIPATH);
+	mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
 	if (!mp)
 		goto nla_put_failure;
 
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index a8eb97777c0a..1322573b8228 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2783,7 +2783,7 @@ static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb)
 		return true;
 
 	vif = &mrt->vif_table[vifid];
-	vif_nest = nla_nest_start(skb, IPMRA_VIF);
+	vif_nest = nla_nest_start_noflag(skb, IPMRA_VIF);
 	if (!vif_nest)
 		return false;
 	if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif->dev->ifindex) ||
@@ -2867,7 +2867,7 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb)
 		memset(hdr, 0, sizeof(*hdr));
 		hdr->ifi_family = RTNL_FAMILY_IPMR;
 
-		af = nla_nest_start(skb, IFLA_AF_SPEC);
+		af = nla_nest_start_noflag(skb, IFLA_AF_SPEC);
 		if (!af) {
 			nlmsg_cancel(skb, nlh);
 			goto out;
@@ -2878,7 +2878,7 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb)
 			goto out;
 		}
 
-		vifs = nla_nest_start(skb, IPMRA_TABLE_VIFS);
+		vifs = nla_nest_start_noflag(skb, IPMRA_TABLE_VIFS);
 		if (!vifs) {
 			nla_nest_end(skb, af);
 			nlmsg_end(skb, nlh);
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 3e614cc824f7..278834d4babc 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -228,7 +228,7 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 	if (c->mfc_flags & MFC_OFFLOAD)
 		rtm->rtm_flags |= RTNH_F_OFFLOAD;
 
-	mp_attr = nla_nest_start(skb, RTA_MULTIPATH);
+	mp_attr = nla_nest_start_noflag(skb, RTA_MULTIPATH);
 	if (!mp_attr)
 		return -EMSGSIZE;
 
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 4ccec4c705f7..9a08bfb0672c 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -658,7 +658,7 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
 	{
 		int n = 0;
 
-		nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);
+		nest = nla_nest_start_noflag(msg, TCP_METRICS_ATTR_VALS);
 		if (!nest)
 			goto nla_put_failure;
 		for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) {
-- 
cgit 


From 8cb081746c031fb164089322e2336a0bf5b3070c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 26 Apr 2019 14:07:28 +0200
Subject: netlink: make validation more configurable for future strictness

We currently have two levels of strict validation:

 1) liberal (default)
     - undefined (type >= max) & NLA_UNSPEC attributes accepted
     - attribute length >= expected accepted
     - garbage at end of message accepted
 2) strict (opt-in)
     - NLA_UNSPEC attributes accepted
     - attribute length >= expected accepted

Split out parsing strictness into four different options:
 * TRAILING     - check that there's no trailing data after parsing
                  attributes (in message or nested)
 * MAXTYPE      - reject attrs > max known type
 * UNSPEC       - reject attributes with NLA_UNSPEC policy entries
 * STRICT_ATTRS - strictly validate attribute size

The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().

Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.

We end up with the following renames:
 * nla_parse           -> nla_parse_deprecated
 * nla_parse_strict    -> nla_parse_deprecated_strict
 * nlmsg_parse         -> nlmsg_parse_deprecated
 * nlmsg_parse_strict  -> nlmsg_parse_deprecated_strict
 * nla_parse_nested    -> nla_parse_nested_deprecated
 * nla_validate_nested -> nla_validate_nested_deprecated

Using spatch, of course:
    @@
    expression TB, MAX, HEAD, LEN, POL, EXT;
    @@
    -nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
    +nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)

    @@
    expression NLH, HDRLEN, TB, MAX, POL, EXT;
    @@
    -nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
    +nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)

    @@
    expression NLH, HDRLEN, TB, MAX, POL, EXT;
    @@
    -nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
    +nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)

    @@
    expression TB, MAX, NLA, POL, EXT;
    @@
    -nla_parse_nested(TB, MAX, NLA, POL, EXT)
    +nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)

    @@
    expression START, MAX, POL, EXT;
    @@
    -nla_validate_nested(START, MAX, POL, EXT)
    +nla_validate_nested_deprecated(START, MAX, POL, EXT)

    @@
    expression NLH, HDRLEN, MAX, POL, EXT;
    @@
    -nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
    +nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)

For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.

Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.

Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.

In effect then, this adds fully strict validation for any new command.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/devinet.c        | 27 +++++++++++++++------------
 net/ipv4/fib_frontend.c   |  8 ++++----
 net/ipv4/ip_tunnel_core.c |  8 ++++----
 net/ipv4/ipmr.c           | 12 ++++++------
 net/ipv4/route.c          |  8 ++++----
 5 files changed, 33 insertions(+), 30 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index eb514f312e6f..701c5d113a34 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -621,8 +621,8 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	ASSERT_RTNL();
 
-	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy,
-			  extack);
+	err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
+				     ifa_ipv4_policy, extack);
 	if (err < 0)
 		goto errout;
 
@@ -793,8 +793,8 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
 	struct in_device *in_dev;
 	int err;
 
-	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy,
-			  extack);
+	err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
+				     ifa_ipv4_policy, extack);
 	if (err < 0)
 		goto errout;
 
@@ -1689,8 +1689,8 @@ static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh,
 		fillargs->flags |= NLM_F_DUMP_FILTERED;
 	}
 
-	err = nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
-				 ifa_ipv4_policy, extack);
+	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
+					    ifa_ipv4_policy, extack);
 	if (err < 0)
 		return err;
 
@@ -1906,7 +1906,8 @@ static int inet_validate_link_af(const struct net_device *dev,
 	if (dev && !__in_dev_get_rcu(dev))
 		return -EAFNOSUPPORT;
 
-	err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy, NULL);
+	err = nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla,
+					  inet_af_policy, NULL);
 	if (err < 0)
 		return err;
 
@@ -1934,7 +1935,7 @@ static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla)
 	if (!in_dev)
 		return -EAFNOSUPPORT;
 
-	if (nla_parse_nested(tb, IFLA_INET_MAX, nla, NULL, NULL) < 0)
+	if (nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, NULL, NULL) < 0)
 		BUG();
 
 	if (tb[IFLA_INET_CONF]) {
@@ -2076,11 +2077,13 @@ static int inet_netconf_valid_get_req(struct sk_buff *skb,
 	}
 
 	if (!netlink_strict_get_check(skb))
-		return nlmsg_parse(nlh, sizeof(struct netconfmsg), tb,
-				   NETCONFA_MAX, devconf_ipv4_policy, extack);
+		return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg),
+					      tb, NETCONFA_MAX,
+					      devconf_ipv4_policy, extack);
 
-	err = nlmsg_parse_strict(nlh, sizeof(struct netconfmsg), tb,
-				 NETCONFA_MAX, devconf_ipv4_policy, extack);
+	err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg),
+					    tb, NETCONFA_MAX,
+					    devconf_ipv4_policy, extack);
 	if (err)
 		return err;
 
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index d4b63f94f7be..b298255f6fdb 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -718,8 +718,8 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
 	int err, remaining;
 	struct rtmsg *rtm;
 
-	err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy,
-			     extack);
+	err = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX,
+					rtm_ipv4_policy, extack);
 	if (err < 0)
 		goto errout;
 
@@ -896,8 +896,8 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 	filter->rt_type  = rtm->rtm_type;
 	filter->table_id = rtm->rtm_table;
 
-	err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
-				 rtm_ipv4_policy, extack);
+	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+					    rtm_ipv4_policy, extack);
 	if (err < 0)
 		return err;
 
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index c3f3d28d1087..30c1c264bdfc 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -239,8 +239,8 @@ static int ip_tun_build_state(struct nlattr *attr,
 	struct nlattr *tb[LWTUNNEL_IP_MAX + 1];
 	int err;
 
-	err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy,
-			       extack);
+	err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_MAX, attr,
+					  ip_tun_policy, extack);
 	if (err < 0)
 		return err;
 
@@ -356,8 +356,8 @@ static int ip6_tun_build_state(struct nlattr *attr,
 	struct nlattr *tb[LWTUNNEL_IP6_MAX + 1];
 	int err;
 
-	err = nla_parse_nested(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy,
-			       extack);
+	err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP6_MAX, attr,
+					  ip6_tun_policy, extack);
 	if (err < 0)
 		return err;
 
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 1322573b8228..2c61e10a60e3 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2498,8 +2498,8 @@ static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb,
 	}
 
 	if (!netlink_strict_get_check(skb))
-		return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
-				   rtm_ipv4_policy, extack);
+		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
+					      rtm_ipv4_policy, extack);
 
 	rtm = nlmsg_data(nlh);
 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
@@ -2510,8 +2510,8 @@ static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb,
 		return -EINVAL;
 	}
 
-	err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
-				 rtm_ipv4_policy, extack);
+	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+					    rtm_ipv4_policy, extack);
 	if (err)
 		return err;
 
@@ -2674,8 +2674,8 @@ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh,
 	struct rtmsg *rtm;
 	int ret, rem;
 
-	ret = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy,
-			     extack);
+	ret = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX,
+					rtm_ipmr_policy, extack);
 	if (ret < 0)
 		goto out;
 	rtm = nlmsg_data(nlh);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 4950adeb05c0..795aed6e4720 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2877,8 +2877,8 @@ static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
 	}
 
 	if (!netlink_strict_get_check(skb))
-		return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
-				   rtm_ipv4_policy, extack);
+		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
+					      rtm_ipv4_policy, extack);
 
 	rtm = nlmsg_data(nlh);
 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
@@ -2896,8 +2896,8 @@ static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
 		return -EINVAL;
 	}
 
-	err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
-				 rtm_ipv4_policy, extack);
+	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+					    rtm_ipv4_policy, extack);
 	if (err)
 		return err;
 
-- 
cgit 


From ef6243acb4782df587a4d7d6c310fa5b5d82684b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 26 Apr 2019 14:07:31 +0200
Subject: genetlink: optionally validate strictly/dumps

Add options to strictly validate messages and dump messages,
sometimes perhaps validating dump messages non-strictly may
be required, so add an option for that as well.

Since none of this can really be applied to existing commands,
set the options everwhere using the following spatch:

    @@
    identifier ops;
    expression X;
    @@
    struct genl_ops ops[] = {
    ...,
     {
            .cmd = X,
    +       .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
            ...
     },
    ...
    };

For new commands one should just not copy the .validate 'opt-out'
flags and thus get strict validation.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fou.c         | 3 +++
 net/ipv4/tcp_metrics.c | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 1ca1586a7e46..ca95051317ed 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -913,16 +913,19 @@ static int fou_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
 static const struct genl_ops fou_nl_ops[] = {
 	{
 		.cmd = FOU_CMD_ADD,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = fou_nl_cmd_add_port,
 		.flags = GENL_ADMIN_PERM,
 	},
 	{
 		.cmd = FOU_CMD_DEL,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = fou_nl_cmd_rm_port,
 		.flags = GENL_ADMIN_PERM,
 	},
 	{
 		.cmd = FOU_CMD_GET,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = fou_nl_cmd_get_port,
 		.dumpit = fou_nl_dump,
 	},
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 9a08bfb0672c..f262f2cace29 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -951,11 +951,13 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
 static const struct genl_ops tcp_metrics_nl_ops[] = {
 	{
 		.cmd = TCP_METRICS_CMD_GET,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = tcp_metrics_nl_cmd_get,
 		.dumpit = tcp_metrics_nl_dump,
 	},
 	{
 		.cmd = TCP_METRICS_CMD_DEL,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = tcp_metrics_nl_cmd_del,
 		.flags = GENL_ADMIN_PERM,
 	},
-- 
cgit 


From e1f172e162c0a11721f1188f12e5b4c3f9f80de6 Mon Sep 17 00:00:00 2001
From: Flavio Leitner <fbl@redhat.com>
Date: Wed, 17 Apr 2019 11:46:14 -0300
Subject: netfilter: use macros to create module aliases.

Each NAT helper creates a module alias which follows a pattern.
Use macros for consistency.

Signed-off-by: Flavio Leitner <fbl@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_nat_h323.c | 2 +-
 net/ipv4/netfilter/nf_nat_pptp.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 4e6b53ab6c33..7875c98072eb 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -631,4 +631,4 @@ module_exit(fini);
 MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
 MODULE_DESCRIPTION("H.323 NAT helper");
 MODULE_LICENSE("GPL");
-MODULE_ALIAS("ip_nat_h323");
+MODULE_ALIAS_NF_NAT_HELPER("h323");
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 68b4d450391b..e17b4ee7604c 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -37,7 +37,7 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
 MODULE_DESCRIPTION("Netfilter NAT helper module for PPTP");
-MODULE_ALIAS("ip_nat_pptp");
+MODULE_ALIAS_NF_NAT_HELPER("pptp");
 
 static void pptp_nat_expected(struct nf_conn *ct,
 			      struct nf_conntrack_expect *exp)
-- 
cgit 


From bc9f38c8328e10c22cb2016d6131ea36141c8d11 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 29 Apr 2019 15:46:13 -0700
Subject: tcp: avoid unconditional congestion window undo on SYN retransmit

Previously if an active TCP open has SYN timeout, it always undo the
cwnd upon receiving the SYNACK. This is because tcp_clean_rtx_queue
would reset tp->retrans_stamp when SYN is acked, which fools then
tcp_try_undo_loss and tcp_packet_delayed. Addressing this issue is
required to properly support undo for spurious SYN timeout.

Fixing this is tricky -- for active TCP open tp->retrans_stamp
records the time when the handshake starts, not the first
retransmission time as the name may suggest. The simplest fix is
for tcp_packet_delayed to ensure it is valid before comparing with
other timestamp.

One side effect of this change is active TCP Fast Open that incurred
SYN timeout. Upon receiving a SYN-ACK that only acknowledged
the SYN, it would immediately retransmit unacknowledged data in
tcp_ack() because the data is marked lost after SYN timeout. But
the retransmission would have an incorrect ack sequence number since
rcv_nxt has not been updated yet tcp_rcv_synsent_state_process(), the
retransmission needs to properly handed by tcp_rcv_fastopen_synack()
like before.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 97671bff597a..e2cbfc3ffa3f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2252,7 +2252,7 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
  */
 static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
 {
-	return !tp->retrans_stamp ||
+	return tp->retrans_stamp &&
 	       tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
 }
 
@@ -3521,7 +3521,7 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (rexmit == REXMIT_NONE)
+	if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT)
 		return;
 
 	if (unlikely(rexmit == 2)) {
-- 
cgit 


From 7c1f08154c4e34d10be41156375ce2b8ab591b0f Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 29 Apr 2019 15:46:14 -0700
Subject: tcp: undo initial congestion window on false SYN timeout

Linux implements RFC6298 and use an initial congestion window of 1
upon establishing the connection if the SYN packet is retransmitted 2
or more times. In cellular networks SYN timeouts are often spurious
if the wireless radio was dormant or idle. Also some network path
is longer than the default SYN timeout. Having a minimal cwnd on
both cases are detrimental to TCP startup performance.

This patch extends TCP undo feature (RFC3522 aka TCP Eifel) to detect
spurious SYN timeout via TCP timestamps. Since tp->retrans_stamp
records the initial SYN timestamp instead of first retransmission, we
have to implement a different undo code additionally. The detection
also must happen before tcp_ack() as retrans_stamp is reset when
SYN is acknowledged.

Note this patch covers both active regular and fast open.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c   | 16 ++++++++++++++++
 net/ipv4/tcp_metrics.c |  2 +-
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e2cbfc3ffa3f..695f840acc14 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5748,6 +5748,21 @@ static void smc_check_reset_syn(struct tcp_sock *tp)
 #endif
 }
 
+static void tcp_try_undo_spurious_syn(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 syn_stamp;
+
+	/* undo_marker is set when SYN or SYNACK times out. The timeout is
+	 * spurious if the ACK's timestamp option echo value matches the
+	 * original SYN timestamp.
+	 */
+	syn_stamp = tp->retrans_stamp;
+	if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp &&
+	    syn_stamp == tp->rx_opt.rcv_tsecr)
+		tp->undo_marker = 0;
+}
+
 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 					 const struct tcphdr *th)
 {
@@ -5815,6 +5830,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		tcp_ecn_rcv_synack(tp, th);
 
 		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
+		tcp_try_undo_spurious_syn(sk);
 		tcp_ack(sk, skb, FLAG_SLOWPATH);
 
 		/* Ok.. it's good. Set up sequence numbers and
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index f262f2cace29..d4d687330e2b 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -517,7 +517,7 @@ reset:
 	 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
 	 * retransmission has occurred.
 	 */
-	if (tp->total_retrans > 1)
+	if (tp->total_retrans > 1 && tp->undo_marker)
 		tp->snd_cwnd = 1;
 	else
 		tp->snd_cwnd = tcp_init_cwnd(tp, dst);
-- 
cgit 


From 9e450c1ecb027417c99eba651413d2a6ba6ffc1f Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 29 Apr 2019 15:46:15 -0700
Subject: tcp: better SYNACK sent timestamp

Detecting spurious SYNACK timeout using timestamp option requires
recording the exact SYNACK skb timestamp. Previously the SYNACK
sent timestamp was stamped slightly earlier before the skb
was transmitted. This patch uses the SYNACK skb transmission
timestamp directly.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c  | 2 +-
 net/ipv4/tcp_output.c | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 695f840acc14..30c6a42b1f5b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6319,7 +6319,7 @@ static void tcp_openreq_init(struct request_sock *req,
 	req->cookie_ts = 0;
 	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
 	tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
-	tcp_rsk(req)->snt_synack = tcp_clock_us();
+	tcp_rsk(req)->snt_synack = 0;
 	tcp_rsk(req)->last_oow_ack_time = 0;
 	req->mss = rx_opt->mss_clamp;
 	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 32061928b054..0c4ed66dc1bf 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3247,7 +3247,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 		skb->skb_mstamp_ns = cookie_init_timestamp(req);
 	else
 #endif
+	{
 		skb->skb_mstamp_ns = tcp_clock_ns();
+		if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
+			tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
+	}
 
 #ifdef CONFIG_TCP_MD5SIG
 	rcu_read_lock();
-- 
cgit 


From 336c39a0315139103712d04b9bfaf0215df23b8e Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 29 Apr 2019 15:46:16 -0700
Subject: tcp: undo init congestion window on false SYNACK timeout

Linux implements RFC6298 and use an initial congestion window
of 1 upon establishing the connection if the SYNACK packet is
retransmitted 2 or more times. In cellular networks SYNACK timeouts
are often spurious if the wireless radio was dormant or idle. Also
some network path is longer than the default SYNACK timeout. In
both cases falsely starting with a minimal cwnd are detrimental
to performance.

This patch avoids doing so when the final ACK's TCP timestamp
indicates the original SYNACK was delivered. It remembers the
original SYNACK timestamp when SYNACK timeout has occurred and
re-uses the function to detect spurious SYN timeout conveniently.

Note that a server may receives multiple SYNs from and immediately
retransmits SYNACKs without any SYNACK timeout. This often happens
on when the client SYNs have timed out due to wireless delay
above. In this case since the server will still use the default
initial congestion (e.g. 10) because tp->undo_marker is reset in
tcp_init_metrics(). This is an intentional design because packets
are not lost but delayed.

This patch only covers regular TCP passive open. Fast Open is
supported in the next patch.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c     | 2 ++
 net/ipv4/tcp_minisocks.c | 5 +++++
 2 files changed, 7 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 30c6a42b1f5b..53b4c5a3113b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6101,6 +6101,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 			 */
 			tcp_rearm_rto(sk);
 		} else {
+			tcp_try_undo_spurious_syn(sk);
+			tp->retrans_stamp = 0;
 			tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
 			tp->copied_seq = tp->rcv_nxt;
 		}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 79900f783e0d..9c2a0d36fb20 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -522,6 +522,11 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		newtp->rx_opt.ts_recent_stamp = 0;
 		newtp->tcp_header_len = sizeof(struct tcphdr);
 	}
+	if (req->num_timeout) {
+		newtp->undo_marker = treq->snt_isn;
+		newtp->retrans_stamp = div_u64(treq->snt_synack,
+					       USEC_PER_SEC / TCP_TS_HZ);
+	}
 	newtp->tsoffset = treq->ts_off;
 #ifdef CONFIG_TCP_MD5SIG
 	newtp->md5sig_info = NULL;	/*XXX*/
-- 
cgit 


From 8c3cfe19feac41065bb88bc14b36c318b26847a9 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 29 Apr 2019 15:46:17 -0700
Subject: tcp: lower congestion window on Fast Open SYNACK timeout

TCP sender would use congestion window of 1 packet on the second SYN
and SYNACK timeout except passive TCP Fast Open. This makes passive
TFO too aggressive and unfair during congestion at handshake. This
patch fixes this issue so TCP (fast open or not, passive or active)
always conforms to the RFC6298.

Note that tcp_enter_loss() is called only once during recurring
timeouts.  This is because during handshake, high_seq and snd_una
are the same so tcp_enter_loss() would incorrect set the undo state
variables multiple times.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_timer.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index f0c86398e6a7..2ac23da42dd2 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -393,6 +393,9 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
 		tcp_write_err(sk);
 		return;
 	}
+	/* Lower cwnd after certain SYNACK timeout like tcp_init_transfer() */
+	if (icsk->icsk_retransmits == 1)
+		tcp_enter_loss(sk);
 	/* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
 	 * returned from rtx_syn_ack() to make it more persistent like
 	 * regular retransmit because if the child socket has been accepted
-- 
cgit 


From 794200d66273cbfa32cab2dbcd59a5db6b57a5d1 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 29 Apr 2019 15:46:18 -0700
Subject: tcp: undo cwnd on Fast Open spurious SYNACK retransmit

This patch makes passive Fast Open reverts the cwnd to default
initial cwnd (10 packets) if the SYNACK timeout is spurious.

Passive Fast Open uses a full socket during handshake so it can
use the existing undo logic to detect spurious retransmission
by recording the first SYNACK timeout in key state variable
retrans_stamp. Upon receiving the ACK of the SYNACK, if the socket
has sent some data before the timeout, the spurious timeout
is detected by tcp_try_undo_recovery() in tcp_process_loss()
in tcp_ack().

But if the socket has not send any data yet, tcp_ack() does not
execute the undo code since no data is acknowledged. The fix is to
check such case explicitly after tcp_ack() during the ACK processing
in SYN_RECV state. In addition this is checked in FIN_WAIT_1 state
in case the server closes the socket before handshake completes.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 53b4c5a3113b..3a40584cb473 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6089,6 +6089,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		 * so release it.
 		 */
 		if (req) {
+			tcp_try_undo_loss(sk, false);
 			inet_csk(sk)->icsk_retransmits = 0;
 			reqsk_fastopen_remove(sk, req, false);
 			/* Re-arm the timer because data may have been sent out.
@@ -6143,6 +6144,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		 * our SYNACK so stop the SYNACK timer.
 		 */
 		if (req) {
+			tcp_try_undo_loss(sk, false);
+			inet_csk(sk)->icsk_retransmits = 0;
 			/* We no longer need the request sock. */
 			reqsk_fastopen_remove(sk, req, false);
 			tcp_rearm_rto(sk);
-- 
cgit 


From 6b94b1c88b660a786fdb1c22d8a0d3529fe40f8c Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 29 Apr 2019 15:46:19 -0700
Subject: tcp: refactor to consolidate TFO passive open code

Use a helper to consolidate two identical code block for passive TFO.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 52 +++++++++++++++++++++++++---------------------------
 1 file changed, 25 insertions(+), 27 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3a40584cb473..706a99ec73f6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5989,6 +5989,27 @@ reset_and_undo:
 	return 1;
 }
 
+static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
+{
+	tcp_try_undo_loss(sk, false);
+	inet_csk(sk)->icsk_retransmits = 0;
+
+	/* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
+	 * we no longer need req so release it.
+	 */
+	reqsk_fastopen_remove(sk, tcp_sk(sk)->fastopen_rsk, false);
+
+	/* Re-arm the timer because data may have been sent out.
+	 * This is similar to the regular data transmission case
+	 * when new data has just been ack'ed.
+	 *
+	 * (TFO) - we could try to be more aggressive and
+	 * retransmitting any data sooner based on when they
+	 * are sent out.
+	 */
+	tcp_rearm_rto(sk);
+}
+
 /*
  *	This function implements the receiving procedure of RFC 793 for
  *	all states except ESTABLISHED and TIME_WAIT.
@@ -6085,22 +6106,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		if (!tp->srtt_us)
 			tcp_synack_rtt_meas(sk, req);
 
-		/* Once we leave TCP_SYN_RECV, we no longer need req
-		 * so release it.
-		 */
 		if (req) {
-			tcp_try_undo_loss(sk, false);
-			inet_csk(sk)->icsk_retransmits = 0;
-			reqsk_fastopen_remove(sk, req, false);
-			/* Re-arm the timer because data may have been sent out.
-			 * This is similar to the regular data transmission case
-			 * when new data has just been ack'ed.
-			 *
-			 * (TFO) - we could try to be more aggressive and
-			 * retransmitting any data sooner based on when they
-			 * are sent out.
-			 */
-			tcp_rearm_rto(sk);
+			tcp_rcv_synrecv_state_fastopen(sk);
 		} else {
 			tcp_try_undo_spurious_syn(sk);
 			tp->retrans_stamp = 0;
@@ -6138,18 +6145,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 	case TCP_FIN_WAIT1: {
 		int tmo;
 
-		/* If we enter the TCP_FIN_WAIT1 state and we are a
-		 * Fast Open socket and this is the first acceptable
-		 * ACK we have received, this would have acknowledged
-		 * our SYNACK so stop the SYNACK timer.
-		 */
-		if (req) {
-			tcp_try_undo_loss(sk, false);
-			inet_csk(sk)->icsk_retransmits = 0;
-			/* We no longer need the request sock. */
-			reqsk_fastopen_remove(sk, req, false);
-			tcp_rearm_rto(sk);
-		}
+		if (req)
+			tcp_rcv_synrecv_state_fastopen(sk);
+
 		if (tp->snd_una != tp->write_seq)
 			break;
 
-- 
cgit 


From 98fa6271cfcb1de873b3fe0caf48d9daa1bcc0ac Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 29 Apr 2019 15:46:20 -0700
Subject: tcp: refactor setting the initial congestion window

Relocate the congestion window initialization from tcp_init_metrics()
to tcp_init_transfer() to improve code readability.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c         | 12 ------------
 net/ipv4/tcp_input.c   | 26 ++++++++++++++++++++++++++
 net/ipv4/tcp_metrics.c | 10 ----------
 3 files changed, 26 insertions(+), 22 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f7567a3698eb..1fa15beb8380 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -457,18 +457,6 @@ void tcp_init_sock(struct sock *sk)
 }
 EXPORT_SYMBOL(tcp_init_sock);
 
-void tcp_init_transfer(struct sock *sk, int bpf_op)
-{
-	struct inet_connection_sock *icsk = inet_csk(sk);
-
-	tcp_mtup_init(sk);
-	icsk->icsk_af_ops->rebuild_header(sk);
-	tcp_init_metrics(sk);
-	tcp_call_bpf(sk, bpf_op, 0, NULL);
-	tcp_init_congestion_control(sk);
-	tcp_init_buffer_space(sk);
-}
-
 static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
 {
 	struct sk_buff *skb = tcp_write_queue_tail(sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 706a99ec73f6..077d9abdfcf5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5647,6 +5647,32 @@ discard:
 }
 EXPORT_SYMBOL(tcp_rcv_established);
 
+void tcp_init_transfer(struct sock *sk, int bpf_op)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tcp_mtup_init(sk);
+	icsk->icsk_af_ops->rebuild_header(sk);
+	tcp_init_metrics(sk);
+
+	/* Initialize the congestion window to start the transfer.
+	 * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
+	 * retransmitted. In light of RFC6298 more aggressive 1sec
+	 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
+	 * retransmission has occurred.
+	 */
+	if (tp->total_retrans > 1 && tp->undo_marker)
+		tp->snd_cwnd = 1;
+	else
+		tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
+	tp->snd_cwnd_stamp = tcp_jiffies32;
+
+	tcp_call_bpf(sk, bpf_op, 0, NULL);
+	tcp_init_congestion_control(sk);
+	tcp_init_buffer_space(sk);
+}
+
 void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index d4d687330e2b..c4848e7a0aad 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -512,16 +512,6 @@ reset:
 
 		inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
 	}
-	/* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
-	 * retransmitted. In light of RFC6298 more aggressive 1sec
-	 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
-	 * retransmission has occurred.
-	 */
-	if (tp->total_retrans > 1 && tp->undo_marker)
-		tp->snd_cwnd = 1;
-	else
-		tp->snd_cwnd = tcp_init_cwnd(tp, dst);
-	tp->snd_cwnd_stamp = tcp_jiffies32;
 }
 
 bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
-- 
cgit 


From 7fcd1e033dacedd520abebc943c960dcf5add3ae Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 2 May 2019 15:14:15 -0700
Subject: ipmr_base: Do not reset index in mr_table_dump

e is the counter used to save the location of a dump when an
skb is filled. Once the walk of the table is complete, mr_table_dump
needs to return without resetting that index to 0. Dump of a specific
table is looping because of the reset because there is no way to
indicate the walk of the table is done.

Move the reset to the caller so the dump of each table starts at 0,
but the loop counter is maintained if a dump fills an skb.

Fixes: e1cedae1ba6b0 ("ipmr: Refactor mr_rtm_dumproute")
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr_base.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 3e614cc824f7..3a1af50bd0a5 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -335,8 +335,6 @@ next_entry2:
 	}
 	spin_unlock_bh(lock);
 	err = 0;
-	e = 0;
-
 out:
 	cb->args[1] = e;
 	return err;
@@ -374,6 +372,7 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 		err = mr_table_dump(mrt, skb, cb, fill, lock, filter);
 		if (err < 0)
 			break;
+		cb->args[1] = 0;
 next_table:
 		t++;
 	}
-- 
cgit 


From 0f457a36626fa94026e483836fbf29e451434567 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 30 Apr 2019 07:45:48 -0700
Subject: ipv4: Move cached routes to fib_nh_common

While the cached routes, nh_pcpu_rth_output and nh_rth_input, are IPv4
specific, a later patch wants to make them accessible for IPv6 nexthops
with IPv4 routes using a fib6_nh. Move the cached routes from fib_nh to
fib_nh_common and update references.

Initialization of the cached entries is moved to fib_nh_common_init,
and free is moved to fib_nh_common_release.

Change in location only, from fib_nh up to fib_nh_common; no functional
change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 36 +++++++++++++++++++-----------------
 net/ipv4/route.c         | 18 +++++++++---------
 2 files changed, 28 insertions(+), 26 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 71c2165a2ce3..4402ec6dc426 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -212,6 +212,8 @@ void fib_nh_common_release(struct fib_nh_common *nhc)
 		dev_put(nhc->nhc_dev);
 
 	lwtstate_put(nhc->nhc_lwtstate);
+	rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output);
+	rt_fibinfo_free(&nhc->nhc_rth_input);
 }
 EXPORT_SYMBOL_GPL(fib_nh_common_release);
 
@@ -223,8 +225,6 @@ void fib_nh_release(struct net *net, struct fib_nh *fib_nh)
 #endif
 	fib_nh_common_release(&fib_nh->nh_common);
 	free_nh_exceptions(fib_nh);
-	rt_fibinfo_free_cpus(fib_nh->nh_pcpu_rth_output);
-	rt_fibinfo_free(&fib_nh->nh_rth_input);
 }
 
 /* Release a nexthop info record */
@@ -491,23 +491,35 @@ int fib_nh_common_init(struct fib_nh_common *nhc, struct nlattr *encap,
 		       u16 encap_type, void *cfg, gfp_t gfp_flags,
 		       struct netlink_ext_ack *extack)
 {
+	int err;
+
+	nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *,
+						    gfp_flags);
+	if (!nhc->nhc_pcpu_rth_output)
+		return -ENOMEM;
+
 	if (encap) {
 		struct lwtunnel_state *lwtstate;
-		int err;
 
 		if (encap_type == LWTUNNEL_ENCAP_NONE) {
 			NL_SET_ERR_MSG(extack, "LWT encap type not specified");
-			return -EINVAL;
+			err = -EINVAL;
+			goto lwt_failure;
 		}
 		err = lwtunnel_build_state(encap_type, encap, nhc->nhc_family,
 					   cfg, &lwtstate, extack);
 		if (err)
-			return err;
+			goto lwt_failure;
 
 		nhc->nhc_lwtstate = lwtstate_get(lwtstate);
 	}
 
 	return 0;
+
+lwt_failure:
+	rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output);
+	nhc->nhc_pcpu_rth_output = NULL;
+	return err;
 }
 EXPORT_SYMBOL_GPL(fib_nh_common_init);
 
@@ -515,18 +527,14 @@ int fib_nh_init(struct net *net, struct fib_nh *nh,
 		struct fib_config *cfg, int nh_weight,
 		struct netlink_ext_ack *extack)
 {
-	int err = -ENOMEM;
+	int err;
 
 	nh->fib_nh_family = AF_INET;
 
-	nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
-	if (!nh->nh_pcpu_rth_output)
-		goto err_out;
-
 	err = fib_nh_common_init(&nh->nh_common, cfg->fc_encap,
 				 cfg->fc_encap_type, cfg, GFP_KERNEL, extack);
 	if (err)
-		goto init_failure;
+		return err;
 
 	nh->fib_nh_oif = cfg->fc_oif;
 	nh->fib_nh_gw_family = cfg->fc_gw_family;
@@ -546,12 +554,6 @@ int fib_nh_init(struct net *net, struct fib_nh *nh,
 	nh->fib_nh_weight = nh_weight;
 #endif
 	return 0;
-
-init_failure:
-	rt_fibinfo_free_cpus(nh->nh_pcpu_rth_output);
-	nh->nh_pcpu_rth_output = NULL;
-err_out:
-	return err;
 }
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 795aed6e4720..662ac9bd956e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -646,6 +646,7 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh
 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 				  u32 pmtu, bool lock, unsigned long expires)
 {
+	struct fib_nh_common *nhc = &nh->nh_common;
 	struct fnhe_hash_bucket *hash;
 	struct fib_nh_exception *fnhe;
 	struct rtable *rt;
@@ -715,13 +716,13 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 		 * stale, so anyone caching it rechecks if this exception
 		 * applies to them.
 		 */
-		rt = rcu_dereference(nh->nh_rth_input);
+		rt = rcu_dereference(nhc->nhc_rth_input);
 		if (rt)
 			rt->dst.obsolete = DST_OBSOLETE_KILL;
 
 		for_each_possible_cpu(i) {
 			struct rtable __rcu **prt;
-			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
+			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
 			rt = rcu_dereference(*prt);
 			if (rt)
 				rt->dst.obsolete = DST_OBSOLETE_KILL;
@@ -1471,13 +1472,14 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
 
 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
 {
+	struct fib_nh_common *nhc = &nh->nh_common;
 	struct rtable *orig, *prev, **p;
 	bool ret = true;
 
 	if (rt_is_input_route(rt)) {
-		p = (struct rtable **)&nh->nh_rth_input;
+		p = (struct rtable **)&nhc->nhc_rth_input;
 	} else {
-		p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
+		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
 	}
 	orig = *p;
 
@@ -1810,7 +1812,7 @@ static int __mkroute_input(struct sk_buff *skb,
 		if (fnhe)
 			rth = rcu_dereference(fnhe->fnhe_rth_input);
 		else
-			rth = rcu_dereference(nh->nh_rth_input);
+			rth = rcu_dereference(nhc->nhc_rth_input);
 		if (rt_cache_valid(rth)) {
 			skb_dst_set_noref(skb, &rth->dst);
 			goto out;
@@ -2105,10 +2107,8 @@ local_input:
 	if (res->fi) {
 		if (!itag) {
 			struct fib_nh_common *nhc = FIB_RES_NHC(*res);
-			struct fib_nh *nh;
 
-			nh = container_of(nhc, struct fib_nh, nh_common);
-			rth = rcu_dereference(nh->nh_rth_input);
+			rth = rcu_dereference(nhc->nhc_rth_input);
 			if (rt_cache_valid(rth)) {
 				skb_dst_set_noref(skb, &rth->dst);
 				err = 0;
@@ -2337,7 +2337,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 				do_cache = false;
 				goto add;
 			}
-			prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
+			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
 		}
 		rth = rcu_dereference(*prth);
 		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
-- 
cgit 


From 87063a1fa66740302f08add95ad3d4d316376bef Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 30 Apr 2019 07:45:49 -0700
Subject: ipv4: Pass fib_nh_common to rt_cache_route

Now that the cached routes are in fib_nh_common, pass it to
rt_cache_route and simplify its callers. For rt_set_nexthop,
the tclassid becomes the last user of fib_nh so move the
container_of under the #ifdef CONFIG_IP_ROUTE_CLASSID.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 662ac9bd956e..9b50d0440940 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1470,9 +1470,8 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
 	return ret;
 }
 
-static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
+static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
 {
-	struct fib_nh_common *nhc = &nh->nh_common;
 	struct rtable *orig, *prev, **p;
 	bool ret = true;
 
@@ -1576,7 +1575,6 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 
 	if (fi) {
 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
-		struct fib_nh *nh;
 
 		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
 			rt->rt_gw_family = nhc->nhc_gw_family;
@@ -1589,15 +1587,19 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 
 		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
 
-		nh = container_of(nhc, struct fib_nh, nh_common);
 #ifdef CONFIG_IP_ROUTE_CLASSID
-		rt->dst.tclassid = nh->nh_tclassid;
+		{
+			struct fib_nh *nh;
+
+			nh = container_of(nhc, struct fib_nh, nh_common);
+			rt->dst.tclassid = nh->nh_tclassid;
+		}
 #endif
-		rt->dst.lwtstate = lwtstate_get(nh->fib_nh_lws);
+		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
 		if (unlikely(fnhe))
 			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
 		else if (do_cache)
-			cached = rt_cache_route(nh, rt);
+			cached = rt_cache_route(nhc, rt);
 		if (unlikely(!cached)) {
 			/* Routes we intend to cache in nexthop exception or
 			 * FIB nexthop have the DST_NOCACHE bit clear.
@@ -2139,7 +2141,6 @@ local_input:
 
 	if (do_cache) {
 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
-		struct fib_nh *nh;
 
 		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
 		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
@@ -2148,8 +2149,7 @@ local_input:
 			rth->dst.input = lwtunnel_input;
 		}
 
-		nh = container_of(nhc, struct fib_nh, nh_common);
-		if (unlikely(!rt_cache_route(nh, rth)))
+		if (unlikely(!rt_cache_route(nhc, rth)))
 			rt_add_uncached_list(rth);
 	}
 	skb_dst_set(skb, &rth->dst);
-- 
cgit 


From a5995e7107eb3d9c44744d3cf47d49fabfef01f5 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 30 Apr 2019 07:45:50 -0700
Subject: ipv4: Move exception bucket to nh_common

Similar to the cached routes, make IPv4 exceptions accessible when
using an IPv6 nexthop struct with IPv4 routes. Simplify the exception
functions by passing in fib_nh_common since that is all it needs,
and then cleanup the call sites that have extraneous fib_nh conversions.

As with the cached routes this is a change in location only, from fib_nh
up to fib_nh_common; no functional change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 12 ++++++------
 net/ipv4/route.c         | 41 +++++++++++++++++------------------------
 2 files changed, 23 insertions(+), 30 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 4402ec6dc426..d3da6a10f86f 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -159,12 +159,12 @@ static void rt_fibinfo_free(struct rtable __rcu **rtp)
 	dst_release_immediate(&rt->dst);
 }
 
-static void free_nh_exceptions(struct fib_nh *nh)
+static void free_nh_exceptions(struct fib_nh_common *nhc)
 {
 	struct fnhe_hash_bucket *hash;
 	int i;
 
-	hash = rcu_dereference_protected(nh->nh_exceptions, 1);
+	hash = rcu_dereference_protected(nhc->nhc_exceptions, 1);
 	if (!hash)
 		return;
 	for (i = 0; i < FNHE_HASH_SIZE; i++) {
@@ -214,6 +214,7 @@ void fib_nh_common_release(struct fib_nh_common *nhc)
 	lwtstate_put(nhc->nhc_lwtstate);
 	rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output);
 	rt_fibinfo_free(&nhc->nhc_rth_input);
+	free_nh_exceptions(nhc);
 }
 EXPORT_SYMBOL_GPL(fib_nh_common_release);
 
@@ -224,7 +225,6 @@ void fib_nh_release(struct net *net, struct fib_nh *fib_nh)
 		net->ipv4.fib_num_tclassid_users--;
 #endif
 	fib_nh_common_release(&fib_nh->nh_common);
-	free_nh_exceptions(fib_nh);
 }
 
 /* Release a nexthop info record */
@@ -1713,12 +1713,12 @@ static int call_fib_nh_notifiers(struct fib_nh *nh,
  * - if the new MTU is greater than the PMTU, don't make any change
  * - otherwise, unlock and set PMTU
  */
-static void nh_update_mtu(struct fib_nh *nh, u32 new, u32 orig)
+static void nh_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig)
 {
 	struct fnhe_hash_bucket *bucket;
 	int i;
 
-	bucket = rcu_dereference_protected(nh->nh_exceptions, 1);
+	bucket = rcu_dereference_protected(nhc->nhc_exceptions, 1);
 	if (!bucket)
 		return;
 
@@ -1749,7 +1749,7 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu)
 
 	hlist_for_each_entry(nh, head, nh_hash) {
 		if (nh->fib_nh_dev == dev)
-			nh_update_mtu(nh, dev->mtu, orig_mtu);
+			nh_update_mtu(&nh->nh_common, dev->mtu, orig_mtu);
 	}
 }
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 9b50d0440940..11ddc276776e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -643,10 +643,10 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh
 	}
 }
 
-static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
-				  u32 pmtu, bool lock, unsigned long expires)
+static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
+				  __be32 gw, u32 pmtu, bool lock,
+				  unsigned long expires)
 {
-	struct fib_nh_common *nhc = &nh->nh_common;
 	struct fnhe_hash_bucket *hash;
 	struct fib_nh_exception *fnhe;
 	struct rtable *rt;
@@ -654,17 +654,17 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 	unsigned int i;
 	int depth;
 
-	genid = fnhe_genid(dev_net(nh->fib_nh_dev));
+	genid = fnhe_genid(dev_net(nhc->nhc_dev));
 	hval = fnhe_hashfun(daddr);
 
 	spin_lock_bh(&fnhe_lock);
 
-	hash = rcu_dereference(nh->nh_exceptions);
+	hash = rcu_dereference(nhc->nhc_exceptions);
 	if (!hash) {
 		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 		if (!hash)
 			goto out_unlock;
-		rcu_assign_pointer(nh->nh_exceptions, hash);
+		rcu_assign_pointer(nhc->nhc_exceptions, hash);
 	}
 
 	hash += hval;
@@ -789,10 +789,8 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
 		} else {
 			if (fib_lookup(net, fl4, &res, 0) == 0) {
 				struct fib_nh_common *nhc = FIB_RES_NHC(res);
-				struct fib_nh *nh;
 
-				nh = container_of(nhc, struct fib_nh, nh_common);
-				update_or_create_fnhe(nh, fl4->daddr, new_gw,
+				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
 						0, false,
 						jiffies + ip_rt_gc_timeout);
 			}
@@ -1040,10 +1038,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 	rcu_read_lock();
 	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
 		struct fib_nh_common *nhc = FIB_RES_NHC(res);
-		struct fib_nh *nh;
 
-		nh = container_of(nhc, struct fib_nh, nh_common);
-		update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
+		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
 				      jiffies + ip_rt_mtu_expires);
 	}
 	rcu_read_unlock();
@@ -1329,7 +1325,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
 }
 
-static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
+static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
 {
 	struct fnhe_hash_bucket *hash;
 	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
@@ -1337,7 +1333,7 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
 
 	spin_lock_bh(&fnhe_lock);
 
-	hash = rcu_dereference_protected(nh->nh_exceptions,
+	hash = rcu_dereference_protected(nhc->nhc_exceptions,
 					 lockdep_is_held(&fnhe_lock));
 	hash += hval;
 
@@ -1363,9 +1359,10 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
 	spin_unlock_bh(&fnhe_lock);
 }
 
-static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
+static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
+					       __be32 daddr)
 {
-	struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
+	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
 	struct fib_nh_exception *fnhe;
 	u32 hval;
 
@@ -1379,7 +1376,7 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
 		if (fnhe->fnhe_daddr == daddr) {
 			if (fnhe->fnhe_expires &&
 			    time_after(jiffies, fnhe->fnhe_expires)) {
-				ip_del_fnhe(nh, daddr);
+				ip_del_fnhe(nhc, daddr);
 				break;
 			}
 			return fnhe;
@@ -1406,10 +1403,9 @@ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
 		mtu = fi->fib_mtu;
 
 	if (likely(!mtu)) {
-		struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common);
 		struct fib_nh_exception *fnhe;
 
-		fnhe = find_exception(nh, daddr);
+		fnhe = find_exception(nhc, daddr);
 		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
 			mtu = fnhe->fnhe_pmtu;
 	}
@@ -1760,7 +1756,6 @@ static int __mkroute_input(struct sk_buff *skb,
 	struct net_device *dev = nhc->nhc_dev;
 	struct fib_nh_exception *fnhe;
 	struct rtable *rth;
-	struct fib_nh *nh;
 	int err;
 	struct in_device *out_dev;
 	bool do_cache;
@@ -1808,8 +1803,7 @@ static int __mkroute_input(struct sk_buff *skb,
 		}
 	}
 
-	nh = container_of(nhc, struct fib_nh, nh_common);
-	fnhe = find_exception(nh, daddr);
+	fnhe = find_exception(nhc, daddr);
 	if (do_cache) {
 		if (fnhe)
 			rth = rcu_dereference(fnhe->fnhe_rth_input);
@@ -2321,10 +2315,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	do_cache &= fi != NULL;
 	if (fi) {
 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
-		struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common);
 		struct rtable __rcu **prth;
 
-		fnhe = find_exception(nh, fl4->daddr);
+		fnhe = find_exception(nhc, fl4->daddr);
 		if (!do_cache)
 			goto add;
 		if (fnhe) {
-- 
cgit 


From 0e219ae48c3bbf382ef96adf3825457315728c03 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 3 May 2019 17:01:37 +0200
Subject: net: use indirect calls helpers for L3 handler hooks

So that we avoid another indirect call per RX packet in the common
case.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_input.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 1132d6d1796a..8d78de4b0304 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -130,6 +130,7 @@
 #include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
+#include <linux/indirect_call_wrapper.h>
 
 #include <net/snmp.h>
 #include <net/ip.h>
@@ -188,6 +189,8 @@ bool ip_call_ra_chain(struct sk_buff *skb)
 	return false;
 }
 
+INDIRECT_CALLABLE_DECLARE(int udp_rcv(struct sk_buff *));
+INDIRECT_CALLABLE_DECLARE(int tcp_v4_rcv(struct sk_buff *));
 void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
 {
 	const struct net_protocol *ipprot;
@@ -205,7 +208,8 @@ resubmit:
 			}
 			nf_reset(skb);
 		}
-		ret = ipprot->handler(skb);
+		ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv,
+				      skb);
 		if (ret < 0) {
 			protocol = -ret;
 			goto resubmit;
-- 
cgit 


From 97ff7ffb11fe7a859a490771e7ce23f1f335176b Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 3 May 2019 17:01:38 +0200
Subject: net: use indirect calls helpers at early demux stage

So that we avoid another indirect call per RX packet, if
early demux is enabled.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_input.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 8d78de4b0304..ed97724c5e33 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -309,6 +309,8 @@ drop:
 	return true;
 }
 
+INDIRECT_CALLABLE_DECLARE(int udp_v4_early_demux(struct sk_buff *));
+INDIRECT_CALLABLE_DECLARE(int tcp_v4_early_demux(struct sk_buff *));
 static int ip_rcv_finish_core(struct net *net, struct sock *sk,
 			      struct sk_buff *skb, struct net_device *dev)
 {
@@ -326,7 +328,8 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk,
 
 		ipprot = rcu_dereference(inet_protos[protocol]);
 		if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
-			err = edemux(skb);
+			err = INDIRECT_CALL_2(edemux, tcp_v4_early_demux,
+					      udp_v4_early_demux, skb);
 			if (unlikely(err))
 				goto drop_error;
 			/* must reload iph, skb->head might have changed */
-- 
cgit