From c492d4c74dd3f87559883ffa0f94a8f1ae3fe5f5 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 20 Jun 2019 18:39:28 +0800
Subject: tipc: change to use register_pernet_device

This patch is to fix a dst defcnt leak, which can be reproduced by doing:

  # ip net a c; ip net a s; modprobe tipc
  # ip net e s ip l a n eth1 type veth peer n eth1 netns c
  # ip net e c ip l s lo up; ip net e c ip l s eth1 up
  # ip net e s ip l s lo up; ip net e s ip l s eth1 up
  # ip net e c ip a a 1.1.1.2/8 dev eth1
  # ip net e s ip a a 1.1.1.1/8 dev eth1
  # ip net e c tipc b e m udp n u1 localip 1.1.1.2
  # ip net e s tipc b e m udp n u1 localip 1.1.1.1
  # ip net d c; ip net d s; rmmod tipc

and it will get stuck and keep logging the error:

  unregister_netdevice: waiting for lo to become free. Usage count = 1

The cause is that a dst is held by the udp sock's sk_rx_dst set on udp rx
path with udp_early_demux == 1, and this dst (eventually holding lo dev)
can't be released as bearer's removal in tipc pernet .exit happens after
lo dev's removal, default_device pernet .exit.

 "There are two distinct types of pernet_operations recognized: subsys and
  device.  At creation all subsys init functions are called before device
  init functions, and at destruction all device exit functions are called
  before subsys exit function."

So by calling register_pernet_device instead to register tipc_net_ops, the
pernet .exit() will be invoked earlier than loopback dev's removal when a
netns is being destroyed, as fou/gue does.

Note that vxlan and geneve udp tunnels don't have this issue, as the udp
sock is released in their device ndo_stop().

This fix is also necessary for tipc dst_cache, which will hold dsts on tx
path and I will introduce in my next patch.

Reported-by: Li Shuang <shuali@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/core.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/tipc/core.c b/net/tipc/core.c
index ed536c05252a..c8370722f0bb 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -134,7 +134,7 @@ static int __init tipc_init(void)
 	if (err)
 		goto out_sysctl;
 
-	err = register_pernet_subsys(&tipc_net_ops);
+	err = register_pernet_device(&tipc_net_ops);
 	if (err)
 		goto out_pernet;
 
@@ -142,7 +142,7 @@ static int __init tipc_init(void)
 	if (err)
 		goto out_socket;
 
-	err = register_pernet_subsys(&tipc_topsrv_net_ops);
+	err = register_pernet_device(&tipc_topsrv_net_ops);
 	if (err)
 		goto out_pernet_topsrv;
 
@@ -153,11 +153,11 @@ static int __init tipc_init(void)
 	pr_info("Started in single node mode\n");
 	return 0;
 out_bearer:
-	unregister_pernet_subsys(&tipc_topsrv_net_ops);
+	unregister_pernet_device(&tipc_topsrv_net_ops);
 out_pernet_topsrv:
 	tipc_socket_stop();
 out_socket:
-	unregister_pernet_subsys(&tipc_net_ops);
+	unregister_pernet_device(&tipc_net_ops);
 out_pernet:
 	tipc_unregister_sysctl();
 out_sysctl:
@@ -172,9 +172,9 @@ out_netlink:
 static void __exit tipc_exit(void)
 {
 	tipc_bearer_cleanup();
-	unregister_pernet_subsys(&tipc_topsrv_net_ops);
+	unregister_pernet_device(&tipc_topsrv_net_ops);
 	tipc_socket_stop();
-	unregister_pernet_subsys(&tipc_net_ops);
+	unregister_pernet_device(&tipc_net_ops);
 	tipc_netlink_stop();
 	tipc_netlink_compat_stop();
 	tipc_unregister_sysctl();
-- 
cgit 


From 45d5cb137c3638b3a310f41b31d8e79daf647f14 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Fri, 21 Jun 2019 21:44:37 +0800
Subject: net/sched: cbs: Fix error path of cbs_module_init

If register_qdisc fails, we should unregister
netdevice notifier.

Reported-by: Hulk Robot <hulkci@huawei.com>
Fixes: e0a7683d30e9 ("net/sched: cbs: fix port_rate miscalculation")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_cbs.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index e16a3d37d2bc..732e109c3055 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -549,12 +549,17 @@ static struct notifier_block cbs_device_notifier = {
 
 static int __init cbs_module_init(void)
 {
-	int err = register_netdevice_notifier(&cbs_device_notifier);
+	int err;
 
+	err = register_netdevice_notifier(&cbs_device_notifier);
 	if (err)
 		return err;
 
-	return register_qdisc(&cbs_qdisc_ops);
+	err = register_qdisc(&cbs_qdisc_ops);
+	if (err)
+		unregister_netdevice_notifier(&cbs_device_notifier);
+
+	return err;
 }
 
 static void __exit cbs_module_exit(void)
-- 
cgit 


From 9354544cbccf68da1b047f8fb7b47630e3c8a59d Mon Sep 17 00:00:00 2001
From: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Date: Sun, 23 Jun 2019 21:26:58 -0700
Subject: net/tls: fix page double free on TX cleanup

With commit 94850257cf0f ("tls: Fix tls_device handling of partial records")
a new path was introduced to cleanup partial records during sk_proto_close.
This path does not handle the SW KTLS tx_list cleanup.

This is unnecessary though since the free_resources calls for both
SW and offload paths will cleanup a partial record.

The visible effect is the following warning, but this bug also causes
a page double free.

    WARNING: CPU: 7 PID: 4000 at net/core/stream.c:206 sk_stream_kill_queues+0x103/0x110
    RIP: 0010:sk_stream_kill_queues+0x103/0x110
    RSP: 0018:ffffb6df87e07bd0 EFLAGS: 00010206
    RAX: 0000000000000000 RBX: ffff8c21db4971c0 RCX: 0000000000000007
    RDX: ffffffffffffffa0 RSI: 000000000000001d RDI: ffff8c21db497270
    RBP: ffff8c21db497270 R08: ffff8c29f4748600 R09: 000000010020001a
    R10: ffffb6df87e07aa0 R11: ffffffff9a445600 R12: 0000000000000007
    R13: 0000000000000000 R14: ffff8c21f03f2900 R15: ffff8c21f03b8df0
    Call Trace:
     inet_csk_destroy_sock+0x55/0x100
     tcp_close+0x25d/0x400
     ? tcp_check_oom+0x120/0x120
     tls_sk_proto_close+0x127/0x1c0
     inet_release+0x3c/0x60
     __sock_release+0x3d/0xb0
     sock_close+0x11/0x20
     __fput+0xd8/0x210
     task_work_run+0x84/0xa0
     do_exit+0x2dc/0xb90
     ? release_sock+0x43/0x90
     do_group_exit+0x3a/0xa0
     get_signal+0x295/0x720
     do_signal+0x36/0x610
     ? SYSC_recvfrom+0x11d/0x130
     exit_to_usermode_loop+0x69/0xb0
     do_syscall_64+0x173/0x180
     entry_SYSCALL_64_after_hwframe+0x3d/0xa2
    RIP: 0033:0x7fe9b9abc10d
    RSP: 002b:00007fe9b19a1d48 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca
    RAX: fffffffffffffe00 RBX: 0000000000000006 RCX: 00007fe9b9abc10d
    RDX: 0000000000000002 RSI: 0000000000000080 RDI: 00007fe948003430
    RBP: 00007fe948003410 R08: 00007fe948003430 R09: 0000000000000000
    R10: 0000000000000000 R11: 0000000000000246 R12: 00005603739d9080
    R13: 00007fe9b9ab9f90 R14: 00007fe948003430 R15: 0000000000000000

Fixes: 94850257cf0f ("tls: Fix tls_device handling of partial records")
Signed-off-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index fc81ae18cc44..e2b69e805d46 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -279,7 +279,8 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 		goto skip_tx_cleanup;
 	}
 
-	if (!tls_complete_pending_work(sk, ctx, 0, &timeo))
+	if (unlikely(sk->sk_write_pending) &&
+	    !wait_on_pending_writer(sk, &timeo))
 		tls_handle_open_record(sk, 0);
 
 	/* We need these for tls_sw_fallback handling of other packets */
-- 
cgit 


From 55655e3d1197fff16a7a05088fb0e5eba50eac55 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 24 Jun 2019 02:38:20 -0700
Subject: net/packet: fix memory leak in packet_set_ring()

syzbot found we can leak memory in packet_set_ring(), if user application
provides buggy parameters.

Fixes: 7f953ab2ba46 ("af_packet: TX_RING support for TPACKET_V3")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index a29d66da7394..0b4cf94f0233 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -4314,7 +4314,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 				    req3->tp_sizeof_priv ||
 				    req3->tp_feature_req_word) {
 					err = -EINVAL;
-					goto out;
+					goto out_free_pg_vec;
 				}
 			}
 			break;
@@ -4378,6 +4378,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 			prb_shutdown_retire_blk_timer(po, rb_queue);
 	}
 
+out_free_pg_vec:
 	if (pg_vec)
 		free_pg_vec(pg_vec, order, req->tp_block_nr);
 out:
-- 
cgit 


From 4f07b80c973348a99b5d2a32476a2e7877e94a05 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 25 Jun 2019 00:28:19 +0800
Subject: tipc: check msg->req data len in tipc_nl_compat_bearer_disable

This patch is to fix an uninit-value issue, reported by syzbot:

  BUG: KMSAN: uninit-value in memchr+0xce/0x110 lib/string.c:981
  Call Trace:
    __dump_stack lib/dump_stack.c:77 [inline]
    dump_stack+0x191/0x1f0 lib/dump_stack.c:113
    kmsan_report+0x130/0x2a0 mm/kmsan/kmsan.c:622
    __msan_warning+0x75/0xe0 mm/kmsan/kmsan_instr.c:310
    memchr+0xce/0x110 lib/string.c:981
    string_is_valid net/tipc/netlink_compat.c:176 [inline]
    tipc_nl_compat_bearer_disable+0x2a1/0x480 net/tipc/netlink_compat.c:449
    __tipc_nl_compat_doit net/tipc/netlink_compat.c:327 [inline]
    tipc_nl_compat_doit+0x3ac/0xb00 net/tipc/netlink_compat.c:360
    tipc_nl_compat_handle net/tipc/netlink_compat.c:1178 [inline]
    tipc_nl_compat_recv+0x1b1b/0x27b0 net/tipc/netlink_compat.c:1281

TLV_GET_DATA_LEN() may return a negtive int value, which will be
used as size_t (becoming a big unsigned long) passed into memchr,
cause this issue.

Similar to what it does in tipc_nl_compat_bearer_enable(), this
fix is to return -EINVAL when TLV_GET_DATA_LEN() is negtive in
tipc_nl_compat_bearer_disable(), as well as in
tipc_nl_compat_link_stat_dump() and tipc_nl_compat_link_reset_stats().

v1->v2:
  - add the missing Fixes tags per Eric's request.

Fixes: 0762216c0ad2 ("tipc: fix uninit-value in tipc_nl_compat_bearer_enable")
Fixes: 8b66fee7f8ee ("tipc: fix uninit-value in tipc_nl_compat_link_reset_stats")
Reported-by: syzbot+30eaa8bf392f7fafffaf@syzkaller.appspotmail.com
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/netlink_compat.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index c6a04c09d075..cf155061c472 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -445,7 +445,11 @@ static int tipc_nl_compat_bearer_disable(struct tipc_nl_compat_cmd_doit *cmd,
 	if (!bearer)
 		return -EMSGSIZE;
 
-	len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_BEARER_NAME);
+	len = TLV_GET_DATA_LEN(msg->req);
+	if (len <= 0)
+		return -EINVAL;
+
+	len = min_t(int, len, TIPC_MAX_BEARER_NAME);
 	if (!string_is_valid(name, len))
 		return -EINVAL;
 
@@ -539,7 +543,11 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg,
 
 	name = (char *)TLV_DATA(msg->req);
 
-	len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_LINK_NAME);
+	len = TLV_GET_DATA_LEN(msg->req);
+	if (len <= 0)
+		return -EINVAL;
+
+	len = min_t(int, len, TIPC_MAX_BEARER_NAME);
 	if (!string_is_valid(name, len))
 		return -EINVAL;
 
@@ -817,7 +825,11 @@ static int tipc_nl_compat_link_reset_stats(struct tipc_nl_compat_cmd_doit *cmd,
 	if (!link)
 		return -EMSGSIZE;
 
-	len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_LINK_NAME);
+	len = TLV_GET_DATA_LEN(msg->req);
+	if (len <= 0)
+		return -EINVAL;
+
+	len = min_t(int, len, TIPC_MAX_BEARER_NAME);
 	if (!string_is_valid(name, len))
 		return -EINVAL;
 
-- 
cgit 


From 38c73529de13e1e10914de7030b659a2f8b01c3b Mon Sep 17 00:00:00 2001
From: Stephen Suryaputra <ssuryaextr@gmail.com>
Date: Mon, 24 Jun 2019 20:14:06 -0400
Subject: ipv4: Use return value of inet_iif() for __raw_v4_lookup in the while
 loop

In commit 19e4e768064a8 ("ipv4: Fix raw socket lookup for local
traffic"), the dif argument to __raw_v4_lookup() is coming from the
returned value of inet_iif() but the change was done only for the first
lookup. Subsequent lookups in the while loop still use skb->dev->ifIndex.

Fixes: 19e4e768064a8 ("ipv4: Fix raw socket lookup for local traffic")
Signed-off-by: Stephen Suryaputra <ssuryaextr@gmail.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/raw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 0b8e06ca75d6..40a6abbc9cf6 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -197,7 +197,7 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
 		}
 		sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol,
 				     iph->saddr, iph->daddr,
-				     skb->dev->ifindex, sdif);
+				     dif, sdif);
 	}
 out:
 	read_unlock(&raw_v4_hashinfo.lock);
-- 
cgit 


From b8e8a86337c25941cb06e9a1c8ee01ab9aab0cc2 Mon Sep 17 00:00:00 2001
From: Eiichi Tsukata <devel@etsukata.com>
Date: Tue, 25 Jun 2019 12:08:01 +0900
Subject: net/ipv6: Fix misuse of proc_dointvec "skip_notify_on_dev_down"

/proc/sys/net/ipv6/route/skip_notify_on_dev_down assumes given value to be
0 or 1. Use proc_dointvec_minmax instead of proc_dointvec.

Fixes: 7c6bb7d2faaf ("net/ipv6: Add knob to skip DELROUTE message ondevice down")
Signed-off-by: Eiichi Tsukata <devel@etsukata.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 11ad62effd56..aade636c6be6 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5281,7 +5281,7 @@ static struct ctl_table ipv6_route_table_template[] = {
 		.data		=	&init_net.ipv6.sysctl.skip_notify_on_dev_down,
 		.maxlen		=	sizeof(int),
 		.mode		=	0644,
-		.proc_handler	=	proc_dointvec,
+		.proc_handler	=	proc_dointvec_minmax,
 		.extra1		=	&zero,
 		.extra2		=	&one,
 	},
-- 
cgit 


From 4480879251fb89b1c5585112b1ccc8c3333b41af Mon Sep 17 00:00:00 2001
From: Huaping Zhou <zhp@smail.nju.edu.cn>
Date: Wed, 26 Jun 2019 17:47:49 +0200
Subject: net/smc: hold conns_lock before calling smc_lgr_register_conn()

After smc_lgr_create(), the newly created link group is added
to smc_lgr_list, thus is accessible from other context.
Although link group creation is serialized by
smc_create_lgr_pending, the new link group may still be accessed
concurrently. For example, if ib_device is no longer active,
smc_ib_port_event_work() will call smc_port_terminate(), which
in turn will call __smc_lgr_terminate() on every link group of
this device. So conns_lock is required here.

Signed-off-by: Huaping Zhou <zhp@smail.nju.edu.cn>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 2d2850adc2a3..4ca50ddf8d16 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -652,7 +652,10 @@ create:
 		rc = smc_lgr_create(smc, ini);
 		if (rc)
 			goto out;
+		lgr = conn->lgr;
+		write_lock_bh(&lgr->conns_lock);
 		smc_lgr_register_conn(conn); /* add smc conn to lgr */
+		write_unlock_bh(&lgr->conns_lock);
 	}
 	conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
 	conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
-- 
cgit 


From 8c33bf1b0a9663d1742cb19ee71da46a1d8670dd Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Wed, 26 Jun 2019 17:47:50 +0200
Subject: net/smc: Fix error path in smc_init

If register_pernet_subsys success in smc_init,
we should cleanup it in case any other error.

Fixes: 64e28b52c7a6 (net/smc: add pnet table namespace support")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 0c874e996f85..7621ec2f539c 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -2029,7 +2029,7 @@ static int __init smc_init(void)
 
 	rc = smc_pnet_init();
 	if (rc)
-		return rc;
+		goto out_pernet_subsys;
 
 	rc = smc_llc_init();
 	if (rc) {
@@ -2080,6 +2080,9 @@ out_proto:
 	proto_unregister(&smc_proto);
 out_pnet:
 	smc_pnet_exit();
+out_pernet_subsys:
+	unregister_pernet_subsys(&smc_net_ops);
+
 	return rc;
 }
 
-- 
cgit 


From 5b18f1289808fee5d04a7e6ecf200189f41a4db6 Mon Sep 17 00:00:00 2001
From: Stephen Suryaputra <ssuryaextr@gmail.com>
Date: Wed, 26 Jun 2019 02:21:16 -0400
Subject: ipv4: reset rt_iif for recirculated mcast/bcast out pkts

Multicast or broadcast egress packets have rt_iif set to the oif. These
packets might be recirculated back as input and lookup to the raw
sockets may fail because they are bound to the incoming interface
(skb_iif). If rt_iif is not zero, during the lookup, inet_iif() function
returns rt_iif instead of skb_iif. Hence, the lookup fails.

v2: Make it non vrf specific (David Ahern). Reword the changelog to
    reflect it.
Signed-off-by: Stephen Suryaputra <ssuryaextr@gmail.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 12 ++++++++++++
 net/ipv4/route.c     | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 16f9159234a2..8c2ec35b6512 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -318,6 +318,7 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
 static int ip_mc_finish_output(struct net *net, struct sock *sk,
 			       struct sk_buff *skb)
 {
+	struct rtable *new_rt;
 	int ret;
 
 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
@@ -326,6 +327,17 @@ static int ip_mc_finish_output(struct net *net, struct sock *sk,
 		return ret;
 	}
 
+	/* Reset rt_iif so that inet_iif() will return skb->skb_iif. Setting
+	 * this to non-zero causes ipi_ifindex in in_pktinfo to be overwritten,
+	 * see ipv4_pktinfo_prepare().
+	 */
+	new_rt = rt_dst_clone(net->loopback_dev, skb_rtable(skb));
+	if (new_rt) {
+		new_rt->rt_iif = 0;
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &new_rt->dst);
+	}
+
 	return dev_loopback_xmit(net, sk, skb);
 }
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6cb7cff22db9..8ea0735a6754 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1647,6 +1647,39 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
 }
 EXPORT_SYMBOL(rt_dst_alloc);
 
+struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
+{
+	struct rtable *new_rt;
+
+	new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
+			   rt->dst.flags);
+
+	if (new_rt) {
+		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
+		new_rt->rt_flags = rt->rt_flags;
+		new_rt->rt_type = rt->rt_type;
+		new_rt->rt_is_input = rt->rt_is_input;
+		new_rt->rt_iif = rt->rt_iif;
+		new_rt->rt_pmtu = rt->rt_pmtu;
+		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
+		new_rt->rt_gw_family = rt->rt_gw_family;
+		if (rt->rt_gw_family == AF_INET)
+			new_rt->rt_gw4 = rt->rt_gw4;
+		else if (rt->rt_gw_family == AF_INET6)
+			new_rt->rt_gw6 = rt->rt_gw6;
+		INIT_LIST_HEAD(&new_rt->rt_uncached);
+
+		new_rt->dst.flags |= DST_HOST;
+		new_rt->dst.input = rt->dst.input;
+		new_rt->dst.output = rt->dst.output;
+		new_rt->dst.error = rt->dst.error;
+		new_rt->dst.lastuse = jiffies;
+		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
+	}
+	return new_rt;
+}
+EXPORT_SYMBOL(rt_dst_clone);
+
 /* called in rcu_read_lock() section */
 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 			  u8 tos, struct net_device *dev,
-- 
cgit 


From 9b1c1ef13b35fa35051b635ca9fbda39fe6bbc70 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 24 Jun 2019 16:01:08 +0200
Subject: ipv6: constify rt6_nexthop()

There is no functional change in this patch, it only prepares the next one.

rt6_nexthop() will be used by ip6_dst_lookup_neigh(), which uses const
variables.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reported-by: kbuild test robot <lkp@intel.com>
Acked-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bluetooth/6lowpan.c          | 4 ++--
 net/ipv6/ip6_output.c            | 2 +-
 net/netfilter/nf_flow_table_ip.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 19d27bee285e..1555b0c6f7ec 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -160,10 +160,10 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev,
 						  struct in6_addr *daddr,
 						  struct sk_buff *skb)
 {
-	struct lowpan_peer *peer;
-	struct in6_addr *nexthop;
 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 	int count = atomic_read(&dev->peer_count);
+	const struct in6_addr *nexthop;
+	struct lowpan_peer *peer;
 
 	BT_DBG("peers %d addr %pI6c rt %p", count, daddr, rt);
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 834475717110..21efcd02f337 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -59,8 +59,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 {
 	struct dst_entry *dst = skb_dst(skb);
 	struct net_device *dev = dst->dev;
+	const struct in6_addr *nexthop;
 	struct neighbour *neigh;
-	struct in6_addr *nexthop;
 	int ret;
 
 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 241317473114..cdfc33517e85 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -439,9 +439,9 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 	struct nf_flowtable *flow_table = priv;
 	struct flow_offload_tuple tuple = {};
 	enum flow_offload_tuple_dir dir;
+	const struct in6_addr *nexthop;
 	struct flow_offload *flow;
 	struct net_device *outdev;
-	struct in6_addr *nexthop;
 	struct ipv6hdr *ip6h;
 	struct rt6_info *rt;
 
-- 
cgit 


From 2c6b55f45d53420d8310d41310e0e2cd41fe073f Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 24 Jun 2019 16:01:09 +0200
Subject: ipv6: fix neighbour resolution with raw socket

The scenario is the following: the user uses a raw socket to send an ipv6
packet, destinated to a not-connected network, and specify a connected nh.
Here is the corresponding python script to reproduce this scenario:

 import socket
 IPPROTO_RAW = 255
 send_s = socket.socket(socket.AF_INET6, socket.SOCK_RAW, IPPROTO_RAW)
 # scapy
 # p = IPv6(src='fd00:100::1', dst='fd00:200::fa')/ICMPv6EchoRequest()
 # str(p)
 req = b'`\x00\x00\x00\x00\x08:@\xfd\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xfd\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xfa\x80\x00\x81\xc0\x00\x00\x00\x00'
 send_s.sendto(req, ('fd00:175::2', 0, 0, 0))

fd00:175::/64 is a connected route and fd00:200::fa is not a connected
host.

With this scenario, the kernel starts by sending a NS to resolve
fd00:175::2. When it receives the NA, it flushes its queue and try to send
the initial packet. But instead of sending it, it sends another NS to
resolve fd00:200::fa, which obvioulsy fails, thus the packet is dropped. If
the user sends again the packet, it now uses the right nh (fd00:175::2).

The problem is that ip6_dst_lookup_neigh() uses the rt6i_gateway, which is
:: because the associated route is a connected route, thus it uses the dst
addr of the packet. Let's use rt6_nexthop() to choose the right nh.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index aade636c6be6..97a843cf164c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -218,7 +218,8 @@ static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
 {
 	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
 
-	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
+	return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any),
+				dst->dev, skb, daddr);
 }
 
 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
-- 
cgit 


From 25bff6d5478b2a02368097015b7d8eb727c87e16 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 25 Jun 2019 00:21:45 +0800
Subject: sctp: change to hold sk after auth shkey is created successfully

Now in sctp_endpoint_init(), it holds the sk then creates auth
shkey. But when the creation fails, it doesn't release the sk,
which causes a sk defcnf leak,

Here to fix it by only holding the sk when auth shkey is created
successfully.

Fixes: a29a5bd4f5c3 ("[SCTP]: Implement SCTP-AUTH initializations.")
Reported-by: syzbot+afabda3890cc2f765041@syzkaller.appspotmail.com
Reported-by: syzbot+276ca1c77a19977c0130@syzkaller.appspotmail.com
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/endpointola.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index e358437ba29b..69cebb2c998b 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -118,10 +118,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
 	/* Initialize the bind addr area */
 	sctp_bind_addr_init(&ep->base.bind_addr, 0);
 
-	/* Remember who we are attached to.  */
-	ep->base.sk = sk;
-	sock_hold(ep->base.sk);
-
 	/* Create the lists of associations.  */
 	INIT_LIST_HEAD(&ep->asocs);
 
@@ -154,6 +150,10 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
 	ep->prsctp_enable = net->sctp.prsctp_enable;
 	ep->reconf_enable = net->sctp.reconf_enable;
 
+	/* Remember who we are attached to.  */
+	ep->base.sk = sk;
+	sock_hold(ep->base.sk);
+
 	return ep;
 
 nomem_shkey:
-- 
cgit 


From 89ed5b519004a7706f50b70f611edbd3aaacff2c Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Tue, 25 Jun 2019 17:57:49 -0400
Subject: af_packet: Block execution of tasks waiting for transmit to complete
 in AF_PACKET

When an application is run that:
a) Sets its scheduler to be SCHED_FIFO
and
b) Opens a memory mapped AF_PACKET socket, and sends frames with the
MSG_DONTWAIT flag cleared, its possible for the application to hang
forever in the kernel.  This occurs because when waiting, the code in
tpacket_snd calls schedule, which under normal circumstances allows
other tasks to run, including ksoftirqd, which in some cases is
responsible for freeing the transmitted skb (which in AF_PACKET calls a
destructor that flips the status bit of the transmitted frame back to
available, allowing the transmitting task to complete).

However, when the calling application is SCHED_FIFO, its priority is
such that the schedule call immediately places the task back on the cpu,
preventing ksoftirqd from freeing the skb, which in turn prevents the
transmitting task from detecting that the transmission is complete.

We can fix this by converting the schedule call to a completion
mechanism.  By using a completion queue, we force the calling task, when
it detects there are no more frames to send, to schedule itself off the
cpu until such time as the last transmitted skb is freed, allowing
forward progress to be made.

Tested by myself and the reporter, with good results

Change Notes:

V1->V2:
	Enhance the sleep logic to support being interruptible and
allowing for honoring to SK_SNDTIMEO (Willem de Bruijn)

V2->V3:
	Rearrage the point at which we wait for the completion queue, to
avoid needing to check for ph/skb being null at the end of the loop.
Also move the complete call to the skb destructor to avoid needing to
modify __packet_set_status.  Also gate calling complete on
packet_read_pending returning zero to avoid multiple calls to complete.
(Willem de Bruijn)

	Move timeo computation within loop, to re-fetch the socket
timeout since we also use the timeo variable to record the return code
from the wait_for_complete call (Neil Horman)

V3->V4:
	Willem has requested that the control flow be restored to the
previous state.  Doing so lets us eliminate the need for the
po->wait_on_complete flag variable, and lets us get rid of the
packet_next_frame function, but introduces another complexity.
Specifically, but using the packet pending count, we can, if an
applications calls sendmsg multiple times with MSG_DONTWAIT set, each
set of transmitted frames, when complete, will cause
tpacket_destruct_skb to issue a complete call, for which there will
never be a wait_on_completion call.  This imbalance will lead to any
future call to wait_for_completion here to return early, when the frames
they sent may not have completed.  To correct this, we need to re-init
the completion queue on every call to tpacket_snd before we enter the
loop so as to ensure we wait properly for the frames we send in this
iteration.

	Change the timeout and interrupted gotos to out_put rather than
out_status so that we don't try to free a non-existant skb
	Clean up some extra newlines (Willem de Bruijn)

Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Reported-by: Matteo Croce <mcroce@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 20 +++++++++++++++++---
 net/packet/internal.h  |  1 +
 2 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 0b4cf94f0233..5f78df080573 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2401,6 +2401,9 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
 
 		ts = __packet_set_timestamp(po, ph, skb);
 		__packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
+
+		if (!packet_read_pending(&po->tx_ring))
+			complete(&po->skb_completion);
 	}
 
 	sock_wfree(skb);
@@ -2585,7 +2588,7 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
 
 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 {
-	struct sk_buff *skb;
+	struct sk_buff *skb = NULL;
 	struct net_device *dev;
 	struct virtio_net_hdr *vnet_hdr = NULL;
 	struct sockcm_cookie sockc;
@@ -2600,6 +2603,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	int len_sum = 0;
 	int status = TP_STATUS_AVAILABLE;
 	int hlen, tlen, copylen = 0;
+	long timeo = 0;
 
 	mutex_lock(&po->pg_vec_lock);
 
@@ -2646,12 +2650,21 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
 		size_max = dev->mtu + reserve + VLAN_HLEN;
 
+	reinit_completion(&po->skb_completion);
+
 	do {
 		ph = packet_current_frame(po, &po->tx_ring,
 					  TP_STATUS_SEND_REQUEST);
 		if (unlikely(ph == NULL)) {
-			if (need_wait && need_resched())
-				schedule();
+			if (need_wait && skb) {
+				timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
+				timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
+				if (timeo <= 0) {
+					err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
+					goto out_put;
+				}
+			}
+			/* check for additional frames */
 			continue;
 		}
 
@@ -3207,6 +3220,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 	sock_init_data(sock, sk);
 
 	po = pkt_sk(sk);
+	init_completion(&po->skb_completion);
 	sk->sk_family = PF_PACKET;
 	po->num = proto;
 	po->xmit = dev_queue_xmit;
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 3bb7c5fb3bff..c70a2794456f 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -128,6 +128,7 @@ struct packet_sock {
 	unsigned int		tp_hdrlen;
 	unsigned int		tp_reserve;
 	unsigned int		tp_tstamp;
+	struct completion	skb_completion;
 	struct net_device __rcu	*cached_dev;
 	int			(*xmit)(struct sk_buff *skb);
 	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
-- 
cgit