From e009f95b1543e26606dca2f7e6e9f0f9174538e5 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Wed, 7 Oct 2020 23:18:21 -0700
Subject: can: j1935: j1939_tp_tx_dat_new(): fix missing initialization of
 skbcnt

This fixes an uninit-value warning:
BUG: KMSAN: uninit-value in can_receive+0x26b/0x630 net/can/af_can.c:650

Reported-and-tested-by: syzbot+3f3837e61a48d32b495f@syzkaller.appspotmail.com
Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol")
Cc: Robin van der Gracht <robin@protonic.nl>
Cc: Oleksij Rempel <linux@rempel-privat.de>
Cc: Pengutronix Kernel Team <kernel@pengutronix.de>
Cc: Oliver Hartkopp <socketcan@hartkopp.net>
Cc: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Link: https://lore.kernel.org/r/20201008061821.24663-1-xiyou.wangcong@gmail.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 net/can/j1939/transport.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c
index 0cec4152f979..88cf1062e1e9 100644
--- a/net/can/j1939/transport.c
+++ b/net/can/j1939/transport.c
@@ -580,6 +580,7 @@ sk_buff *j1939_tp_tx_dat_new(struct j1939_priv *priv,
 	skb->dev = priv->ndev;
 	can_skb_reserve(skb);
 	can_skb_prv(skb)->ifindex = priv->ndev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
 	/* reserve CAN header */
 	skb_reserve(skb, offsetof(struct can_frame, data));
 
-- 
cgit 


From 13ba4c434422837d7c8c163f9c8d854e67bf3c99 Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Thu, 8 Oct 2020 23:23:10 +0200
Subject: net: j1939: j1939_session_fresh_new(): fix missing initialization of
 skbcnt

This patch add the initialization of skbcnt, similar to:

    e009f95b1543 can: j1935: j1939_tp_tx_dat_new(): fix missing initialization of skbcnt

Let's play save and initialize this skbcnt as well.

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol")
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 net/can/j1939/transport.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c
index 88cf1062e1e9..e09d087ba240 100644
--- a/net/can/j1939/transport.c
+++ b/net/can/j1939/transport.c
@@ -1488,6 +1488,7 @@ j1939_session *j1939_session_fresh_new(struct j1939_priv *priv,
 	skb->dev = priv->ndev;
 	can_skb_reserve(skb);
 	can_skb_prv(skb)->ifindex = priv->ndev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
 	skcb = j1939_skb_to_cb(skb);
 	memcpy(skcb, rel_skcb, sizeof(*skcb));
 
-- 
cgit 


From 37198e93ced70733f0b993dff28b7c33857e254f Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Tue, 6 Oct 2020 18:26:17 +0200
Subject: net: mptcp: make DACK4/DACK8 usage consistent among all subflows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

using packetdrill it's possible to observe the same MPTCP DSN being acked
by different subflows with DACK4 and DACK8. This is in contrast with what
specified in RFC8684 §3.3.2: if an MPTCP endpoint transmits a 64-bit wide
DSN, it MUST be acknowledged with a 64-bit wide DACK. Fix 'use_64bit_ack'
variable to make it a property of MPTCP sockets, not TCP subflows.

Fixes: a0c1d0eafd1e ("mptcp: Use 32-bit DATA_ACK when possible")
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/mptcp/options.c  | 2 +-
 net/mptcp/protocol.h | 2 +-
 net/mptcp/subflow.c  | 3 +--
 3 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 888bbbbb3e8a..9d7fa93fe0cf 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -516,7 +516,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
 		return ret;
 	}
 
-	if (subflow->use_64bit_ack) {
+	if (READ_ONCE(msk->use_64bit_ack)) {
 		ack_size = TCPOLEN_MPTCP_DSS_ACK64;
 		opts->ext_copy.data_ack = READ_ONCE(msk->ack_seq);
 		opts->ext_copy.ack64 = 1;
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 20f04ac85409..285dd8b2b43a 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -202,6 +202,7 @@ struct mptcp_sock {
 	bool		fully_established;
 	bool		rcv_data_fin;
 	bool		snd_data_fin_enable;
+	bool		use_64bit_ack; /* Set when we received a 64-bit DSN */
 	spinlock_t	join_list_lock;
 	struct work_struct work;
 	struct list_head conn_list;
@@ -294,7 +295,6 @@ struct mptcp_subflow_context {
 		backup : 1,
 		data_avail : 1,
 		rx_eof : 1,
-		use_64bit_ack : 1, /* Set when we received a 64-bit DSN */
 		can_ack : 1;	    /* only after processing the remote a key */
 	u32	remote_nonce;
 	u64	thmac;
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 6f035af1c9d2..91bef7bfffa6 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -769,12 +769,11 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
 	if (!mpext->dsn64) {
 		map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
 				     mpext->data_seq);
-		subflow->use_64bit_ack = 0;
 		pr_debug("expanded seq=%llu", subflow->map_seq);
 	} else {
 		map_seq = mpext->data_seq;
-		subflow->use_64bit_ack = 1;
 	}
+	WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64);
 
 	if (subflow->map_valid) {
 		/* Allow replacing only with an identical map */
-- 
cgit 


From 4a65dff81a04f874fa6915c7f069b4dc2c4010e4 Mon Sep 17 00:00:00 2001
From: Georg Kohmann <geokohma@cisco.com>
Date: Wed, 7 Oct 2020 14:53:02 +0200
Subject: net: ipv6: Discard next-hop MTU less than minimum link MTU

When a ICMPV6_PKT_TOOBIG report a next-hop MTU that is less than the IPv6
minimum link MTU, the estimated path MTU is reduced to the minimum link
MTU. This behaviour breaks TAHI IPv6 Core Conformance Test v6LC4.1.6:
Packet Too Big Less than IPv6 MTU.

Referring to RFC 8201 section 4: "If a node receives a Packet Too Big
message reporting a next-hop MTU that is less than the IPv6 minimum link
MTU, it must discard it. A node must not reduce its estimate of the Path
MTU below the IPv6 minimum link MTU on receipt of a Packet Too Big
message."

Drop the path MTU update if reported MTU is less than the minimum link MTU.

Signed-off-by: Georg Kohmann <geokohma@cisco.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv6/route.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index fb075d9545b9..27430d6e1a99 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2745,7 +2745,8 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
 	if (confirm_neigh)
 		dst_confirm_neigh(dst, daddr);
 
-	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
+	if (mtu < IPV6_MIN_MTU)
+		return;
 	if (mtu >= dst_mtu(dst))
 		return;
 
-- 
cgit 


From ea1dd3e9d080c961b9a451130b61c72dc9a5397b Mon Sep 17 00:00:00 2001
From: Rohit Maheshwari <rohitm@chelsio.com>
Date: Thu, 8 Oct 2020 00:10:21 +0530
Subject: net/tls: sendfile fails with ktls offload

At first when sendpage gets called, if there is more data, 'more' in
tls_push_data() gets set which later sets pending_open_record_frags, but
when there is no more data in file left, and last time tls_push_data()
gets called, pending_open_record_frags doesn't get reset. And later when
2 bytes of encrypted alert comes as sendmsg, it first checks for
pending_open_record_frags, and since this is set, it creates a record with
0 data bytes to encrypt, meaning record length is prepend_size + tag_size
only, which causes problem.
 We should set/reset pending_open_record_frags based on more bit.

Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure")
Signed-off-by: Rohit Maheshwari <rohitm@chelsio.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/tls/tls_device.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index b74e2741f74f..cec86229a6a0 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -418,14 +418,14 @@ static int tls_push_data(struct sock *sk,
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_prot_info *prot = &tls_ctx->prot_info;
 	struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
-	int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE);
 	struct tls_record_info *record = ctx->open_record;
 	int tls_push_record_flags;
 	struct page_frag *pfrag;
 	size_t orig_size = size;
 	u32 max_open_record_len;
-	int copy, rc = 0;
+	bool more = false;
 	bool done = false;
+	int copy, rc = 0;
 	long timeo;
 
 	if (flags &
@@ -492,9 +492,8 @@ handle_error:
 		if (!size) {
 last_record:
 			tls_push_record_flags = flags;
-			if (more) {
-				tls_ctx->pending_open_record_frags =
-						!!record->num_frags;
+			if (flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE)) {
+				more = true;
 				break;
 			}
 
@@ -526,6 +525,8 @@ last_record:
 		}
 	} while (!done);
 
+	tls_ctx->pending_open_record_frags = more;
+
 	if (orig_size - size > 0)
 		rc = orig_size - size;
 
-- 
cgit 


From ed42989eab57d619667d7e87dfbd8fe207db54fe Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Wed, 7 Oct 2020 21:12:50 -0700
Subject: tipc: fix the skb_unshare() in tipc_buf_append()

skb_unshare() drops a reference count on the old skb unconditionally,
so in the failure case, we end up freeing the skb twice here.
And because the skb is allocated in fclone and cloned by caller
tipc_msg_reassemble(), the consequence is actually freeing the
original skb too, thus triggered the UAF by syzbot.

Fix this by replacing this skb_unshare() with skb_cloned()+skb_copy().

Fixes: ff48b6222e65 ("tipc: use skb_unshare() instead in tipc_buf_append()")
Reported-and-tested-by: syzbot+e96a7ba46281824cc46a@syzkaller.appspotmail.com
Cc: Jon Maloy <jmaloy@redhat.com>
Cc: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/tipc/msg.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 52e93ba4d8e2..681224401871 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -150,7 +150,8 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf)
 	if (fragid == FIRST_FRAGMENT) {
 		if (unlikely(head))
 			goto err;
-		frag = skb_unshare(frag, GFP_ATOMIC);
+		if (skb_cloned(frag))
+			frag = skb_copy(frag, GFP_ATOMIC);
 		if (unlikely(!frag))
 			goto err;
 		head = *headbuf = frag;
-- 
cgit 


From 7b50ee3dad2581dc022b4e32e55964d4fcdccf20 Mon Sep 17 00:00:00 2001
From: Hoang Huu Le <hoang.h.le@dektech.com.au>
Date: Thu, 8 Oct 2020 14:31:56 +0700
Subject: tipc: fix NULL pointer dereference in tipc_named_rcv

In the function node_lost_contact(), we call __skb_queue_purge() without
grabbing the list->lock. This can cause to a race-condition why processing
the list 'namedq' in calling path tipc_named_rcv()->tipc_named_dequeue().

    [] BUG: kernel NULL pointer dereference, address: 0000000000000000
    [] #PF: supervisor read access in kernel mode
    [] #PF: error_code(0x0000) - not-present page
    [] PGD 7ca63067 P4D 7ca63067 PUD 6c553067 PMD 0
    [] Oops: 0000 [#1] SMP NOPTI
    [] CPU: 1 PID: 15 Comm: ksoftirqd/1 Tainted: G  O  5.9.0-rc6+ #2
    [] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS [...]
    [] RIP: 0010:tipc_named_rcv+0x103/0x320 [tipc]
    [] Code: 41 89 44 24 10 49 8b 16 49 8b 46 08 49 c7 06 00 00 00 [...]
    [] RSP: 0018:ffffc900000a7c58 EFLAGS: 00000282
    [] RAX: 00000000000012ec RBX: 0000000000000000 RCX: ffff88807bde1270
    [] RDX: 0000000000002c7c RSI: 0000000000002c7c RDI: ffff88807b38f1a8
    [] RBP: ffff88807b006288 R08: ffff88806a367800 R09: ffff88806a367900
    [] R10: ffff88806a367a00 R11: ffff88806a367b00 R12: ffff88807b006258
    [] R13: ffff88807b00628a R14: ffff888069334d00 R15: ffff88806a434600
    [] FS:  0000000000000000(0000) GS:ffff888079480000(0000) knlGS:0[...]
    [] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    [] CR2: 0000000000000000 CR3: 0000000077320000 CR4: 00000000000006e0
    [] Call Trace:
    []  ? tipc_bcast_rcv+0x9a/0x1a0 [tipc]
    []  tipc_rcv+0x40d/0x670 [tipc]
    []  ? _raw_spin_unlock+0xa/0x20
    []  tipc_l2_rcv_msg+0x55/0x80 [tipc]
    []  __netif_receive_skb_one_core+0x8c/0xa0
    []  process_backlog+0x98/0x140
    []  net_rx_action+0x13a/0x420
    []  __do_softirq+0xdb/0x316
    []  ? smpboot_thread_fn+0x2f/0x1e0
    []  ? smpboot_thread_fn+0x74/0x1e0
    []  ? smpboot_thread_fn+0x14e/0x1e0
    []  run_ksoftirqd+0x1a/0x40
    []  smpboot_thread_fn+0x149/0x1e0
    []  ? sort_range+0x20/0x20
    []  kthread+0x131/0x150
    []  ? kthread_unuse_mm+0xa0/0xa0
    []  ret_from_fork+0x22/0x30
    [] Modules linked in: veth tipc(O) ip6_udp_tunnel udp_tunnel [...]
    [] CR2: 0000000000000000
    [] ---[ end trace 65c276a8e2e2f310 ]---

To fix this, we need to grab the lock of the 'namedq' list on both
path calling.

Fixes: cad2929dc432 ("tipc: update a binding service via broadcast")
Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Hoang Huu Le <hoang.h.le@dektech.com.au>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/tipc/name_distr.c | 10 +++++++++-
 net/tipc/node.c       |  2 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 2f9c148f17e2..fe4edce459ad 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -327,8 +327,13 @@ static struct sk_buff *tipc_named_dequeue(struct sk_buff_head *namedq,
 	struct tipc_msg *hdr;
 	u16 seqno;
 
+	spin_lock_bh(&namedq->lock);
 	skb_queue_walk_safe(namedq, skb, tmp) {
-		skb_linearize(skb);
+		if (unlikely(skb_linearize(skb))) {
+			__skb_unlink(skb, namedq);
+			kfree_skb(skb);
+			continue;
+		}
 		hdr = buf_msg(skb);
 		seqno = msg_named_seqno(hdr);
 		if (msg_is_last_bulk(hdr)) {
@@ -338,12 +343,14 @@ static struct sk_buff *tipc_named_dequeue(struct sk_buff_head *namedq,
 
 		if (msg_is_bulk(hdr) || msg_is_legacy(hdr)) {
 			__skb_unlink(skb, namedq);
+			spin_unlock_bh(&namedq->lock);
 			return skb;
 		}
 
 		if (*open && (*rcv_nxt == seqno)) {
 			(*rcv_nxt)++;
 			__skb_unlink(skb, namedq);
+			spin_unlock_bh(&namedq->lock);
 			return skb;
 		}
 
@@ -353,6 +360,7 @@ static struct sk_buff *tipc_named_dequeue(struct sk_buff_head *namedq,
 			continue;
 		}
 	}
+	spin_unlock_bh(&namedq->lock);
 	return NULL;
 }
 
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 4edcee3088da..e4cf515e323f 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1485,7 +1485,7 @@ static void node_lost_contact(struct tipc_node *n,
 
 	/* Clean up broadcast state */
 	tipc_bcast_remove_peer(n->net, n->bc_entry.link);
-	__skb_queue_purge(&n->bc_entry.namedq);
+	skb_queue_purge(&n->bc_entry.namedq);
 
 	/* Abort any ongoing link failover */
 	for (i = 0; i < MAX_BEARERS; i++) {
-- 
cgit 


From 7e94e46c16d060fe272b658ad5e7c2374eeec1bb Mon Sep 17 00:00:00 2001
From: Pujin Shi <shipujin.t@gmail.com>
Date: Thu, 8 Oct 2020 20:19:28 +0800
Subject: net: smc: fix missing brace warning for old compilers

For older versions of gcc, the array = {0}; will cause warnings:

net/smc/smc_llc.c: In function 'smc_llc_send_link_delete_all':
net/smc/smc_llc.c:1317:9: warning: missing braces around initializer [-Wmissing-braces]
  struct smc_llc_msg_del_link delllc = {0};
         ^
net/smc/smc_llc.c:1317:9: warning: (near initialization for 'delllc.hd') [-Wmissing-braces]

1 warnings generated

Fixes: f3811fd7bc97 ("net/smc: send DELETE_LINK, ALL message and wait for send to complete")
Signed-off-by: Pujin Shi <shipujin.t@gmail.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/smc/smc_llc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 3ea33466ebe9..19c8ff7a3609 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -1314,7 +1314,7 @@ out:
  */
 void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn)
 {
-	struct smc_llc_msg_del_link delllc = {0};
+	struct smc_llc_msg_del_link delllc = {};
 	int i;
 
 	delllc.hd.common.type = SMC_LLC_DELETE_LINK;
-- 
cgit 


From 16cb3653803dee243547f95cb51f01bec1323cdf Mon Sep 17 00:00:00 2001
From: Pujin Shi <shipujin.t@gmail.com>
Date: Thu, 8 Oct 2020 20:19:29 +0800
Subject: net: smc: fix missing brace warning for old compilers

For older versions of gcc, the array = {0}; will cause warnings:

net/smc/smc_llc.c: In function 'smc_llc_add_link_local':
net/smc/smc_llc.c:1212:9: warning: missing braces around initializer [-Wmissing-braces]
  struct smc_llc_msg_add_link add_llc = {0};
         ^
net/smc/smc_llc.c:1212:9: warning: (near initialization for 'add_llc.hd') [-Wmissing-braces]
net/smc/smc_llc.c: In function 'smc_llc_srv_delete_link_local':
net/smc/smc_llc.c:1245:9: warning: missing braces around initializer [-Wmissing-braces]
  struct smc_llc_msg_del_link del_llc = {0};
         ^
net/smc/smc_llc.c:1245:9: warning: (near initialization for 'del_llc.hd') [-Wmissing-braces]

2 warnings generated

Fixes: 4dadd151b265 ("net/smc: enqueue local LLC messages")
Signed-off-by: Pujin Shi <shipujin.t@gmail.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/smc/smc_llc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 19c8ff7a3609..f5f6487bb847 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -1209,7 +1209,7 @@ static void smc_llc_process_srv_add_link(struct smc_link_group *lgr)
 /* enqueue a local add_link req to trigger a new add_link flow */
 void smc_llc_add_link_local(struct smc_link *link)
 {
-	struct smc_llc_msg_add_link add_llc = {0};
+	struct smc_llc_msg_add_link add_llc = {};
 
 	add_llc.hd.length = sizeof(add_llc);
 	add_llc.hd.common.type = SMC_LLC_ADD_LINK;
@@ -1242,7 +1242,7 @@ out:
  */
 void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id)
 {
-	struct smc_llc_msg_del_link del_llc = {0};
+	struct smc_llc_msg_del_link del_llc = {};
 
 	del_llc.hd.length = sizeof(del_llc);
 	del_llc.hd.common.type = SMC_LLC_DELETE_LINK;
-- 
cgit 


From d582484726c4c46c8580923e855665fb91e3463e Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 9 Oct 2020 19:00:00 +0200
Subject: mptcp: fix fallback for MP_JOIN subflows

Additional/MP_JOIN subflows that do not pass some initial handshake
tests currently causes fallback to TCP. That is an RFC violation:
we should instead reset the subflow and leave the the msk untouched.

Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/91
Fixes: f296234c98a8 ("mptcp: Add handling of incoming MP_JOIN requests")
Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/mptcp/options.c  | 32 +++++++++++++++++++++++++-------
 net/mptcp/protocol.h |  1 +
 net/mptcp/subflow.c  | 10 ++++++++--
 3 files changed, 34 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 9d7fa93fe0cf..b2f018d32840 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -626,6 +626,12 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
 	if (unlikely(mptcp_check_fallback(sk)))
 		return false;
 
+	/* prevent adding of any MPTCP related options on reset packet
+	 * until we support MP_TCPRST/MP_FASTCLOSE
+	 */
+	if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST))
+		return false;
+
 	if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts))
 		ret = true;
 	else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
@@ -676,7 +682,7 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
 	return false;
 }
 
-static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
+static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
 				    struct mptcp_subflow_context *subflow,
 				    struct sk_buff *skb,
 				    struct mptcp_options_received *mp_opt)
@@ -693,15 +699,20 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
 		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq &&
 		    subflow->mp_join && mp_opt->mp_join &&
 		    READ_ONCE(msk->pm.server_side))
-			tcp_send_ack(sk);
+			tcp_send_ack(ssk);
 		goto fully_established;
 	}
 
-	/* we should process OoO packets before the first subflow is fully
-	 * established, but not expected for MP_JOIN subflows
+	/* we must process OoO packets before the first subflow is fully
+	 * established. OoO packets are instead a protocol violation
+	 * for MP_JOIN subflows as the peer must not send any data
+	 * before receiving the forth ack - cfr. RFC 8684 section 3.2.
 	 */
-	if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1)
+	if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) {
+		if (subflow->mp_join)
+			goto reset;
 		return subflow->mp_capable;
+	}
 
 	if (mp_opt->dss && mp_opt->use_ack) {
 		/* subflows are fully established as soon as we get any
@@ -713,9 +724,12 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
 	}
 
 	/* If the first established packet does not contain MP_CAPABLE + data
-	 * then fallback to TCP
+	 * then fallback to TCP. Fallback scenarios requires a reset for
+	 * MP_JOIN subflows.
 	 */
 	if (!mp_opt->mp_capable) {
+		if (subflow->mp_join)
+			goto reset;
 		subflow->mp_capable = 0;
 		pr_fallback(msk);
 		__mptcp_do_fallback(msk);
@@ -732,12 +746,16 @@ fully_established:
 
 	subflow->pm_notified = 1;
 	if (subflow->mp_join) {
-		clear_3rdack_retransmission(sk);
+		clear_3rdack_retransmission(ssk);
 		mptcp_pm_subflow_established(msk, subflow);
 	} else {
 		mptcp_pm_fully_established(msk);
 	}
 	return true;
+
+reset:
+	mptcp_subflow_reset(ssk);
+	return false;
 }
 
 static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 285dd8b2b43a..a26b33556e9b 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -348,6 +348,7 @@ void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
 				     struct mptcp_options_received *mp_opt);
 bool mptcp_subflow_data_available(struct sock *sk);
 void __init mptcp_subflow_init(void);
+void mptcp_subflow_reset(struct sock *ssk);
 
 /* called with sk socket lock held */
 int __mptcp_subflow_connect(struct sock *sk, int ifindex,
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 91bef7bfffa6..298bcfdd05f1 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -270,6 +270,13 @@ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
 	return thmac == subflow->thmac;
 }
 
+void mptcp_subflow_reset(struct sock *ssk)
+{
+	tcp_set_state(ssk, TCP_CLOSE);
+	tcp_send_active_reset(ssk, GFP_ATOMIC);
+	tcp_done(ssk);
+}
+
 static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
 {
 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
@@ -342,8 +349,7 @@ fallback:
 	return;
 
 do_reset:
-	tcp_send_active_reset(sk, GFP_ATOMIC);
-	tcp_done(sk);
+	mptcp_subflow_reset(sk);
 }
 
 struct request_sock_ops mptcp_subflow_request_sock_ops;
-- 
cgit 


From 0e4f35d7880157ceccf0a58377d778b02762af82 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 9 Oct 2020 19:00:01 +0200
Subject: mptcp: subflows garbage collection

The msk can close MP_JOIN subflows if the initial handshake
fails. Currently such subflows are kept alive in the
conn_list until the msk itself is closed.

Beyond the wasted memory, we could end-up sending the
DATA_FIN and the DATA_FIN ack on such socket, even after a
reset.

Fixes: 43b54c6ee382 ("mptcp: Use full MPTCP-level disconnect state machine")
Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/mptcp/protocol.c | 17 +++++++++++++++++
 net/mptcp/protocol.h |  1 +
 net/mptcp/subflow.c  |  6 ++++++
 3 files changed, 24 insertions(+)

(limited to 'net')

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 5d747c6a610e..b295eb6e9580 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1383,6 +1383,20 @@ static void pm_work(struct mptcp_sock *msk)
 	spin_unlock_bh(&msk->pm.lock);
 }
 
+static void __mptcp_close_subflow(struct mptcp_sock *msk)
+{
+	struct mptcp_subflow_context *subflow, *tmp;
+
+	list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
+		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+		if (inet_sk_state_load(ssk) != TCP_CLOSE)
+			continue;
+
+		__mptcp_close_ssk((struct sock *)msk, ssk, subflow, 0);
+	}
+}
+
 static void mptcp_worker(struct work_struct *work)
 {
 	struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
@@ -1400,6 +1414,9 @@ static void mptcp_worker(struct work_struct *work)
 	mptcp_clean_una(sk);
 	mptcp_check_data_fin_ack(sk);
 	__mptcp_flush_join_list(msk);
+	if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
+		__mptcp_close_subflow(msk);
+
 	__mptcp_move_skbs(msk);
 
 	if (msk->pm.status)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index a26b33556e9b..972463642690 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -90,6 +90,7 @@
 #define MPTCP_WORK_RTX		2
 #define MPTCP_WORK_EOF		3
 #define MPTCP_FALLBACK_DONE	4
+#define MPTCP_WORK_CLOSE_SUBFLOW 5
 
 struct mptcp_options_received {
 	u64	sndr_key;
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 298bcfdd05f1..559f5bbd9622 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -272,9 +272,15 @@ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
 
 void mptcp_subflow_reset(struct sock *ssk)
 {
+	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+	struct sock *sk = subflow->conn;
+
 	tcp_set_state(ssk, TCP_CLOSE);
 	tcp_send_active_reset(ssk, GFP_ATOMIC);
 	tcp_done(ssk);
+	if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) &&
+	    schedule_work(&mptcp_sk(sk)->work))
+		sock_hold(sk);
 }
 
 static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
-- 
cgit 


From 874fb9e2ca949b443cc419a4f2227cafd4381d39 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@kernel.org>
Date: Fri, 9 Oct 2020 11:01:01 -0700
Subject: ipv4: Restore flowi4_oif update before call to xfrm_lookup_route

Tobias reported regressions in IPsec tests following the patch
referenced by the Fixes tag below. The root cause is dropping the
reset of the flowi4_oif after the fib_lookup. Apparently it is
needed for xfrm cases, so restore the oif update to ip_route_output_flow
right before the call to xfrm_lookup_route.

Fixes: 2fbc6e89b2f1 ("ipv4: Update exception handling for multipath routes via same device")
Reported-by: Tobias Brunner <tobias@strongswan.org>
Signed-off-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv4/route.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 58642b29a499..9bd30fd4de4b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2769,10 +2769,12 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 	if (IS_ERR(rt))
 		return rt;
 
-	if (flp4->flowi4_proto)
+	if (flp4->flowi4_proto) {
+		flp4->flowi4_oif = rt->dst.dev->ifindex;
 		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
 							flowi4_to_flowi(flp4),
 							sk, 0);
+	}
 
 	return rt;
 }
-- 
cgit 


From 7980d2eabde82be86c5be18aa3d07e88ec13c6a1 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 9 Oct 2020 21:24:25 +0300
Subject: ipvs: clear skb->tstamp in forwarding path

fq qdisc requires tstamp to be cleared in forwarding path

Reported-by: Evgeny B <abt-admin@mail.ru>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=209427
Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Fixes: 8203e2d844d3 ("net: clear skb->tstamp in forwarding paths")
Fixes: fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC")
Fixes: 80b14dee2bea ("net: Add a new socket option for a future transmit time.")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Reviewed-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_xmit.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index b00866d777fe..d2e5a8f644b8 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -609,6 +609,8 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
 	if (ret == NF_ACCEPT) {
 		nf_reset_ct(skb);
 		skb_forward_csum(skb);
+		if (skb->dev)
+			skb->tstamp = 0;
 	}
 	return ret;
 }
@@ -649,6 +651,8 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
 
 	if (!local) {
 		skb_forward_csum(skb);
+		if (skb->dev)
+			skb->tstamp = 0;
 		NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
 			NULL, skb_dst(skb)->dev, dst_output);
 	} else
@@ -669,6 +673,8 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
 	if (!local) {
 		ip_vs_drop_early_demux_sk(skb);
 		skb_forward_csum(skb);
+		if (skb->dev)
+			skb->tstamp = 0;
 		NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
 			NULL, skb_dst(skb)->dev, dst_output);
 	} else
-- 
cgit 


From 98a381a7d4892dd9969d24433a4bca2f45092643 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 12 Oct 2020 16:54:30 +0200
Subject: netfilter: nftables: extend error reporting for chain updates

The initial support for netlink extended ACK is missing the chain update
path, which results in misleading error reporting in case of EEXIST.

Fixes 36dd1bcc07e5 ("netfilter: nf_tables: initial support for extended ACK reporting")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 4603b667973a..0e43063767d6 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2103,7 +2103,8 @@ static bool nft_hook_list_equal(struct list_head *hook_list1,
 }
 
 static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
-			      u32 flags)
+			      u32 flags, const struct nlattr *attr,
+			      struct netlink_ext_ack *extack)
 {
 	const struct nlattr * const *nla = ctx->nla;
 	struct nft_table *table = ctx->table;
@@ -2119,9 +2120,10 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 		return -EOPNOTSUPP;
 
 	if (nla[NFTA_CHAIN_HOOK]) {
-		if (!nft_is_base_chain(chain))
+		if (!nft_is_base_chain(chain)) {
+			NL_SET_BAD_ATTR(extack, attr);
 			return -EEXIST;
-
+		}
 		err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family,
 					   false);
 		if (err < 0)
@@ -2130,6 +2132,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 		basechain = nft_base_chain(chain);
 		if (basechain->type != hook.type) {
 			nft_chain_release_hook(&hook);
+			NL_SET_BAD_ATTR(extack, attr);
 			return -EEXIST;
 		}
 
@@ -2137,6 +2140,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 			if (!nft_hook_list_equal(&basechain->hook_list,
 						 &hook.list)) {
 				nft_chain_release_hook(&hook);
+				NL_SET_BAD_ATTR(extack, attr);
 				return -EEXIST;
 			}
 		} else {
@@ -2144,6 +2148,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 			if (ops->hooknum != hook.num ||
 			    ops->priority != hook.priority) {
 				nft_chain_release_hook(&hook);
+				NL_SET_BAD_ATTR(extack, attr);
 				return -EEXIST;
 			}
 		}
@@ -2156,8 +2161,10 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 
 		chain2 = nft_chain_lookup(ctx->net, table,
 					  nla[NFTA_CHAIN_NAME], genmask);
-		if (!IS_ERR(chain2))
+		if (!IS_ERR(chain2)) {
+			NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
 			return -EEXIST;
+		}
 	}
 
 	if (nla[NFTA_CHAIN_COUNTERS]) {
@@ -2200,6 +2207,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 			    nft_trans_chain_update(tmp) &&
 			    nft_trans_chain_name(tmp) &&
 			    strcmp(name, nft_trans_chain_name(tmp)) == 0) {
+				NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
 				kfree(name);
 				goto err;
 			}
@@ -2322,7 +2330,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 			return -EOPNOTSUPP;
 
 		flags |= chain->flags & NFT_CHAIN_BASE;
-		return nf_tables_updchain(&ctx, genmask, policy, flags);
+		return nf_tables_updchain(&ctx, genmask, policy, flags, attr,
+					  extack);
 	}
 
 	return nf_tables_addchain(&ctx, family, genmask, policy, flags);
-- 
cgit 


From 0d9826bc18ce356e8909919ad681ad65d0a6061e Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 12 Oct 2020 17:06:06 +0200
Subject: netfilter: nf_log: missing vlan offload tag and proto

Dump vlan tag and proto for the usual vlan offload case if the
NF_LOG_MACDECODE flag is set on. Without this information the logging is
misleading as there is no reference to the VLAN header.

[12716.993704] test: IN=veth0 OUT= MACSRC=86:6c:92:ea:d6:73 MACDST=0e:3b:eb:86:73:76 VPROTO=8100 VID=10 MACPROTO=0800 SRC=192.168.10.2 DST=172.217.168.163 LEN=52 TOS=0x00 PREC=0x00 TTL=64 ID=2548 DF PROTO=TCP SPT=55848 DPT=80 WINDOW=501 RES=0x00 ACK FIN URGP=0
[12721.157643] test: IN=veth0 OUT= MACSRC=86:6c:92:ea:d6:73 MACDST=0e:3b:eb:86:73:76 VPROTO=8100 VID=10 MACPROTO=0806 ARP HTYPE=1 PTYPE=0x0800 OPCODE=2 MACSRC=86:6c:92:ea:d6:73 IPSRC=192.168.10.2 MACDST=0e:3b:eb:86:73:76 IPDST=192.168.10.1

Fixes: 83e96d443b37 ("netfilter: log: split family specific code to nf_log_{ip,ip6,common}.c files")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_log_arp.c  | 19 +++++++++++++++++--
 net/ipv4/netfilter/nf_log_ipv4.c |  6 ++++--
 net/ipv6/netfilter/nf_log_ipv6.c |  8 +++++---
 net/netfilter/nf_log_common.c    | 12 ++++++++++++
 4 files changed, 38 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index 7a83f881efa9..136030ad2e54 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -43,16 +43,31 @@ static void dump_arp_packet(struct nf_log_buf *m,
 			    const struct nf_loginfo *info,
 			    const struct sk_buff *skb, unsigned int nhoff)
 {
-	const struct arphdr *ah;
-	struct arphdr _arph;
 	const struct arppayload *ap;
 	struct arppayload _arpp;
+	const struct arphdr *ah;
+	unsigned int logflags;
+	struct arphdr _arph;
 
 	ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
 	if (ah == NULL) {
 		nf_log_buf_add(m, "TRUNCATED");
 		return;
 	}
+
+	if (info->type == NF_LOG_TYPE_LOG)
+		logflags = info->u.log.logflags;
+	else
+		logflags = NF_LOG_DEFAULT_MASK;
+
+	if (logflags & NF_LOG_MACDECODE) {
+		nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ",
+			       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest);
+		nf_log_dump_vlan(m, skb);
+		nf_log_buf_add(m, "MACPROTO=%04x ",
+			       ntohs(eth_hdr(skb)->h_proto));
+	}
+
 	nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d",
 		       ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op));
 
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 0c72156130b6..d07583fac8f8 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -284,8 +284,10 @@ static void dump_ipv4_mac_header(struct nf_log_buf *m,
 
 	switch (dev->type) {
 	case ARPHRD_ETHER:
-		nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
-			       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+		nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ",
+			       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest);
+		nf_log_dump_vlan(m, skb);
+		nf_log_buf_add(m, "MACPROTO=%04x ",
 			       ntohs(eth_hdr(skb)->h_proto));
 		return;
 	default:
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index da64550a5707..8210ff34ed9b 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -297,9 +297,11 @@ static void dump_ipv6_mac_header(struct nf_log_buf *m,
 
 	switch (dev->type) {
 	case ARPHRD_ETHER:
-		nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
-		       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
-		       ntohs(eth_hdr(skb)->h_proto));
+		nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ",
+			       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest);
+		nf_log_dump_vlan(m, skb);
+		nf_log_buf_add(m, "MACPROTO=%04x ",
+			       ntohs(eth_hdr(skb)->h_proto));
 		return;
 	default:
 		break;
diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c
index ae5628ddbe6d..fd7c5f0f5c25 100644
--- a/net/netfilter/nf_log_common.c
+++ b/net/netfilter/nf_log_common.c
@@ -171,6 +171,18 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf,
 }
 EXPORT_SYMBOL_GPL(nf_log_dump_packet_common);
 
+void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb)
+{
+	u16 vid;
+
+	if (!skb_vlan_tag_present(skb))
+		return;
+
+	vid = skb_vlan_tag_get(skb);
+	nf_log_buf_add(m, "VPROTO=%04x VID=%u ", ntohs(skb->vlan_proto), vid);
+}
+EXPORT_SYMBOL_GPL(nf_log_dump_vlan);
+
 /* bridge and netdev logging families share this code. */
 void nf_log_l2packet(struct net *net, u_int8_t pf,
 		     __be16 protocol,
-- 
cgit 


From 59e611a566e7cd48cf54b6777a11fe3f9c2f9db5 Mon Sep 17 00:00:00 2001
From: Christian Eggers <ceggers@arri.de>
Date: Mon, 12 Oct 2020 11:35:41 +0200
Subject: socket: fix option SO_TIMESTAMPING_NEW

The comparison of optname with SO_TIMESTAMPING_NEW is wrong way around,
so SOCK_TSTAMP_NEW will first be set and than reset again. Additionally
move it out of the test for SOF_TIMESTAMPING_RX_SOFTWARE as this seems
unrelated.

This problem happens on 32 bit platforms were the libc has already
switched to struct timespec64 (from SO_TIMExxx_OLD to SO_TIMExxx_NEW
socket options). ptp4l complains with "missing timestamp on transmitted
peer delay request" because the wrong format is received (and
discarded).

Fixes: 9718475e6908 ("socket: Add SO_TIMESTAMPING_NEW")
Signed-off-by: Christian Eggers <ceggers@arri.de>
Reviewed-by: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
Reviewed-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/core/sock.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index 6c5c6b18eff4..dfe5d8047850 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1007,8 +1007,6 @@ set_sndbuf:
 		__sock_set_timestamps(sk, valbool, true, true);
 		break;
 	case SO_TIMESTAMPING_NEW:
-		sock_set_flag(sk, SOCK_TSTAMP_NEW);
-		fallthrough;
 	case SO_TIMESTAMPING_OLD:
 		if (val & ~SOF_TIMESTAMPING_MASK) {
 			ret = -EINVAL;
@@ -1037,16 +1035,14 @@ set_sndbuf:
 		}
 
 		sk->sk_tsflags = val;
+		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
+
 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 			sock_enable_timestamp(sk,
 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
-		else {
-			if (optname == SO_TIMESTAMPING_NEW)
-				sock_reset_flag(sk, SOCK_TSTAMP_NEW);
-
+		else
 			sock_disable_timestamp(sk,
 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
-		}
 		break;
 
 	case SO_RCVLOWAT:
-- 
cgit 


From 4e3bbb33e6f36e4b05be1b1b9b02e3dd5aaa3e69 Mon Sep 17 00:00:00 2001
From: Christian Eggers <ceggers@arri.de>
Date: Mon, 12 Oct 2020 11:35:42 +0200
Subject: socket: don't clear SOCK_TSTAMP_NEW when SO_TIMESTAMPNS is disabled

SOCK_TSTAMP_NEW (timespec64 instead of timespec) is also used for
hardware time stamps (configured via SO_TIMESTAMPING_NEW).

User space (ptp4l) first configures hardware time stamping via
SO_TIMESTAMPING_NEW which sets SOCK_TSTAMP_NEW. In the next step, ptp4l
disables SO_TIMESTAMPNS(_NEW) (software time stamps), but this must not
switch hardware time stamps back to "32 bit mode".

This problem happens on 32 bit platforms were the libc has already
switched to struct timespec64 (from SO_TIMExxx_OLD to SO_TIMExxx_NEW
socket options). ptp4l complains with "missing timestamp on transmitted
peer delay request" because the wrong format is received (and
discarded).

Fixes: 887feae36aee ("socket: Add SO_TIMESTAMP[NS]_NEW")
Fixes: 783da70e8396 ("net: add sock_enable_timestamps")
Signed-off-by: Christian Eggers <ceggers@arri.de>
Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/core/sock.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index dfe5d8047850..40a95e0c4bad 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -769,7 +769,6 @@ static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 	} else {
 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
-		sock_reset_flag(sk, SOCK_TSTAMP_NEW);
 	}
 }
 
-- 
cgit 


From fdafed459998e2be0e877e6189b24cb7a0183224 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Mon, 12 Oct 2020 16:17:21 -0700
Subject: ip_gre: set dev->hard_header_len and dev->needed_headroom properly

GRE tunnel has its own header_ops, ipgre_header_ops, and sets it
conditionally. When it is set, it assumes the outer IP header is
already created before ipgre_xmit().

This is not true when we send packets through a raw packet socket,
where L2 headers are supposed to be constructed by user. Packet
socket calls dev_validate_header() to validate the header. But
GRE tunnel does not set dev->hard_header_len, so that check can
be simply bypassed, therefore uninit memory could be passed down
to ipgre_xmit(). Similar for dev->needed_headroom.

dev->hard_header_len is supposed to be the length of the header
created by dev->header_ops->create(), so it should be used whenever
header_ops is set, and dev->needed_headroom should be used when it
is not set.

Reported-and-tested-by: syzbot+4a2c52677a8a1aa283cb@syzkaller.appspotmail.com
Cc: William Tu <u9012063@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Xie He <xie.he.0141@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv4/ip_gre.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 4e31f23e4117..e70291748889 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -625,9 +625,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
 	}
 
 	if (dev->header_ops) {
-		/* Need space for new headers */
-		if (skb_cow_head(skb, dev->needed_headroom -
-				      (tunnel->hlen + sizeof(struct iphdr))))
+		if (skb_cow_head(skb, 0))
 			goto free_skb;
 
 		tnl_params = (const struct iphdr *)skb->data;
@@ -748,7 +746,11 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu)
 	len = tunnel->tun_hlen - len;
 	tunnel->hlen = tunnel->hlen + len;
 
-	dev->needed_headroom = dev->needed_headroom + len;
+	if (dev->header_ops)
+		dev->hard_header_len += len;
+	else
+		dev->needed_headroom += len;
+
 	if (set_mtu)
 		dev->mtu = max_t(int, dev->mtu - len, 68);
 
@@ -944,6 +946,7 @@ static void __gre_tunnel_init(struct net_device *dev)
 	tunnel->parms.iph.protocol = IPPROTO_GRE;
 
 	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
+	dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph);
 
 	dev->features		|= GRE_FEATURES;
 	dev->hw_features	|= GRE_FEATURES;
@@ -987,10 +990,14 @@ static int ipgre_tunnel_init(struct net_device *dev)
 				return -EINVAL;
 			dev->flags = IFF_BROADCAST;
 			dev->header_ops = &ipgre_header_ops;
+			dev->hard_header_len = tunnel->hlen + sizeof(*iph);
+			dev->needed_headroom = 0;
 		}
 #endif
 	} else if (!tunnel->collect_md) {
 		dev->header_ops = &ipgre_header_ops;
+		dev->hard_header_len = tunnel->hlen + sizeof(*iph);
+		dev->needed_headroom = 0;
 	}
 
 	return ip_tunnel_init(dev);
-- 
cgit 


From e1e84eb58eb494b77c8389fc6308b5042dcce791 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Mon, 12 Oct 2020 10:50:14 -0400
Subject: ipv4/icmp: l3mdev: Perform icmp error route lookup on source device
 routing table (v2)

As per RFC792, ICMP errors should be sent to the source host.

However, in configurations with Virtual Routing and Forwarding tables,
looking up which routing table to use is currently done by using the
destination net_device.

commit 9d1a6c4ea43e ("net: icmp_route_lookup should use rt dev to
determine L3 domain") changes the interface passed to
l3mdev_master_ifindex() and inet_addr_type_dev_table() from skb_in->dev
to skb_dst(skb_in)->dev. This effectively uses the destination device
rather than the source device for choosing which routing table should be
used to lookup where to send the ICMP error.

Therefore, if the source and destination interfaces are within separate
VRFs, or one in the global routing table and the other in a VRF, looking
up the source host in the destination interface's routing table will
fail if the destination interface's routing table contains no route to
the source host.

One observable effect of this issue is that traceroute does not work in
the following cases:

- Route leaking between global routing table and VRF
- Route leaking between VRFs

Preferably use the source device routing table when sending ICMP error
messages. If no source device is set, fall-back on the destination
device routing table. Else, use the main routing table (index 0).

[ It has been pointed out that a similar issue may exist with ICMP
  errors triggered when forwarding between network namespaces. It would
  be worthwhile to investigate, but is outside of the scope of this
  investigation. ]

[ It has also been pointed out that a similar issue exists with
  unreachable / fragmentation needed messages, which can be triggered by
  changing the MTU of eth1 in r1 to 1400 and running:

  ip netns exec h1 ping -s 1450 -Mdo -c1 172.16.2.2

  Some investigation points to raw_icmp_error() and raw_err() as being
  involved in this last scenario. The focus of this patch is TTL expired
  ICMP messages, which go through icmp_route_lookup.
  Investigation of failure modes related to raw_icmp_error() is beyond
  this investigation's scope. ]

Fixes: 9d1a6c4ea43e ("net: icmp_route_lookup should use rt dev to determine L3 domain")
Link: https://tools.ietf.org/html/rfc792
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv4/icmp.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index cf36f955bfe6..9ea66d903c41 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -457,6 +457,23 @@ out_bh_enable:
 	local_bh_enable();
 }
 
+/*
+ * The device used for looking up which routing table to use for sending an ICMP
+ * error is preferably the source whenever it is set, which should ensure the
+ * icmp error can be sent to the source host, else lookup using the routing
+ * table of the destination device, else use the main routing table (index 0).
+ */
+static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb)
+{
+	struct net_device *route_lookup_dev = NULL;
+
+	if (skb->dev)
+		route_lookup_dev = skb->dev;
+	else if (skb_dst(skb))
+		route_lookup_dev = skb_dst(skb)->dev;
+	return route_lookup_dev;
+}
+
 static struct rtable *icmp_route_lookup(struct net *net,
 					struct flowi4 *fl4,
 					struct sk_buff *skb_in,
@@ -465,6 +482,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
 					int type, int code,
 					struct icmp_bxm *param)
 {
+	struct net_device *route_lookup_dev;
 	struct rtable *rt, *rt2;
 	struct flowi4 fl4_dec;
 	int err;
@@ -479,7 +497,8 @@ static struct rtable *icmp_route_lookup(struct net *net,
 	fl4->flowi4_proto = IPPROTO_ICMP;
 	fl4->fl4_icmp_type = type;
 	fl4->fl4_icmp_code = code;
-	fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev);
+	route_lookup_dev = icmp_get_route_lookup_dev(skb_in);
+	fl4->flowi4_oif = l3mdev_master_ifindex(route_lookup_dev);
 
 	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
 	rt = ip_route_output_key_hash(net, fl4, skb_in);
@@ -503,7 +522,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
 	if (err)
 		goto relookup_failed;
 
-	if (inet_addr_type_dev_table(net, skb_dst(skb_in)->dev,
+	if (inet_addr_type_dev_table(net, route_lookup_dev,
 				     fl4_dec.saddr) == RTN_LOCAL) {
 		rt2 = __ip_route_output_key(net, &fl4_dec);
 		if (IS_ERR(rt2))
-- 
cgit 


From 272928d1cdacfc3b55f605cb0e9115832ecfb20c Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Mon, 12 Oct 2020 10:50:15 -0400
Subject: ipv6/icmp: l3mdev: Perform icmp error route lookup on source device
 routing table (v2)

As per RFC4443, the destination address field for ICMPv6 error messages
is copied from the source address field of the invoking packet.

In configurations with Virtual Routing and Forwarding tables, looking up
which routing table to use for sending ICMPv6 error messages is
currently done by using the destination net_device.

If the source and destination interfaces are within separate VRFs, or
one in the global routing table and the other in a VRF, looking up the
source address of the invoking packet in the destination interface's
routing table will fail if the destination interface's routing table
contains no route to the invoking packet's source address.

One observable effect of this issue is that traceroute6 does not work in
the following cases:

- Route leaking between global routing table and VRF
- Route leaking between VRFs

Use the source device routing table when sending ICMPv6 error
messages.

[ In the context of ipv4, it has been pointed out that a similar issue
  may exist with ICMP errors triggered when forwarding between network
  namespaces. It would be worthwhile to investigate whether ipv6 has
  similar issues, but is outside of the scope of this investigation. ]

[ Testing shows that similar issues exist with ipv6 unreachable /
  fragmentation needed messages.  However, investigation of this
  additional failure mode is beyond this investigation's scope. ]

Link: https://tools.ietf.org/html/rfc4443
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv6/icmp.c       | 7 +++++--
 net/ipv6/ip6_output.c | 2 --
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index a4e4912ad607..91209a2760aa 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -501,8 +501,11 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	if (__ipv6_addr_needs_scope_id(addr_type)) {
 		iif = icmp6_iif(skb);
 	} else {
-		dst = skb_dst(skb);
-		iif = l3mdev_master_ifindex(dst ? dst->dev : skb->dev);
+		/*
+		 * The source device is used for looking up which routing table
+		 * to use for sending an ICMP error.
+		 */
+		iif = l3mdev_master_ifindex(skb->dev);
 	}
 
 	/*
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index c78e67d7747f..cd623068de53 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -468,8 +468,6 @@ int ip6_forward(struct sk_buff *skb)
 	 *	check and decrement ttl
 	 */
 	if (hdr->hop_limit <= 1) {
-		/* Force OUTPUT device used as source address */
-		skb->dev = dst->dev;
 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 
-- 
cgit 


From 1d273fcc2c29343e59658276b77b02e5897a3123 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Wed, 14 Oct 2020 17:17:49 +0800
Subject: bpfilter: Fix build error with CONFIG_BPFILTER_UMH

IF CONFIG_BPFILTER_UMH is set, building fails:

In file included from /usr/include/sys/socket.h:33:0,
                 from net/bpfilter/main.c:6:
/usr/include/bits/socket.h:390:10: fatal error: asm/socket.h: No such file or directory
 #include <asm/socket.h>
          ^~~~~~~~~~~~~~
compilation terminated.
scripts/Makefile.userprogs:43: recipe for target 'net/bpfilter/main.o' failed
make[2]: *** [net/bpfilter/main.o] Error 1

Add missing include path to fix this.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/bpfilter/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
index cdac82b8c53a..389ea76ccc0b 100644
--- a/net/bpfilter/Makefile
+++ b/net/bpfilter/Makefile
@@ -5,7 +5,7 @@
 
 userprogs := bpfilter_umh
 bpfilter_umh-objs := main.o
-userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi
+userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi -I usr/include/
 
 ifeq ($(CONFIG_BPFILTER_UMH), y)
 # builtin bpfilter_umh should be linked with -static
-- 
cgit 


From d535ca1367787ddc8bff22d679a11f864c8228bc Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 14 Oct 2020 19:43:27 +0200
Subject: net/smc: fix use-after-free of delayed events

When a delayed event is enqueued then the event worker will send this
event the next time it is running and no other flow is currently
active. The event handler is called for the delayed event, and the
pointer to the event keeps set in lgr->delayed_event. This pointer is
cleared later in the processing by smc_llc_flow_start().
This can lead to a use-after-free condition when the processing does not
reach smc_llc_flow_start(), but frees the event because of an error
situation. Then the delayed_event pointer is still set but the event is
freed.
Fix this by always clearing the delayed event pointer when the event is
provided to the event handler for processing, and remove the code to
clear it in smc_llc_flow_start().

Fixes: 555da9af827d ("net/smc: add event-based llc_flow framework")
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/smc/smc_llc.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index f5f6487bb847..5e86926c83a1 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -233,8 +233,6 @@ static bool smc_llc_flow_start(struct smc_llc_flow *flow,
 	default:
 		flow->type = SMC_LLC_FLOW_NONE;
 	}
-	if (qentry == lgr->delayed_event)
-		lgr->delayed_event = NULL;
 	smc_llc_flow_qentry_set(flow, qentry);
 	spin_unlock_bh(&lgr->llc_flow_lock);
 	return true;
@@ -1603,13 +1601,12 @@ static void smc_llc_event_work(struct work_struct *work)
 	struct smc_llc_qentry *qentry;
 
 	if (!lgr->llc_flow_lcl.type && lgr->delayed_event) {
-		if (smc_link_usable(lgr->delayed_event->link)) {
-			smc_llc_event_handler(lgr->delayed_event);
-		} else {
-			qentry = lgr->delayed_event;
-			lgr->delayed_event = NULL;
+		qentry = lgr->delayed_event;
+		lgr->delayed_event = NULL;
+		if (smc_link_usable(qentry->link))
+			smc_llc_event_handler(qentry);
+		else
 			kfree(qentry);
-		}
 	}
 
 again:
-- 
cgit 


From ef12ad45880b696eb993d86c481ca891836ab593 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 14 Oct 2020 19:43:28 +0200
Subject: net/smc: fix valid DMBE buffer sizes

The SMCD_DMBE_SIZES should include all valid DMBE buffer sizes, so the
correct value is 6 which means 1MB. With 7 the registration of an ISM
buffer would always fail because of the invalid size requested.
Fix that and set the value to 6.

Fixes: c6ba7c9ba43d ("net/smc: add base infrastructure for SMC-D and ISM")
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/smc/smc_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index a406627b1d55..7c0e4fac9748 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -1597,7 +1597,7 @@ out:
 	return rc;
 }
 
-#define SMCD_DMBE_SIZES		7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */
+#define SMCD_DMBE_SIZES		6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */
 
 static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr,
 						bool is_dmb, int bufsize)
-- 
cgit 


From 6b1bbf94ab369d97ed3bdaa561521a52c27ef619 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 14 Oct 2020 19:43:29 +0200
Subject: net/smc: fix invalid return code in smcd_new_buf_create()

smc_ism_register_dmb() returns error codes set by the ISM driver which
are not guaranteed to be negative or in the errno range. Such values
would not be handled by ERR_PTR() and finally the return code will be
used as a memory address.
Fix that by using a valid negative errno value with ERR_PTR().

Fixes: 72b7f6c48708 ("net/smc: unique reason code for exceeded max dmb count")
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/smc/smc_core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 7c0e4fac9748..59cc99fec2a2 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -1616,7 +1616,8 @@ static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr,
 		rc = smc_ism_register_dmb(lgr, bufsize, buf_desc);
 		if (rc) {
 			kfree(buf_desc);
-			return (rc == -ENOMEM) ? ERR_PTR(-EAGAIN) : ERR_PTR(rc);
+			return (rc == -ENOMEM) ? ERR_PTR(-EAGAIN) :
+						 ERR_PTR(-EIO);
 		}
 		buf_desc->pages = virt_to_page(buf_desc->cpu_addr);
 		/* CDC header stored in buf. So, pretend it was smaller */
-- 
cgit 


From 6617dfd440149e42ce4d2be615eb31a4755f4d30 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 14 Oct 2020 07:46:12 -0700
Subject: net: fix pos incrementment in ipv6_route_seq_next

Commit 4fc427e05158 ("ipv6_route_seq_next should increase position index")
tried to fix the issue where seq_file pos is not increased
if a NULL element is returned with seq_ops->next(). See bug
  https://bugzilla.kernel.org/show_bug.cgi?id=206283
The commit effectively does:
  - increase pos for all seq_ops->start()
  - increase pos for all seq_ops->next()

For ipv6_route, increasing pos for all seq_ops->next() is correct.
But increasing pos for seq_ops->start() is not correct
since pos is used to determine how many items to skip during
seq_ops->start():
  iter->skip = *pos;
seq_ops->start() just fetches the *current* pos item.
The item can be skipped only after seq_ops->show() which essentially
is the beginning of seq_ops->next().

For example, I have 7 ipv6 route entries,
  root@arch-fb-vm1:~/net-next dd if=/proc/net/ipv6_route bs=4096
  00000000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000400 00000001 00000000 00000001     eth0
  fe800000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000001 00000000 00000001     eth0
  00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
  00000000000000000000000000000001 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000003 00000000 80200001       lo
  fe800000000000002050e3fffebd3be8 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80200001     eth0
  ff000000000000000000000000000000 08 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000004 00000000 00000001     eth0
  00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
  0+1 records in
  0+1 records out
  1050 bytes (1.0 kB, 1.0 KiB) copied, 0.00707908 s, 148 kB/s
  root@arch-fb-vm1:~/net-next

In the above, I specify buffer size 4096, so all records can be returned
to user space with a single trip to the kernel.

If I use buffer size 128, since each record size is 149, internally
kernel seq_read() will read 149 into its internal buffer and return the data
to user space in two read() syscalls. Then user read() syscall will trigger
next seq_ops->start(). Since the current implementation increased pos even
for seq_ops->start(), it will skip record #2, #4 and #6, assuming the first
record is #1.

  root@arch-fb-vm1:~/net-next dd if=/proc/net/ipv6_route bs=128
  00000000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000400 00000001 00000000 00000001     eth0
  00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
  fe800000000000002050e3fffebd3be8 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80200001     eth0
  00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200       lo
4+1 records in
4+1 records out
600 bytes copied, 0.00127758 s, 470 kB/s

To fix the problem, create a fake pos pointer so seq_ops->start()
won't actually increase seq_file pos. With this fix, the
above `dd` command with `bs=128` will show correct result.

Fixes: 4fc427e05158 ("ipv6_route_seq_next should increase position index")
Cc: Alexei Starovoitov <ast@kernel.org>
Suggested-by: Vasily Averin <vvs@virtuozzo.com>
Reviewed-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv6/ip6_fib.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 4a664ad4f4d4..f88693929e8d 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -2618,8 +2618,10 @@ static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
 	iter->skip = *pos;
 
 	if (iter->tbl) {
+		loff_t p = 0;
+
 		ipv6_route_seq_setup_walk(iter, net);
-		return ipv6_route_seq_next(seq, NULL, pos);
+		return ipv6_route_seq_next(seq, NULL, &p);
 	} else {
 		return NULL;
 	}
-- 
cgit 


From 2ecbc1f684482b4ed52447a39903bd9b0f222898 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 15 Oct 2020 12:33:24 -0700
Subject: Revert "bpfilter: Fix build error with CONFIG_BPFILTER_UMH"

This reverts commit 1d273fcc2c29343e59658276b77b02e5897a3123.

Alexei points out there's nothing implying headers will be built
and therefore exist under usr/include, so this fix doesn't make
much sense.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/bpfilter/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
index 389ea76ccc0b..cdac82b8c53a 100644
--- a/net/bpfilter/Makefile
+++ b/net/bpfilter/Makefile
@@ -5,7 +5,7 @@
 
 userprogs := bpfilter_umh
 bpfilter_umh-objs := main.o
-userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi -I usr/include/
+userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi
 
 ifeq ($(CONFIG_BPFILTER_UMH), y)
 # builtin bpfilter_umh should be linked with -static
-- 
cgit