From 303216e76dcab6049c9d42390b1032f0649a8206 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Wed, 22 Jul 2020 20:36:43 +0200
Subject: batman-adv: Avoid uninitialized chaddr when handling DHCP

The gateway client code can try to optimize the delivery of DHCP packets to
avoid broadcasting them through the whole mesh. But also transmissions to
the client can be optimized by looking up the destination via the chaddr of
the DHCP packet.

But the chaddr is currently only done when chaddr is fully inside the
non-paged area of the skbuff. Otherwise it will not be initialized and the
unoptimized path should have been taken.

But the implementation didn't handle this correctly. It didn't retrieve the
correct chaddr but still tried to perform the TT lookup with this
uninitialized memory.

Reported-by: syzbot+ab16e463b903f5a37036@syzkaller.appspotmail.com
Fixes: 6c413b1c22a2 ("batman-adv: send every DHCP packet as bat-unicast")
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Acked-by: Antonio Quartulli <a@unstable.cc>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/gateway_client.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index a18dcc686dc3..ef3f85b576c4 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -703,8 +703,10 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len,
 
 	chaddr_offset = *header_len + BATADV_DHCP_CHADDR_OFFSET;
 	/* store the client address if the message is going to a client */
-	if (ret == BATADV_DHCP_TO_CLIENT &&
-	    pskb_may_pull(skb, chaddr_offset + ETH_ALEN)) {
+	if (ret == BATADV_DHCP_TO_CLIENT) {
+		if (!pskb_may_pull(skb, chaddr_offset + ETH_ALEN))
+			return BATADV_DHCP_NO;
+
 		/* check if the DHCP packet carries an Ethernet DHCP */
 		p = skb->data + *header_len + BATADV_DHCP_HTYPE_OFFSET;
 		if (*p != BATADV_DHCP_HTYPE_ETHERNET)
-- 
cgit 


From d8bf0c01642275c7dca1e5d02c34e4199c200b1f Mon Sep 17 00:00:00 2001
From: Linus Lüssing <linus.luessing@c0d3.blue>
Date: Thu, 23 Jul 2020 14:28:08 +0200
Subject: batman-adv: Fix own OGM check in aggregated OGMs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The own OGM check is currently misplaced and can lead to the following
issues:

For one thing we might receive an aggregated OGM from a neighbor node
which has our own OGM in the first place. We would then not only skip
our own OGM but erroneously also any other, following OGM in the
aggregate.

For another, we might receive an OGM aggregate which has our own OGM in
a place other then the first one. Then we would wrongly not skip this
OGM, leading to populating the orginator and gateway table with ourself.

Fixes: 9323158ef9f4 ("batman-adv: OGMv2 - implement originators logic")
Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/bat_v_ogm.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index 0f8495b9eeb1..717fe657561d 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -881,6 +881,12 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset,
 		   ntohl(ogm_packet->seqno), ogm_throughput, ogm_packet->ttl,
 		   ogm_packet->version, ntohs(ogm_packet->tvlv_len));
 
+	if (batadv_is_my_mac(bat_priv, ogm_packet->orig)) {
+		batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+			   "Drop packet: originator packet from ourself\n");
+		return;
+	}
+
 	/* If the throughput metric is 0, immediately drop the packet. No need
 	 * to create orig_node / neigh_node for an unusable route.
 	 */
@@ -1008,11 +1014,6 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb,
 	if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
 		goto free_skb;
 
-	ogm_packet = (struct batadv_ogm2_packet *)skb->data;
-
-	if (batadv_is_my_mac(bat_priv, ogm_packet->orig))
-		goto free_skb;
-
 	batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX);
 	batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES,
 			   skb->len + ETH_HLEN);
-- 
cgit 


From 279e89b2281af3b1a9f04906e157992c19c9f163 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@haltian.com>
Date: Tue, 18 Aug 2020 17:46:10 +0300
Subject: batman-adv: bla: use netif_rx_ni when not in interrupt context

batadv_bla_send_claim() gets called from worker thread context through
batadv_bla_periodic_work(), thus netif_rx_ni needs to be used in that
case. This fixes "NOHZ: local_softirq_pending 08" log messages seen
when batman-adv is enabled.

Fixes: 23721387c409 ("batman-adv: add basic bridge loop avoidance code")
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@haltian.com>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/bridge_loop_avoidance.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index 91a04ca373dc..8500f56cbd10 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -437,7 +437,10 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac,
 	batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
 			   skb->len + ETH_HLEN);
 
-	netif_rx(skb);
+	if (in_interrupt())
+		netif_rx(skb);
+	else
+		netif_rx_ni(skb);
 out:
 	if (primary_if)
 		batadv_hardif_put(primary_if);
-- 
cgit 


From cc5453a5b7e90c39f713091a7ebc53c1f87d1700 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 18 Aug 2020 16:15:58 +0200
Subject: netfilter: conntrack: allow sctp hearbeat after connection re-use

If an sctp connection gets re-used, heartbeats are flagged as invalid
because their vtag doesn't match.

Handle this in a similar way as TCP conntrack when it suspects that the
endpoints and conntrack are out-of-sync.

When a HEARTBEAT request fails its vtag validation, flag this in the
conntrack state and accept the packet.

When a HEARTBEAT_ACK is received with an invalid vtag in the reverse
direction after we allowed such a HEARTBEAT through, assume we are
out-of-sync and re-set the vtag info.

v2: remove left-over snippet from an older incarnation that moved
    new_state/old_state assignments, thats not needed so keep that
    as-is.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_proto_sctp.c | 39 +++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 4f897b14b606..810cca24b399 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -62,6 +62,8 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = {
 	[SCTP_CONNTRACK_HEARTBEAT_ACKED]	= 210 SECS,
 };
 
+#define	SCTP_FLAG_HEARTBEAT_VTAG_FAILED	1
+
 #define sNO SCTP_CONNTRACK_NONE
 #define	sCL SCTP_CONNTRACK_CLOSED
 #define	sCW SCTP_CONNTRACK_COOKIE_WAIT
@@ -369,6 +371,7 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
 	u_int32_t offset, count;
 	unsigned int *timeouts;
 	unsigned long map[256 / sizeof(unsigned long)] = { 0 };
+	bool ignore = false;
 
 	if (sctp_error(skb, dataoff, state))
 		return -NF_ACCEPT;
@@ -427,15 +430,39 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
 			/* Sec 8.5.1 (D) */
 			if (sh->vtag != ct->proto.sctp.vtag[dir])
 				goto out_unlock;
-		} else if (sch->type == SCTP_CID_HEARTBEAT ||
-			   sch->type == SCTP_CID_HEARTBEAT_ACK) {
+		} else if (sch->type == SCTP_CID_HEARTBEAT) {
+			if (ct->proto.sctp.vtag[dir] == 0) {
+				pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir);
+				ct->proto.sctp.vtag[dir] = sh->vtag;
+			} else if (sh->vtag != ct->proto.sctp.vtag[dir]) {
+				if (test_bit(SCTP_CID_DATA, map) || ignore)
+					goto out_unlock;
+
+				ct->proto.sctp.flags |= SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
+				ct->proto.sctp.last_dir = dir;
+				ignore = true;
+				continue;
+			} else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) {
+				ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
+			}
+		} else if (sch->type == SCTP_CID_HEARTBEAT_ACK) {
 			if (ct->proto.sctp.vtag[dir] == 0) {
 				pr_debug("Setting vtag %x for dir %d\n",
 					 sh->vtag, dir);
 				ct->proto.sctp.vtag[dir] = sh->vtag;
 			} else if (sh->vtag != ct->proto.sctp.vtag[dir]) {
-				pr_debug("Verification tag check failed\n");
-				goto out_unlock;
+				if (test_bit(SCTP_CID_DATA, map) || ignore)
+					goto out_unlock;
+
+				if ((ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) == 0 ||
+				    ct->proto.sctp.last_dir == dir)
+					goto out_unlock;
+
+				ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
+				ct->proto.sctp.vtag[dir] = sh->vtag;
+				ct->proto.sctp.vtag[!dir] = 0;
+			} else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) {
+				ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
 			}
 		}
 
@@ -470,6 +497,10 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
 	}
 	spin_unlock_bh(&ct->lock);
 
+	/* allow but do not refresh timeout */
+	if (ignore)
+		return NF_ACCEPT;
+
 	timeouts = nf_ct_timeout_lookup(ct);
 	if (!timeouts)
 		timeouts = nf_sctp_pernet(nf_ct_net(ct))->timeouts;
-- 
cgit 


From fce2ff728f95b8894db14f51c9274dc56c37616f Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 5 Aug 2020 15:35:18 +0200
Subject: nl80211: fix NL80211_ATTR_HE_6GHZ_CAPABILITY usage

In nl80211_set_station(), we check NL80211_ATTR_HE_6GHZ_CAPABILITY
and then use NL80211_ATTR_HE_CAPABILITY, which is clearly wrong.
Fix this to use NL80211_ATTR_HE_6GHZ_CAPABILITY as well.

Cc: stable@vger.kernel.org
Fixes: 43e64bf301fd ("cfg80211: handle 6 GHz capability of new station")
Link: https://lore.kernel.org/r/20200805153516.310cef625955.I0abc04dc8abb2c7c005c88ef8fa2d0e3c9fb95c4@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index c04fc6cf6583..19dc0ee807f6 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -6011,7 +6011,7 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
 
 	if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY])
 		params.he_6ghz_capa =
-			nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+			nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]);
 
 	if (info->attrs[NL80211_ATTR_AIRTIME_WEIGHT])
 		params.airtime_weight =
-- 
cgit 


From 68528d937dcd675e79973061c1a314db598162d1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 20 Aug 2020 14:12:33 +0100
Subject: rxrpc: Keep the ACK serial in a var in rxrpc_input_ack()

Keep the ACK serial number in a variable in rxrpc_input_ack() as it's used
frequently.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/input.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 767579328a06..a7699e56eac8 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -843,7 +843,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
 		struct rxrpc_ackinfo info;
 		u8 acks[RXRPC_MAXACKS];
 	} buf;
-	rxrpc_serial_t acked_serial;
+	rxrpc_serial_t ack_serial, acked_serial;
 	rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt;
 	int nr_acks, offset, ioffset;
 
@@ -856,6 +856,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
 	}
 	offset += sizeof(buf.ack);
 
+	ack_serial = sp->hdr.serial;
 	acked_serial = ntohl(buf.ack.serial);
 	first_soft_ack = ntohl(buf.ack.firstPacket);
 	prev_pkt = ntohl(buf.ack.previousPacket);
@@ -864,31 +865,31 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
 	summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ?
 			      buf.ack.reason : RXRPC_ACK__INVALID);
 
-	trace_rxrpc_rx_ack(call, sp->hdr.serial, acked_serial,
+	trace_rxrpc_rx_ack(call, ack_serial, acked_serial,
 			   first_soft_ack, prev_pkt,
 			   summary.ack_reason, nr_acks);
 
 	if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE)
 		rxrpc_input_ping_response(call, skb->tstamp, acked_serial,
-					  sp->hdr.serial);
+					  ack_serial);
 	if (buf.ack.reason == RXRPC_ACK_REQUESTED)
 		rxrpc_input_requested_ack(call, skb->tstamp, acked_serial,
-					  sp->hdr.serial);
+					  ack_serial);
 
 	if (buf.ack.reason == RXRPC_ACK_PING) {
-		_proto("Rx ACK %%%u PING Request", sp->hdr.serial);
+		_proto("Rx ACK %%%u PING Request", ack_serial);
 		rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE,
-				  sp->hdr.serial, true, true,
+				  ack_serial, true, true,
 				  rxrpc_propose_ack_respond_to_ping);
 	} else if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
 		rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED,
-				  sp->hdr.serial, true, true,
+				  ack_serial, true, true,
 				  rxrpc_propose_ack_respond_to_ack);
 	}
 
 	/* Discard any out-of-order or duplicate ACKs (outside lock). */
 	if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) {
-		trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial,
+		trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial,
 					   first_soft_ack, call->ackr_first_seq,
 					   prev_pkt, call->ackr_prev_seq);
 		return;
@@ -904,7 +905,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
 
 	/* Discard any out-of-order or duplicate ACKs (inside lock). */
 	if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) {
-		trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial,
+		trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial,
 					   first_soft_ack, call->ackr_first_seq,
 					   prev_pkt, call->ackr_prev_seq);
 		goto out;
@@ -964,7 +965,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
 	    RXRPC_TX_ANNO_LAST &&
 	    summary.nr_acks == call->tx_top - hard_ack &&
 	    rxrpc_is_client_call(call))
-		rxrpc_propose_ACK(call, RXRPC_ACK_PING, sp->hdr.serial,
+		rxrpc_propose_ACK(call, RXRPC_ACK_PING, ack_serial,
 				  false, true,
 				  rxrpc_propose_ack_ping_for_lost_reply);
 
-- 
cgit 


From 4700c4d80b7bb171f6996016ef121e1508860b42 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 19 Aug 2020 23:29:16 +0100
Subject: rxrpc: Fix loss of RTT samples due to interposed ACK

The Rx protocol has a mechanism to help generate RTT samples that works by
a client transmitting a REQUESTED-type ACK when it receives a DATA packet
that has the REQUEST_ACK flag set.

The peer, however, may interpose other ACKs before transmitting the
REQUESTED-ACK, as can be seen in the following trace excerpt:

 rxrpc_tx_data: c=00000044 DATA d0b5ece8:00000001 00000001 q=00000001 fl=07
 rxrpc_rx_ack: c=00000044 00000001 PNG r=00000000 f=00000002 p=00000000 n=0
 rxrpc_rx_ack: c=00000044 00000002 REQ r=00000001 f=00000002 p=00000001 n=0
 ...

DATA packet 1 (q=xx) has REQUEST_ACK set (bit 1 of fl=xx).  The incoming
ping (labelled PNG) hard-acks the request DATA packet (f=xx exceeds the
sequence number of the DATA packet), causing it to be discarded from the Tx
ring.  The ACK that was requested (labelled REQ, r=xx references the serial
of the DATA packet) comes after the ping, but the sk_buff holding the
timestamp has gone and the RTT sample is lost.

This is particularly noticeable on RPC calls used to probe the service
offered by the peer.  A lot of peers end up with an unknown RTT because we
only ever sent a single RPC.  This confuses the server rotation algorithm.

Fix this by caching the information about the outgoing packet in RTT
calculations in the rxrpc_call struct rather than looking in the Tx ring.

A four-deep buffer is maintained and both REQUEST_ACK-flagged DATA and
PING-ACK transmissions are recorded in there.  When the appropriate
response ACK is received, the buffer is checked for a match and, if found,
an RTT sample is recorded.

If a received ACK refers to a packet with a later serial number than an
entry in the cache, that entry is presumed lost and the entry is made
available to record a new transmission.

ACKs types other than REQUESTED-type and PING-type cause any matching
sample to be cancelled as they don't necessarily represent a useful
measurement.

If there's no space in the buffer on ping/data transmission, the sample
base is discarded.

Fixes: 50235c4b5a2f ("rxrpc: Obtain RTT data by requesting ACKs on DATA packets")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h |  13 +++---
 net/rxrpc/call_object.c |   1 +
 net/rxrpc/input.c       | 104 ++++++++++++++++++++++++++++--------------------
 net/rxrpc/output.c      |  82 ++++++++++++++++++++++++++++----------
 net/rxrpc/rtt.c         |   3 +-
 5 files changed, 132 insertions(+), 71 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 6d29a3603a3e..884cff7bb169 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -488,7 +488,6 @@ enum rxrpc_call_flag {
 	RXRPC_CALL_RX_LAST,		/* Received the last packet (at rxtx_top) */
 	RXRPC_CALL_TX_LAST,		/* Last packet in Tx buffer (at rxtx_top) */
 	RXRPC_CALL_SEND_PING,		/* A ping will need to be sent */
-	RXRPC_CALL_PINGING,		/* Ping in process */
 	RXRPC_CALL_RETRANS_TIMEOUT,	/* Retransmission due to timeout occurred */
 	RXRPC_CALL_BEGAN_RX_TIMER,	/* We began the expect_rx_by timer */
 	RXRPC_CALL_RX_HEARD,		/* The peer responded at least once to this call */
@@ -673,9 +672,13 @@ struct rxrpc_call {
 	rxrpc_seq_t		ackr_consumed;	/* Highest packet shown consumed */
 	rxrpc_seq_t		ackr_seen;	/* Highest packet shown seen */
 
-	/* ping management */
-	rxrpc_serial_t		ping_serial;	/* Last ping sent */
-	ktime_t			ping_time;	/* Time last ping sent */
+	/* RTT management */
+	rxrpc_serial_t		rtt_serial[4];	/* Serial number of DATA or PING sent */
+	ktime_t			rtt_sent_at[4];	/* Time packet sent */
+	unsigned long		rtt_avail;	/* Mask of available slots in bits 0-3,
+						 * Mask of pending samples in 8-11 */
+#define RXRPC_CALL_RTT_AVAIL_MASK	0xf
+#define RXRPC_CALL_RTT_PEND_SHIFT	8
 
 	/* transmission-phase ACK management */
 	ktime_t			acks_latest_ts;	/* Timestamp of latest ACK received */
@@ -1037,7 +1040,7 @@ static inline bool __rxrpc_abort_eproto(struct rxrpc_call *call,
 /*
  * rtt.c
  */
-void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace,
+void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, int,
 			rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t);
 unsigned long rxrpc_get_rto_backoff(struct rxrpc_peer *, bool);
 void rxrpc_peer_init_rtt(struct rxrpc_peer *);
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 38a46167523f..a40fae013942 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -153,6 +153,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
 	call->cong_ssthresh = RXRPC_RXTX_BUFF_SIZE - 1;
 
 	call->rxnet = rxnet;
+	call->rtt_avail = RXRPC_CALL_RTT_AVAIL_MASK;
 	atomic_inc(&rxnet->nr_calls);
 	return call;
 
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index a7699e56eac8..19ddfc9807e8 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -608,36 +608,57 @@ unlock:
 }
 
 /*
- * Process a requested ACK.
+ * See if there's a cached RTT probe to complete.
  */
-static void rxrpc_input_requested_ack(struct rxrpc_call *call,
-				      ktime_t resp_time,
-				      rxrpc_serial_t orig_serial,
-				      rxrpc_serial_t ack_serial)
+static void rxrpc_complete_rtt_probe(struct rxrpc_call *call,
+				     ktime_t resp_time,
+				     rxrpc_serial_t acked_serial,
+				     rxrpc_serial_t ack_serial,
+				     enum rxrpc_rtt_rx_trace type)
 {
-	struct rxrpc_skb_priv *sp;
-	struct sk_buff *skb;
+	rxrpc_serial_t orig_serial;
+	unsigned long avail;
 	ktime_t sent_at;
-	int ix;
+	bool matched = false;
+	int i;
 
-	for (ix = 0; ix < RXRPC_RXTX_BUFF_SIZE; ix++) {
-		skb = call->rxtx_buffer[ix];
-		if (!skb)
-			continue;
+	avail = READ_ONCE(call->rtt_avail);
+	smp_rmb(); /* Read avail bits before accessing data. */
 
-		sent_at = skb->tstamp;
-		smp_rmb(); /* Read timestamp before serial. */
-		sp = rxrpc_skb(skb);
-		if (sp->hdr.serial != orig_serial)
+	for (i = 0; i < ARRAY_SIZE(call->rtt_serial); i++) {
+		if (!test_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &avail))
 			continue;
-		goto found;
-	}
 
-	return;
+		sent_at = call->rtt_sent_at[i];
+		orig_serial = call->rtt_serial[i];
+
+		if (orig_serial == acked_serial) {
+			clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail);
+			smp_mb(); /* Read data before setting avail bit */
+			set_bit(i, &call->rtt_avail);
+			if (type != rxrpc_rtt_rx_cancel)
+				rxrpc_peer_add_rtt(call, type, i, acked_serial, ack_serial,
+						   sent_at, resp_time);
+			else
+				trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_cancel, i,
+						   orig_serial, acked_serial, 0, 0);
+			matched = true;
+		}
+
+		/* If a later serial is being acked, then mark this slot as
+		 * being available.
+		 */
+		if (after(acked_serial, orig_serial)) {
+			trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_obsolete, i,
+					   orig_serial, acked_serial, 0, 0);
+			clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail);
+			smp_wmb();
+			set_bit(i, &call->rtt_avail);
+		}
+	}
 
-found:
-	rxrpc_peer_add_rtt(call, rxrpc_rtt_rx_requested_ack,
-			   orig_serial, ack_serial, sent_at, resp_time);
+	if (!matched)
+		trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_lost, 9, 0, acked_serial, 0, 0);
 }
 
 /*
@@ -682,27 +703,11 @@ static void rxrpc_input_check_for_lost_ack(struct rxrpc_call *call)
  */
 static void rxrpc_input_ping_response(struct rxrpc_call *call,
 				      ktime_t resp_time,
-				      rxrpc_serial_t orig_serial,
+				      rxrpc_serial_t acked_serial,
 				      rxrpc_serial_t ack_serial)
 {
-	rxrpc_serial_t ping_serial;
-	ktime_t ping_time;
-
-	ping_time = call->ping_time;
-	smp_rmb();
-	ping_serial = READ_ONCE(call->ping_serial);
-
-	if (orig_serial == call->acks_lost_ping)
+	if (acked_serial == call->acks_lost_ping)
 		rxrpc_input_check_for_lost_ack(call);
-
-	if (before(orig_serial, ping_serial) ||
-	    !test_and_clear_bit(RXRPC_CALL_PINGING, &call->flags))
-		return;
-	if (after(orig_serial, ping_serial))
-		return;
-
-	rxrpc_peer_add_rtt(call, rxrpc_rtt_rx_ping_response,
-			   orig_serial, ack_serial, ping_time, resp_time);
 }
 
 /*
@@ -869,12 +874,23 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
 			   first_soft_ack, prev_pkt,
 			   summary.ack_reason, nr_acks);
 
-	if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE)
+	switch (buf.ack.reason) {
+	case RXRPC_ACK_PING_RESPONSE:
 		rxrpc_input_ping_response(call, skb->tstamp, acked_serial,
 					  ack_serial);
-	if (buf.ack.reason == RXRPC_ACK_REQUESTED)
-		rxrpc_input_requested_ack(call, skb->tstamp, acked_serial,
-					  ack_serial);
+		rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial,
+					 rxrpc_rtt_rx_ping_response);
+		break;
+	case RXRPC_ACK_REQUESTED:
+		rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial,
+					 rxrpc_rtt_rx_requested_ack);
+		break;
+	default:
+		if (acked_serial != 0)
+			rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial,
+						 rxrpc_rtt_rx_cancel);
+		break;
+	}
 
 	if (buf.ack.reason == RXRPC_ACK_PING) {
 		_proto("Rx ACK %%%u PING Request", ack_serial);
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 1ba43c3df4ad..3cfff7922ba8 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -123,6 +123,49 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn,
 	return top - hard_ack + 3;
 }
 
+/*
+ * Record the beginning of an RTT probe.
+ */
+static int rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial,
+				 enum rxrpc_rtt_tx_trace why)
+{
+	unsigned long avail = call->rtt_avail;
+	int rtt_slot = 9;
+
+	if (!(avail & RXRPC_CALL_RTT_AVAIL_MASK))
+		goto no_slot;
+
+	rtt_slot = __ffs(avail & RXRPC_CALL_RTT_AVAIL_MASK);
+	if (!test_and_clear_bit(rtt_slot, &call->rtt_avail))
+		goto no_slot;
+
+	call->rtt_serial[rtt_slot] = serial;
+	call->rtt_sent_at[rtt_slot] = ktime_get_real();
+	smp_wmb(); /* Write data before avail bit */
+	set_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail);
+
+	trace_rxrpc_rtt_tx(call, why, rtt_slot, serial);
+	return rtt_slot;
+
+no_slot:
+	trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_no_slot, rtt_slot, serial);
+	return -1;
+}
+
+/*
+ * Cancel an RTT probe.
+ */
+static void rxrpc_cancel_rtt_probe(struct rxrpc_call *call,
+				   rxrpc_serial_t serial, int rtt_slot)
+{
+	if (rtt_slot != -1) {
+		clear_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail);
+		smp_wmb(); /* Clear pending bit before setting slot */
+		set_bit(rtt_slot, &call->rtt_avail);
+		trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_cancel, rtt_slot, serial);
+	}
+}
+
 /*
  * Send an ACK call packet.
  */
@@ -136,7 +179,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
 	rxrpc_serial_t serial;
 	rxrpc_seq_t hard_ack, top;
 	size_t len, n;
-	int ret;
+	int ret, rtt_slot = -1;
 	u8 reason;
 
 	if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
@@ -196,18 +239,8 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
 	if (_serial)
 		*_serial = serial;
 
-	if (ping) {
-		call->ping_serial = serial;
-		smp_wmb();
-		/* We need to stick a time in before we send the packet in case
-		 * the reply gets back before kernel_sendmsg() completes - but
-		 * asking UDP to send the packet can take a relatively long
-		 * time.
-		 */
-		call->ping_time = ktime_get_real();
-		set_bit(RXRPC_CALL_PINGING, &call->flags);
-		trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_ping, serial);
-	}
+	if (ping)
+		rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_ping);
 
 	ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
 	conn->params.peer->last_tx_at = ktime_get_seconds();
@@ -221,8 +254,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
 
 	if (call->state < RXRPC_CALL_COMPLETE) {
 		if (ret < 0) {
-			if (ping)
-				clear_bit(RXRPC_CALL_PINGING, &call->flags);
+			rxrpc_cancel_rtt_probe(call, serial, rtt_slot);
 			rxrpc_propose_ACK(call, pkt->ack.reason,
 					  ntohl(pkt->ack.serial),
 					  false, true,
@@ -321,7 +353,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	struct kvec iov[2];
 	rxrpc_serial_t serial;
 	size_t len;
-	int ret;
+	int ret, rtt_slot = -1;
 
 	_enter(",{%d}", skb->len);
 
@@ -397,6 +429,8 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	sp->hdr.serial = serial;
 	smp_wmb(); /* Set serial before timestamp */
 	skb->tstamp = ktime_get_real();
+	if (whdr.flags & RXRPC_REQUEST_ACK)
+		rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data);
 
 	/* send the packet by UDP
 	 * - returns -EMSGSIZE if UDP would have to fragment the packet
@@ -408,12 +442,15 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	conn->params.peer->last_tx_at = ktime_get_seconds();
 
 	up_read(&conn->params.local->defrag_sem);
-	if (ret < 0)
+	if (ret < 0) {
+		rxrpc_cancel_rtt_probe(call, serial, rtt_slot);
 		trace_rxrpc_tx_fail(call->debug_id, serial, ret,
 				    rxrpc_tx_point_call_data_nofrag);
-	else
+	} else {
 		trace_rxrpc_tx_packet(call->debug_id, &whdr,
 				      rxrpc_tx_point_call_data_nofrag);
+	}
+
 	rxrpc_tx_backoff(call, ret);
 	if (ret == -EMSGSIZE)
 		goto send_fragmentable;
@@ -422,7 +459,6 @@ done:
 	if (ret >= 0) {
 		if (whdr.flags & RXRPC_REQUEST_ACK) {
 			call->peer->rtt_last_req = skb->tstamp;
-			trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial);
 			if (call->peer->rtt_count > 1) {
 				unsigned long nowj = jiffies, ack_lost_at;
 
@@ -469,6 +505,8 @@ send_fragmentable:
 	sp->hdr.serial = serial;
 	smp_wmb(); /* Set serial before timestamp */
 	skb->tstamp = ktime_get_real();
+	if (whdr.flags & RXRPC_REQUEST_ACK)
+		rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data);
 
 	switch (conn->params.local->srx.transport.family) {
 	case AF_INET6:
@@ -487,12 +525,14 @@ send_fragmentable:
 		BUG();
 	}
 
-	if (ret < 0)
+	if (ret < 0) {
+		rxrpc_cancel_rtt_probe(call, serial, rtt_slot);
 		trace_rxrpc_tx_fail(call->debug_id, serial, ret,
 				    rxrpc_tx_point_call_data_frag);
-	else
+	} else {
 		trace_rxrpc_tx_packet(call->debug_id, &whdr,
 				      rxrpc_tx_point_call_data_frag);
+	}
 	rxrpc_tx_backoff(call, ret);
 
 	up_write(&conn->params.local->defrag_sem);
diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c
index 928d8b34a3ee..1221b0637a7e 100644
--- a/net/rxrpc/rtt.c
+++ b/net/rxrpc/rtt.c
@@ -146,6 +146,7 @@ static void rxrpc_ack_update_rtt(struct rxrpc_peer *peer, long rtt_us)
  * exclusive access to the peer RTT data.
  */
 void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
+			int rtt_slot,
 			rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial,
 			ktime_t send_time, ktime_t resp_time)
 {
@@ -162,7 +163,7 @@ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
 		peer->rtt_count++;
 	spin_unlock(&peer->rtt_input_lock);
 
-	trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial,
+	trace_rxrpc_rtt_rx(call, why, rtt_slot, send_serial, resp_serial,
 			   peer->srtt_us >> 3, peer->rto_j);
 }
 
-- 
cgit 


From 1d4adfaf65746203861c72d9d78de349eb97d528 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 20 Aug 2020 15:13:00 +0100
Subject: rxrpc: Make rxrpc_kernel_get_srtt() indicate validity

Fix rxrpc_kernel_get_srtt() to indicate the validity of the returned
smoothed RTT.  If we haven't had any valid samples yet, the SRTT isn't
useful.

Fixes: c410bf01933e ("rxrpc: Fix the excessive initial retransmission timeout")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/peer_object.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index ca29976bb193..68396d052052 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -502,11 +502,21 @@ EXPORT_SYMBOL(rxrpc_kernel_get_peer);
  * rxrpc_kernel_get_srtt - Get a call's peer smoothed RTT
  * @sock: The socket on which the call is in progress.
  * @call: The call to query
+ * @_srtt: Where to store the SRTT value.
  *
- * Get the call's peer smoothed RTT.
+ * Get the call's peer smoothed RTT in uS.
  */
-u32 rxrpc_kernel_get_srtt(struct socket *sock, struct rxrpc_call *call)
+bool rxrpc_kernel_get_srtt(struct socket *sock, struct rxrpc_call *call,
+			   u32 *_srtt)
 {
-	return call->peer->srtt_us >> 3;
+	struct rxrpc_peer *peer = call->peer;
+
+	if (peer->rtt_count == 0) {
+		*_srtt = 1000000; /* 1S */
+		return false;
+	}
+
+	*_srtt = call->peer->srtt_us >> 3;
+	return true;
 }
 EXPORT_SYMBOL(rxrpc_kernel_get_srtt);
-- 
cgit 


From 226a88de473e475cb9f993682a1c7d0c2b451ad8 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 19 Aug 2020 23:59:14 +0200
Subject: netfilter: nft_set_rbtree: Handle outcomes of tree rotations in
 overlap detection

Checks for partial overlaps on insertion assume that end elements
are always descendant nodes of their corresponding start, because
they are inserted later. However, this is not the case if a
previous delete operation caused a tree rotation as part of
rebalancing.

Taking the issue reported by Andreas Fischer as an example, if we
omit delete operations, the existing procedure works because,
equivalently, we are inserting a start item with value 40 in the
this region of the red-black tree with single-sized intervals:

                                  overlap flag
                   10 (start)
                  /  \            false
                      20 (start)
                     /  \         false
                         30 (start)
                        /  \      false
                            60 (start)
                           /  \   false
                         50 (end)
                        /  \      false
                      20 (end)
                     /  \         false
                         40 (start)

if we now delete interval 30 - 30, the tree can be rearranged in
a way similar to this (note the rotation involving 50 - 50):

                                  overlap flag
                   10 (start)
                  /  \            false
                      20 (start)
                     /  \         false
                         25 (start)
                        /  \      false
                            70 (start)
                           /  \   false
                         50 (end)
                        /  \      true (from rule a1.)
                      50 (start)
                     /  \         true
                   40 (start)

and we traverse interval 50 - 50 from the opposite direction
compared to what was expected.

To deal with those cases, add a start-before-start rule, b4.,
that covers traversal of existing intervals from the right.

We now need to restrict start-after-end rule b3. to cases
where there are no occurring nodes between existing start and
end elements, because addition of rule b4. isn't sufficient to
ensure that the pre-existing end element we encounter while
descending the tree corresponds to a start element of an
interval that we already traversed entirely.

Different types of overlap detection on trees with rotations
resulting from re-balancing will be covered by nft test case
sets/0044interval_overlap_1.

Reported-by: Andreas Fischer <netfilter@d9c.eu>
Bugzilla: https://bugzilla.netfilter.org/show_bug.cgi?id=1449
Cc: <stable@vger.kernel.org> # 5.6.x
Fixes: 7c84d41416d8 ("netfilter: nft_set_rbtree: Detect partial overlaps on insertion")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_set_rbtree.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 4b2834fd17b2..27668f4e44ea 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -238,21 +238,27 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 	 *
 	 * b1. _ _ __>|  !_ _ __|  (insert end before existing start)
 	 * b2. _ _ ___|  !_ _ _>|  (insert end after existing start)
-	 * b3. _ _ ___! >|_ _ __|  (insert start after existing end)
+	 * b3. _ _ ___! >|_ _ __|  (insert start after existing end, as a leaf)
+	 *            '--' no nodes falling in this range
+	 * b4.          >|_ _   !  (insert start before existing start)
 	 *
 	 * Case a3. resolves to b3.:
 	 * - if the inserted start element is the leftmost, because the '0'
 	 *   element in the tree serves as end element
-	 * - otherwise, if an existing end is found. Note that end elements are
-	 *   always inserted after corresponding start elements.
+	 * - otherwise, if an existing end is found immediately to the left. If
+	 *   there are existing nodes in between, we need to further descend the
+	 *   tree before we can conclude the new start isn't causing an overlap
+	 *
+	 * or to b4., which, preceded by a3., means we already traversed one or
+	 * more existing intervals entirely, from the right.
 	 *
 	 * For a new, rightmost pair of elements, we'll hit cases b3. and b2.,
 	 * in that order.
 	 *
 	 * The flag is also cleared in two special cases:
 	 *
-	 * b4. |__ _ _!|<_ _ _   (insert start right before existing end)
-	 * b5. |__ _ >|!__ _ _   (insert end right after existing start)
+	 * b5. |__ _ _!|<_ _ _   (insert start right before existing end)
+	 * b6. |__ _ >|!__ _ _   (insert end right after existing start)
 	 *
 	 * which always happen as last step and imply that no further
 	 * overlapping is possible.
@@ -272,7 +278,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 			if (nft_rbtree_interval_start(new)) {
 				if (nft_rbtree_interval_end(rbe) &&
 				    nft_set_elem_active(&rbe->ext, genmask) &&
-				    !nft_set_elem_expired(&rbe->ext))
+				    !nft_set_elem_expired(&rbe->ext) && !*p)
 					overlap = false;
 			} else {
 				overlap = nft_rbtree_interval_end(rbe) &&
@@ -288,10 +294,9 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 					  nft_set_elem_active(&rbe->ext,
 							      genmask) &&
 					  !nft_set_elem_expired(&rbe->ext);
-			} else if (nft_rbtree_interval_end(rbe) &&
-				   nft_set_elem_active(&rbe->ext, genmask) &&
+			} else if (nft_set_elem_active(&rbe->ext, genmask) &&
 				   !nft_set_elem_expired(&rbe->ext)) {
-				overlap = true;
+				overlap = nft_rbtree_interval_end(rbe);
 			}
 		} else {
 			if (nft_rbtree_interval_end(rbe) &&
-- 
cgit 


From 0726763043dc10dd4c12481f050b1a5ef8f15410 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 19 Aug 2020 23:59:15 +0200
Subject: netfilter: nft_set_rbtree: Detect partial overlap with start endpoint
 match

Getting creative with nft and omitting the interval_overlap()
check from the set_overlap() function, without omitting
set_overlap() altogether, led to the observation of a partial
overlap that wasn't detected, and would actually result in
replacement of the end element of an existing interval.

This is due to the fact that we'll return -EEXIST on a matching,
pre-existing start element, instead of -ENOTEMPTY, and the error
is cleared by API if NLM_F_EXCL is not given. At this point, we
can insert a matching start, and duplicate the end element as long
as we don't end up into other intervals.

For instance, inserting interval 0 - 2 with an existing 0 - 3
interval would result in a single 0 - 2 interval, and a dangling
'3' end element. This is because nft will proceed after inserting
the '0' start element as no error is reported, and no further
conflicting intervals are detected on insertion of the end element.

This needs a different approach as it's a local condition that can
be detected by looking for duplicate ends coming from left and
right, separately. Track those and directly report -ENOTEMPTY on
duplicated end elements for a matching start.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_set_rbtree.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 27668f4e44ea..217ab3644c25 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -218,11 +218,11 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 			       struct nft_rbtree_elem *new,
 			       struct nft_set_ext **ext)
 {
+	bool overlap = false, dup_end_left = false, dup_end_right = false;
 	struct nft_rbtree *priv = nft_set_priv(set);
 	u8 genmask = nft_genmask_next(net);
 	struct nft_rbtree_elem *rbe;
 	struct rb_node *parent, **p;
-	bool overlap = false;
 	int d;
 
 	/* Detect overlaps as we descend the tree. Set the flag in these cases:
@@ -262,6 +262,20 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 	 *
 	 * which always happen as last step and imply that no further
 	 * overlapping is possible.
+	 *
+	 * Another special case comes from the fact that start elements matching
+	 * an already existing start element are allowed: insertion is not
+	 * performed but we return -EEXIST in that case, and the error will be
+	 * cleared by the caller if NLM_F_EXCL is not present in the request.
+	 * This way, request for insertion of an exact overlap isn't reported as
+	 * error to userspace if not desired.
+	 *
+	 * However, if the existing start matches a pre-existing start, but the
+	 * end element doesn't match the corresponding pre-existing end element,
+	 * we need to report a partial overlap. This is a local condition that
+	 * can be noticed without need for a tracking flag, by checking for a
+	 * local duplicated end for a corresponding start, from left and right,
+	 * separately.
 	 */
 
 	parent = NULL;
@@ -281,19 +295,35 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 				    !nft_set_elem_expired(&rbe->ext) && !*p)
 					overlap = false;
 			} else {
+				if (dup_end_left && !*p)
+					return -ENOTEMPTY;
+
 				overlap = nft_rbtree_interval_end(rbe) &&
 					  nft_set_elem_active(&rbe->ext,
 							      genmask) &&
 					  !nft_set_elem_expired(&rbe->ext);
+
+				if (overlap) {
+					dup_end_right = true;
+					continue;
+				}
 			}
 		} else if (d > 0) {
 			p = &parent->rb_right;
 
 			if (nft_rbtree_interval_end(new)) {
+				if (dup_end_right && !*p)
+					return -ENOTEMPTY;
+
 				overlap = nft_rbtree_interval_end(rbe) &&
 					  nft_set_elem_active(&rbe->ext,
 							      genmask) &&
 					  !nft_set_elem_expired(&rbe->ext);
+
+				if (overlap) {
+					dup_end_left = true;
+					continue;
+				}
 			} else if (nft_set_elem_active(&rbe->ext, genmask) &&
 				   !nft_set_elem_expired(&rbe->ext)) {
 				overlap = nft_rbtree_interval_end(rbe);
@@ -321,6 +351,8 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 				p = &parent->rb_left;
 			}
 		}
+
+		dup_end_left = dup_end_right = false;
 	}
 
 	if (overlap)
-- 
cgit 


From 6f03bf43ee05b31d3822def2a80f11b3591c55b3 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 20 Aug 2020 14:12:54 +0200
Subject: netfilter: nf_tables: add NFTA_SET_USERDATA if not null

Kernel sends an empty NFTA_SET_USERDATA attribute with no value if
userspace adds a set with no NFTA_SET_USERDATA attribute.

Fixes: e6d8ecac9e68 ("netfilter: nf_tables: Add new attributes into nft_set to store user data.")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index fd814e514f94..71e501c5ad21 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3770,7 +3770,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
 			goto nla_put_failure;
 	}
 
-	if (nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata))
+	if (set->udata &&
+	    nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata))
 		goto nla_put_failure;
 
 	nest = nla_nest_start_noflag(skb, NFTA_SET_DESC);
-- 
cgit 


From 1e105e6afa6c3d32bfb52c00ffa393894a525c27 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 20 Aug 2020 21:05:50 +0200
Subject: netfilter: nf_tables: fix destination register zeroing

Following bug was reported via irc:
nft list ruleset
   set knock_candidates_ipv4 {
      type ipv4_addr . inet_service
      size 65535
      elements = { 127.0.0.1 . 123,
                   127.0.0.1 . 123 }
      }
 ..
   udp dport 123 add @knock_candidates_ipv4 { ip saddr . 123 }
   udp dport 123 add @knock_candidates_ipv4 { ip saddr . udp dport }

It should not have been possible to add a duplicate set entry.

After some debugging it turned out that the problem is the immediate
value (123) in the second-to-last rule.

Concatenations use 32bit registers, i.e. the elements are 8 bytes each,
not 6 and it turns out the kernel inserted

inet firewall @knock_candidates_ipv4
        element 0100007f ffff7b00  : 0 [end]
        element 0100007f 00007b00  : 0 [end]

Note the non-zero upper bits of the first element.  It turns out that
nft_immediate doesn't zero the destination register, but this is needed
when the length isn't a multiple of 4.

Furthermore, the zeroing in nft_payload is broken.  We can't use
[len / 4] = 0 -- if len is a multiple of 4, index is off by one.

Skip zeroing in this case and use a conditional instead of (len -1) / 4.

Fixes: 49499c3e6e18 ("netfilter: nf_tables: switch registers to 32 bit addressing")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_payload.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index ed7cb9f747f6..7a2e59638499 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -87,7 +87,9 @@ void nft_payload_eval(const struct nft_expr *expr,
 	u32 *dest = &regs->data[priv->dreg];
 	int offset;
 
-	dest[priv->len / NFT_REG32_SIZE] = 0;
+	if (priv->len % NFT_REG32_SIZE)
+		dest[priv->len / NFT_REG32_SIZE] = 0;
+
 	switch (priv->base) {
 	case NFT_PAYLOAD_LL_HEADER:
 		if (!skb_mac_header_was_set(skb))
-- 
cgit 


From 3106ecb43a05dc3e009779764b9da245a5d082de Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 21 Aug 2020 14:59:38 +0800
Subject: sctp: not disable bh in the whole sctp_get_port_local()

With disabling bh in the whole sctp_get_port_local(), when
snum == 0 and too many ports have been used, the do-while
loop will take the cpu for a long time and cause cpu stuck:

  [ ] watchdog: BUG: soft lockup - CPU#11 stuck for 22s!
  [ ] RIP: 0010:native_queued_spin_lock_slowpath+0x4de/0x940
  [ ] Call Trace:
  [ ]  _raw_spin_lock+0xc1/0xd0
  [ ]  sctp_get_port_local+0x527/0x650 [sctp]
  [ ]  sctp_do_bind+0x208/0x5e0 [sctp]
  [ ]  sctp_autobind+0x165/0x1e0 [sctp]
  [ ]  sctp_connect_new_asoc+0x355/0x480 [sctp]
  [ ]  __sctp_connect+0x360/0xb10 [sctp]

There's no need to disable bh in the whole function of
sctp_get_port_local. So fix this cpu stuck by removing
local_bh_disable() called at the beginning, and using
spin_lock_bh() instead.

The same thing was actually done for inet_csk_get_port() in
Commit ea8add2b1903 ("tcp/dccp: better use of ephemeral
ports in bind()").

Thanks to Marcelo for pointing the buggy code out.

v1->v2:
  - use cond_resched() to yield cpu to other tasks if needed,
    as Eric noticed.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-by: Ying Xu <yinxu@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index ec1fba1fbe71..836615f71a7d 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -8060,8 +8060,6 @@ static int sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
 
 	pr_debug("%s: begins, snum:%d\n", __func__, snum);
 
-	local_bh_disable();
-
 	if (snum == 0) {
 		/* Search for an available port. */
 		int low, high, remaining, index;
@@ -8079,20 +8077,21 @@ static int sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
 				continue;
 			index = sctp_phashfn(net, rover);
 			head = &sctp_port_hashtable[index];
-			spin_lock(&head->lock);
+			spin_lock_bh(&head->lock);
 			sctp_for_each_hentry(pp, &head->chain)
 				if ((pp->port == rover) &&
 				    net_eq(net, pp->net))
 					goto next;
 			break;
 		next:
-			spin_unlock(&head->lock);
+			spin_unlock_bh(&head->lock);
+			cond_resched();
 		} while (--remaining > 0);
 
 		/* Exhausted local port range during search? */
 		ret = 1;
 		if (remaining <= 0)
-			goto fail;
+			return ret;
 
 		/* OK, here is the one we will use.  HEAD (the port
 		 * hash table list entry) is non-NULL and we hold it's
@@ -8107,7 +8106,7 @@ static int sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
 		 * port iterator, pp being NULL.
 		 */
 		head = &sctp_port_hashtable[sctp_phashfn(net, snum)];
-		spin_lock(&head->lock);
+		spin_lock_bh(&head->lock);
 		sctp_for_each_hentry(pp, &head->chain) {
 			if ((pp->port == snum) && net_eq(pp->net, net))
 				goto pp_found;
@@ -8207,10 +8206,7 @@ success:
 	ret = 0;
 
 fail_unlock:
-	spin_unlock(&head->lock);
-
-fail:
-	local_bh_enable();
+	spin_unlock_bh(&head->lock);
 	return ret;
 }
 
-- 
cgit 


From d3b990b7f327e2afa98006e7666fb8ada8ed8683 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Fri, 21 Aug 2020 16:34:52 -0400
Subject: netlabel: fix problems with mapping removal

This patch fixes two main problems seen when removing NetLabel
mappings: memory leaks and potentially extra audit noise.

The memory leaks are caused by not properly free'ing the mapping's
address selector struct when free'ing the entire entry as well as
not properly cleaning up a temporary mapping entry when adding new
address selectors to an existing entry.  This patch fixes both these
problems such that kmemleak reports no NetLabel associated leaks
after running the SELinux test suite.

The potentially extra audit noise was caused by the auditing code in
netlbl_domhsh_remove_entry() being called regardless of the entry's
validity.  If another thread had already marked the entry as invalid,
but not removed/free'd it from the list of mappings, then it was
possible that an additional mapping removal audit record would be
generated.  This patch fixes this by returning early from the removal
function when the entry was previously marked invalid.  This change
also had the side benefit of improving the code by decreasing the
indentation level of large chunk of code by one (accounting for most
of the diffstat).

Fixes: 63c416887437 ("netlabel: Add network address selectors to the NetLabel/LSM domain mapping")
Reported-by: Stephen Smalley <stephen.smalley.work@gmail.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlabel/netlabel_domainhash.c | 59 +++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 29 deletions(-)

(limited to 'net')

diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
index d07de2c0fbc7..f73a8382c275 100644
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -85,6 +85,7 @@ static void netlbl_domhsh_free_entry(struct rcu_head *entry)
 			kfree(netlbl_domhsh_addr6_entry(iter6));
 		}
 #endif /* IPv6 */
+		kfree(ptr->def.addrsel);
 	}
 	kfree(ptr->domain);
 	kfree(ptr);
@@ -537,6 +538,8 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry,
 				goto add_return;
 		}
 #endif /* IPv6 */
+		/* cleanup the new entry since we've moved everything over */
+		netlbl_domhsh_free_entry(&entry->rcu);
 	} else
 		ret_val = -EINVAL;
 
@@ -580,6 +583,12 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
 {
 	int ret_val = 0;
 	struct audit_buffer *audit_buf;
+	struct netlbl_af4list *iter4;
+	struct netlbl_domaddr4_map *map4;
+#if IS_ENABLED(CONFIG_IPV6)
+	struct netlbl_af6list *iter6;
+	struct netlbl_domaddr6_map *map6;
+#endif /* IPv6 */
 
 	if (entry == NULL)
 		return -ENOENT;
@@ -597,6 +606,9 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
 		ret_val = -ENOENT;
 	spin_unlock(&netlbl_domhsh_lock);
 
+	if (ret_val)
+		return ret_val;
+
 	audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info);
 	if (audit_buf != NULL) {
 		audit_log_format(audit_buf,
@@ -606,40 +618,29 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
 		audit_log_end(audit_buf);
 	}
 
-	if (ret_val == 0) {
-		struct netlbl_af4list *iter4;
-		struct netlbl_domaddr4_map *map4;
-#if IS_ENABLED(CONFIG_IPV6)
-		struct netlbl_af6list *iter6;
-		struct netlbl_domaddr6_map *map6;
-#endif /* IPv6 */
-
-		switch (entry->def.type) {
-		case NETLBL_NLTYPE_ADDRSELECT:
-			netlbl_af4list_foreach_rcu(iter4,
-					     &entry->def.addrsel->list4) {
-				map4 = netlbl_domhsh_addr4_entry(iter4);
-				cipso_v4_doi_putdef(map4->def.cipso);
-			}
+	switch (entry->def.type) {
+	case NETLBL_NLTYPE_ADDRSELECT:
+		netlbl_af4list_foreach_rcu(iter4, &entry->def.addrsel->list4) {
+			map4 = netlbl_domhsh_addr4_entry(iter4);
+			cipso_v4_doi_putdef(map4->def.cipso);
+		}
 #if IS_ENABLED(CONFIG_IPV6)
-			netlbl_af6list_foreach_rcu(iter6,
-					     &entry->def.addrsel->list6) {
-				map6 = netlbl_domhsh_addr6_entry(iter6);
-				calipso_doi_putdef(map6->def.calipso);
-			}
+		netlbl_af6list_foreach_rcu(iter6, &entry->def.addrsel->list6) {
+			map6 = netlbl_domhsh_addr6_entry(iter6);
+			calipso_doi_putdef(map6->def.calipso);
+		}
 #endif /* IPv6 */
-			break;
-		case NETLBL_NLTYPE_CIPSOV4:
-			cipso_v4_doi_putdef(entry->def.cipso);
-			break;
+		break;
+	case NETLBL_NLTYPE_CIPSOV4:
+		cipso_v4_doi_putdef(entry->def.cipso);
+		break;
 #if IS_ENABLED(CONFIG_IPV6)
-		case NETLBL_NLTYPE_CALIPSO:
-			calipso_doi_putdef(entry->def.calipso);
-			break;
+	case NETLBL_NLTYPE_CALIPSO:
+		calipso_doi_putdef(entry->def.calipso);
+		break;
 #endif /* IPv6 */
-		}
-		call_rcu(&entry->rcu, netlbl_domhsh_free_entry);
 	}
+	call_rcu(&entry->rcu, netlbl_domhsh_free_entry);
 
 	return ret_val;
 }
-- 
cgit 


From be769db2f95861cc8c7c8fedcc71a8c39b803b10 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 22 Aug 2020 08:23:29 +1000
Subject: net: Get rid of consume_skb when tracing is off

The function consume_skb is only meaningful when tracing is enabled.
This patch makes it conditional on CONFIG_TRACEPOINTS.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e18184ffa9c3..6faf73d6a0f7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -820,6 +820,7 @@ void skb_tx_error(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(skb_tx_error);
 
+#ifdef CONFIG_TRACEPOINTS
 /**
  *	consume_skb - free an skbuff
  *	@skb: buffer to free
@@ -837,6 +838,7 @@ void consume_skb(struct sk_buff *skb)
 	__kfree_skb(skb);
 }
 EXPORT_SYMBOL(consume_skb);
+#endif
 
 /**
  *	consume_stateless_skb - free an skbuff, assuming it is stateless
-- 
cgit 


From e104684108fc8b846dbca889f28257a65a956874 Mon Sep 17 00:00:00 2001
From: Tong Zhang <ztong0001@gmail.com>
Date: Mon, 24 Aug 2020 18:08:06 -0400
Subject: net: caif: fix error code handling

cfpkt_peek_head return 0 and 1, caller is checking error using <0

Signed-off-by: Tong Zhang <ztong0001@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/caif/cfrfml.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c
index ce2767e9cec6..7b0af33bdb97 100644
--- a/net/caif/cfrfml.c
+++ b/net/caif/cfrfml.c
@@ -116,7 +116,7 @@ static int cfrfml_receive(struct cflayer *layr, struct cfpkt *pkt)
 	if (segmented) {
 		if (rfml->incomplete_frm == NULL) {
 			/* Initial Segment */
-			if (cfpkt_peek_head(pkt, rfml->seghead, 6) < 0)
+			if (cfpkt_peek_head(pkt, rfml->seghead, 6) != 0)
 				goto out;
 
 			rfml->pdu_size = get_unaligned_le16(rfml->seghead+4);
@@ -233,7 +233,7 @@ static int cfrfml_transmit(struct cflayer *layr, struct cfpkt *pkt)
 	if (cfpkt_getlen(pkt) > rfml->fragment_size + RFM_HEAD_SIZE)
 		err = cfpkt_peek_head(pkt, head, 6);
 
-	if (err < 0)
+	if (err != 0)
 		goto out;
 
 	while (cfpkt_getlen(frontpkt) > rfml->fragment_size + RFM_HEAD_SIZE) {
-- 
cgit 


From 09e31cf0c528dac3358a081dc4e773d1b3de1bc9 Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Tue, 25 Aug 2020 10:44:04 -0700
Subject: taprio: Fix using wrong queues in gate mask

Since commit 9c66d1564676 ("taprio: Add support for hardware
offloading") there's a bit of inconsistency when offloading schedules
to the hardware:

In software mode, the gate masks are specified in terms of traffic
classes, so if say "sched-entry S 03 20000", it means that the traffic
classes 0 and 1 are open for 20us; when taprio is offloaded to
hardware, the gate masks are specified in terms of hardware queues.

The idea here is to fix hardware offloading, so schedules in hardware
and software mode have the same behavior. What's needed to do is to
map traffic classes to queues when applying the offload to the driver.

Fixes: 9c66d1564676 ("taprio: Add support for hardware offloading")
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_taprio.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index e981992634dd..fe53c1e38c7d 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -1176,9 +1176,27 @@ static void taprio_offload_config_changed(struct taprio_sched *q)
 	spin_unlock(&q->current_entry_lock);
 }
 
-static void taprio_sched_to_offload(struct taprio_sched *q,
+static u32 tc_map_to_queue_mask(struct net_device *dev, u32 tc_mask)
+{
+	u32 i, queue_mask = 0;
+
+	for (i = 0; i < dev->num_tc; i++) {
+		u32 offset, count;
+
+		if (!(tc_mask & BIT(i)))
+			continue;
+
+		offset = dev->tc_to_txq[i].offset;
+		count = dev->tc_to_txq[i].count;
+
+		queue_mask |= GENMASK(offset + count - 1, offset);
+	}
+
+	return queue_mask;
+}
+
+static void taprio_sched_to_offload(struct net_device *dev,
 				    struct sched_gate_list *sched,
-				    const struct tc_mqprio_qopt *mqprio,
 				    struct tc_taprio_qopt_offload *offload)
 {
 	struct sched_entry *entry;
@@ -1193,7 +1211,8 @@ static void taprio_sched_to_offload(struct taprio_sched *q,
 
 		e->command = entry->command;
 		e->interval = entry->interval;
-		e->gate_mask = entry->gate_mask;
+		e->gate_mask = tc_map_to_queue_mask(dev, entry->gate_mask);
+
 		i++;
 	}
 
@@ -1201,7 +1220,6 @@ static void taprio_sched_to_offload(struct taprio_sched *q,
 }
 
 static int taprio_enable_offload(struct net_device *dev,
-				 struct tc_mqprio_qopt *mqprio,
 				 struct taprio_sched *q,
 				 struct sched_gate_list *sched,
 				 struct netlink_ext_ack *extack)
@@ -1223,7 +1241,7 @@ static int taprio_enable_offload(struct net_device *dev,
 		return -ENOMEM;
 	}
 	offload->enable = 1;
-	taprio_sched_to_offload(q, sched, mqprio, offload);
+	taprio_sched_to_offload(dev, sched, offload);
 
 	err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
 	if (err < 0) {
@@ -1485,7 +1503,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
 	}
 
 	if (FULL_OFFLOAD_IS_ENABLED(q->flags))
-		err = taprio_enable_offload(dev, mqprio, q, new_admin, extack);
+		err = taprio_enable_offload(dev, q, new_admin, extack);
 	else
 		err = taprio_disable_offload(dev, q, extack);
 	if (err)
-- 
cgit 


From 1cec170d458b1d18f6f1654ca84c0804a701c5ef Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 26 Aug 2020 01:31:05 +0200
Subject: mptcp: free acked data before waiting for more memory

After subflow lock is dropped, more wmem might have been made available.

This fixes a deadlock in mptcp_connect.sh 'mmap' mode: wmem is exhausted.
But as the mptcp socket holds on to already-acked data (for retransmit)
no wakeup will occur.

Using 'goto restart' calls mptcp_clean_una(sk) which will free pages
that have been acked completely in the mean time.

Fixes: fb529e62d3f3 ("mptcp: break and restart in case mptcp sndbuf is full")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mptcp/protocol.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 1aad411a0e46..8ccd4a151dcb 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -892,7 +892,6 @@ restart:
 		goto out;
 	}
 
-wait_for_sndbuf:
 	__mptcp_flush_join_list(msk);
 	ssk = mptcp_subflow_get_send(msk);
 	while (!sk_stream_memory_free(sk) ||
@@ -982,7 +981,7 @@ wait_for_sndbuf:
 				 */
 				mptcp_set_timeout(sk, ssk);
 				release_sock(ssk);
-				goto wait_for_sndbuf;
+				goto restart;
 			}
 		}
 	}
-- 
cgit 


From 7f6f32bb7d3355cd78ebf1dece9a6ea7a0ca8158 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 26 Aug 2020 19:48:10 +0300
Subject: ipv4: Silence suspicious RCU usage warning

fib_info_notify_update() is always called with RTNL held, but not from
an RCU read-side critical section. This leads to the following warning
[1] when the FIB table list is traversed with
hlist_for_each_entry_rcu(), but without a proper lockdep expression.

Since modification of the list is protected by RTNL, silence the warning
by adding a lockdep expression which verifies RTNL is held.

[1]
 =============================
 WARNING: suspicious RCU usage
 5.9.0-rc1-custom-14233-g2f26e122d62f #129 Not tainted
 -----------------------------
 net/ipv4/fib_trie.c:2124 RCU-list traversed in non-reader section!!

 other info that might help us debug this:

 rcu_scheduler_active = 2, debug_locks = 1
 1 lock held by ip/834:
  #0: ffffffff85a3b6b0 (rtnl_mutex){+.+.}-{3:3}, at: rtnetlink_rcv_msg+0x49a/0xbd0

 stack backtrace:
 CPU: 0 PID: 834 Comm: ip Not tainted 5.9.0-rc1-custom-14233-g2f26e122d62f #129
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-2.fc32 04/01/2014
 Call Trace:
  dump_stack+0x100/0x184
  lockdep_rcu_suspicious+0x143/0x14d
  fib_info_notify_update+0x8d1/0xa60
  __nexthop_replace_notify+0xd2/0x290
  rtm_new_nexthop+0x35e2/0x5946
  rtnetlink_rcv_msg+0x4f7/0xbd0
  netlink_rcv_skb+0x17a/0x480
  rtnetlink_rcv+0x22/0x30
  netlink_unicast+0x5ae/0x890
  netlink_sendmsg+0x98a/0xf40
  ____sys_sendmsg+0x879/0xa00
  ___sys_sendmsg+0x122/0x190
  __sys_sendmsg+0x103/0x1d0
  __x64_sys_sendmsg+0x7d/0xb0
  do_syscall_64+0x32/0x50
  entry_SYSCALL_64_after_hwframe+0x44/0xa9
 RIP: 0033:0x7fde28c3be57
 Code: 0c 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51
c3 48 83 ec 28 89 54 24 1c 48 89 74 24 10
RSP: 002b:00007ffc09330028 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fde28c3be57
RDX: 0000000000000000 RSI: 00007ffc09330090 RDI: 0000000000000003
RBP: 000000005f45f911 R08: 0000000000000001 R09: 00007ffc0933012c
R10: 0000000000000076 R11: 0000000000000246 R12: 0000000000000001
R13: 00007ffc09330290 R14: 00007ffc09330eee R15: 00005610e48ed020

Fixes: 1bff1a0c9bbd ("ipv4: Add function to send route updates")
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index c89b46fec153..ffc5332f1390 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2121,7 +2121,8 @@ void fib_info_notify_update(struct net *net, struct nl_info *info)
 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
 		struct fib_table *tb;
 
-		hlist_for_each_entry_rcu(tb, head, tb_hlist)
+		hlist_for_each_entry_rcu(tb, head, tb_hlist,
+					 lockdep_rtnl_is_held())
 			__fib_info_notify_update(net, tb, info);
 	}
 }
-- 
cgit 


From 96e97bc07e90f175a8980a22827faf702ca4cb30 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 26 Aug 2020 12:40:06 -0700
Subject: net: disable netpoll on fresh napis

napi_disable() makes sure to set the NAPI_STATE_NPSVC bit to prevent
netpoll from accessing rings before init is complete. However, the
same is not done for fresh napi instances in netif_napi_add(),
even though we expect NAPI instances to be added as disabled.

This causes crashes during driver reconfiguration (enabling XDP,
changing the channel count) - if there is any printk() after
netif_napi_add() but before napi_enable().

To ensure memory ordering is correct we need to use RCU accessors.

Reported-by: Rob Sherwood <rsher@fb.com>
Fixes: 2d8bff12699a ("netpoll: Close race condition between poll_one_napi and napi_disable")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c     | 3 ++-
 net/core/netpoll.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index d42c9ea0c3c0..95ac7568f693 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6612,12 +6612,13 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
 				weight);
 	napi->weight = weight;
-	list_add(&napi->dev_list, &dev->napi_list);
 	napi->dev = dev;
 #ifdef CONFIG_NETPOLL
 	napi->poll_owner = -1;
 #endif
 	set_bit(NAPI_STATE_SCHED, &napi->state);
+	set_bit(NAPI_STATE_NPSVC, &napi->state);
+	list_add_rcu(&napi->dev_list, &dev->napi_list);
 	napi_hash_add(napi);
 }
 EXPORT_SYMBOL(netif_napi_add);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 093e90e52bc2..2338753e936b 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -162,7 +162,7 @@ static void poll_napi(struct net_device *dev)
 	struct napi_struct *napi;
 	int cpu = smp_processor_id();
 
-	list_for_each_entry(napi, &dev->napi_list, dev_list) {
+	list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) {
 		if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) {
 			poll_one_napi(napi);
 			smp_store_release(&napi->poll_owner, -1);
-- 
cgit 


From 8ed37e79196033dbdb09a4eea9465af6a0e29fe9 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 21 Aug 2020 18:30:43 +0200
Subject: mac80211: use rate provided via status->rate on
 ieee80211_tx_status_ext for AQL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since ieee80211_tx_info does not have enough room to encode HE rates, HE
drivers use status->rate to provide rate info.
Store it in struct sta_info and use it for AQL.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/r/20200821163045.62140-1-nbd@nbd.name
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/airtime.c  | 53 +++++++++++++++++++++++++++++++++++++++++++++++--
 net/mac80211/sta_info.h |  1 +
 net/mac80211/status.c   | 12 +++++++----
 3 files changed, 60 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/airtime.c b/net/mac80211/airtime.c
index 366f76c9003d..656c9a033714 100644
--- a/net/mac80211/airtime.c
+++ b/net/mac80211/airtime.c
@@ -487,14 +487,61 @@ u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw,
 }
 EXPORT_SYMBOL_GPL(ieee80211_calc_rx_airtime);
 
+static bool ieee80211_fill_rate_info(struct ieee80211_hw *hw,
+				     struct ieee80211_rx_status *stat, u8 band,
+				     struct rate_info *ri)
+{
+	struct ieee80211_supported_band *sband = hw->wiphy->bands[band];
+	int i;
+
+	if (!ri || !sband)
+	    return false;
+
+	stat->bw = ri->bw;
+	stat->nss = ri->nss;
+	stat->rate_idx = ri->mcs;
+
+	if (ri->flags & RATE_INFO_FLAGS_HE_MCS)
+		stat->encoding = RX_ENC_HE;
+	else if (ri->flags & RATE_INFO_FLAGS_VHT_MCS)
+		stat->encoding = RX_ENC_VHT;
+	else if (ri->flags & RATE_INFO_FLAGS_MCS)
+		stat->encoding = RX_ENC_HT;
+	else
+		stat->encoding = RX_ENC_LEGACY;
+
+	if (ri->flags & RATE_INFO_FLAGS_SHORT_GI)
+		stat->enc_flags |= RX_ENC_FLAG_SHORT_GI;
+
+	stat->he_gi = ri->he_gi;
+
+	if (stat->encoding != RX_ENC_LEGACY)
+		return true;
+
+	stat->rate_idx = 0;
+	for (i = 0; i < sband->n_bitrates; i++) {
+		if (ri->legacy != sband->bitrates[i].bitrate)
+			continue;
+
+		stat->rate_idx = i;
+		return true;
+	}
+
+	return false;
+}
+
 static u32 ieee80211_calc_tx_airtime_rate(struct ieee80211_hw *hw,
 					  struct ieee80211_tx_rate *rate,
+					  struct rate_info *ri,
 					  u8 band, int len)
 {
 	struct ieee80211_rx_status stat = {
 		.band = band,
 	};
 
+	if (ieee80211_fill_rate_info(hw, &stat, band, ri))
+		goto out;
+
 	if (rate->idx < 0 || !rate->count)
 		return 0;
 
@@ -522,6 +569,7 @@ static u32 ieee80211_calc_tx_airtime_rate(struct ieee80211_hw *hw,
 		stat.encoding = RX_ENC_LEGACY;
 	}
 
+out:
 	return ieee80211_calc_rx_airtime(hw, &stat, len);
 }
 
@@ -536,7 +584,7 @@ u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw,
 		struct ieee80211_tx_rate *rate = &info->status.rates[i];
 		u32 cur_duration;
 
-		cur_duration = ieee80211_calc_tx_airtime_rate(hw, rate,
+		cur_duration = ieee80211_calc_tx_airtime_rate(hw, rate, NULL,
 							      info->band, len);
 		if (!cur_duration)
 			break;
@@ -573,6 +621,7 @@ u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw,
 		struct sta_info *sta = container_of(pubsta, struct sta_info,
 						    sta);
 		struct ieee80211_tx_rate *rate = &sta->tx_stats.last_rate;
+		struct rate_info *ri = &sta->tx_stats.last_rate_info;
 		u32 airtime;
 
 		if (!(rate->flags & (IEEE80211_TX_RC_VHT_MCS |
@@ -586,7 +635,7 @@ u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw,
 		 * This will not be very accurate, but much better than simply
 		 * assuming un-aggregated tx.
 		 */
-		airtime = ieee80211_calc_tx_airtime_rate(hw, rate, band,
+		airtime = ieee80211_calc_tx_airtime_rate(hw, rate, ri, band,
 							 ampdu ? len * 16 : len);
 		if (ampdu)
 			airtime /= 16;
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 9d398c9daa4c..8060d142e4f1 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -611,6 +611,7 @@ struct sta_info {
 		u64 packets[IEEE80211_NUM_ACS];
 		u64 bytes[IEEE80211_NUM_ACS];
 		struct ieee80211_tx_rate last_rate;
+		struct rate_info last_rate_info;
 		u64 msdu[IEEE80211_NUM_TIDS + 1];
 	} tx_stats;
 	u16 tid_seq[IEEE80211_QOS_CTL_TID_MASK + 1];
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index adb1d30ce06e..6de63f1d8c7b 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -1137,9 +1137,17 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
 	struct ieee80211_tx_info *info = status->info;
 	struct ieee80211_sta *pubsta = status->sta;
 	struct ieee80211_supported_band *sband;
+	struct sta_info *sta;
 	int retry_count;
 	bool acked, noack_success;
 
+	if (pubsta) {
+		sta = container_of(pubsta, struct sta_info, sta);
+
+		if (status->rate)
+			sta->tx_stats.last_rate_info = *status->rate;
+	}
+
 	if (status->skb)
 		return __ieee80211_tx_status(hw, status);
 
@@ -1154,10 +1162,6 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
 	noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED);
 
 	if (pubsta) {
-		struct sta_info *sta;
-
-		sta = container_of(pubsta, struct sta_info, sta);
-
 		if (!acked && !noack_success)
 			sta->status_stats.retry_failed++;
 		sta->status_stats.retry_count += retry_count;
-- 
cgit 


From 43cd72c5892c46fe69b8a1682fb4905cf158c39c Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 21 Aug 2020 18:30:44 +0200
Subject: mac80211: factor out code to look up the average packet length
 duration for a rate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This will be used to enhance AQL estimated aggregation length

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/r/20200821163045.62140-2-nbd@nbd.name
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/airtime.c | 121 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 74 insertions(+), 47 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/airtime.c b/net/mac80211/airtime.c
index 656c9a033714..ba95f4ee1f71 100644
--- a/net/mac80211/airtime.c
+++ b/net/mac80211/airtime.c
@@ -405,18 +405,14 @@ ieee80211_calc_legacy_rate_duration(u16 bitrate, bool short_pre,
 	return duration;
 }
 
-u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw,
-			      struct ieee80211_rx_status *status,
-			      int len)
+static u32 ieee80211_get_rate_duration(struct ieee80211_hw *hw,
+				       struct ieee80211_rx_status *status,
+				       u32 *overhead)
 {
-	struct ieee80211_supported_band *sband;
-	const struct ieee80211_rate *rate;
 	bool sgi = status->enc_flags & RX_ENC_FLAG_SHORT_GI;
-	bool sp = status->enc_flags & RX_ENC_FLAG_SHORTPRE;
 	int bw, streams;
 	int group, idx;
 	u32 duration;
-	bool cck;
 
 	switch (status->bw) {
 	case RATE_INFO_BW_20:
@@ -437,20 +433,6 @@ u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw,
 	}
 
 	switch (status->encoding) {
-	case RX_ENC_LEGACY:
-		if (WARN_ON_ONCE(status->band > NL80211_BAND_5GHZ))
-			return 0;
-
-		sband = hw->wiphy->bands[status->band];
-		if (!sband || status->rate_idx >= sband->n_bitrates)
-			return 0;
-
-		rate = &sband->bitrates[status->rate_idx];
-		cck = rate->flags & IEEE80211_RATE_MANDATORY_B;
-
-		return ieee80211_calc_legacy_rate_duration(rate->bitrate, sp,
-							   cck, len);
-
 	case RX_ENC_VHT:
 		streams = status->nss;
 		idx = status->rate_idx;
@@ -477,13 +459,47 @@ u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw,
 
 	duration = airtime_mcs_groups[group].duration[idx];
 	duration <<= airtime_mcs_groups[group].shift;
+	*overhead = 36 + (streams << 2);
+
+	return duration;
+}
+
+
+u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw,
+			      struct ieee80211_rx_status *status,
+			      int len)
+{
+	struct ieee80211_supported_band *sband;
+	u32 duration, overhead = 0;
+
+	if (status->encoding == RX_ENC_LEGACY) {
+		const struct ieee80211_rate *rate;
+		bool sp = status->enc_flags & RX_ENC_FLAG_SHORTPRE;
+		bool cck;
+
+		if (WARN_ON_ONCE(status->band > NL80211_BAND_5GHZ))
+			return 0;
+
+		sband = hw->wiphy->bands[status->band];
+		if (!sband || status->rate_idx >= sband->n_bitrates)
+			return 0;
+
+		rate = &sband->bitrates[status->rate_idx];
+		cck = rate->flags & IEEE80211_RATE_MANDATORY_B;
+
+		return ieee80211_calc_legacy_rate_duration(rate->bitrate, sp,
+							   cck, len);
+	}
+
+	duration = ieee80211_get_rate_duration(hw, status, &overhead);
+	if (!duration)
+		return 0;
+
 	duration *= len;
 	duration /= AVG_PKT_SIZE;
 	duration /= 1024;
 
-	duration += 36 + (streams << 2);
-
-	return duration;
+	return duration + overhead;
 }
 EXPORT_SYMBOL_GPL(ieee80211_calc_rx_airtime);
 
@@ -530,46 +546,57 @@ static bool ieee80211_fill_rate_info(struct ieee80211_hw *hw,
 	return false;
 }
 
-static u32 ieee80211_calc_tx_airtime_rate(struct ieee80211_hw *hw,
-					  struct ieee80211_tx_rate *rate,
-					  struct rate_info *ri,
-					  u8 band, int len)
+static int ieee80211_fill_rx_status(struct ieee80211_rx_status *stat,
+				    struct ieee80211_hw *hw,
+				    struct ieee80211_tx_rate *rate,
+				    struct rate_info *ri, u8 band, int len)
 {
-	struct ieee80211_rx_status stat = {
-		.band = band,
-	};
+	memset(stat, 0, sizeof(*stat));
+	stat->band = band;
 
-	if (ieee80211_fill_rate_info(hw, &stat, band, ri))
-		goto out;
+	if (ieee80211_fill_rate_info(hw, stat, band, ri))
+		return 0;
 
 	if (rate->idx < 0 || !rate->count)
-		return 0;
+		return -1;
 
 	if (rate->flags & IEEE80211_TX_RC_80_MHZ_WIDTH)
-		stat.bw = RATE_INFO_BW_80;
+		stat->bw = RATE_INFO_BW_80;
 	else if (rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH)
-		stat.bw = RATE_INFO_BW_40;
+		stat->bw = RATE_INFO_BW_40;
 	else
-		stat.bw = RATE_INFO_BW_20;
+		stat->bw = RATE_INFO_BW_20;
 
-	stat.enc_flags = 0;
+	stat->enc_flags = 0;
 	if (rate->flags & IEEE80211_TX_RC_USE_SHORT_PREAMBLE)
-		stat.enc_flags |= RX_ENC_FLAG_SHORTPRE;
+		stat->enc_flags |= RX_ENC_FLAG_SHORTPRE;
 	if (rate->flags & IEEE80211_TX_RC_SHORT_GI)
-		stat.enc_flags |= RX_ENC_FLAG_SHORT_GI;
+		stat->enc_flags |= RX_ENC_FLAG_SHORT_GI;
 
-	stat.rate_idx = rate->idx;
+	stat->rate_idx = rate->idx;
 	if (rate->flags & IEEE80211_TX_RC_VHT_MCS) {
-		stat.encoding = RX_ENC_VHT;
-		stat.rate_idx = ieee80211_rate_get_vht_mcs(rate);
-		stat.nss = ieee80211_rate_get_vht_nss(rate);
+		stat->encoding = RX_ENC_VHT;
+		stat->rate_idx = ieee80211_rate_get_vht_mcs(rate);
+		stat->nss = ieee80211_rate_get_vht_nss(rate);
 	} else if (rate->flags & IEEE80211_TX_RC_MCS) {
-		stat.encoding = RX_ENC_HT;
+		stat->encoding = RX_ENC_HT;
 	} else {
-		stat.encoding = RX_ENC_LEGACY;
+		stat->encoding = RX_ENC_LEGACY;
 	}
 
-out:
+	return 0;
+}
+
+static u32 ieee80211_calc_tx_airtime_rate(struct ieee80211_hw *hw,
+					  struct ieee80211_tx_rate *rate,
+					  struct rate_info *ri,
+					  u8 band, int len)
+{
+	struct ieee80211_rx_status stat;
+
+	if (ieee80211_fill_rx_status(&stat, hw, rate, ri, band, len))
+		return 0;
+
 	return ieee80211_calc_rx_airtime(hw, &stat, len);
 }
 
-- 
cgit 


From f01cfbaf9b2971126f094acfbda2589bd2394f4d Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 21 Aug 2020 18:30:45 +0200
Subject: mac80211: improve AQL aggregation estimation for low data rates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Links with low data rates use much smaller aggregates and are much more
sensitive to latency added by bufferbloat.
Tune the assumed aggregation length based on the tx rate duration.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/r/20200821163045.62140-3-nbd@nbd.name
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/airtime.c | 38 ++++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/airtime.c b/net/mac80211/airtime.c
index ba95f4ee1f71..314973033d03 100644
--- a/net/mac80211/airtime.c
+++ b/net/mac80211/airtime.c
@@ -647,27 +647,41 @@ u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw,
 	if (pubsta) {
 		struct sta_info *sta = container_of(pubsta, struct sta_info,
 						    sta);
+		struct ieee80211_rx_status stat;
 		struct ieee80211_tx_rate *rate = &sta->tx_stats.last_rate;
 		struct rate_info *ri = &sta->tx_stats.last_rate_info;
-		u32 airtime;
+		u32 duration, overhead;
+		u8 agg_shift;
 
-		if (!(rate->flags & (IEEE80211_TX_RC_VHT_MCS |
-				     IEEE80211_TX_RC_MCS)))
-			ampdu = false;
+		if (ieee80211_fill_rx_status(&stat, hw, rate, ri, band, len))
+			return 0;
+
+		if (stat.encoding == RX_ENC_LEGACY || !ampdu)
+			return ieee80211_calc_rx_airtime(hw, &stat, len);
 
+		duration = ieee80211_get_rate_duration(hw, &stat, &overhead);
 		/*
 		 * Assume that HT/VHT transmission on any AC except VO will
 		 * use aggregation. Since we don't have reliable reporting
-		 * of aggregation length, assume an average of 16.
+		 * of aggregation length, assume an average size based on the
+		 * tx rate.
 		 * This will not be very accurate, but much better than simply
-		 * assuming un-aggregated tx.
+		 * assuming un-aggregated tx in all cases.
 		 */
-		airtime = ieee80211_calc_tx_airtime_rate(hw, rate, ri, band,
-							 ampdu ? len * 16 : len);
-		if (ampdu)
-			airtime /= 16;
-
-		return airtime;
+		if (duration > 400) /* <= VHT20 MCS2 1S */
+			agg_shift = 1;
+		else if (duration > 250) /* <= VHT20 MCS3 1S or MCS1 2S */
+			agg_shift = 2;
+		else if (duration > 150) /* <= VHT20 MCS5 1S or MCS3 2S */
+			agg_shift = 3;
+		else
+			agg_shift = 4;
+
+		duration *= len;
+		duration /= AVG_PKT_SIZE;
+		duration /= 1024;
+
+		return duration + (overhead >> agg_shift);
 	}
 
 	if (!conf)
-- 
cgit 


From 3579994476b65cb5e272ff0f720a1fd31322e53f Mon Sep 17 00:00:00 2001
From: Shay Bar <shay.bar@celeno.com>
Date: Wed, 26 Aug 2020 17:31:39 +0300
Subject: wireless: fix wrong 160/80+80 MHz setting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix cfg80211_chandef_usable():
consider IEEE80211_VHT_CAP_EXT_NSS_BW when verifying 160/80+80 MHz.

Based on:
"Table 9-272 — Setting of the Supported Channel Width Set subfield and Extended NSS BW
Support subfield at a STA transmitting the VHT Capabilities Information field"
From "Draft P802.11REVmd_D3.0.pdf"

Signed-off-by: Aviad Brikman <aviad.brikman@celeno.com>
Signed-off-by: Shay Bar <shay.bar@celeno.com>
Link: https://lore.kernel.org/r/20200826143139.25976-1-shay.bar@celeno.com
[reformat the code a bit and use u32_get_bits()]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/chan.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index 90f0f82cd9ca..edee3a645c06 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/export.h>
+#include <linux/bitfield.h>
 #include <net/cfg80211.h>
 #include "core.h"
 #include "rdev-ops.h"
@@ -912,6 +913,7 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy,
 	struct ieee80211_sta_vht_cap *vht_cap;
 	struct ieee80211_edmg *edmg_cap;
 	u32 width, control_freq, cap;
+	bool support_80_80 = false;
 
 	if (WARN_ON(!cfg80211_chandef_valid(chandef)))
 		return false;
@@ -979,9 +981,13 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy,
 			return false;
 		break;
 	case NL80211_CHAN_WIDTH_80P80:
-		cap = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK;
-		if (chandef->chan->band != NL80211_BAND_6GHZ &&
-		    cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ)
+		cap = vht_cap->cap;
+		support_80_80 =
+			(cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) ||
+			(cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ &&
+			 cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) ||
+			u32_get_bits(cap, IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) > 1;
+		if (chandef->chan->band != NL80211_BAND_6GHZ && !support_80_80)
 			return false;
 		/* fall through */
 	case NL80211_CHAN_WIDTH_80:
@@ -1001,7 +1007,8 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy,
 			return false;
 		cap = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK;
 		if (cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ &&
-		    cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ)
+		    cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ &&
+		    !(vht_cap->cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK))
 			return false;
 		break;
 	default:
-- 
cgit 


From 47caf685a6854593348f216e0b489b71c10cbe03 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 19 Aug 2020 10:46:48 +0200
Subject: cfg80211: regulatory: reject invalid hints

Reject invalid hints early in order to not cause a kernel
WARN later if they're restored to or similar.

Reported-by: syzbot+d451401ffd00a60677ee@syzkaller.appspotmail.com
Link: https://syzkaller.appspot.com/bug?extid=d451401ffd00a60677ee
Link: https://lore.kernel.org/r/20200819084648.13956-1-johannes@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/reg.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 35b8847a2f6d..d8a90d397423 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -2946,6 +2946,9 @@ int regulatory_hint_user(const char *alpha2,
 	if (WARN_ON(!alpha2))
 		return -EINVAL;
 
+	if (!is_world_regdom(alpha2) && !is_an_alpha2(alpha2))
+		return -EINVAL;
+
 	request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL);
 	if (!request)
 		return -ENOMEM;
-- 
cgit 


From 47df8e059b49a80c179fae39256bcd7096810934 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 8 Aug 2020 19:25:42 +0200
Subject: mac80211: reduce packet loss event false positives

When running a large number of packets per second with a high data rate
and long A-MPDUs, the packet loss threshold can be reached very quickly
when the link conditions change. This frequently shows up as spurious
disconnects.
Mitigate false positives by using a similar logic for regular stations
as the one being used for TDLS, though with a more aggressive timeout.
Packet loss events are only reported if no ACK was received for a second.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Link: https://lore.kernel.org/r/20200808172542.41628-1-nbd@nbd.name
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/sta_info.h |  4 ++--
 net/mac80211/status.c   | 31 +++++++++++++++----------------
 2 files changed, 17 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 8060d142e4f1..d5010116cf4d 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -524,7 +524,7 @@ struct ieee80211_sta_rx_stats {
  * @status_stats.retry_failed: # of frames that failed after retry
  * @status_stats.retry_count: # of retries attempted
  * @status_stats.lost_packets: # of lost packets
- * @status_stats.last_tdls_pkt_time: timestamp of last TDLS packet
+ * @status_stats.last_pkt_time: timestamp of last ACKed packet
  * @status_stats.msdu_retries: # of MSDU retries
  * @status_stats.msdu_failed: # of failed MSDUs
  * @status_stats.last_ack: last ack timestamp (jiffies)
@@ -597,7 +597,7 @@ struct sta_info {
 		unsigned long filtered;
 		unsigned long retry_failed, retry_count;
 		unsigned int lost_packets;
-		unsigned long last_tdls_pkt_time;
+		unsigned long last_pkt_time;
 		u64 msdu_retries[IEEE80211_NUM_TIDS + 1];
 		u64 msdu_failed[IEEE80211_NUM_TIDS + 1];
 		unsigned long last_ack;
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 6de63f1d8c7b..0794396a7988 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -755,12 +755,16 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
  *  - current throughput (higher value for higher tpt)?
  */
 #define STA_LOST_PKT_THRESHOLD	50
+#define STA_LOST_PKT_TIME	HZ		/* 1 sec since last ACK */
 #define STA_LOST_TDLS_PKT_THRESHOLD	10
 #define STA_LOST_TDLS_PKT_TIME		(10*HZ) /* 10secs since last ACK */
 
 static void ieee80211_lost_packet(struct sta_info *sta,
 				  struct ieee80211_tx_info *info)
 {
+	unsigned long pkt_time = STA_LOST_PKT_TIME;
+	unsigned int pkt_thr = STA_LOST_PKT_THRESHOLD;
+
 	/* If driver relies on its own algorithm for station kickout, skip
 	 * mac80211 packet loss mechanism.
 	 */
@@ -773,21 +777,20 @@ static void ieee80211_lost_packet(struct sta_info *sta,
 		return;
 
 	sta->status_stats.lost_packets++;
-	if (!sta->sta.tdls &&
-	    sta->status_stats.lost_packets < STA_LOST_PKT_THRESHOLD)
-		return;
+	if (sta->sta.tdls) {
+		pkt_time = STA_LOST_TDLS_PKT_TIME;
+		pkt_thr = STA_LOST_PKT_THRESHOLD;
+	}
 
 	/*
 	 * If we're in TDLS mode, make sure that all STA_LOST_TDLS_PKT_THRESHOLD
 	 * of the last packets were lost, and that no ACK was received in the
 	 * last STA_LOST_TDLS_PKT_TIME ms, before triggering the CQM packet-loss
 	 * mechanism.
+	 * For non-TDLS, use STA_LOST_PKT_THRESHOLD and STA_LOST_PKT_TIME
 	 */
-	if (sta->sta.tdls &&
-	    (sta->status_stats.lost_packets < STA_LOST_TDLS_PKT_THRESHOLD ||
-	     time_before(jiffies,
-			 sta->status_stats.last_tdls_pkt_time +
-			 STA_LOST_TDLS_PKT_TIME)))
+	if (sta->status_stats.lost_packets < pkt_thr ||
+	    !time_after(jiffies, sta->status_stats.last_pkt_time + pkt_time))
 		return;
 
 	cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr,
@@ -1033,9 +1036,7 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
 					sta->status_stats.lost_packets = 0;
 
 				/* Track when last TDLS packet was ACKed */
-				if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH))
-					sta->status_stats.last_tdls_pkt_time =
-						jiffies;
+				sta->status_stats.last_pkt_time = jiffies;
 			} else if (noack_success) {
 				/* nothing to do here, do not account as lost */
 			} else {
@@ -1172,9 +1173,8 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
 			if (sta->status_stats.lost_packets)
 				sta->status_stats.lost_packets = 0;
 
-			/* Track when last TDLS packet was ACKed */
-			if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH))
-				sta->status_stats.last_tdls_pkt_time = jiffies;
+			/* Track when last packet was ACKed */
+			sta->status_stats.last_pkt_time = jiffies;
 		} else if (test_sta_flag(sta, WLAN_STA_PS_STA)) {
 			return;
 		} else if (noack_success) {
@@ -1263,8 +1263,7 @@ void ieee80211_tx_status_8023(struct ieee80211_hw *hw,
 			if (sta->status_stats.lost_packets)
 				sta->status_stats.lost_packets = 0;
 
-			if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH))
-				sta->status_stats.last_tdls_pkt_time = jiffies;
+			sta->status_stats.last_pkt_time = jiffies;
 		} else {
 			ieee80211_lost_packet(sta, info);
 		}
-- 
cgit 


From 2d9b55508556ccee6410310fb9ea2482fd3328eb Mon Sep 17 00:00:00 2001
From: Amar Singhal <asinghal@codeaurora.org>
Date: Fri, 19 Jun 2020 13:52:01 -0700
Subject: cfg80211: Adjust 6 GHz frequency to channel conversion

Adjust the 6 GHz frequency to channel conversion function,
the other way around was previously handled.

Signed-off-by: Amar Singhal <asinghal@codeaurora.org>
Link: https://lore.kernel.org/r/1592599921-10607-1-git-send-email-asinghal@codeaurora.org
[rewrite commit message, hard-code channel 2]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/util.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/wireless/util.c b/net/wireless/util.c
index dfad1c0f57ad..c62eb3d9ab3d 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -123,11 +123,13 @@ int ieee80211_freq_khz_to_channel(u32 freq)
 		return (freq - 2407) / 5;
 	else if (freq >= 4910 && freq <= 4980)
 		return (freq - 4000) / 5;
-	else if (freq < 5945)
+	else if (freq < 5925)
 		return (freq - 5000) / 5;
+	else if (freq == 5935)
+		return 2;
 	else if (freq <= 45000) /* DMG band lower limit */
-		/* see 802.11ax D4.1 27.3.22.2 */
-		return (freq - 5940) / 5;
+		/* see 802.11ax D6.1 27.3.22.2 */
+		return (freq - 5950) / 5;
 	else if (freq >= 58320 && freq <= 70200)
 		return (freq - 56160) / 2160;
 	else
-- 
cgit 


From 645f08975f49441b3e753d8dc5b740cbcb226594 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Thu, 27 Aug 2020 07:27:49 -0400
Subject: net: Fix some comments

Fix some comments, including wrong function name, duplicated word and so
on.

Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c     | 2 +-
 net/ipv4/raw.c      | 2 +-
 net/l3mdev/l3mdev.c | 2 +-
 net/socket.c        | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index e4f40b175acb..8eb2c924805a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3254,7 +3254,7 @@ void sk_common_release(struct sock *sk)
 		sk->sk_prot->destroy(sk);
 
 	/*
-	 * Observation: when sock_common_release is called, processes have
+	 * Observation: when sk_common_release is called, processes have
 	 * no access to socket. But net still has.
 	 * Step one, detach it from networking:
 	 *
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 6fd4330287c2..407956be7deb 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -610,7 +610,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	} else if (!ipc.oif) {
 		ipc.oif = inet->uc_index;
 	} else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
-		/* oif is set, packet is to local broadcast and
+		/* oif is set, packet is to local broadcast
 		 * and uc_index is set. oif is most likely set
 		 * by sk_bound_dev_if. If uc_index != oif check if the
 		 * oif is an L3 master and uc_index is an L3 slave.
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index e71ca5aec684..864326f150e2 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -154,7 +154,7 @@ int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex)
 EXPORT_SYMBOL_GPL(l3mdev_master_upper_ifindex_by_index_rcu);
 
 /**
- *	l3mdev_fib_table - get FIB table id associated with an L3
+ *	l3mdev_fib_table_rcu - get FIB table id associated with an L3
  *                             master interface
  *	@dev: targeted interface
  */
diff --git a/net/socket.c b/net/socket.c
index dbbe8ea7d395..0c0144604f81 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -3610,7 +3610,7 @@ int kernel_getsockname(struct socket *sock, struct sockaddr *addr)
 EXPORT_SYMBOL(kernel_getsockname);
 
 /**
- *	kernel_peername - get the address which the socket is connected (kernel space)
+ *	kernel_getpeername - get the address which the socket is connected (kernel space)
  *	@sock: socket
  *	@addr: address holder
  *
@@ -3671,7 +3671,7 @@ int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset,
 EXPORT_SYMBOL(kernel_sendpage_locked);
 
 /**
- *	kernel_shutdown - shut down part of a full-duplex connection (kernel space)
+ *	kernel_sock_shutdown - shut down part of a full-duplex connection (kernel space)
  *	@sock: socket
  *	@how: connection part
  *
-- 
cgit 


From b43c75abfd084b9e961a47c6331e01f2f91176b6 Mon Sep 17 00:00:00 2001
From: Dinghao Liu <dinghao.liu@zju.edu.cn>
Date: Thu, 27 Aug 2020 16:55:46 +0100
Subject: rxrpc: Fix memory leak in rxkad_verify_response()

Fix a memory leak in rxkad_verify_response() whereby the response buffer
doesn't get freed if we fail to allocate a ticket buffer.

Fixes: ef68622da9cc ("rxrpc: Handle temporary errors better in rxkad security")
Signed-off-by: Dinghao Liu <dinghao.liu@zju.edu.cn>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rxrpc/rxkad.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 52a24d4ef5d8..e08130e5746b 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -1137,7 +1137,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 	ret = -ENOMEM;
 	ticket = kmalloc(ticket_len, GFP_NOFS);
 	if (!ticket)
-		goto temporary_error;
+		goto temporary_error_free_resp;
 
 	eproto = tracepoint_string("rxkad_tkt_short");
 	abort_code = RXKADPACKETSHORT;
@@ -1230,6 +1230,7 @@ protocol_error:
 
 temporary_error_free_ticket:
 	kfree(ticket);
+temporary_error_free_resp:
 	kfree(response);
 temporary_error:
 	/* Ignore the response packet if we got a temporary error such as
-- 
cgit 


From 174bce38caf18b47816f352156fab926f4fe072a Mon Sep 17 00:00:00 2001
From: zhudi <zhudi21@huawei.com>
Date: Wed, 26 Aug 2020 20:01:13 +0800
Subject: netlink: fix a data race in netlink_rcv_wake()

The data races were reported by KCSAN:
BUG: KCSAN: data-race in netlink_recvmsg / skb_queue_tail

write (marked) to 0xffff8c0986e5a8c8 of 8 bytes by interrupt on cpu 3:
 skb_queue_tail+0xcc/0x120
 __netlink_sendskb+0x55/0x80
 netlink_broadcast_filtered+0x465/0x7e0
 nlmsg_notify+0x8f/0x120
 rtnl_notify+0x8e/0xb0
 __neigh_notify+0xf2/0x120
 neigh_update+0x927/0xde0
 arp_process+0x8a3/0xf50
 arp_rcv+0x27c/0x3b0
 __netif_receive_skb_core+0x181c/0x1840
 __netif_receive_skb+0x38/0xf0
 netif_receive_skb_internal+0x77/0x1c0
 napi_gro_receive+0x1bd/0x1f0
 e1000_clean_rx_irq+0x538/0xb20 [e1000]
 e1000_clean+0x5e4/0x1340 [e1000]
 net_rx_action+0x310/0x9d0
 __do_softirq+0xe8/0x308
 irq_exit+0x109/0x110
 do_IRQ+0x7f/0xe0
 ret_from_intr+0x0/0x1d
 0xffffffffffffffff

read to 0xffff8c0986e5a8c8 of 8 bytes by task 1463 on cpu 0:
 netlink_recvmsg+0x40b/0x820
 sock_recvmsg+0xc9/0xd0
 ___sys_recvmsg+0x1a4/0x3b0
 __sys_recvmsg+0x86/0x120
 __x64_sys_recvmsg+0x52/0x70
 do_syscall_64+0xb5/0x360
 entry_SYSCALL_64_after_hwframe+0x65/0xca
 0xffffffffffffffff

Since the write is under sk_receive_queue->lock but the read
is done as lockless. so fix it by using skb_queue_empty_lockless()
instead of skb_queue_empty() for the read in netlink_rcv_wake()

Signed-off-by: zhudi <zhudi21@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index b5f30d7d30d0..d2d1448274f5 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -353,7 +353,7 @@ static void netlink_rcv_wake(struct sock *sk)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
 
-	if (skb_queue_empty(&sk->sk_receive_queue))
+	if (skb_queue_empty_lockless(&sk->sk_receive_queue))
 		clear_bit(NETLINK_S_CONGESTED, &nlk->state);
 	if (!test_bit(NETLINK_S_CONGESTED, &nlk->state))
 		wake_up_interruptible(&nlk->wait);
-- 
cgit 


From 5438dd45831ee33869779bd1919b05816ae4dbc9 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 27 Aug 2020 10:40:41 -0700
Subject: net_sched: fix error path in red_init()

When ->init() fails, ->destroy() is called to clean up.
So it is unnecessary to clean up in red_init(), and it
would cause some refcount underflow.

Fixes: aee9caa03fc3 ("net: sched: sch_red: Add qevents "early_drop" and "mark"")
Reported-and-tested-by: syzbot+b33c1cb0a30ebdc8a5f9@syzkaller.appspotmail.com
Reported-and-tested-by: syzbot+e5ea5f8a3ecfd4427a1c@syzkaller.appspotmail.com
Cc: Petr Machata <petrm@mellanox.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_red.c | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index deac82f3ad7b..e89fab6ccb34 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -353,23 +353,11 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt,
 			      FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP,
 			      tb[TCA_RED_EARLY_DROP_BLOCK], extack);
 	if (err)
-		goto err_early_drop_init;
-
-	err = tcf_qevent_init(&q->qe_mark, sch,
-			      FLOW_BLOCK_BINDER_TYPE_RED_MARK,
-			      tb[TCA_RED_MARK_BLOCK], extack);
-	if (err)
-		goto err_mark_init;
-
-	return 0;
+		return err;
 
-err_mark_init:
-	tcf_qevent_destroy(&q->qe_early_drop, sch);
-err_early_drop_init:
-	del_timer_sync(&q->adapt_timer);
-	red_offload(sch, false);
-	qdisc_put(q->qdisc);
-	return err;
+	return tcf_qevent_init(&q->qe_mark, sch,
+			       FLOW_BLOCK_BINDER_TYPE_RED_MARK,
+			       tb[TCA_RED_MARK_BLOCK], extack);
 }
 
 static int red_change(struct Qdisc *sch, struct nlattr *opt,
-- 
cgit 


From 4b7ddc58e61ada82be1f47f82d5139f0ab2ba478 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 22 Aug 2020 18:07:27 -0700
Subject: netfilter: delete repeated words

Drop duplicated words in net/netfilter/ and net/ipv4/netfilter/.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_nat_pptp.c       | 2 +-
 net/netfilter/nf_conntrack_pptp.c      | 2 +-
 net/netfilter/nf_conntrack_proto_tcp.c | 2 +-
 net/netfilter/xt_recent.c              | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 7afde8828b4c..3f248a19faa3 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -3,7 +3,7 @@
  * nf_nat_pptp.c
  *
  * NAT support for PPTP (Point to Point Tunneling Protocol).
- * PPTP is a a protocol for creating virtual private networks.
+ * PPTP is a protocol for creating virtual private networks.
  * It is a specification defined by Microsoft and some vendors
  * working with Microsoft.  PPTP is built on top of a modified
  * version of the Internet Generic Routing Encapsulation Protocol.
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index 1f44d523b512..5105d4250012 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * Connection tracking support for PPTP (Point to Point Tunneling Protocol).
- * PPTP is a a protocol for creating virtual private networks.
+ * PPTP is a protocol for creating virtual private networks.
  * It is a specification defined by Microsoft and some vendors
  * working with Microsoft.  PPTP is built on top of a modified
  * version of the Internet Generic Routing Encapsulation Protocol.
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 6892e497781c..e8c86ee4c1c4 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1152,7 +1152,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
 		   && (old_state == TCP_CONNTRACK_SYN_RECV
 		       || old_state == TCP_CONNTRACK_ESTABLISHED)
 		   && new_state == TCP_CONNTRACK_ESTABLISHED) {
-		/* Set ASSURED if we see see valid ack in ESTABLISHED
+		/* Set ASSURED if we see valid ack in ESTABLISHED
 		   after SYN_RECV or a valid answer for a picked up
 		   connection. */
 		set_bit(IPS_ASSURED_BIT, &ct->status);
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 19bef176145e..606411869698 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -640,7 +640,7 @@ static void __net_exit recent_proc_net_exit(struct net *net)
 	struct recent_table *t;
 
 	/* recent_net_exit() is called before recent_mt_destroy(). Make sure
-	 * that the parent xt_recent proc entry is is empty before trying to
+	 * that the parent xt_recent proc entry is empty before trying to
 	 * remove it.
 	 */
 	spin_lock_bh(&recent_lock);
-- 
cgit 


From ee921183557af39c1a0475f982d43b0fcac25e2e Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 23 Aug 2020 13:55:36 +0200
Subject: netfilter: nfnetlink: nfnetlink_unicast() reports EAGAIN instead of
 ENOBUFS

Frontend callback reports EAGAIN to nfnetlink to retry a command, this
is used to signal that module autoloading is required. Unfortunately,
nlmsg_unicast() reports EAGAIN in case the receiver socket buffer gets
full, so it enters a busy-loop.

This patch updates nfnetlink_unicast() to turn EAGAIN into ENOBUFS and
to use nlmsg_unicast(). Remove the flags field in nfnetlink_unicast()
since this is always MSG_DONTWAIT in the existing code which is exactly
what nlmsg_unicast() passes to netlink_unicast() as parameter.

Fixes: 96518518cc41 ("netfilter: add nftables")
Reported-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c   | 61 ++++++++++++++++++++---------------------
 net/netfilter/nfnetlink.c       | 11 ++++++--
 net/netfilter/nfnetlink_log.c   |  3 +-
 net/netfilter/nfnetlink_queue.c |  2 +-
 4 files changed, 39 insertions(+), 38 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 71e501c5ad21..b7dc1cbf40ea 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -815,11 +815,11 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
 					nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0,
 					family, table);
 	if (err < 0)
-		goto err;
+		goto err_fill_table_info;
 
-	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+	return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
 
-err:
+err_fill_table_info:
 	kfree_skb(skb2);
 	return err;
 }
@@ -1563,11 +1563,11 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
 					nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0,
 					family, table, chain);
 	if (err < 0)
-		goto err;
+		goto err_fill_chain_info;
 
-	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+	return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
 
-err:
+err_fill_chain_info:
 	kfree_skb(skb2);
 	return err;
 }
@@ -3008,11 +3008,11 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 				       nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
 				       family, table, chain, rule, NULL);
 	if (err < 0)
-		goto err;
+		goto err_fill_rule_info;
 
-	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+	return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
 
-err:
+err_fill_rule_info:
 	kfree_skb(skb2);
 	return err;
 }
@@ -3968,11 +3968,11 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
 
 	err = nf_tables_fill_set(skb2, &ctx, set, NFT_MSG_NEWSET, 0);
 	if (err < 0)
-		goto err;
+		goto err_fill_set_info;
 
-	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+	return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
 
-err:
+err_fill_set_info:
 	kfree_skb(skb2);
 	return err;
 }
@@ -4860,24 +4860,18 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	err = -ENOMEM;
 	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
 	if (skb == NULL)
-		goto err1;
+		return err;
 
 	err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid,
 					  NFT_MSG_NEWSETELEM, 0, set, &elem);
 	if (err < 0)
-		goto err2;
+		goto err_fill_setelem;
 
-	err = nfnetlink_unicast(skb, ctx->net, ctx->portid, MSG_DONTWAIT);
-	/* This avoids a loop in nfnetlink. */
-	if (err < 0)
-		goto err1;
+	return nfnetlink_unicast(skb, ctx->net, ctx->portid);
 
-	return 0;
-err2:
+err_fill_setelem:
 	kfree_skb(skb);
-err1:
-	/* this avoids a loop in nfnetlink. */
-	return err == -EAGAIN ? -ENOBUFS : err;
+	return err;
 }
 
 /* called with rcu_read_lock held */
@@ -6182,10 +6176,11 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
 				      nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
 				      family, table, obj, reset);
 	if (err < 0)
-		goto err;
+		goto err_fill_obj_info;
 
-	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
-err:
+	return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
+
+err_fill_obj_info:
 	kfree_skb(skb2);
 	return err;
 }
@@ -7045,10 +7040,11 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
 					    NFT_MSG_NEWFLOWTABLE, 0, family,
 					    flowtable, &flowtable->hook_list);
 	if (err < 0)
-		goto err;
+		goto err_fill_flowtable_info;
 
-	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
-err:
+	return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
+
+err_fill_flowtable_info:
 	kfree_skb(skb2);
 	return err;
 }
@@ -7234,10 +7230,11 @@ static int nf_tables_getgen(struct net *net, struct sock *nlsk,
 	err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid,
 				      nlh->nlmsg_seq);
 	if (err < 0)
-		goto err;
+		goto err_fill_gen_info;
 
-	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
-err:
+	return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
+
+err_fill_gen_info:
 	kfree_skb(skb2);
 	return err;
 }
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 5f24edf95830..3a2e64e13b22 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -149,10 +149,15 @@ int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error)
 }
 EXPORT_SYMBOL_GPL(nfnetlink_set_err);
 
-int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid,
-		      int flags)
+int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid)
 {
-	return netlink_unicast(net->nfnl, skb, portid, flags);
+	int err;
+
+	err = nlmsg_unicast(net->nfnl, skb, portid);
+	if (err == -EAGAIN)
+		err = -ENOBUFS;
+
+	return err;
 }
 EXPORT_SYMBOL_GPL(nfnetlink_unicast);
 
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index f02992419850..b35e8d9a5b37 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -356,8 +356,7 @@ __nfulnl_send(struct nfulnl_instance *inst)
 			goto out;
 		}
 	}
-	nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid,
-			  MSG_DONTWAIT);
+	nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid);
 out:
 	inst->qlen = 0;
 	inst->skb = NULL;
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index dadfc06245a3..d1d8bca03b4f 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -681,7 +681,7 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
 	*packet_id_ptr = htonl(entry->id);
 
 	/* nfnetlink_unicast will either free the nskb or add it to a socket */
-	err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT);
+	err = nfnetlink_unicast(nskb, net, queue->peer_portid);
 	if (err < 0) {
 		if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
 			failopen = 1;
-- 
cgit 


From c46172147ebbeb70094db48d76ab7945d96c638b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 26 Aug 2020 00:07:18 +0200
Subject: netfilter: conntrack: do not auto-delete clash entries on reply

Its possible that we have more than one packet with the same ct tuple
simultaneously, e.g. when an application emits n packets on same UDP
socket from multiple threads.

NAT rules might be applied to those packets. With the right set of rules,
n packets will be mapped to m destinations, where at least two packets end
up with the same destination.

When this happens, the existing clash resolution may merge the skb that
is processed after the first has been received with the identical tuple
already in hash table.

However, its possible that this identical tuple is a NAT_CLASH tuple.
In that case the second skb will be sent, but no reply can be received
since the reply that is processed first removes the NAT_CLASH tuple.

Do not auto-delete, this gives a 1 second window for replies to be passed
back to originator.

Packets that are coming later (udp stream case) will not be affected:
they match the original ct entry, not a NAT_CLASH one.

Also prevent NAT_CLASH entries from getting offloaded.

Fixes: 6a757c07e51f ("netfilter: conntrack: allow insertion of clashing entries")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_proto_udp.c | 26 ++++++++++----------------
 net/netfilter/nft_flow_offload.c       |  2 +-
 2 files changed, 11 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 760ca2422816..af402f458ee0 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -81,18 +81,6 @@ static bool udp_error(struct sk_buff *skb,
 	return false;
 }
 
-static void nf_conntrack_udp_refresh_unreplied(struct nf_conn *ct,
-					       struct sk_buff *skb,
-					       enum ip_conntrack_info ctinfo,
-					       u32 extra_jiffies)
-{
-	if (unlikely(ctinfo == IP_CT_ESTABLISHED_REPLY &&
-		     ct->status & IPS_NAT_CLASH))
-		nf_ct_kill(ct);
-	else
-		nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies);
-}
-
 /* Returns verdict for packet, and may modify conntracktype */
 int nf_conntrack_udp_packet(struct nf_conn *ct,
 			    struct sk_buff *skb,
@@ -124,12 +112,15 @@ int nf_conntrack_udp_packet(struct nf_conn *ct,
 
 		nf_ct_refresh_acct(ct, ctinfo, skb, extra);
 
+		/* never set ASSURED for IPS_NAT_CLASH, they time out soon */
+		if (unlikely((ct->status & IPS_NAT_CLASH)))
+			return NF_ACCEPT;
+
 		/* Also, more likely to be important, and not a probe */
 		if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
 			nf_conntrack_event_cache(IPCT_ASSURED, ct);
 	} else {
-		nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo,
-						   timeouts[UDP_CT_UNREPLIED]);
+		nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]);
 	}
 	return NF_ACCEPT;
 }
@@ -206,12 +197,15 @@ int nf_conntrack_udplite_packet(struct nf_conn *ct,
 	if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
 		nf_ct_refresh_acct(ct, ctinfo, skb,
 				   timeouts[UDP_CT_REPLIED]);
+
+		if (unlikely((ct->status & IPS_NAT_CLASH)))
+			return NF_ACCEPT;
+
 		/* Also, more likely to be important, and not a probe */
 		if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
 			nf_conntrack_event_cache(IPCT_ASSURED, ct);
 	} else {
-		nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo,
-						   timeouts[UDP_CT_UNREPLIED]);
+		nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]);
 	}
 	return NF_ACCEPT;
 }
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index 3b9b97aa4b32..3a6c84fb2c90 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -102,7 +102,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
 	}
 
 	if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
-	    ct->status & IPS_SEQ_ADJUST)
+	    ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH))
 		goto out;
 
 	if (!nf_ct_is_confirmed(ct))
-- 
cgit 


From bb8872a1e6bc911869a729240781076ed950764b Mon Sep 17 00:00:00 2001
From: Tuong Lien <tuong.t.lien@dektech.com.au>
Date: Sun, 30 Aug 2020 02:37:55 +0700
Subject: tipc: fix using smp_processor_id() in preemptible

The 'this_cpu_ptr()' is used to obtain the AEAD key' TFM on the current
CPU for encryption, however the execution can be preemptible since it's
actually user-space context, so the 'using smp_processor_id() in
preemptible' has been observed.

We fix the issue by using the 'get/put_cpu_ptr()' API which consists of
a 'preempt_disable()' instead.

Fixes: fc1b6d6de220 ("tipc: introduce TIPC encryption & authentication")
Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/crypto.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c
index c38babaa4e57..7c523dc81575 100644
--- a/net/tipc/crypto.c
+++ b/net/tipc/crypto.c
@@ -326,7 +326,8 @@ static void tipc_aead_free(struct rcu_head *rp)
 	if (aead->cloned) {
 		tipc_aead_put(aead->cloned);
 	} else {
-		head = *this_cpu_ptr(aead->tfm_entry);
+		head = *get_cpu_ptr(aead->tfm_entry);
+		put_cpu_ptr(aead->tfm_entry);
 		list_for_each_entry_safe(tfm_entry, tmp, &head->list, list) {
 			crypto_free_aead(tfm_entry->tfm);
 			list_del(&tfm_entry->list);
@@ -399,10 +400,15 @@ static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val)
  */
 static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead)
 {
-	struct tipc_tfm **tfm_entry = this_cpu_ptr(aead->tfm_entry);
+	struct tipc_tfm **tfm_entry;
+	struct crypto_aead *tfm;
 
+	tfm_entry = get_cpu_ptr(aead->tfm_entry);
 	*tfm_entry = list_next_entry(*tfm_entry, list);
-	return (*tfm_entry)->tfm;
+	tfm = (*tfm_entry)->tfm;
+	put_cpu_ptr(tfm_entry);
+
+	return tfm;
 }
 
 /**
-- 
cgit 


From 355db39110100b968c8291a1a7d897f92e17a8df Mon Sep 17 00:00:00 2001
From: Leesoo Ahn <dev@ooseel.net>
Date: Tue, 1 Sep 2020 22:04:47 +0900
Subject: pktgen: fix error message with wrong function name

Error on calling kthread_create_on_node prints wrong function name,
kernel_thread.

Fixes: 94dcf29a11b3 ("kthread: use kthread_create_on_node()")
Signed-off-by: Leesoo Ahn <dev@ooseel.net>
Acked-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index b53b6d38c4df..fce968b673b3 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3699,7 +3699,7 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn)
 				   cpu_to_node(cpu),
 				   "kpktgend_%d", cpu);
 	if (IS_ERR(p)) {
-		pr_err("kernel_thread() failed for cpu %d\n", t->cpu);
+		pr_err("kthread_create_on_node() failed for cpu %d\n", t->cpu);
 		list_del(&t->th_list);
 		kfree(t);
 		return PTR_ERR(p);
-- 
cgit 


From 05d4487197b2b71d5363623c28924fd58c71c0b6 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 2 Sep 2020 16:16:59 +0300
Subject: ipv6: Fix sysctl max for fib_multipath_hash_policy

Cited commit added the possible value of '2', but it cannot be set. Fix
it by adjusting the maximum value to '2'. This is consistent with the
corresponding IPv4 sysctl.

Before:

# sysctl -w net.ipv6.fib_multipath_hash_policy=2
sysctl: setting key "net.ipv6.fib_multipath_hash_policy": Invalid argument
net.ipv6.fib_multipath_hash_policy = 2
# sysctl net.ipv6.fib_multipath_hash_policy
net.ipv6.fib_multipath_hash_policy = 0

After:

# sysctl -w net.ipv6.fib_multipath_hash_policy=2
net.ipv6.fib_multipath_hash_policy = 2
# sysctl net.ipv6.fib_multipath_hash_policy
net.ipv6.fib_multipath_hash_policy = 2

Fixes: d8f74f0975d8 ("ipv6: Support multipath hashing on inner IP pkts")
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Stephen Suryaputra <ssuryaextr@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/sysctl_net_ipv6.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index fac2135aa47b..5b60a4bdd36a 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -21,6 +21,7 @@
 #include <net/calipso.h>
 #endif
 
+static int two = 2;
 static int flowlabel_reflect_max = 0x7;
 static int auto_flowlabels_min;
 static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
@@ -150,7 +151,7 @@ static struct ctl_table ipv6_table_template[] = {
 		.mode		= 0644,
 		.proc_handler   = proc_rt6_multipath_hash_policy,
 		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
+		.extra2		= &two,
 	},
 	{
 		.procname	= "seg6_flowlabel",
-- 
cgit 


From 2a63866c8b51a3f72cea388dfac259d0e14c4ba6 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 2 Sep 2020 22:44:16 +0900
Subject: tipc: fix shutdown() of connectionless socket

syzbot is reporting hung task at nbd_ioctl() [1], for there are two
problems regarding TIPC's connectionless socket's shutdown() operation.

----------
#include <fcntl.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <linux/nbd.h>
#include <unistd.h>

int main(int argc, char *argv[])
{
        const int fd = open("/dev/nbd0", 3);
        alarm(5);
        ioctl(fd, NBD_SET_SOCK, socket(PF_TIPC, SOCK_DGRAM, 0));
        ioctl(fd, NBD_DO_IT, 0); /* To be interrupted by SIGALRM. */
        return 0;
}
----------

One problem is that wait_for_completion() from flush_workqueue() from
nbd_start_device_ioctl() from nbd_ioctl() cannot be completed when
nbd_start_device_ioctl() received a signal at wait_event_interruptible(),
for tipc_shutdown() from kernel_sock_shutdown(SHUT_RDWR) from
nbd_mark_nsock_dead() from sock_shutdown() from nbd_start_device_ioctl()
is failing to wake up a WQ thread sleeping at wait_woken() from
tipc_wait_for_rcvmsg() from sock_recvmsg() from sock_xmit() from
nbd_read_stat() from recv_work() scheduled by nbd_start_device() from
nbd_start_device_ioctl(). Fix this problem by always invoking
sk->sk_state_change() (like inet_shutdown() does) when tipc_shutdown() is
called.

The other problem is that tipc_wait_for_rcvmsg() cannot return when
tipc_shutdown() is called, for tipc_shutdown() sets sk->sk_shutdown to
SEND_SHUTDOWN (despite "how" is SHUT_RDWR) while tipc_wait_for_rcvmsg()
needs sk->sk_shutdown set to RCV_SHUTDOWN or SHUTDOWN_MASK. Fix this
problem by setting sk->sk_shutdown to SHUTDOWN_MASK (like inet_shutdown()
does) when the socket is connectionless.

[1] https://syzkaller.appspot.com/bug?id=3fe51d307c1f0a845485cf1798aa059d12bf18b2

Reported-by: syzbot <syzbot+e36f41d207137b5d12f7@syzkaller.appspotmail.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/socket.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 07419f36116a..d49c6173954a 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2771,18 +2771,21 @@ static int tipc_shutdown(struct socket *sock, int how)
 
 	trace_tipc_sk_shutdown(sk, NULL, TIPC_DUMP_ALL, " ");
 	__tipc_shutdown(sock, TIPC_CONN_SHUTDOWN);
-	sk->sk_shutdown = SEND_SHUTDOWN;
+	if (tipc_sk_type_connectionless(sk))
+		sk->sk_shutdown = SHUTDOWN_MASK;
+	else
+		sk->sk_shutdown = SEND_SHUTDOWN;
 
 	if (sk->sk_state == TIPC_DISCONNECTING) {
 		/* Discard any unreceived messages */
 		__skb_queue_purge(&sk->sk_receive_queue);
 
-		/* Wake up anyone sleeping in poll */
-		sk->sk_state_change(sk);
 		res = 0;
 	} else {
 		res = -ENOTCONN;
 	}
+	/* Wake up anyone sleeping in poll. */
+	sk->sk_state_change(sk);
 
 	release_sock(sk);
 	return res;
-- 
cgit 


From fffe83c8c40a61978f8998a85cc94bb2ca19b6dd Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Thu, 3 Sep 2020 21:53:15 +0200
Subject: net/smc: fix toleration of fake add_link messages

Older SMCR implementations had no link failover support and used one
link only. Because the handshake protocol requires to try the
establishment of a second link the old code sent a fake add_link message
and declined any server response afterwards.
The current code supports multiple links and inspects the received fake
add_link message more closely. To tolerate the fake add_link messages
smc_llc_is_local_add_link() needs an improved check of the message to
be able to separate between locally enqueued and fake add_link messages.
And smc_llc_cli_add_link() needs to check if the provided qp_mtu size is
invalid and reject the add_link request in that case.

Fixes: c48254fa48e5 ("net/smc: move add link processing for new device into llc layer")
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_llc.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index df5b0a6ea848..3ea33466ebe9 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -841,6 +841,9 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry)
 	struct smc_init_info ini;
 	int lnk_idx, rc = 0;
 
+	if (!llc->qp_mtu)
+		goto out_reject;
+
 	ini.vlan_id = lgr->vlan_id;
 	smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev);
 	if (!memcmp(llc->sender_gid, link->peer_gid, SMC_GID_SIZE) &&
@@ -917,10 +920,20 @@ out:
 	kfree(qentry);
 }
 
+static bool smc_llc_is_empty_llc_message(union smc_llc_msg *llc)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(llc->raw.data); i++)
+		if (llc->raw.data[i])
+			return false;
+	return true;
+}
+
 static bool smc_llc_is_local_add_link(union smc_llc_msg *llc)
 {
 	if (llc->raw.hdr.common.type == SMC_LLC_ADD_LINK &&
-	    !llc->add_link.qp_mtu && !llc->add_link.link_num)
+	    smc_llc_is_empty_llc_message(llc))
 		return true;
 	return false;
 }
-- 
cgit 


From 2d2bfeb8c5c8c95a93c473ca2c7d07cad6943a34 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Thu, 3 Sep 2020 21:53:16 +0200
Subject: net/smc: set rx_off for SMCR explicitly

SMC tries to make use of SMCD first. If a problem shows up,
it tries to switch to SMCR. If the SMCD initializing problem shows
up after the SMCD connection has already been initialized, field
rx_off keeps the wrong SMCD value for SMCR, which results in corrupted
data at the receiver.
This patch adds an explicit (re-)setting of field rx_off to zero if the
connection uses SMCR.

Fixes: be244f28d22f ("net/smc: add SMC-D support in data transfer")
Reviewed-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index b42fa3b00d00..a6ac0eb20c99 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -1356,6 +1356,8 @@ create:
 	if (ini->is_smcd) {
 		conn->rx_off = sizeof(struct smcd_cdc_msg);
 		smcd_cdc_rx_init(conn); /* init tasklet for this conn */
+	} else {
+		conn->rx_off = 0;
 	}
 #ifndef KERNEL_HAS_ATOMIC64
 	spin_lock_init(&conn->acurs_lock);
-- 
cgit 


From 1d8df41d896d75c6eababaa912e3a814585d9f61 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Thu, 3 Sep 2020 21:53:17 +0200
Subject: net/smc: reset sndbuf_desc if freed

When an SMC connection is created, and there is a problem to
create an RMB or DMB, the previously created send buffer is
thrown away as well including buffer descriptor freeing.
Make sure the connection no longer references the freed
buffer descriptor, otherwise bugs like this are possible:

[71556.835148] =============================================================================
[71556.835168] BUG kmalloc-128 (Tainted: G    B      OE    ): Poison overwritten
[71556.835172] -----------------------------------------------------------------------------

[71556.835179] INFO: 0x00000000d20894be-0x00000000aaef63e9 @offset=2724. First byte 0x0 instead of 0x6b
[71556.835215] INFO: Allocated in __smc_buf_create+0x184/0x578 [smc] age=0 cpu=5 pid=46726
[71556.835234]     ___slab_alloc+0x5a4/0x690
[71556.835239]     __slab_alloc.constprop.0+0x70/0xb0
[71556.835243]     kmem_cache_alloc_trace+0x38e/0x3f8
[71556.835250]     __smc_buf_create+0x184/0x578 [smc]
[71556.835257]     smc_buf_create+0x2e/0xe8 [smc]
[71556.835264]     smc_listen_work+0x516/0x6a0 [smc]
[71556.835275]     process_one_work+0x280/0x478
[71556.835280]     worker_thread+0x66/0x368
[71556.835287]     kthread+0x17a/0x1a0
[71556.835294]     ret_from_fork+0x28/0x2c
[71556.835301] INFO: Freed in smc_buf_create+0xd8/0xe8 [smc] age=0 cpu=5 pid=46726
[71556.835307]     __slab_free+0x246/0x560
[71556.835311]     kfree+0x398/0x3f8
[71556.835318]     smc_buf_create+0xd8/0xe8 [smc]
[71556.835324]     smc_listen_work+0x516/0x6a0 [smc]
[71556.835328]     process_one_work+0x280/0x478
[71556.835332]     worker_thread+0x66/0x368
[71556.835337]     kthread+0x17a/0x1a0
[71556.835344]     ret_from_fork+0x28/0x2c
[71556.835348] INFO: Slab 0x00000000a0744551 objects=51 used=51 fp=0x0000000000000000 flags=0x1ffff00000010200
[71556.835352] INFO: Object 0x00000000563480a1 @offset=2688 fp=0x00000000289567b2

[71556.835359] Redzone 000000006783cde2: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb  ................
[71556.835363] Redzone 00000000e35b876e: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb  ................
[71556.835367] Redzone 0000000023074562: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb  ................
[71556.835372] Redzone 00000000b9564b8c: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb  ................
[71556.835376] Redzone 00000000810c6362: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb  ................
[71556.835380] Redzone 0000000065ef52c3: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb  ................
[71556.835384] Redzone 00000000c5dd6984: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb  ................
[71556.835388] Redzone 000000004c480f8f: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb  ................
[71556.835392] Object 00000000563480a1: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
[71556.835397] Object 000000009c479d06: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
[71556.835401] Object 000000006e1dce92: 6b 6b 6b 6b 00 00 00 00 6b 6b 6b 6b 6b 6b 6b 6b  kkkk....kkkkkkkk
[71556.835405] Object 00000000227f7cf8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
[71556.835410] Object 000000009a701215: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
[71556.835414] Object 000000003731ce76: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
[71556.835418] Object 00000000f7085967: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
[71556.835422] Object 0000000007f99927: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b a5  kkkkkkkkkkkkkkk.
[71556.835427] Redzone 00000000579c4913: bb bb bb bb bb bb bb bb                          ........
[71556.835431] Padding 00000000305aef82: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a  ZZZZZZZZZZZZZZZZ
[71556.835435] Padding 00000000b1cdd722: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a  ZZZZZZZZZZZZZZZZ
[71556.835438] Padding 00000000c7568199: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a  ZZZZZZZZZZZZZZZZ
[71556.835442] Padding 00000000fad4c4d4: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a  ZZZZZZZZZZZZZZZZ
[71556.835451] CPU: 0 PID: 47939 Comm: kworker/0:15 Tainted: G    B      OE     5.9.0-rc1uschi+ #54
[71556.835456] Hardware name: IBM 3906 M03 703 (LPAR)
[71556.835464] Workqueue: events smc_listen_work [smc]
[71556.835470] Call Trace:
[71556.835478]  [<00000000d5eaeb10>] show_stack+0x90/0xf8
[71556.835493]  [<00000000d66fc0f8>] dump_stack+0xa8/0xe8
[71556.835499]  [<00000000d61a511c>] check_bytes_and_report+0x104/0x130
[71556.835504]  [<00000000d61a57b2>] check_object+0x26a/0x2e0
[71556.835509]  [<00000000d61a59bc>] alloc_debug_processing+0x194/0x238
[71556.835514]  [<00000000d61a8c14>] ___slab_alloc+0x5a4/0x690
[71556.835519]  [<00000000d61a9170>] __slab_alloc.constprop.0+0x70/0xb0
[71556.835524]  [<00000000d61aaf66>] kmem_cache_alloc_trace+0x38e/0x3f8
[71556.835530]  [<000003ff80549bbc>] __smc_buf_create+0x184/0x578 [smc]
[71556.835538]  [<000003ff8054a396>] smc_buf_create+0x2e/0xe8 [smc]
[71556.835545]  [<000003ff80540c16>] smc_listen_work+0x516/0x6a0 [smc]
[71556.835549]  [<00000000d5f0f448>] process_one_work+0x280/0x478
[71556.835554]  [<00000000d5f0f6a6>] worker_thread+0x66/0x368
[71556.835559]  [<00000000d5f18692>] kthread+0x17a/0x1a0
[71556.835563]  [<00000000d6abf3b8>] ret_from_fork+0x28/0x2c
[71556.835569] INFO: lockdep is turned off.
[71556.835573] FIX kmalloc-128: Restoring 0x00000000d20894be-0x00000000aaef63e9=0x6b

[71556.835577] FIX kmalloc-128: Marking all objects used

Fixes: fd7f3a746582 ("net/smc: remove freed buffer from list")
Reviewed-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index a6ac0eb20c99..a406627b1d55 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -1779,6 +1779,7 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd)
 		list_del(&smc->conn.sndbuf_desc->list);
 		mutex_unlock(&smc->conn.lgr->sndbufs_lock);
 		smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc);
+		smc->conn.sndbuf_desc = NULL;
 	}
 	return rc;
 }
-- 
cgit 


From 5fb8642a17aa2ab9077372977d67a72d3899a98d Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Thu, 3 Sep 2020 21:53:18 +0200
Subject: net/smc: fix sock refcounting in case of termination

When an ISM device is removed, all its linkgroups are terminated,
i.e. all the corresponding connections are killed.
Connection killing invokes smc_close_active_abort(), which decreases
the sock refcount for certain states to simulate passive closing.
And it cancels the close worker and has to give up the sock lock for
this timeframe. This opens the door for a passive close worker or a
socket close to run in between. In this case smc_close_active_abort() and
passive close worker resp. smc_release() might do a sock_put for passive
closing. This causes:

[ 1323.315943] refcount_t: underflow; use-after-free.
[ 1323.316055] WARNING: CPU: 3 PID: 54469 at lib/refcount.c:28 refcount_warn_saturate+0xe8/0x130
[ 1323.316069] Kernel panic - not syncing: panic_on_warn set ...
[ 1323.316084] CPU: 3 PID: 54469 Comm: uperf Not tainted 5.9.0-20200826.rc2.git0.46328853ed20.300.fc32.s390x+debug #1
[ 1323.316096] Hardware name: IBM 2964 NC9 702 (z/VM 6.4.0)
[ 1323.316108] Call Trace:
[ 1323.316125]  [<00000000c0d4aae8>] show_stack+0x90/0xf8
[ 1323.316143]  [<00000000c15989b0>] dump_stack+0xa8/0xe8
[ 1323.316158]  [<00000000c0d8344e>] panic+0x11e/0x288
[ 1323.316173]  [<00000000c0d83144>] __warn+0xac/0x158
[ 1323.316187]  [<00000000c1597a7a>] report_bug+0xb2/0x130
[ 1323.316201]  [<00000000c0d36424>] monitor_event_exception+0x44/0xc0
[ 1323.316219]  [<00000000c195c716>] pgm_check_handler+0x1da/0x238
[ 1323.316234]  [<00000000c151844c>] refcount_warn_saturate+0xec/0x130
[ 1323.316280] ([<00000000c1518448>] refcount_warn_saturate+0xe8/0x130)
[ 1323.316310]  [<000003ff801f2e2a>] smc_release+0x192/0x1c8 [smc]
[ 1323.316323]  [<00000000c169f1fa>] __sock_release+0x5a/0xe0
[ 1323.316334]  [<00000000c169f2ac>] sock_close+0x2c/0x40
[ 1323.316350]  [<00000000c1086de0>] __fput+0xb8/0x278
[ 1323.316362]  [<00000000c0db1e0e>] task_work_run+0x76/0xb8
[ 1323.316393]  [<00000000c0d8ab84>] do_exit+0x26c/0x520
[ 1323.316408]  [<00000000c0d8af08>] do_group_exit+0x48/0xc0
[ 1323.316421]  [<00000000c0d8afa8>] __s390x_sys_exit_group+0x28/0x38
[ 1323.316433]  [<00000000c195c32c>] system_call+0xe0/0x2b4
[ 1323.316446] 1 lock held by uperf/54469:
[ 1323.316456]  #0: 0000000044125e60 (&sb->s_type->i_mutex_key#9){+.+.}-{3:3}, at: __sock_release+0x44/0xe0

The patch rechecks sock state in smc_close_active_abort() after
smc_close_cancel_work() to avoid duplicate decrease of sock
refcount for the same purpose.

Fixes: 611b63a12732 ("net/smc: cancel tx worker in case of socket aborts")
Reviewed-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_close.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index 290270c821ca..35e30d39510d 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -116,7 +116,6 @@ static void smc_close_cancel_work(struct smc_sock *smc)
 	cancel_work_sync(&smc->conn.close_work);
 	cancel_delayed_work_sync(&smc->conn.tx_work);
 	lock_sock(sk);
-	sk->sk_state = SMC_CLOSED;
 }
 
 /* terminate smc socket abnormally - active abort
@@ -134,22 +133,22 @@ void smc_close_active_abort(struct smc_sock *smc)
 	}
 	switch (sk->sk_state) {
 	case SMC_ACTIVE:
-		sk->sk_state = SMC_PEERABORTWAIT;
-		smc_close_cancel_work(smc);
-		sk->sk_state = SMC_CLOSED;
-		sock_put(sk); /* passive closing */
-		break;
 	case SMC_APPCLOSEWAIT1:
 	case SMC_APPCLOSEWAIT2:
+		sk->sk_state = SMC_PEERABORTWAIT;
 		smc_close_cancel_work(smc);
+		if (sk->sk_state != SMC_PEERABORTWAIT)
+			break;
 		sk->sk_state = SMC_CLOSED;
-		sock_put(sk); /* postponed passive closing */
+		sock_put(sk); /* (postponed) passive closing */
 		break;
 	case SMC_PEERCLOSEWAIT1:
 	case SMC_PEERCLOSEWAIT2:
 	case SMC_PEERFINCLOSEWAIT:
 		sk->sk_state = SMC_PEERABORTWAIT;
 		smc_close_cancel_work(smc);
+		if (sk->sk_state != SMC_PEERABORTWAIT)
+			break;
 		sk->sk_state = SMC_CLOSED;
 		smc_conn_free(&smc->conn);
 		release_clcsock = true;
@@ -159,6 +158,8 @@ void smc_close_active_abort(struct smc_sock *smc)
 	case SMC_APPFINCLOSEWAIT:
 		sk->sk_state = SMC_PEERABORTWAIT;
 		smc_close_cancel_work(smc);
+		if (sk->sk_state != SMC_PEERABORTWAIT)
+			break;
 		sk->sk_state = SMC_CLOSED;
 		smc_conn_free(&smc->conn);
 		release_clcsock = true;
-- 
cgit