From 7a7d3e4c031b969c3b570787fc9fdb605abda03e Mon Sep 17 00:00:00 2001
From: Simon Dinkin <simondinkin@gmail.com>
Date: Thu, 31 Aug 2017 13:09:36 +0300
Subject: mac80211: fix incorrect assignment of reassoc value

this fix minor issue in the log message.
in ieee80211_rx_mgmt_assoc_resp function, when assigning the
reassoc value from the mgmt frame control:
ieee80211_is_reassoc_resp function need to be used, instead of
ieee80211_is_reassoc_req function.

Signed-off-by: Simon Dinkin <simon.dinkin@tandemg.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mlme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index b588e593b0ec..3b8e2709d8de 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3155,7 +3155,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	if (len < 24 + 6)
 		return;
 
-	reassoc = ieee80211_is_reassoc_req(mgmt->frame_control);
+	reassoc = ieee80211_is_reassoc_resp(mgmt->frame_control);
 	capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info);
 	status_code = le16_to_cpu(mgmt->u.assoc_resp.status_code);
 	aid = le16_to_cpu(mgmt->u.assoc_resp.aid);
-- 
cgit 


From 53168215909281a09d3afc6fb51a9d4f81f74d39 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 22 Jun 2017 12:20:30 +0200
Subject: mac80211: fix VLAN handling with TXQs

With TXQs, the AP_VLAN interfaces are resolved to their owner AP
interface when enqueuing the frame, which makes sense since the
frame really goes out on that as far as the driver is concerned.

However, this introduces a problem: frames to be encrypted with
a VLAN-specific GTK will now be encrypted with the AP GTK, since
the information about which virtual interface to use to select
the key is taken from the TXQ.

Fix this by preserving info->control.vif and using that in the
dequeue function. This now requires doing the driver-mapping
in the dequeue as well.

Since there's no way to filter the frames that are sitting on a
TXQ, drop all frames, which may affect other interfaces, when an
AP_VLAN is removed.

Cc: stable@vger.kernel.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/iface.c | 17 +++++++++++++++--
 net/mac80211/tx.c    | 36 +++++++++++++++++++++++++++++-------
 2 files changed, 44 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 9228ac73c429..44399322f356 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -792,6 +792,7 @@ static int ieee80211_open(struct net_device *dev)
 static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 			      bool going_down)
 {
+	struct ieee80211_sub_if_data *txq_sdata = sdata;
 	struct ieee80211_local *local = sdata->local;
 	struct fq *fq = &local->fq;
 	unsigned long flags;
@@ -937,6 +938,9 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 
 	switch (sdata->vif.type) {
 	case NL80211_IFTYPE_AP_VLAN:
+		txq_sdata = container_of(sdata->bss,
+					 struct ieee80211_sub_if_data, u.ap);
+
 		mutex_lock(&local->mtx);
 		list_del(&sdata->u.vlan.list);
 		mutex_unlock(&local->mtx);
@@ -1007,8 +1011,17 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 	}
 	spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
 
-	if (sdata->vif.txq) {
-		struct txq_info *txqi = to_txq_info(sdata->vif.txq);
+	if (txq_sdata->vif.txq) {
+		struct txq_info *txqi = to_txq_info(txq_sdata->vif.txq);
+
+		/*
+		 * FIXME FIXME
+		 *
+		 * We really shouldn't purge the *entire* txqi since that
+		 * contains frames for the other AP_VLANs (and possibly
+		 * the AP itself) as well, but there's no API in FQ now
+		 * to be able to filter.
+		 */
 
 		spin_lock_bh(&fq->lock);
 		ieee80211_txq_purge(local, txqi);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 8858f4f185e9..94826680cf2b 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1276,11 +1276,6 @@ static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb)
 	IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time();
 }
 
-static void ieee80211_set_skb_vif(struct sk_buff *skb, struct txq_info *txqi)
-{
-	IEEE80211_SKB_CB(skb)->control.vif = txqi->txq.vif;
-}
-
 static u32 codel_skb_len_func(const struct sk_buff *skb)
 {
 	return skb->len;
@@ -3414,6 +3409,7 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 	struct ieee80211_tx_info *info;
 	struct ieee80211_tx_data tx;
 	ieee80211_tx_result r;
+	struct ieee80211_vif *vif;
 
 	spin_lock_bh(&fq->lock);
 
@@ -3430,8 +3426,6 @@ begin:
 	if (!skb)
 		goto out;
 
-	ieee80211_set_skb_vif(skb, txqi);
-
 	hdr = (struct ieee80211_hdr *)skb->data;
 	info = IEEE80211_SKB_CB(skb);
 
@@ -3488,6 +3482,34 @@ begin:
 		}
 	}
 
+	switch (tx.sdata->vif.type) {
+	case NL80211_IFTYPE_MONITOR:
+		if (tx.sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) {
+			vif = &tx.sdata->vif;
+			break;
+		}
+		tx.sdata = rcu_dereference(local->monitor_sdata);
+		if (tx.sdata) {
+			vif = &tx.sdata->vif;
+			info->hw_queue =
+				vif->hw_queue[skb_get_queue_mapping(skb)];
+		} else if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) {
+			ieee80211_free_txskb(&local->hw, skb);
+			goto begin;
+		} else {
+			vif = NULL;
+		}
+		break;
+	case NL80211_IFTYPE_AP_VLAN:
+		tx.sdata = container_of(tx.sdata->bss,
+					struct ieee80211_sub_if_data, u.ap);
+		/* fall through */
+	default:
+		vif = &tx.sdata->vif;
+		break;
+	}
+
+	IEEE80211_SKB_CB(skb)->control.vif = vif;
 out:
 	spin_unlock_bh(&fq->lock);
 
-- 
cgit 


From d81b0fd0e7b76b05873299ae2dd310127b13613b Mon Sep 17 00:00:00 2001
From: Sharon Dvir <sharon.dvir@intel.com>
Date: Sat, 5 Aug 2017 11:44:30 +0300
Subject: mac80211: shorten debug prints using ht_dbg() to avoid warning

Invoking ht_dbg() with too long of a string will print a warning.
Shorten the messages while retaining the printed patameters.

Signed-off-by: Sharon Dvir <sharon.dvir@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/agg-tx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index cbd48762256c..420486b5a1d9 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -436,7 +436,7 @@ static void sta_addba_resp_timer_expired(unsigned long data)
 	    test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state)) {
 		rcu_read_unlock();
 		ht_dbg(sta->sdata,
-		       "timer expired on %pM tid %d but we are not (or no longer) expecting addBA response there\n",
+		       "timer expired on %pM tid %d not expecting addBA response\n",
 		       sta->sta.addr, tid);
 		return;
 	}
@@ -639,7 +639,7 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid,
 	    time_before(jiffies, sta->ampdu_mlme.last_addba_req_time[tid] +
 			HT_AGG_RETRIES_PERIOD)) {
 		ht_dbg(sdata,
-		       "BA request denied - waiting a grace period after %d failed requests on %pM tid %u\n",
+		       "BA request denied - %d failed requests on %pM tid %u\n",
 		       sta->ampdu_mlme.addba_req_num[tid], sta->sta.addr, tid);
 		ret = -EBUSY;
 		goto err_unlock_sta;
-- 
cgit 


From b44eebea181a36378bea8f27e7f9b4175bfad683 Mon Sep 17 00:00:00 2001
From: Liad Kaufman <liad.kaufman@intel.com>
Date: Sat, 5 Aug 2017 11:44:29 +0300
Subject: mac80211: add MESH IE in the correct order

VHT MESH support was added, but the order of the IEs
wasn't enforced. Fix that.

Signed-off-by: Liad Kaufman <liad.kaufman@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 259698de569f..6aef6793d052 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1436,7 +1436,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local,
 			WLAN_EID_SSID_LIST,
 			WLAN_EID_CHANNEL_USAGE,
 			WLAN_EID_INTERWORKING,
-			/* mesh ID can't happen here */
+			WLAN_EID_MESH_ID,
 			/* 60 GHz can't happen here right now */
 		};
 		noffset = ieee80211_ie_split(ie, ie_len,
-- 
cgit 


From 89e9bfc4ee859ad2a477f10aa2d5c37377242296 Mon Sep 17 00:00:00 2001
From: Chunho Lee <handera@gmail.com>
Date: Fri, 7 Jul 2017 17:03:14 -0700
Subject: mac80211: Fix null pointer dereference with iTXQ support

This change adds null pointer check before dereferencing pointer dev on
netif_tx_start_all_queues() when an interface is added.
With iTXQ support, netif_tx_start_all_queues() is always called while
an interface is added. however, the netdev queues are not associated
and dev is null when the interface is either NL80211_IFTYPE_P2P_DEVICE
or NL80211_IFTYPE_NAN.

Signed-off-by: Chunho Lee <ch.lee@newracom.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/iface.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 44399322f356..f75029abf728 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -731,7 +731,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
 	    sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
 	    local->ops->wake_tx_queue) {
 		/* XXX: for AP_VLAN, actually track AP queues */
-		netif_tx_start_all_queues(dev);
+		if (dev)
+			netif_tx_start_all_queues(dev);
 	} else if (dev) {
 		unsigned long flags;
 		int n_acs = IEEE80211_NUM_ACS;
-- 
cgit 


From 979e1f08042b83152dfe3d76df10db31eb7edf98 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 22 Jun 2017 12:20:28 +0200
Subject: mac80211: agg-tx: call drv_wake_tx_queue in proper context

Since drv_wake_tx_queue() is normally called in the TX path, which
is already in an RCU critical section, we should call it the same
way in the aggregation code path, so if the driver expects to be
able to use RCU, it'll already be protected without having to enter
a nested critical section.

Additionally, disable soft-IRQs, since not doing so could cause
issues in a driver that relies on them already being disabled like
in the other path.

Fixes: ba8c3d6f16a1 ("mac80211: add an intermediate software queue implementation")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/agg-tx.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 420486b5a1d9..bef516ec47f9 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -226,7 +226,11 @@ ieee80211_agg_start_txq(struct sta_info *sta, int tid, bool enable)
 		clear_bit(IEEE80211_TXQ_AMPDU, &txqi->flags);
 
 	clear_bit(IEEE80211_TXQ_STOP, &txqi->flags);
+	local_bh_disable();
+	rcu_read_lock();
 	drv_wake_tx_queue(sta->sdata->local, txqi);
+	rcu_read_unlock();
+	local_bh_enable();
 }
 
 /*
-- 
cgit 


From 6e46d8ce894374fc135c96a8d1057c6af1fef237 Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Fri, 18 Aug 2017 15:33:57 +0300
Subject: mac80211: flush hw_roc_start work before cancelling the ROC

When HW ROC is supported it is possible that after the HW notified
that the ROC has started, the ROC was cancelled and another ROC was
added while the hw_roc_start worker is waiting on the mutex (since
cancelling the ROC and adding another one also holds the same mutex).
As a result, the hw_roc_start worker will continue to run after the
new ROC is added but before it is actually started by the HW.
This may result in notifying userspace that the ROC has started before
it actually does, or in case of management tx ROC, in an attempt to
tx while not on the right channel.

In addition, when the driver will notify mac80211 that the second ROC
has started, mac80211 will warn that this ROC has already been
notified.

Fix this by flushing the hw_roc_start work before cancelling an ROC.

Cc: stable@vger.kernel.org
Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/offchannel.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index f8e7a8bbc618..faf4f6055000 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -707,6 +707,8 @@ static int ieee80211_cancel_roc(struct ieee80211_local *local,
 	if (!cookie)
 		return -ENOENT;
 
+	flush_work(&local->hw_roc_start);
+
 	mutex_lock(&local->mtx);
 	list_for_each_entry_safe(roc, tmp, &local->roc_list, list) {
 		if (!mgmt_tx && roc->cookie != cookie)
-- 
cgit 


From ba83bfb1e86501d21077b570e10e443f1605be64 Mon Sep 17 00:00:00 2001
From: Igor Mitsyanko <igor.mitsyanko.os@quantenna.com>
Date: Wed, 30 Aug 2017 13:52:25 -0700
Subject: nl80211: look for HT/VHT capabilities in beacon's tail

There are no HT/VHT capabilities in cfg80211_ap_settings::beacon_ies,
these should be looked for in beacon's tail instead.

Fixes: 66cd794e3c30 ("nl80211: add HT/VHT capabilities to AP parameters")
Signed-off-by: Igor Mitsyanko <igor.mitsyanko.os@quantenna.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 8ce85420ecb0..0df8023f480b 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3791,8 +3791,8 @@ static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params,
 static void nl80211_calculate_ap_params(struct cfg80211_ap_settings *params)
 {
 	const struct cfg80211_beacon_data *bcn = &params->beacon;
-	size_t ies_len = bcn->beacon_ies_len;
-	const u8 *ies = bcn->beacon_ies;
+	size_t ies_len = bcn->tail_len;
+	const u8 *ies = bcn->tail;
 	const u8 *rates;
 	const u8 *cap;
 
-- 
cgit 


From 4e0854a74f08e6a9d847f2c2cfae7b07c931d125 Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Wed, 6 Sep 2017 13:45:40 +0300
Subject: cfg80211: honor NL80211_RRF_NO_HT40{MINUS,PLUS}

Honor the NL80211_RRF_NO_HT40{MINUS,PLUS} flags in
reg_process_ht_flags_channel. Not doing so leads can lead
to a firmware assert in iwlwifi for example.

Fixes: b0d7aa59592b ("cfg80211: allow wiphy specific regdomain management")
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/reg.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 5fae296a6a58..6e94f6934a0e 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -4,6 +4,7 @@
  * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2008-2011	Luis R. Rodriguez <mcgrof@qca.qualcomm.com>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
+ * Copyright      2017  Intel Deutschland GmbH
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -1483,7 +1484,9 @@ static void reg_process_ht_flags_channel(struct wiphy *wiphy,
 {
 	struct ieee80211_supported_band *sband = wiphy->bands[channel->band];
 	struct ieee80211_channel *channel_before = NULL, *channel_after = NULL;
+	const struct ieee80211_regdomain *regd;
 	unsigned int i;
+	u32 flags;
 
 	if (!is_ht40_allowed(channel)) {
 		channel->flags |= IEEE80211_CHAN_NO_HT40;
@@ -1503,17 +1506,30 @@ static void reg_process_ht_flags_channel(struct wiphy *wiphy,
 			channel_after = c;
 	}
 
+	flags = 0;
+	regd = get_wiphy_regdom(wiphy);
+	if (regd) {
+		const struct ieee80211_reg_rule *reg_rule =
+			freq_reg_info_regd(MHZ_TO_KHZ(channel->center_freq),
+					   regd, MHZ_TO_KHZ(20));
+
+		if (!IS_ERR(reg_rule))
+			flags = reg_rule->flags;
+	}
+
 	/*
 	 * Please note that this assumes target bandwidth is 20 MHz,
 	 * if that ever changes we also need to change the below logic
 	 * to include that as well.
 	 */
-	if (!is_ht40_allowed(channel_before))
+	if (!is_ht40_allowed(channel_before) ||
+	    flags & NL80211_RRF_NO_HT40MINUS)
 		channel->flags |= IEEE80211_CHAN_NO_HT40MINUS;
 	else
 		channel->flags &= ~IEEE80211_CHAN_NO_HT40MINUS;
 
-	if (!is_ht40_allowed(channel_after))
+	if (!is_ht40_allowed(channel_after) ||
+	    flags & NL80211_RRF_NO_HT40PLUS)
 		channel->flags |= IEEE80211_CHAN_NO_HT40PLUS;
 	else
 		channel->flags &= ~IEEE80211_CHAN_NO_HT40PLUS;
-- 
cgit 


From 98e93e968e4947cd71c2eb69e323682daa453ee7 Mon Sep 17 00:00:00 2001
From: Ilan peer <ilan.peer@intel.com>
Date: Wed, 6 Sep 2017 17:32:40 +0300
Subject: mac80211: Complete ampdu work schedule during session tear down

Commit 7a7c0a6438b8 ("mac80211: fix TX aggregation start/stop callback race")
added a cancellation of the ampdu work after the loop that stopped the
Tx and Rx BA sessions. However, in some cases, e.g., during HW reconfig,
the low level driver might call mac80211 APIs to complete the stopping
of the BA sessions, which would queue the ampdu work to handle the actual
completion. This work needs to be performed as otherwise mac80211 data
structures would not be properly synced.

Fix this by checking if BA session STOP_CB bit is set after the BA session
cancellation and properly clean the session.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
[Johannes: the work isn't flushed because that could do other things we
 don't want, and the locking situation isn't clear]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ht.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index c92df492e898..4cba7fca10d4 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -300,6 +300,24 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta,
 
 	/* stopping might queue the work again - so cancel only afterwards */
 	cancel_work_sync(&sta->ampdu_mlme.work);
+
+	/*
+	 * In case the tear down is part of a reconfigure due to HW restart
+	 * request, it is possible that the low level driver requested to stop
+	 * the BA session, so handle it to properly clean tid_tx data.
+	 */
+	mutex_lock(&sta->ampdu_mlme.mtx);
+	for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
+		struct tid_ampdu_tx *tid_tx =
+			rcu_dereference_protected_tid_tx(sta, i);
+
+		if (!tid_tx)
+			continue;
+
+		if (test_and_clear_bit(HT_AGG_STATE_STOP_CB, &tid_tx->state))
+			ieee80211_stop_tx_ba_cb(sta, i, tid_tx);
+	}
+	mutex_unlock(&sta->ampdu_mlme.mtx);
 }
 
 void ieee80211_ba_session_work(struct work_struct *work)
-- 
cgit 


From bde59c475e0883e4c4294bcd9b9c7e08ae18c828 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 6 Sep 2017 15:01:42 +0200
Subject: mac80211: fix deadlock in driver-managed RX BA session start

When an RX BA session is started by the driver, and it has to tell
mac80211 about it, the corresponding bit in tid_rx_manage_offl gets
set and the BA session work is scheduled. Upon testing this bit, it
will call __ieee80211_start_rx_ba_session(), thus deadlocking as it
already holds the ampdu_mlme.mtx, which that acquires again.

Fix this by adding ___ieee80211_start_rx_ba_session(), a version of
the function that requires the mutex already held.

Cc: stable@vger.kernel.org
Fixes: 699cb58c8a52 ("mac80211: manage RX BA session offload without SKB queue")
Reported-by: Matteo Croce <mcroce@redhat.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/agg-rx.c      | 32 +++++++++++++++++++++-----------
 net/mac80211/ht.c          |  6 +++---
 net/mac80211/ieee80211_i.h |  4 ++++
 3 files changed, 28 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 2b36eff5d97e..2849a1fc41c5 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -245,10 +245,10 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d
 	ieee80211_tx_skb(sdata, skb);
 }
 
-void __ieee80211_start_rx_ba_session(struct sta_info *sta,
-				     u8 dialog_token, u16 timeout,
-				     u16 start_seq_num, u16 ba_policy, u16 tid,
-				     u16 buf_size, bool tx, bool auto_seq)
+void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
+				      u8 dialog_token, u16 timeout,
+				      u16 start_seq_num, u16 ba_policy, u16 tid,
+				      u16 buf_size, bool tx, bool auto_seq)
 {
 	struct ieee80211_local *local = sta->sdata->local;
 	struct tid_ampdu_rx *tid_agg_rx;
@@ -267,7 +267,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 		ht_dbg(sta->sdata,
 		       "STA %pM requests BA session on unsupported tid %d\n",
 		       sta->sta.addr, tid);
-		goto end_no_lock;
+		goto end;
 	}
 
 	if (!sta->sta.ht_cap.ht_supported) {
@@ -275,14 +275,14 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 		       "STA %pM erroneously requests BA session on tid %d w/o QoS\n",
 		       sta->sta.addr, tid);
 		/* send a response anyway, it's an error case if we get here */
-		goto end_no_lock;
+		goto end;
 	}
 
 	if (test_sta_flag(sta, WLAN_STA_BLOCK_BA)) {
 		ht_dbg(sta->sdata,
 		       "Suspend in progress - Denying ADDBA request (%pM tid %d)\n",
 		       sta->sta.addr, tid);
-		goto end_no_lock;
+		goto end;
 	}
 
 	/* sanity check for incoming parameters:
@@ -296,7 +296,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 		ht_dbg_ratelimited(sta->sdata,
 				   "AddBA Req with bad params from %pM on tid %u. policy %d, buffer size %d\n",
 				   sta->sta.addr, tid, ba_policy, buf_size);
-		goto end_no_lock;
+		goto end;
 	}
 	/* determine default buffer size */
 	if (buf_size == 0)
@@ -311,7 +311,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 	       buf_size, sta->sta.addr);
 
 	/* examine state machine */
-	mutex_lock(&sta->ampdu_mlme.mtx);
+	lockdep_assert_held(&sta->ampdu_mlme.mtx);
 
 	if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) {
 		if (sta->ampdu_mlme.tid_rx_token[tid] == dialog_token) {
@@ -415,15 +415,25 @@ end:
 		__clear_bit(tid, sta->ampdu_mlme.unexpected_agg);
 		sta->ampdu_mlme.tid_rx_token[tid] = dialog_token;
 	}
-	mutex_unlock(&sta->ampdu_mlme.mtx);
 
-end_no_lock:
 	if (tx)
 		ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, tid,
 					  dialog_token, status, 1, buf_size,
 					  timeout);
 }
 
+void __ieee80211_start_rx_ba_session(struct sta_info *sta,
+				     u8 dialog_token, u16 timeout,
+				     u16 start_seq_num, u16 ba_policy, u16 tid,
+				     u16 buf_size, bool tx, bool auto_seq)
+{
+	mutex_lock(&sta->ampdu_mlme.mtx);
+	___ieee80211_start_rx_ba_session(sta, dialog_token, timeout,
+					 start_seq_num, ba_policy, tid,
+					 buf_size, tx, auto_seq);
+	mutex_unlock(&sta->ampdu_mlme.mtx);
+}
+
 void ieee80211_process_addba_request(struct ieee80211_local *local,
 				     struct sta_info *sta,
 				     struct ieee80211_mgmt *mgmt,
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 4cba7fca10d4..d6d0b4201e40 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -351,9 +351,9 @@ void ieee80211_ba_session_work(struct work_struct *work)
 
 		if (test_and_clear_bit(tid,
 				       sta->ampdu_mlme.tid_rx_manage_offl))
-			__ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid,
-							IEEE80211_MAX_AMPDU_BUF,
-							false, true);
+			___ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid,
+							 IEEE80211_MAX_AMPDU_BUF,
+							 false, true);
 
 		if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS,
 				       sta->ampdu_mlme.tid_rx_manage_offl))
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 2197c62a0a6e..9675814f64db 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1760,6 +1760,10 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 				     u8 dialog_token, u16 timeout,
 				     u16 start_seq_num, u16 ba_policy, u16 tid,
 				     u16 buf_size, bool tx, bool auto_seq);
+void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
+				      u8 dialog_token, u16 timeout,
+				      u16 start_seq_num, u16 ba_policy, u16 tid,
+				      u16 buf_size, bool tx, bool auto_seq);
 void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta,
 					 enum ieee80211_agg_stop_reason reason);
 void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata,
-- 
cgit 


From 39ad1297a2084e0724da73d9eda2ceb9573a5d6c Mon Sep 17 00:00:00 2001
From: Gao Feng <gfree.wind@vip.163.com>
Date: Mon, 4 Sep 2017 14:21:12 +0800
Subject: sched: Use __qdisc_drop instead of kfree_skb in sch_prio and sch_qfq

The commit 520ac30f4551 ("net_sched: drop packets after root qdisc lock
is released) made a big change of tc for performance. There are two points
left in sch_prio and sch_qfq which are not changed with that commit. Now
enhance them now with __qdisc_drop.

Signed-off-by: Gao Feng <gfree.wind@vip.163.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_prio.c | 2 +-
 net/sched/sch_qfq.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index f31b28f788c0..2dd6c68ae91e 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -80,7 +80,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
 
 		if (ret & __NET_XMIT_BYPASS)
 			qdisc_qstats_drop(sch);
-		kfree_skb(skb);
+		__qdisc_drop(skb, to_free);
 		return ret;
 	}
 #endif
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index cd661a7f81e6..6ddfd4991108 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -1215,7 +1215,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	if (cl == NULL) {
 		if (err & __NET_XMIT_BYPASS)
 			qdisc_qstats_drop(sch);
-		kfree_skb(skb);
+		__qdisc_drop(skb, to_free);
 		return err;
 	}
 	pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid);
-- 
cgit 


From be82485fbcbb1cf11b13e7356231c72fdf21241c Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 6 Sep 2017 11:47:12 +0800
Subject: netlink: fix an use-after-free issue for nlk groups

ChunYu found a netlink use-after-free issue by syzkaller:

[28448.842981] BUG: KASAN: use-after-free in __nla_put+0x37/0x40 at addr ffff8807185e2378
[28448.969918] Call Trace:
[...]
[28449.117207]  __nla_put+0x37/0x40
[28449.132027]  nla_put+0xf5/0x130
[28449.146261]  sk_diag_fill.isra.4.constprop.5+0x5a0/0x750 [netlink_diag]
[28449.176608]  __netlink_diag_dump+0x25a/0x700 [netlink_diag]
[28449.202215]  netlink_diag_dump+0x176/0x240 [netlink_diag]
[28449.226834]  netlink_dump+0x488/0xbb0
[28449.298014]  __netlink_dump_start+0x4e8/0x760
[28449.317924]  netlink_diag_handler_dump+0x261/0x340 [netlink_diag]
[28449.413414]  sock_diag_rcv_msg+0x207/0x390
[28449.432409]  netlink_rcv_skb+0x149/0x380
[28449.467647]  sock_diag_rcv+0x2d/0x40
[28449.484362]  netlink_unicast+0x562/0x7b0
[28449.564790]  netlink_sendmsg+0xaa8/0xe60
[28449.661510]  sock_sendmsg+0xcf/0x110
[28449.865631]  __sys_sendmsg+0xf3/0x240
[28450.000964]  SyS_sendmsg+0x32/0x50
[28450.016969]  do_syscall_64+0x25c/0x6c0
[28450.154439]  entry_SYSCALL64_slow_path+0x25/0x25

It was caused by no protection between nlk groups' free in netlink_release
and nlk groups' accessing in sk_diag_dump_groups. The similar issue also
exists in netlink_seq_show().

This patch is to defer nlk groups' free in deferred_put_nlk_sk.

Reported-by: ChunYu Wang <chunwang@redhat.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 5acee49db90b..94a61e602ee7 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -691,6 +691,9 @@ static void deferred_put_nlk_sk(struct rcu_head *head)
 	struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
 	struct sock *sk = &nlk->sk;
 
+	kfree(nlk->groups);
+	nlk->groups = NULL;
+
 	if (!refcount_dec_and_test(&sk->sk_refcnt))
 		return;
 
@@ -769,9 +772,6 @@ static int netlink_release(struct socket *sock)
 		netlink_table_ungrab();
 	}
 
-	kfree(nlk->groups);
-	nlk->groups = NULL;
-
 	local_bh_disable();
 	sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
 	local_bh_enable();
-- 
cgit 


From f773608026ee1e75e2256e3ad625c222ff6f9305 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 6 Sep 2017 11:53:29 +0800
Subject: netlink: access nlk groups safely in netlink bind and getname

Now there is no lock protecting nlk ngroups/groups' accessing in
netlink bind and getname. It's safe from nlk groups' setting in
netlink_release, but not from netlink_realloc_groups called by
netlink_setsockopt.

netlink_lock_table is needed in both netlink bind and getname when
accessing nlk groups.

Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 94a61e602ee7..327807731b44 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -955,7 +955,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
 	struct net *net = sock_net(sk);
 	struct netlink_sock *nlk = nlk_sk(sk);
 	struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
-	int err;
+	int err = 0;
 	long unsigned int groups = nladdr->nl_groups;
 	bool bound;
 
@@ -983,6 +983,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
 			return -EINVAL;
 	}
 
+	netlink_lock_table();
 	if (nlk->netlink_bind && groups) {
 		int group;
 
@@ -993,7 +994,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
 			if (!err)
 				continue;
 			netlink_undo_bind(group, groups, sk);
-			return err;
+			goto unlock;
 		}
 	}
 
@@ -1006,12 +1007,13 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
 			netlink_autobind(sock);
 		if (err) {
 			netlink_undo_bind(nlk->ngroups, groups, sk);
-			return err;
+			goto unlock;
 		}
 	}
 
 	if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
-		return 0;
+		goto unlock;
+	netlink_unlock_table();
 
 	netlink_table_grab();
 	netlink_update_subscriptions(sk, nlk->subscriptions +
@@ -1022,6 +1024,10 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
 	netlink_table_ungrab();
 
 	return 0;
+
+unlock:
+	netlink_unlock_table();
+	return err;
 }
 
 static int netlink_connect(struct socket *sock, struct sockaddr *addr,
@@ -1079,7 +1085,9 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr,
 		nladdr->nl_groups = netlink_group_mask(nlk->dst_group);
 	} else {
 		nladdr->nl_pid = nlk->portid;
+		netlink_lock_table();
 		nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;
+		netlink_unlock_table();
 	}
 	return 0;
 }
-- 
cgit 


From 8e0deed92406d93ae0365cb8a6134db5721e7aca Mon Sep 17 00:00:00 2001
From: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
Date: Wed, 6 Sep 2017 11:08:06 +0200
Subject: tipc: remove unnecessary call to dev_net()

The net device is already stored in the 'net' variable, so no need to call
dev_net() again.

Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/bearer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index ac1d66d7e1fd..47ec121574ce 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -637,7 +637,7 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt,
 		break;
 	case NETDEV_UNREGISTER:
 	case NETDEV_CHANGENAME:
-		bearer_disable(dev_net(dev), b);
+		bearer_disable(net, b);
 		break;
 	}
 	return NOTIFY_OK;
-- 
cgit 


From 80532384af4ccb6d6cc965fa9232aaa2c456362c Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 6 Sep 2017 13:14:19 +0200
Subject: net: sched: fix memleak for chain zero

There's a memleak happening for chain 0. The thing is, chain 0 needs to
be always present, not created on demand. Therefore tcf_block_get upon
creation of block calls the tcf_chain_create function directly. The
chain is created with refcnt == 1, which is not correct in this case and
causes the memleak. So move the refcnt increment into tcf_chain_get
function even for the case when chain needs to be created.

Reported-by: Jakub Kicinski <kubakici@wp.pl>
Fixes: 5bc1701881e3 ("net: sched: introduce multichain support for filters")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Tested-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index ea6c65fd5fc5..c743f03cfebd 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -182,7 +182,7 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block,
 	list_add_tail(&chain->list, &block->chain_list);
 	chain->block = block;
 	chain->index = chain_index;
-	chain->refcnt = 1;
+	chain->refcnt = 0;
 	return chain;
 }
 
@@ -217,15 +217,15 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
 	struct tcf_chain *chain;
 
 	list_for_each_entry(chain, &block->chain_list, list) {
-		if (chain->index == chain_index) {
-			chain->refcnt++;
-			return chain;
-		}
+		if (chain->index == chain_index)
+			goto incref;
 	}
-	if (create)
-		return tcf_chain_create(block, chain_index);
-	else
-		return NULL;
+	chain = create ? tcf_chain_create(block, chain_index) : NULL;
+
+incref:
+	if (chain)
+		chain->refcnt++;
+	return chain;
 }
 EXPORT_SYMBOL(tcf_chain_get);
 
-- 
cgit 


From 5c25f30c93fdc5bf25e62101aeaae7a4f9b421b3 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 5 Sep 2017 17:26:33 +0800
Subject: ip6_gre: update mtu properly in ip6gre_err

Now when probessing ICMPV6_PKT_TOOBIG, ip6gre_err only subtracts the
offset of gre header from mtu info. The expected mtu of gre device
should also subtract gre header. Otherwise, the next packets still
can't be sent out.

Jianlin found this issue when using the topo:
  client(ip6gre)<---->(nic1)route(nic2)<----->(ip6gre)server

and reducing nic2's mtu, then both tcp and sctp's performance with
big size data became 0.

This patch is to fix it by also subtracting grehdr (tun->tun_hlen)
from mtu info when updating gre device's mtu in ip6gre_err(). It
also needs to subtract ETH_HLEN if gre dev'type is ARPHRD_ETHER.

Reported-by: Jianlin Shi <jishi@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 67ff2aaf5dcb..b7a72d409334 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -432,7 +432,9 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		}
 		break;
 	case ICMPV6_PKT_TOOBIG:
-		mtu = be32_to_cpu(info) - offset;
+		mtu = be32_to_cpu(info) - offset - t->tun_hlen;
+		if (t->dev->type == ARPHRD_ETHER)
+			mtu -= ETH_HLEN;
 		if (mtu < IPV6_MIN_MTU)
 			mtu = IPV6_MIN_MTU;
 		t->dev->mtu = mtu;
-- 
cgit 


From ca2c1418efe9f7fe37aa1f355efdf4eb293673ce Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 6 Sep 2017 14:44:36 +0200
Subject: udp: drop head states only when all skb references are gone

After commit 0ddf3fb2c43d ("udp: preserve skb->dst if required
for IP options processing") we clear the skb head state as soon
as the skb carrying them is first processed.

Since the same skb can be processed several times when MSG_PEEK
is used, we can end up lacking the required head states, and
eventually oopsing.

Fix this clearing the skb head state only when processing the
last skb reference.

Reported-by: Eric Dumazet <edumazet@google.com>
Fixes: 0ddf3fb2c43d ("udp: preserve skb->dst if required for IP options processing")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 9 +++------
 net/ipv4/udp.c    | 5 ++++-
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 68065d7d383f..16982de649b9 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -710,14 +710,11 @@ EXPORT_SYMBOL(consume_skb);
  *	consume_stateless_skb - free an skbuff, assuming it is stateless
  *	@skb: buffer to free
  *
- *	Works like consume_skb(), but this variant assumes that all the head
- *	states have been already dropped.
+ *	Alike consume_skb(), but this variant assumes that this is the last
+ *	skb reference and all the head states have been already dropped
  */
-void consume_stateless_skb(struct sk_buff *skb)
+void __consume_stateless_skb(struct sk_buff *skb)
 {
-	if (!skb_unref(skb))
-		return;
-
 	trace_consume_skb(skb);
 	skb_release_data(skb);
 	kfree_skbmem(skb);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index db1c9e78c83c..ef29df8648e4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1397,12 +1397,15 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
 		unlock_sock_fast(sk, slow);
 	}
 
+	if (!skb_unref(skb))
+		return;
+
 	/* In the more common cases we cleared the head states previously,
 	 * see __udp_queue_rcv_skb().
 	 */
 	if (unlikely(udp_skb_has_head_state(skb)))
 		skb_release_head_state(skb);
-	consume_stateless_skb(skb);
+	__consume_stateless_skb(skb);
 }
 EXPORT_SYMBOL_GPL(skb_consume_udp);
 
-- 
cgit 


From 126f760ca94dae77425695f9f9238b731de86e32 Mon Sep 17 00:00:00 2001
From: Håkon Bugge <Haakon.Bugge@oracle.com>
Date: Wed, 6 Sep 2017 18:35:51 +0200
Subject: rds: Fix incorrect statistics counting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In rds_send_xmit() there is logic to batch the sends. However, if
another thread has acquired the lock and has incremented the send_gen,
it is considered a race and we yield. The code incrementing the
s_send_lock_queue_raced statistics counter did not count this event
correctly.

This commit counts the race condition correctly.

Changes from v1:
- Removed check for *someone_on_xmit()*
- Fixed incorrect indentation

Signed-off-by: Håkon Bugge <haakon.bugge@oracle.com>
Reviewed-by: Knut Omang <knut.omang@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/send.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/rds/send.c b/net/rds/send.c
index 058a40743041..b52cdc8ae428 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -428,14 +428,18 @@ over_batch:
 	 * some work and we will skip our goto
 	 */
 	if (ret == 0) {
+		bool raced;
+
 		smp_mb();
+		raced = send_gen != READ_ONCE(cp->cp_send_gen);
+
 		if ((test_bit(0, &conn->c_map_queued) ||
-		     !list_empty(&cp->cp_send_queue)) &&
-			send_gen == READ_ONCE(cp->cp_send_gen)) {
-			rds_stats_inc(s_send_lock_queue_raced);
+		    !list_empty(&cp->cp_send_queue)) && !raced) {
 			if (batch_count < send_batch_count)
 				goto restart;
 			queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
+		} else if (raced) {
+			rds_stats_inc(s_send_lock_queue_raced);
 		}
 	}
 out:
-- 
cgit 


From 1cc4a018669f2fb18c10010f1a7ab3f6fb688cef Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 20 Aug 2017 13:38:07 +0800
Subject: netfilter: ipvs: fix the issue that sctp_conn_schedule drops non-INIT
 packet

Commit 5e26b1b3abce ("ipvs: support scheduling inverse and icmp SCTP
packets") changed to check packet type early. It introduced a side
effect: if it's not a INIT packet, ports will be set as  NULL, and
the packet will be dropped later.

It caused that sctp couldn't create connection when ipvs module is
loaded and any scheduler is registered on server.

Li Shuang reproduced it by running the cmds on sctp server:
  # ipvsadm -A -t 1.1.1.1:80 -s rr
  # ipvsadm -D -t 1.1.1.1:80
then the server could't work any more.

This patch is to return 1 when it's not an INIT packet. It means ipvs
will accept it without creating a conn for it, just like what it does
for tcp.

Fixes: 5e26b1b3abce ("ipvs: support scheduling inverse and icmp SCTP packets")
Reported-by: Li Shuang <shuali@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_proto_sctp.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index e1efa446b305..81f08198b125 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -24,9 +24,12 @@ sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
 		if (sh) {
 			sch = skb_header_pointer(skb, iph->len + sizeof(_sctph),
 						 sizeof(_schunkh), &_schunkh);
-			if (sch && (sch->type == SCTP_CID_INIT ||
-				    sysctl_sloppy_sctp(ipvs)))
+			if (sch) {
+				if (!(sysctl_sloppy_sctp(ipvs) ||
+				      sch->type == SCTP_CID_INIT))
+					return 1;
 				ports = &sh->source;
+			}
 		}
 	} else {
 		ports = skb_header_pointer(
-- 
cgit 


From 68913a018f6082f8f90abb8ff9114435ef45dff7 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 20 Aug 2017 13:38:08 +0800
Subject: netfilter: ipvs: do not create conn for ABORT packet in
 sctp_conn_schedule

There's no reason for ipvs to create a conn for an ABORT packet
even if sysctl_sloppy_sctp is set.

This patch is to accept it without creating a conn, just as ipvs
does for tcp's RST packet.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_proto_sctp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 81f08198b125..57c8ee66491e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -25,7 +25,8 @@ sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
 			sch = skb_header_pointer(skb, iph->len + sizeof(_sctph),
 						 sizeof(_schunkh), &_schunkh);
 			if (sch) {
-				if (!(sysctl_sloppy_sctp(ipvs) ||
+				if (sch->type == SCTP_CID_ABORT ||
+				    !(sysctl_sloppy_sctp(ipvs) ||
 				      sch->type == SCTP_CID_INIT))
 					return 1;
 				ports = &sh->source;
-- 
cgit 


From ba1cc08d9488c94cb8d94f545305688b72a2a300 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri, 8 Sep 2017 10:26:19 +0200
Subject: ipv6: fix memory leak with multiple tables during netns destruction

fib6_net_exit only frees the main and local tables. If another table was
created with fib6_alloc_table, we leak it when the netns is destroyed.

Fix this in the same way ip_fib_net_exit cleans up tables, by walking
through the whole hashtable of fib6_table's. We can get rid of the
special cases for local and main, since they're also part of the
hashtable.

Reproducer:
    ip netns add x
    ip -net x -6 rule add from 6003:1::/64 table 100
    ip netns del x

Reported-by: Jianlin Shi <jishi@redhat.com>
Fixes: 58f09b78b730 ("[NETNS][IPV6] ip6_fib - make it per network namespace")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index a3b5c163325f..8280172c806c 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -191,6 +191,12 @@ void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
 }
 EXPORT_SYMBOL_GPL(rt6_free_pcpu);
 
+static void fib6_free_table(struct fib6_table *table)
+{
+	inetpeer_invalidate_tree(&table->tb6_peers);
+	kfree(table);
+}
+
 static void fib6_link_table(struct net *net, struct fib6_table *tb)
 {
 	unsigned int h;
@@ -2022,15 +2028,22 @@ out_timer:
 
 static void fib6_net_exit(struct net *net)
 {
+	unsigned int i;
+
 	rt6_ifdown(net, NULL);
 	del_timer_sync(&net->ipv6.ip6_fib_timer);
 
-#ifdef CONFIG_IPV6_MULTIPLE_TABLES
-	inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers);
-	kfree(net->ipv6.fib6_local_tbl);
-#endif
-	inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers);
-	kfree(net->ipv6.fib6_main_tbl);
+	for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
+		struct hlist_head *head = &net->ipv6.fib_table_hash[i];
+		struct hlist_node *tmp;
+		struct fib6_table *tb;
+
+		hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
+			hlist_del(&tb->tb6_hlist);
+			fib6_free_table(tb);
+		}
+	}
+
 	kfree(net->ipv6.fib_table_hash);
 	kfree(net->ipv6.rt6_stats);
 	fib6_notifier_exit(net);
-- 
cgit 


From 75c2631468e8af554057246b2413e738dd96af3d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 31 Aug 2017 13:45:24 +0200
Subject: netfilter: nf_nat: don't bug when mapping already exists

It seems preferrable to limp along if we have a conflicting mapping,
its certainly better than a BUG().

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_nat_core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 40573aa6c133..dc3519cc7209 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -416,7 +416,9 @@ nf_nat_setup_info(struct nf_conn *ct,
 
 	WARN_ON(maniptype != NF_NAT_MANIP_SRC &&
 		maniptype != NF_NAT_MANIP_DST);
-	BUG_ON(nf_nat_initialized(ct, maniptype));
+
+	if (WARN_ON(nf_nat_initialized(ct, maniptype)))
+		return NF_DROP;
 
 	/* What we've got will look like inverse of reply. Normally
 	 * this is what is in the conntrack, except for prior
-- 
cgit 


From a5d7a714569199f909cd60ff7074107bf15c7db4 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 1 Sep 2017 22:41:03 +0200
Subject: netfilter: xtables: add scheduling opportunity in get_counters

There are reports about spurious softlockups during iptables-restore, a
backtrace i saw points at get_counters -- it uses a sequence lock and also
has unbounded restart loop.

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/arp_tables.c | 1 +
 net/ipv4/netfilter/ip_tables.c  | 1 +
 net/ipv6/netfilter/ip6_tables.c | 1 +
 3 files changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index e04457198f93..9e2770fd00be 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -629,6 +629,7 @@ static void get_counters(const struct xt_table_info *t,
 
 			ADD_COUNTER(counters[i], bcnt, pcnt);
 			++i;
+			cond_resched();
 		}
 	}
 }
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 576cba2b57e9..39286e543ee6 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -776,6 +776,7 @@ get_counters(const struct xt_table_info *t,
 
 			ADD_COUNTER(counters[i], bcnt, pcnt);
 			++i; /* macro does multi eval of i */
+			cond_resched();
 		}
 	}
 }
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 54b1e75eded1..01bd3ee5ebc6 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -795,6 +795,7 @@ get_counters(const struct xt_table_info *t,
 
 			ADD_COUNTER(counters[i], bcnt, pcnt);
 			++i;
+			cond_resched();
 		}
 	}
 }
-- 
cgit 


From e1bf1687740ce1a3598a1c5e452b852ff2190682 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 6 Sep 2017 14:39:51 +0200
Subject: netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to
 rhashtable"

This reverts commit 870190a9ec9075205c0fa795a09fa931694a3ff1.

It was not a good idea. The custom hash table was a much better
fit for this purpose.

A fast lookup is not essential, in fact for most cases there is no lookup
at all because original tuple is not taken and can be used as-is.
What needs to be fast is insertion and deletion.

rhlist removal however requires a rhlist walk.
We can have thousands of entries in such a list if source port/addresses
are reused for multiple flows, if this happens removal requests are so
expensive that deletions of a few thousand flows can take several
seconds(!).

The advantages that we got from rhashtable are:
1) table auto-sizing
2) multiple locks

1) would be nice to have, but it is not essential as we have at
most one lookup per new flow, so even a million flows in the bysource
table are not a problem compared to current deletion cost.
2) is easy to add to custom hash table.

I tried to add hlist_node to rhlist to speed up rhltable_remove but this
isn't doable without changing semantics.  rhltable_remove_fast will
check that the to-be-deleted object is part of the table and that
requires a list walk that we want to avoid.

Furthermore, using hlist_node increases size of struct rhlist_head, which
in turn increases nf_conn size.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=196821
Reported-by: Ivan Babrou <ibobrik@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_nat_core.c | 130 ++++++++++++++++++--------------------------
 1 file changed, 53 insertions(+), 77 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index dc3519cc7209..f090419f5f97 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -30,19 +30,17 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <linux/netfilter/nf_nat.h>
 
+static DEFINE_SPINLOCK(nf_nat_lock);
+
 static DEFINE_MUTEX(nf_nat_proto_mutex);
 static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
 						__read_mostly;
 static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
 						__read_mostly;
 
-struct nf_nat_conn_key {
-	const struct net *net;
-	const struct nf_conntrack_tuple *tuple;
-	const struct nf_conntrack_zone *zone;
-};
-
-static struct rhltable nf_nat_bysource_table;
+static struct hlist_head *nf_nat_bysource __read_mostly;
+static unsigned int nf_nat_htable_size __read_mostly;
+static unsigned int nf_nat_hash_rnd __read_mostly;
 
 inline const struct nf_nat_l3proto *
 __nf_nat_l3proto_find(u8 family)
@@ -118,17 +116,19 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
 EXPORT_SYMBOL(nf_xfrm_me_harder);
 #endif /* CONFIG_XFRM */
 
-static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed)
+/* We keep an extra hash for each conntrack, for fast searching. */
+static unsigned int
+hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
 {
-	const struct nf_conntrack_tuple *t;
-	const struct nf_conn *ct = data;
+	unsigned int hash;
+
+	get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
 
-	t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
 	/* Original src, to ensure we map it consistently if poss. */
+	hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
+		      tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
 
-	seed ^= net_hash_mix(nf_ct_net(ct));
-	return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32),
-		      t->dst.protonum ^ seed);
+	return reciprocal_scale(hash, nf_nat_htable_size);
 }
 
 /* Is this tuple already taken? (not by us) */
@@ -184,28 +184,6 @@ same_src(const struct nf_conn *ct,
 		t->src.u.all == tuple->src.u.all);
 }
 
-static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg,
-			       const void *obj)
-{
-	const struct nf_nat_conn_key *key = arg->key;
-	const struct nf_conn *ct = obj;
-
-	if (!same_src(ct, key->tuple) ||
-	    !net_eq(nf_ct_net(ct), key->net) ||
-	    !nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL))
-		return 1;
-
-	return 0;
-}
-
-static struct rhashtable_params nf_nat_bysource_params = {
-	.head_offset = offsetof(struct nf_conn, nat_bysource),
-	.obj_hashfn = nf_nat_bysource_hash,
-	.obj_cmpfn = nf_nat_bysource_cmp,
-	.nelem_hint = 256,
-	.min_size = 1024,
-};
-
 /* Only called for SRC manip */
 static int
 find_appropriate_src(struct net *net,
@@ -216,26 +194,22 @@ find_appropriate_src(struct net *net,
 		     struct nf_conntrack_tuple *result,
 		     const struct nf_nat_range *range)
 {
+	unsigned int h = hash_by_src(net, tuple);
 	const struct nf_conn *ct;
-	struct nf_nat_conn_key key = {
-		.net = net,
-		.tuple = tuple,
-		.zone = zone
-	};
-	struct rhlist_head *hl, *h;
-
-	hl = rhltable_lookup(&nf_nat_bysource_table, &key,
-			     nf_nat_bysource_params);
 
-	rhl_for_each_entry_rcu(ct, h, hl, nat_bysource) {
-		nf_ct_invert_tuplepr(result,
-				     &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-		result->dst = tuple->dst;
-
-		if (in_range(l3proto, l4proto, result, range))
-			return 1;
+	hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
+		if (same_src(ct, tuple) &&
+		    net_eq(net, nf_ct_net(ct)) &&
+		    nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
+			/* Copy source part from reply tuple. */
+			nf_ct_invert_tuplepr(result,
+				       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+			result->dst = tuple->dst;
+
+			if (in_range(l3proto, l4proto, result, range))
+				return 1;
+		}
 	}
-
 	return 0;
 }
 
@@ -408,6 +382,7 @@ nf_nat_setup_info(struct nf_conn *ct,
 		  const struct nf_nat_range *range,
 		  enum nf_nat_manip_type maniptype)
 {
+	struct net *net = nf_ct_net(ct);
 	struct nf_conntrack_tuple curr_tuple, new_tuple;
 
 	/* Can't setup nat info for confirmed ct. */
@@ -449,19 +424,14 @@ nf_nat_setup_info(struct nf_conn *ct,
 	}
 
 	if (maniptype == NF_NAT_MANIP_SRC) {
-		struct nf_nat_conn_key key = {
-			.net = nf_ct_net(ct),
-			.tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
-			.zone = nf_ct_zone(ct),
-		};
-		int err;
-
-		err = rhltable_insert_key(&nf_nat_bysource_table,
-					  &key,
-					  &ct->nat_bysource,
-					  nf_nat_bysource_params);
-		if (err)
-			return NF_DROP;
+		unsigned int srchash;
+
+		srchash = hash_by_src(net,
+				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+		spin_lock_bh(&nf_nat_lock);
+		hlist_add_head_rcu(&ct->nat_bysource,
+				   &nf_nat_bysource[srchash]);
+		spin_unlock_bh(&nf_nat_lock);
 	}
 
 	/* It's done. */
@@ -570,8 +540,9 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
 	 * will delete entry from already-freed table.
 	 */
 	clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
-	rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
-			nf_nat_bysource_params);
+	spin_lock_bh(&nf_nat_lock);
+	hlist_del_rcu(&ct->nat_bysource);
+	spin_unlock_bh(&nf_nat_lock);
 
 	/* don't delete conntrack.  Although that would make things a lot
 	 * simpler, we'd end up flushing all conntracks on nat rmmod.
@@ -699,9 +670,11 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister);
 /* No one using conntrack by the time this called. */
 static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
 {
-	if (ct->status & IPS_SRC_NAT_DONE)
-		rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
-				nf_nat_bysource_params);
+	if (ct->status & IPS_SRC_NAT_DONE) {
+		spin_lock_bh(&nf_nat_lock);
+		hlist_del_rcu(&ct->nat_bysource);
+		spin_unlock_bh(&nf_nat_lock);
+	}
 }
 
 static struct nf_ct_ext_type nat_extend __read_mostly = {
@@ -825,13 +798,16 @@ static int __init nf_nat_init(void)
 {
 	int ret;
 
-	ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params);
-	if (ret)
-		return ret;
+	/* Leave them the same for the moment. */
+	nf_nat_htable_size = nf_conntrack_htable_size;
+
+	nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
+	if (!nf_nat_bysource)
+		return -ENOMEM;
 
 	ret = nf_ct_extend_register(&nat_extend);
 	if (ret < 0) {
-		rhltable_destroy(&nf_nat_bysource_table);
+		nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
 		printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
 		return ret;
 	}
@@ -865,8 +841,8 @@ static void __exit nf_nat_cleanup(void)
 
 	for (i = 0; i < NFPROTO_NUMPROTO; i++)
 		kfree(nf_nat_l4protos[i]);
-
-	rhltable_destroy(&nf_nat_bysource_table);
+	synchronize_net();
+	nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
 }
 
 MODULE_LICENSE("GPL");
-- 
cgit 


From 8073e960a03bf7b5d5ebfc5ff18ac475e1688f46 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 6 Sep 2017 14:39:52 +0200
Subject: netfilter: nat: use keyed locks

no need to serialize on a single lock, we can partition the table and
add/delete in parallel to different slots.
This restores one of the advantages that got lost with the rhlist
revert.

Cc: Ivan Babrou <ibobrik@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_nat_core.c | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index f090419f5f97..f393a7086025 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -30,7 +30,7 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <linux/netfilter/nf_nat.h>
 
-static DEFINE_SPINLOCK(nf_nat_lock);
+static spinlock_t nf_nat_locks[CONNTRACK_LOCKS];
 
 static DEFINE_MUTEX(nf_nat_proto_mutex);
 static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
@@ -425,13 +425,15 @@ nf_nat_setup_info(struct nf_conn *ct,
 
 	if (maniptype == NF_NAT_MANIP_SRC) {
 		unsigned int srchash;
+		spinlock_t *lock;
 
 		srchash = hash_by_src(net,
 				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-		spin_lock_bh(&nf_nat_lock);
+		lock = &nf_nat_locks[srchash % ARRAY_SIZE(nf_nat_locks)];
+		spin_lock_bh(lock);
 		hlist_add_head_rcu(&ct->nat_bysource,
 				   &nf_nat_bysource[srchash]);
-		spin_unlock_bh(&nf_nat_lock);
+		spin_unlock_bh(lock);
 	}
 
 	/* It's done. */
@@ -525,6 +527,16 @@ static int nf_nat_proto_remove(struct nf_conn *i, void *data)
 	return i->status & IPS_NAT_MASK ? 1 : 0;
 }
 
+static void __nf_nat_cleanup_conntrack(struct nf_conn *ct)
+{
+	unsigned int h;
+
+	h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+	spin_lock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]);
+	hlist_del_rcu(&ct->nat_bysource);
+	spin_unlock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]);
+}
+
 static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
 {
 	if (nf_nat_proto_remove(ct, data))
@@ -540,9 +552,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
 	 * will delete entry from already-freed table.
 	 */
 	clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
-	spin_lock_bh(&nf_nat_lock);
-	hlist_del_rcu(&ct->nat_bysource);
-	spin_unlock_bh(&nf_nat_lock);
+	__nf_nat_cleanup_conntrack(ct);
 
 	/* don't delete conntrack.  Although that would make things a lot
 	 * simpler, we'd end up flushing all conntracks on nat rmmod.
@@ -670,11 +680,8 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister);
 /* No one using conntrack by the time this called. */
 static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
 {
-	if (ct->status & IPS_SRC_NAT_DONE) {
-		spin_lock_bh(&nf_nat_lock);
-		hlist_del_rcu(&ct->nat_bysource);
-		spin_unlock_bh(&nf_nat_lock);
-	}
+	if (ct->status & IPS_SRC_NAT_DONE)
+		__nf_nat_cleanup_conntrack(ct);
 }
 
 static struct nf_ct_ext_type nat_extend __read_mostly = {
@@ -796,10 +803,12 @@ static struct nf_ct_helper_expectfn follow_master_nat = {
 
 static int __init nf_nat_init(void)
 {
-	int ret;
+	int ret, i;
 
 	/* Leave them the same for the moment. */
 	nf_nat_htable_size = nf_conntrack_htable_size;
+	if (nf_nat_htable_size < ARRAY_SIZE(nf_nat_locks))
+		nf_nat_htable_size = ARRAY_SIZE(nf_nat_locks);
 
 	nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
 	if (!nf_nat_bysource)
@@ -812,6 +821,9 @@ static int __init nf_nat_init(void)
 		return ret;
 	}
 
+	for (i = 0; i < ARRAY_SIZE(nf_nat_locks); i++)
+		spin_lock_init(&nf_nat_locks[i]);
+
 	nf_ct_helper_expectfn_register(&follow_master_nat);
 
 	BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
-- 
cgit 


From 74585d4f84379528347630253c42518c5002d2f9 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 6 Sep 2017 14:47:57 +0200
Subject: netfilter: core: remove erroneous warn_on

kernel test robot reported:

WARNING: CPU: 0 PID: 1244 at net/netfilter/core.c:218 __nf_hook_entries_try_shrink+0x49/0xcd
[..]

After allowing batching in nf_unregister_net_hooks its possible that an earlier
call to __nf_hook_entries_try_shrink already compacted the list.
If this happens we don't need to do anything.

Fixes: d3ad2c17b4047 ("netfilter: core: batch nf_unregister_net_hooks synchronize_net calls")
Reported-by: kernel test robot <xiaolong.ye@intel.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Aaron Conole <aconole@bytheb.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 04fe25abc5f6..52cd2901a097 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -215,7 +215,7 @@ static void *__nf_hook_entries_try_shrink(struct nf_hook_entries __rcu **pp)
 	if (skip == hook_entries)
 		goto out_assign;
 
-	if (WARN_ON(skip == 0))
+	if (skip == 0)
 		return NULL;
 
 	hook_entries -= skip;
-- 
cgit 


From 05d0eae7c1cf6bf7b19c02b3a97ff457b3317323 Mon Sep 17 00:00:00 2001
From: Zhizhou Tian <zhizhou.tian@gmail.com>
Date: Fri, 8 Sep 2017 11:00:16 +0800
Subject: netfilter: xt_hashlimit: alloc hashtable with right size

struct xt_byteslimit_htable used hlist_head, but memory allocation is
done through sizeof(struct list_head).

Signed-off-by: Zhizhou Tian <zhizhou.tian@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_hashlimit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 10d48234f5f4..962ea4a63d9f 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -279,7 +279,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
 		size = cfg->size;
 	} else {
 		size = (totalram_pages << PAGE_SHIFT) / 16384 /
-		       sizeof(struct list_head);
+		       sizeof(struct hlist_head);
 		if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE)
 			size = 8192;
 		if (size < 16)
@@ -287,7 +287,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
 	}
 	/* FIXME: don't use vmalloc() here or anywhere else -HW */
 	hinfo = vmalloc(sizeof(struct xt_hashlimit_htable) +
-	                sizeof(struct list_head) * size);
+	                sizeof(struct hlist_head) * size);
 	if (hinfo == NULL)
 		return -ENOMEM;
 	*out_hinfo = hinfo;
-- 
cgit 


From 90c4ae4e2c1da9f1eaf846136861af43d4c1ff34 Mon Sep 17 00:00:00 2001
From: Vishwanath Pai <vpai@akamai.com>
Date: Fri, 8 Sep 2017 01:38:58 -0400
Subject: netfilter: xt_hashlimit: fix build error caused by 64bit division

64bit division causes build/link errors on 32bit architectures. It
prints out error messages like:

ERROR: "__aeabi_uldivmod" [net/netfilter/xt_hashlimit.ko] undefined!

The value of avg passed through by userspace in BYTE mode cannot exceed
U32_MAX. Which means 64bit division in user2rate_bytes is unnecessary.
To fix this I have changed the type of param 'user' to u32.

Since anything greater than U32_MAX is an invalid input we error out in
hashlimit_mt_check_common() when this is the case.

Changes in v2:
	Making return type as u32 would cause an overflow for small
	values of 'user' (for example 2, 3 etc). To avoid this I bumped up
	'r' to u64 again as well as the return type. This is OK since the
	variable that stores the result is u64. We still avoid 64bit
	division here since 'user' is u32.

Fixes: bea74641e378 ("netfilter: xt_hashlimit: add rate match mode")
Signed-off-by: Vishwanath Pai <vpai@akamai.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_hashlimit.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 962ea4a63d9f..5da8746f7b88 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -35,6 +35,7 @@
 #include <linux/netfilter_ipv6/ip6_tables.h>
 #include <linux/netfilter/xt_hashlimit.h>
 #include <linux/mutex.h>
+#include <linux/kernel.h>
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
@@ -527,12 +528,12 @@ static u64 user2rate(u64 user)
 	}
 }
 
-static u64 user2rate_bytes(u64 user)
+static u64 user2rate_bytes(u32 user)
 {
 	u64 r;
 
-	r = user ? 0xFFFFFFFFULL / user : 0xFFFFFFFFULL;
-	r = (r - 1) << 4;
+	r = user ? U32_MAX / user : U32_MAX;
+	r = (r - 1) << XT_HASHLIMIT_BYTE_SHIFT;
 	return r;
 }
 
@@ -588,7 +589,8 @@ static void rateinfo_init(struct dsthash_ent *dh,
 		dh->rateinfo.prev_window = 0;
 		dh->rateinfo.current_rate = 0;
 		if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) {
-			dh->rateinfo.rate = user2rate_bytes(hinfo->cfg.avg);
+			dh->rateinfo.rate =
+				user2rate_bytes((u32)hinfo->cfg.avg);
 			if (hinfo->cfg.burst)
 				dh->rateinfo.burst =
 					hinfo->cfg.burst * dh->rateinfo.rate;
@@ -870,7 +872,7 @@ static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
 
 	/* Check for overflow. */
 	if (revision >= 3 && cfg->mode & XT_HASHLIMIT_RATE_MATCH) {
-		if (cfg->avg == 0) {
+		if (cfg->avg == 0 || cfg->avg > U32_MAX) {
 			pr_info("hashlimit invalid rate\n");
 			return -ERANGE;
 		}
-- 
cgit 


From 7906b00f5cd1cd484fced7fcda892176e3202c8a Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Fri, 8 Sep 2017 11:35:21 -0300
Subject: sctp: fix missing wake ups in some situations

Commit fb586f25300f ("sctp: delay calls to sk_data_ready() as much as
possible") minimized the number of wake ups that are triggered in case
the association receives a packet with multiple data chunks on it and/or
when io_events are enabled and then commit 0970f5b36659 ("sctp: signal
sk_data_ready earlier on data chunks reception") moved the wake up to as
soon as possible. It thus relies on the state machine running later to
clean the flag that the event was already generated.

The issue is that there are 2 call paths that calls
sctp_ulpq_tail_event() outside of the state machine, causing the flag to
linger and possibly omitting a needed wake up in the sequence.

One of the call paths is when enabling SCTP_SENDER_DRY_EVENTS via
setsockopt(SCTP_EVENTS), as noticed by Harald Welte. The other is when
partial reliability triggers removal of chunks from the send queue when
the application calls sendmsg().

This commit fixes it by not setting the flag in case the socket is not
owned by the user, as it won't be cleaned later. This works for
user-initiated calls and also for rx path processing.

Fixes: fb586f25300f ("sctp: delay calls to sk_data_ready() as much as possible")
Reported-by: Harald Welte <laforge@gnumonks.org>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/ulpqueue.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 0225d62a869f..a71be33f3afe 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -265,7 +265,8 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
 		sctp_ulpq_clear_pd(ulpq);
 
 	if (queue == &sk->sk_receive_queue && !sp->data_ready_signalled) {
-		sp->data_ready_signalled = 1;
+		if (!sock_owned_by_user(sk))
+			sp->data_ready_signalled = 1;
 		sk->sk_data_ready(sk);
 	}
 	return 1;
-- 
cgit 


From 1f3b359f1004bd34b7b0bad70b93e3c7af92a37b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 8 Sep 2017 12:44:47 -0700
Subject: tcp: fix a request socket leak

While the cited commit fixed a possible deadlock, it added a leak
of the request socket, since reqsk_put() must be called if the BPF
filter decided the ACK packet must be dropped.

Fixes: d624d276d1dd ("tcp: fix possible deadlock in TCP stack vs BPF filter")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 6 +++---
 net/ipv6/tcp_ipv6.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a63486afa7a7..d9416b5162bc 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1669,9 +1669,9 @@ process:
 		 */
 		sock_hold(sk);
 		refcounted = true;
-		if (tcp_filter(sk, skb))
-			goto discard_and_relse;
-		nsk = tcp_check_req(sk, skb, req, false);
+		nsk = NULL;
+		if (!tcp_filter(sk, skb))
+			nsk = tcp_check_req(sk, skb, req, false);
 		if (!nsk) {
 			reqsk_put(req);
 			goto discard_and_relse;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 38f76d8b231e..64d94afa427f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1460,9 +1460,9 @@ process:
 		}
 		sock_hold(sk);
 		refcounted = true;
-		if (tcp_filter(sk, skb))
-			goto discard_and_relse;
-		nsk = tcp_check_req(sk, skb, req, false);
+		nsk = NULL;
+		if (!tcp_filter(sk, skb))
+			nsk = tcp_check_req(sk, skb, req, false);
 		if (!nsk) {
 			reqsk_put(req);
 			goto discard_and_relse;
-- 
cgit 


From 32a805baf0fb70b6dbedefcd7249ac7f580f9e3b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 8 Sep 2017 15:48:47 -0700
Subject: ipv6: fix typo in fib6_net_exit()

IPv6 FIB should use FIB6_TABLE_HASHSZ, not FIB_TABLE_HASHSZ.

Fixes: ba1cc08d9488 ("ipv6: fix memory leak with multiple tables during netns destruction")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 8280172c806c..e5308d7cbd75 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -2033,7 +2033,7 @@ static void fib6_net_exit(struct net *net)
 	rt6_ifdown(net, NULL);
 	del_timer_sync(&net->ipv6.ip6_fib_timer);
 
-	for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
+	for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
 		struct hlist_head *head = &net->ipv6.fib_table_hash[i];
 		struct hlist_node *tmp;
 		struct fib6_table *tb;
-- 
cgit 


From 0f693f1995cf002432b70f43ce73f79bf8d0b6c9 Mon Sep 17 00:00:00 2001
From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Thu, 7 Sep 2017 14:08:34 +0800
Subject: ip_tunnel: fix setting ttl and tos value in collect_md mode

ttl and tos variables are declared and assigned, but are not used in
iptunnel_xmit() function.

Fixes: cfc7381b3002 ("ip_tunnel: add collect_md mode to IPIP tunnel")
Cc: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_tunnel.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 129d1a3616f8..e1856bfa753d 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -618,8 +618,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
 		ip_rt_put(rt);
 		goto tx_dropped;
 	}
-	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
-		      key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
+	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
+		      df, !net_eq(tunnel->net, dev_net(dev)));
 	return;
 tx_error:
 	dev->stats.tx_errors++;
-- 
cgit 


From 18e1173d5f3615c8c2680853ba87830ad8a0febc Mon Sep 17 00:00:00 2001
From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Thu, 7 Sep 2017 14:08:35 +0800
Subject: ip6_tunnel: fix setting hop_limit value for ipv6 tunnel

Similar to vxlan/geneve tunnel, if hop_limit is zero, it should fall
back to ip6_dst_hoplimt().

Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 3a0ba2ae4b0f..10a693a19323 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1184,6 +1184,7 @@ route_lookup:
 		init_tel_txopt(&opt, encap_limit);
 		ipv6_push_frag_opts(skb, &opt.ops, &proto);
 	}
+	hop_limit = hop_limit ? : ip6_dst_hoplimit(dst);
 
 	/* Calculate max headroom for all the headers and adjust
 	 * needed_headroom if necessary.
-- 
cgit 


From 109980b894e9dae66c37c3d804a415aa68b19c7e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 8 Sep 2017 00:14:51 +0200
Subject: bpf: don't select potentially stale ri->map from buggy xdp progs

We can potentially run into a couple of issues with the XDP
bpf_redirect_map() helper. The ri->map in the per CPU storage
can become stale in several ways, mostly due to misuse, where
we can then trigger a use after free on the map:

i) prog A is calling bpf_redirect_map(), returning XDP_REDIRECT
and running on a driver not supporting XDP_REDIRECT yet. The
ri->map on that CPU becomes stale when the XDP program is unloaded
on the driver, and a prog B loaded on a different driver which
supports XDP_REDIRECT return code. prog B would have to omit
calling to bpf_redirect_map() and just return XDP_REDIRECT, which
would then access the freed map in xdp_do_redirect() since not
cleared for that CPU.

ii) prog A is calling bpf_redirect_map(), returning a code other
than XDP_REDIRECT. prog A is then detached, which triggers release
of the map. prog B is attached which, similarly as in i), would
just return XDP_REDIRECT without having called bpf_redirect_map()
and thus be accessing the freed map in xdp_do_redirect() since
not cleared for that CPU.

iii) prog A is attached to generic XDP, calling the bpf_redirect_map()
helper and returning XDP_REDIRECT. xdp_do_generic_redirect() is
currently not handling ri->map (will be fixed by Jesper), so it's
not being reset. Later loading a e.g. native prog B which would,
say, call bpf_xdp_redirect() and then returns XDP_REDIRECT would
find in xdp_do_redirect() that a map was set and uses that causing
use after free on map access.

Fix thus needs to avoid accessing stale ri->map pointers, naive
way would be to call a BPF function from drivers that just resets
it to NULL for all XDP return codes but XDP_REDIRECT and including
XDP_REDIRECT for drivers not supporting it yet (and let ri->map
being handled in xdp_do_generic_redirect()). There is a less
intrusive way w/o letting drivers call a reset for each BPF run.

The verifier knows we're calling into bpf_xdp_redirect_map()
helper, so it can do a small insn rewrite transparent to the prog
itself in the sense that it fills R4 with a pointer to the own
bpf_prog. We have that pointer at verification time anyway and
R4 is allowed to be used as per calling convention we scratch
R0 to R5 anyway, so they become inaccessible and program cannot
read them prior to a write. Then, the helper would store the prog
pointer in the current CPUs struct redirect_info. Later in
xdp_do_*_redirect() we check whether the redirect_info's prog
pointer is the same as passed xdp_prog pointer, and if that's
the case then all good, since the prog holds a ref on the map
anyway, so it is always valid at that point in time and must
have a reference count of at least 1. If in the unlikely case
they are not equal, it means we got a stale pointer, so we clear
and bail out right there. Also do reset map and the owning prog
in bpf_xdp_redirect(), so that bpf_xdp_redirect_map() and
bpf_xdp_redirect() won't get mixed up, only the last call should
take precedence. A tc bpf_redirect() doesn't use map anywhere
yet, so no need to clear it there since never accessed in that
layer.

Note that in case the prog is released, and thus the map as
well we're still under RCU read critical section at that time
and have preemption disabled as well. Once we commit with the
__dev_map_insert_ctx() from xdp_do_redirect_map() and set the
map to ri->map_to_flush, we still wait for a xdp_do_flush_map()
to finish in devmap dismantle time once flush_needed bit is set,
so that is fine.

Fixes: 97f91a7cf04f ("bpf: add bpf_redirect_map helper routine")
Reported-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 5912c738a7b2..0848df2cd9bf 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1794,6 +1794,7 @@ struct redirect_info {
 	u32 flags;
 	struct bpf_map *map;
 	struct bpf_map *map_to_flush;
+	const struct bpf_prog *map_owner;
 };
 
 static DEFINE_PER_CPU(struct redirect_info, redirect_info);
@@ -1807,7 +1808,6 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
 
 	ri->ifindex = ifindex;
 	ri->flags = flags;
-	ri->map = NULL;
 
 	return TC_ACT_REDIRECT;
 }
@@ -2504,6 +2504,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 			       struct bpf_prog *xdp_prog)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	const struct bpf_prog *map_owner = ri->map_owner;
 	struct bpf_map *map = ri->map;
 	u32 index = ri->ifindex;
 	struct net_device *fwd;
@@ -2511,6 +2512,15 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 
 	ri->ifindex = 0;
 	ri->map = NULL;
+	ri->map_owner = NULL;
+
+	/* This is really only caused by a deliberately crappy
+	 * BPF program, normally we would never hit that case,
+	 * so no need to inform someone via tracepoints either,
+	 * just bail out.
+	 */
+	if (unlikely(map_owner != xdp_prog))
+		return -EINVAL;
 
 	fwd = __dev_map_lookup_elem(map, index);
 	if (!fwd) {
@@ -2607,6 +2617,8 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
 
 	ri->ifindex = ifindex;
 	ri->flags = flags;
+	ri->map = NULL;
+	ri->map_owner = NULL;
 
 	return XDP_REDIRECT;
 }
@@ -2619,7 +2631,8 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
-BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags)
+BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags,
+	   const struct bpf_prog *, map_owner)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 
@@ -2629,10 +2642,14 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags
 	ri->ifindex = ifindex;
 	ri->flags = flags;
 	ri->map = map;
+	ri->map_owner = map_owner;
 
 	return XDP_REDIRECT;
 }
 
+/* Note, arg4 is hidden from users and populated by the verifier
+ * with the right pointer.
+ */
 static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
 	.func           = bpf_xdp_redirect_map,
 	.gpl_only       = false,
-- 
cgit 


From bbbe211c295ffb309247adb7b871dda60d92d2d5 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 8 Sep 2017 14:00:30 -0700
Subject: net: rcu lock and preempt disable missing around generic xdp

do_xdp_generic must be called inside rcu critical section with preempt
disabled to ensure BPF programs are valid and per-cpu variables used
for redirect operations are consistent. This patch ensures this is true
and fixes the splat below.

The netif_receive_skb_internal() code path is now broken into two rcu
critical sections. I decided it was better to limit the preempt_enable/disable
block to just the xdp static key portion and the fallout is more
rcu_read_lock/unlock calls. Seems like the best option to me.

[  607.596901] =============================
[  607.596906] WARNING: suspicious RCU usage
[  607.596912] 4.13.0-rc4+ #570 Not tainted
[  607.596917] -----------------------------
[  607.596923] net/core/dev.c:3948 suspicious rcu_dereference_check() usage!
[  607.596927]
[  607.596927] other info that might help us debug this:
[  607.596927]
[  607.596933]
[  607.596933] rcu_scheduler_active = 2, debug_locks = 1
[  607.596938] 2 locks held by pool/14624:
[  607.596943]  #0:  (rcu_read_lock_bh){......}, at: [<ffffffff95445ffd>] ip_finish_output2+0x14d/0x890
[  607.596973]  #1:  (rcu_read_lock_bh){......}, at: [<ffffffff953c8e3a>] __dev_queue_xmit+0x14a/0xfd0
[  607.597000]
[  607.597000] stack backtrace:
[  607.597006] CPU: 5 PID: 14624 Comm: pool Not tainted 4.13.0-rc4+ #570
[  607.597011] Hardware name: Dell Inc. Precision Tower 5810/0HHV7N, BIOS A17 03/01/2017
[  607.597016] Call Trace:
[  607.597027]  dump_stack+0x67/0x92
[  607.597040]  lockdep_rcu_suspicious+0xdd/0x110
[  607.597054]  do_xdp_generic+0x313/0xa50
[  607.597068]  ? time_hardirqs_on+0x5b/0x150
[  607.597076]  ? mark_held_locks+0x6b/0xc0
[  607.597088]  ? netdev_pick_tx+0x150/0x150
[  607.597117]  netif_rx_internal+0x205/0x3f0
[  607.597127]  ? do_xdp_generic+0xa50/0xa50
[  607.597144]  ? lock_downgrade+0x2b0/0x2b0
[  607.597158]  ? __lock_is_held+0x93/0x100
[  607.597187]  netif_rx+0x119/0x190
[  607.597202]  loopback_xmit+0xfd/0x1b0
[  607.597214]  dev_hard_start_xmit+0x127/0x4e0

Fixes: d445516966dc ("net: xdp: support xdp generic on virtual devices")
Fixes: b5cdae3291f7 ("net: Generic XDP")
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 6f845e4fec17..fb766d906148 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3981,8 +3981,13 @@ static int netif_rx_internal(struct sk_buff *skb)
 	trace_netif_rx(skb);
 
 	if (static_key_false(&generic_xdp_needed)) {
-		int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
-					 skb);
+		int ret;
+
+		preempt_disable();
+		rcu_read_lock();
+		ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
+		rcu_read_unlock();
+		preempt_enable();
 
 		/* Consider XDP consuming the packet a success from
 		 * the netdev point of view we do not want to count
@@ -4500,18 +4505,20 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
 	if (skb_defer_rx_timestamp(skb))
 		return NET_RX_SUCCESS;
 
-	rcu_read_lock();
-
 	if (static_key_false(&generic_xdp_needed)) {
-		int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
-					 skb);
+		int ret;
 
-		if (ret != XDP_PASS) {
-			rcu_read_unlock();
+		preempt_disable();
+		rcu_read_lock();
+		ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
+		rcu_read_unlock();
+		preempt_enable();
+
+		if (ret != XDP_PASS)
 			return NET_RX_DROP;
-		}
 	}
 
+	rcu_read_lock();
 #ifdef CONFIG_RPS
 	if (static_key_false(&rps_needed)) {
 		struct rps_dev_flow voidflow, *rflow = &voidflow;
-- 
cgit 


From 9beb8bedb05c5f3a353dba62b8fa7cbbb9ec685e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 9 Sep 2017 01:40:35 +0200
Subject: bpf: make error reporting in bpf_warn_invalid_xdp_action more clear

Differ between illegal XDP action code and just driver
unsupported one to provide better feedback when we throw
a one-time warning here. Reason is that with 814abfabef3c
("xdp: add bpf_redirect helper function") not all drivers
support the new XDP return code yet and thus they will
fall into their 'default' case when checking for return
codes after program return, which then triggers a
bpf_warn_invalid_xdp_action() stating that the return
code is illegal, but from XDP perspective it's not.

I decided not to place something like a XDP_ACT_MAX define
into uapi i) given we don't have this either for all other
program types, ii) future action codes could have further
encoding there, which would render such define unsuitable
and we wouldn't be able to rip it out again, and iii) we
rarely add new action codes.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 0848df2cd9bf..3a50a9b021e2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3609,7 +3609,11 @@ static bool xdp_is_valid_access(int off, int size,
 
 void bpf_warn_invalid_xdp_action(u32 act)
 {
-	WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act);
+	const u32 act_max = XDP_REDIRECT;
+
+	WARN_ONCE(1, "%s XDP return value %u, expect packet loss!\n",
+		  act > act_max ? "Illegal" : "Driver unsupported",
+		  act);
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
-- 
cgit