From 7a7d3e4c031b969c3b570787fc9fdb605abda03e Mon Sep 17 00:00:00 2001 From: Simon Dinkin Date: Thu, 31 Aug 2017 13:09:36 +0300 Subject: mac80211: fix incorrect assignment of reassoc value this fix minor issue in the log message. in ieee80211_rx_mgmt_assoc_resp function, when assigning the reassoc value from the mgmt frame control: ieee80211_is_reassoc_resp function need to be used, instead of ieee80211_is_reassoc_req function. Signed-off-by: Simon Dinkin Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index b588e593b0ec..3b8e2709d8de 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3155,7 +3155,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, if (len < 24 + 6) return; - reassoc = ieee80211_is_reassoc_req(mgmt->frame_control); + reassoc = ieee80211_is_reassoc_resp(mgmt->frame_control); capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); status_code = le16_to_cpu(mgmt->u.assoc_resp.status_code); aid = le16_to_cpu(mgmt->u.assoc_resp.aid); -- cgit From 53168215909281a09d3afc6fb51a9d4f81f74d39 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 22 Jun 2017 12:20:30 +0200 Subject: mac80211: fix VLAN handling with TXQs With TXQs, the AP_VLAN interfaces are resolved to their owner AP interface when enqueuing the frame, which makes sense since the frame really goes out on that as far as the driver is concerned. However, this introduces a problem: frames to be encrypted with a VLAN-specific GTK will now be encrypted with the AP GTK, since the information about which virtual interface to use to select the key is taken from the TXQ. Fix this by preserving info->control.vif and using that in the dequeue function. This now requires doing the driver-mapping in the dequeue as well. Since there's no way to filter the frames that are sitting on a TXQ, drop all frames, which may affect other interfaces, when an AP_VLAN is removed. Cc: stable@vger.kernel.org Signed-off-by: Johannes Berg --- include/net/mac80211.h | 15 ++------------- net/mac80211/iface.c | 17 +++++++++++++++-- net/mac80211/tx.c | 36 +++++++++++++++++++++++++++++------- 3 files changed, 46 insertions(+), 22 deletions(-) diff --git a/include/net/mac80211.h b/include/net/mac80211.h index f8149ca192b4..885690fa39c8 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -919,21 +919,10 @@ struct ieee80211_tx_info { unsigned long jiffies; }; /* NB: vif can be NULL for injected frames */ - union { - /* NB: vif can be NULL for injected frames */ - struct ieee80211_vif *vif; - - /* When packets are enqueued on txq it's easy - * to re-construct the vif pointer. There's no - * more space in tx_info so it can be used to - * store the necessary enqueue time for packet - * sojourn time computation. - */ - codel_time_t enqueue_time; - }; + struct ieee80211_vif *vif; struct ieee80211_key_conf *hw_key; u32 flags; - /* 4 bytes free */ + codel_time_t enqueue_time; } control; struct { u64 cookie; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 9228ac73c429..44399322f356 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -792,6 +792,7 @@ static int ieee80211_open(struct net_device *dev) static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_down) { + struct ieee80211_sub_if_data *txq_sdata = sdata; struct ieee80211_local *local = sdata->local; struct fq *fq = &local->fq; unsigned long flags; @@ -937,6 +938,9 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: + txq_sdata = container_of(sdata->bss, + struct ieee80211_sub_if_data, u.ap); + mutex_lock(&local->mtx); list_del(&sdata->u.vlan.list); mutex_unlock(&local->mtx); @@ -1007,8 +1011,17 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); - if (sdata->vif.txq) { - struct txq_info *txqi = to_txq_info(sdata->vif.txq); + if (txq_sdata->vif.txq) { + struct txq_info *txqi = to_txq_info(txq_sdata->vif.txq); + + /* + * FIXME FIXME + * + * We really shouldn't purge the *entire* txqi since that + * contains frames for the other AP_VLANs (and possibly + * the AP itself) as well, but there's no API in FQ now + * to be able to filter. + */ spin_lock_bh(&fq->lock); ieee80211_txq_purge(local, txqi); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 8858f4f185e9..94826680cf2b 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1276,11 +1276,6 @@ static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb) IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time(); } -static void ieee80211_set_skb_vif(struct sk_buff *skb, struct txq_info *txqi) -{ - IEEE80211_SKB_CB(skb)->control.vif = txqi->txq.vif; -} - static u32 codel_skb_len_func(const struct sk_buff *skb) { return skb->len; @@ -3414,6 +3409,7 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, struct ieee80211_tx_info *info; struct ieee80211_tx_data tx; ieee80211_tx_result r; + struct ieee80211_vif *vif; spin_lock_bh(&fq->lock); @@ -3430,8 +3426,6 @@ begin: if (!skb) goto out; - ieee80211_set_skb_vif(skb, txqi); - hdr = (struct ieee80211_hdr *)skb->data; info = IEEE80211_SKB_CB(skb); @@ -3488,6 +3482,34 @@ begin: } } + switch (tx.sdata->vif.type) { + case NL80211_IFTYPE_MONITOR: + if (tx.sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) { + vif = &tx.sdata->vif; + break; + } + tx.sdata = rcu_dereference(local->monitor_sdata); + if (tx.sdata) { + vif = &tx.sdata->vif; + info->hw_queue = + vif->hw_queue[skb_get_queue_mapping(skb)]; + } else if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) { + ieee80211_free_txskb(&local->hw, skb); + goto begin; + } else { + vif = NULL; + } + break; + case NL80211_IFTYPE_AP_VLAN: + tx.sdata = container_of(tx.sdata->bss, + struct ieee80211_sub_if_data, u.ap); + /* fall through */ + default: + vif = &tx.sdata->vif; + break; + } + + IEEE80211_SKB_CB(skb)->control.vif = vif; out: spin_unlock_bh(&fq->lock); -- cgit From d81b0fd0e7b76b05873299ae2dd310127b13613b Mon Sep 17 00:00:00 2001 From: Sharon Dvir Date: Sat, 5 Aug 2017 11:44:30 +0300 Subject: mac80211: shorten debug prints using ht_dbg() to avoid warning Invoking ht_dbg() with too long of a string will print a warning. Shorten the messages while retaining the printed patameters. Signed-off-by: Sharon Dvir Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/agg-tx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index cbd48762256c..420486b5a1d9 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -436,7 +436,7 @@ static void sta_addba_resp_timer_expired(unsigned long data) test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state)) { rcu_read_unlock(); ht_dbg(sta->sdata, - "timer expired on %pM tid %d but we are not (or no longer) expecting addBA response there\n", + "timer expired on %pM tid %d not expecting addBA response\n", sta->sta.addr, tid); return; } @@ -639,7 +639,7 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, time_before(jiffies, sta->ampdu_mlme.last_addba_req_time[tid] + HT_AGG_RETRIES_PERIOD)) { ht_dbg(sdata, - "BA request denied - waiting a grace period after %d failed requests on %pM tid %u\n", + "BA request denied - %d failed requests on %pM tid %u\n", sta->ampdu_mlme.addba_req_num[tid], sta->sta.addr, tid); ret = -EBUSY; goto err_unlock_sta; -- cgit From b44eebea181a36378bea8f27e7f9b4175bfad683 Mon Sep 17 00:00:00 2001 From: Liad Kaufman Date: Sat, 5 Aug 2017 11:44:29 +0300 Subject: mac80211: add MESH IE in the correct order VHT MESH support was added, but the order of the IEs wasn't enforced. Fix that. Signed-off-by: Liad Kaufman Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 259698de569f..6aef6793d052 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1436,7 +1436,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, WLAN_EID_SSID_LIST, WLAN_EID_CHANNEL_USAGE, WLAN_EID_INTERWORKING, - /* mesh ID can't happen here */ + WLAN_EID_MESH_ID, /* 60 GHz can't happen here right now */ }; noffset = ieee80211_ie_split(ie, ie_len, -- cgit From 89e9bfc4ee859ad2a477f10aa2d5c37377242296 Mon Sep 17 00:00:00 2001 From: Chunho Lee Date: Fri, 7 Jul 2017 17:03:14 -0700 Subject: mac80211: Fix null pointer dereference with iTXQ support This change adds null pointer check before dereferencing pointer dev on netif_tx_start_all_queues() when an interface is added. With iTXQ support, netif_tx_start_all_queues() is always called while an interface is added. however, the netdev queues are not associated and dev is null when the interface is either NL80211_IFTYPE_P2P_DEVICE or NL80211_IFTYPE_NAN. Signed-off-by: Chunho Lee Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 44399322f356..f75029abf728 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -731,7 +731,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) sdata->vif.type == NL80211_IFTYPE_AP_VLAN || local->ops->wake_tx_queue) { /* XXX: for AP_VLAN, actually track AP queues */ - netif_tx_start_all_queues(dev); + if (dev) + netif_tx_start_all_queues(dev); } else if (dev) { unsigned long flags; int n_acs = IEEE80211_NUM_ACS; -- cgit From 9de981f507474f326e42117858dc9a9321331ae5 Mon Sep 17 00:00:00 2001 From: Beni Lev Date: Tue, 25 Jul 2017 11:25:25 +0300 Subject: mac80211_hwsim: Use proper TX power In struct ieee80211_tx_info, control.vif pointer and rate_driver_data[0] falls on the same place, depending on the union usage. During the whole TX process, the union is referred to as a control struct, which holds the vif that is later used in the tx flow, especially in order to derive the used tx power. Referring direcly to rate_driver_data[0] and assigning a value to it, overwrites the vif pointer, hence making all later references irrelevant. Moreover, rate_driver_data[0] isn't used later in the flow in order to retrieve the channel that it is pointing to. Cc: stable@vger.kernel.org Signed-off-by: Beni Lev Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index c8852acc1462..6467ffac9811 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -1362,8 +1362,6 @@ static void mac80211_hwsim_tx(struct ieee80211_hw *hw, txi->control.rates, ARRAY_SIZE(txi->control.rates)); - txi->rate_driver_data[0] = channel; - if (skb->len >= 24 + 8 && ieee80211_is_probe_resp(hdr->frame_control)) { /* fake header transmission time */ -- cgit From 979e1f08042b83152dfe3d76df10db31eb7edf98 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 22 Jun 2017 12:20:28 +0200 Subject: mac80211: agg-tx: call drv_wake_tx_queue in proper context Since drv_wake_tx_queue() is normally called in the TX path, which is already in an RCU critical section, we should call it the same way in the aggregation code path, so if the driver expects to be able to use RCU, it'll already be protected without having to enter a nested critical section. Additionally, disable soft-IRQs, since not doing so could cause issues in a driver that relies on them already being disabled like in the other path. Fixes: ba8c3d6f16a1 ("mac80211: add an intermediate software queue implementation") Signed-off-by: Johannes Berg --- net/mac80211/agg-tx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 420486b5a1d9..bef516ec47f9 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -226,7 +226,11 @@ ieee80211_agg_start_txq(struct sta_info *sta, int tid, bool enable) clear_bit(IEEE80211_TXQ_AMPDU, &txqi->flags); clear_bit(IEEE80211_TXQ_STOP, &txqi->flags); + local_bh_disable(); + rcu_read_lock(); drv_wake_tx_queue(sta->sdata->local, txqi); + rcu_read_unlock(); + local_bh_enable(); } /* -- cgit From 6e46d8ce894374fc135c96a8d1057c6af1fef237 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Fri, 18 Aug 2017 15:33:57 +0300 Subject: mac80211: flush hw_roc_start work before cancelling the ROC When HW ROC is supported it is possible that after the HW notified that the ROC has started, the ROC was cancelled and another ROC was added while the hw_roc_start worker is waiting on the mutex (since cancelling the ROC and adding another one also holds the same mutex). As a result, the hw_roc_start worker will continue to run after the new ROC is added but before it is actually started by the HW. This may result in notifying userspace that the ROC has started before it actually does, or in case of management tx ROC, in an attempt to tx while not on the right channel. In addition, when the driver will notify mac80211 that the second ROC has started, mac80211 will warn that this ROC has already been notified. Fix this by flushing the hw_roc_start work before cancelling an ROC. Cc: stable@vger.kernel.org Signed-off-by: Avraham Stern Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/offchannel.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index f8e7a8bbc618..faf4f6055000 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -707,6 +707,8 @@ static int ieee80211_cancel_roc(struct ieee80211_local *local, if (!cookie) return -ENOENT; + flush_work(&local->hw_roc_start); + mutex_lock(&local->mtx); list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { if (!mgmt_tx && roc->cookie != cookie) -- cgit From ba83bfb1e86501d21077b570e10e443f1605be64 Mon Sep 17 00:00:00 2001 From: Igor Mitsyanko Date: Wed, 30 Aug 2017 13:52:25 -0700 Subject: nl80211: look for HT/VHT capabilities in beacon's tail There are no HT/VHT capabilities in cfg80211_ap_settings::beacon_ies, these should be looked for in beacon's tail instead. Fixes: 66cd794e3c30 ("nl80211: add HT/VHT capabilities to AP parameters") Signed-off-by: Igor Mitsyanko Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 8ce85420ecb0..0df8023f480b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3791,8 +3791,8 @@ static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params, static void nl80211_calculate_ap_params(struct cfg80211_ap_settings *params) { const struct cfg80211_beacon_data *bcn = ¶ms->beacon; - size_t ies_len = bcn->beacon_ies_len; - const u8 *ies = bcn->beacon_ies; + size_t ies_len = bcn->tail_len; + const u8 *ies = bcn->tail; const u8 *rates; const u8 *cap; -- cgit From 4e0854a74f08e6a9d847f2c2cfae7b07c931d125 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Wed, 6 Sep 2017 13:45:40 +0300 Subject: cfg80211: honor NL80211_RRF_NO_HT40{MINUS,PLUS} Honor the NL80211_RRF_NO_HT40{MINUS,PLUS} flags in reg_process_ht_flags_channel. Not doing so leads can lead to a firmware assert in iwlwifi for example. Fixes: b0d7aa59592b ("cfg80211: allow wiphy specific regdomain management") Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/wireless/reg.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 5fae296a6a58..6e94f6934a0e 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -4,6 +4,7 @@ * Copyright 2007 Johannes Berg * Copyright 2008-2011 Luis R. Rodriguez * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright 2017 Intel Deutschland GmbH * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -1483,7 +1484,9 @@ static void reg_process_ht_flags_channel(struct wiphy *wiphy, { struct ieee80211_supported_band *sband = wiphy->bands[channel->band]; struct ieee80211_channel *channel_before = NULL, *channel_after = NULL; + const struct ieee80211_regdomain *regd; unsigned int i; + u32 flags; if (!is_ht40_allowed(channel)) { channel->flags |= IEEE80211_CHAN_NO_HT40; @@ -1503,17 +1506,30 @@ static void reg_process_ht_flags_channel(struct wiphy *wiphy, channel_after = c; } + flags = 0; + regd = get_wiphy_regdom(wiphy); + if (regd) { + const struct ieee80211_reg_rule *reg_rule = + freq_reg_info_regd(MHZ_TO_KHZ(channel->center_freq), + regd, MHZ_TO_KHZ(20)); + + if (!IS_ERR(reg_rule)) + flags = reg_rule->flags; + } + /* * Please note that this assumes target bandwidth is 20 MHz, * if that ever changes we also need to change the below logic * to include that as well. */ - if (!is_ht40_allowed(channel_before)) + if (!is_ht40_allowed(channel_before) || + flags & NL80211_RRF_NO_HT40MINUS) channel->flags |= IEEE80211_CHAN_NO_HT40MINUS; else channel->flags &= ~IEEE80211_CHAN_NO_HT40MINUS; - if (!is_ht40_allowed(channel_after)) + if (!is_ht40_allowed(channel_after) || + flags & NL80211_RRF_NO_HT40PLUS) channel->flags |= IEEE80211_CHAN_NO_HT40PLUS; else channel->flags &= ~IEEE80211_CHAN_NO_HT40PLUS; -- cgit From 98e93e968e4947cd71c2eb69e323682daa453ee7 Mon Sep 17 00:00:00 2001 From: Ilan peer Date: Wed, 6 Sep 2017 17:32:40 +0300 Subject: mac80211: Complete ampdu work schedule during session tear down Commit 7a7c0a6438b8 ("mac80211: fix TX aggregation start/stop callback race") added a cancellation of the ampdu work after the loop that stopped the Tx and Rx BA sessions. However, in some cases, e.g., during HW reconfig, the low level driver might call mac80211 APIs to complete the stopping of the BA sessions, which would queue the ampdu work to handle the actual completion. This work needs to be performed as otherwise mac80211 data structures would not be properly synced. Fix this by checking if BA session STOP_CB bit is set after the BA session cancellation and properly clean the session. Signed-off-by: Ilan Peer [Johannes: the work isn't flushed because that could do other things we don't want, and the locking situation isn't clear] Signed-off-by: Johannes Berg --- net/mac80211/ht.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index c92df492e898..4cba7fca10d4 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -300,6 +300,24 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, /* stopping might queue the work again - so cancel only afterwards */ cancel_work_sync(&sta->ampdu_mlme.work); + + /* + * In case the tear down is part of a reconfigure due to HW restart + * request, it is possible that the low level driver requested to stop + * the BA session, so handle it to properly clean tid_tx data. + */ + mutex_lock(&sta->ampdu_mlme.mtx); + for (i = 0; i < IEEE80211_NUM_TIDS; i++) { + struct tid_ampdu_tx *tid_tx = + rcu_dereference_protected_tid_tx(sta, i); + + if (!tid_tx) + continue; + + if (test_and_clear_bit(HT_AGG_STATE_STOP_CB, &tid_tx->state)) + ieee80211_stop_tx_ba_cb(sta, i, tid_tx); + } + mutex_unlock(&sta->ampdu_mlme.mtx); } void ieee80211_ba_session_work(struct work_struct *work) -- cgit From bde59c475e0883e4c4294bcd9b9c7e08ae18c828 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 6 Sep 2017 15:01:42 +0200 Subject: mac80211: fix deadlock in driver-managed RX BA session start When an RX BA session is started by the driver, and it has to tell mac80211 about it, the corresponding bit in tid_rx_manage_offl gets set and the BA session work is scheduled. Upon testing this bit, it will call __ieee80211_start_rx_ba_session(), thus deadlocking as it already holds the ampdu_mlme.mtx, which that acquires again. Fix this by adding ___ieee80211_start_rx_ba_session(), a version of the function that requires the mutex already held. Cc: stable@vger.kernel.org Fixes: 699cb58c8a52 ("mac80211: manage RX BA session offload without SKB queue") Reported-by: Matteo Croce Signed-off-by: Johannes Berg --- net/mac80211/agg-rx.c | 32 +++++++++++++++++++++----------- net/mac80211/ht.c | 6 +++--- net/mac80211/ieee80211_i.h | 4 ++++ 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 2b36eff5d97e..2849a1fc41c5 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -245,10 +245,10 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d ieee80211_tx_skb(sdata, skb); } -void __ieee80211_start_rx_ba_session(struct sta_info *sta, - u8 dialog_token, u16 timeout, - u16 start_seq_num, u16 ba_policy, u16 tid, - u16 buf_size, bool tx, bool auto_seq) +void ___ieee80211_start_rx_ba_session(struct sta_info *sta, + u8 dialog_token, u16 timeout, + u16 start_seq_num, u16 ba_policy, u16 tid, + u16 buf_size, bool tx, bool auto_seq) { struct ieee80211_local *local = sta->sdata->local; struct tid_ampdu_rx *tid_agg_rx; @@ -267,7 +267,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, ht_dbg(sta->sdata, "STA %pM requests BA session on unsupported tid %d\n", sta->sta.addr, tid); - goto end_no_lock; + goto end; } if (!sta->sta.ht_cap.ht_supported) { @@ -275,14 +275,14 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, "STA %pM erroneously requests BA session on tid %d w/o QoS\n", sta->sta.addr, tid); /* send a response anyway, it's an error case if we get here */ - goto end_no_lock; + goto end; } if (test_sta_flag(sta, WLAN_STA_BLOCK_BA)) { ht_dbg(sta->sdata, "Suspend in progress - Denying ADDBA request (%pM tid %d)\n", sta->sta.addr, tid); - goto end_no_lock; + goto end; } /* sanity check for incoming parameters: @@ -296,7 +296,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, ht_dbg_ratelimited(sta->sdata, "AddBA Req with bad params from %pM on tid %u. policy %d, buffer size %d\n", sta->sta.addr, tid, ba_policy, buf_size); - goto end_no_lock; + goto end; } /* determine default buffer size */ if (buf_size == 0) @@ -311,7 +311,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, buf_size, sta->sta.addr); /* examine state machine */ - mutex_lock(&sta->ampdu_mlme.mtx); + lockdep_assert_held(&sta->ampdu_mlme.mtx); if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) { if (sta->ampdu_mlme.tid_rx_token[tid] == dialog_token) { @@ -415,15 +415,25 @@ end: __clear_bit(tid, sta->ampdu_mlme.unexpected_agg); sta->ampdu_mlme.tid_rx_token[tid] = dialog_token; } - mutex_unlock(&sta->ampdu_mlme.mtx); -end_no_lock: if (tx) ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, tid, dialog_token, status, 1, buf_size, timeout); } +void __ieee80211_start_rx_ba_session(struct sta_info *sta, + u8 dialog_token, u16 timeout, + u16 start_seq_num, u16 ba_policy, u16 tid, + u16 buf_size, bool tx, bool auto_seq) +{ + mutex_lock(&sta->ampdu_mlme.mtx); + ___ieee80211_start_rx_ba_session(sta, dialog_token, timeout, + start_seq_num, ba_policy, tid, + buf_size, tx, auto_seq); + mutex_unlock(&sta->ampdu_mlme.mtx); +} + void ieee80211_process_addba_request(struct ieee80211_local *local, struct sta_info *sta, struct ieee80211_mgmt *mgmt, diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 4cba7fca10d4..d6d0b4201e40 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -351,9 +351,9 @@ void ieee80211_ba_session_work(struct work_struct *work) if (test_and_clear_bit(tid, sta->ampdu_mlme.tid_rx_manage_offl)) - __ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid, - IEEE80211_MAX_AMPDU_BUF, - false, true); + ___ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid, + IEEE80211_MAX_AMPDU_BUF, + false, true); if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS, sta->ampdu_mlme.tid_rx_manage_offl)) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 2197c62a0a6e..9675814f64db 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1760,6 +1760,10 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, u8 dialog_token, u16 timeout, u16 start_seq_num, u16 ba_policy, u16 tid, u16 buf_size, bool tx, bool auto_seq); +void ___ieee80211_start_rx_ba_session(struct sta_info *sta, + u8 dialog_token, u16 timeout, + u16 start_seq_num, u16 ba_policy, u16 tid, + u16 buf_size, bool tx, bool auto_seq); void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, enum ieee80211_agg_stop_reason reason); void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, -- cgit From 9a94b3a4bdfdb404f1846099e880c52e144f06c0 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Sun, 3 Sep 2017 17:32:16 +0300 Subject: dt-binding: phy: don't confuse with Ethernet phy properties The generic PHY 'phys' property sometime appears in the same node with the Ethernet PHY 'phy' or 'phy-handle' properties. Add a warning in phy-bindings.txt to reduce confusion. Signed-off-by: Baruch Siach Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/phy/phy-bindings.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/phy/phy-bindings.txt b/Documentation/devicetree/bindings/phy/phy-bindings.txt index 1293c321754c..a403b81d0679 100644 --- a/Documentation/devicetree/bindings/phy/phy-bindings.txt +++ b/Documentation/devicetree/bindings/phy/phy-bindings.txt @@ -34,7 +34,9 @@ PHY user node ============= Required Properties: -phys : the phandle for the PHY device (used by the PHY subsystem) +phys : the phandle for the PHY device (used by the PHY subsystem; not to be + confused with the Ethernet specific 'phy' and 'phy-handle' properties, + see Documentation/devicetree/bindings/net/ethernet.txt for these) phy-names : the names of the PHY corresponding to the PHYs present in the *phys* phandle -- cgit From 39ad1297a2084e0724da73d9eda2ceb9573a5d6c Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Mon, 4 Sep 2017 14:21:12 +0800 Subject: sched: Use __qdisc_drop instead of kfree_skb in sch_prio and sch_qfq The commit 520ac30f4551 ("net_sched: drop packets after root qdisc lock is released) made a big change of tc for performance. There are two points left in sch_prio and sch_qfq which are not changed with that commit. Now enhance them now with __qdisc_drop. Signed-off-by: Gao Feng Signed-off-by: David S. Miller --- net/sched/sch_prio.c | 2 +- net/sched/sch_qfq.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index f31b28f788c0..2dd6c68ae91e 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -80,7 +80,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return ret; } #endif diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index cd661a7f81e6..6ddfd4991108 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1215,7 +1215,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (cl == NULL) { if (err & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return err; } pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid); -- cgit From be82485fbcbb1cf11b13e7356231c72fdf21241c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 6 Sep 2017 11:47:12 +0800 Subject: netlink: fix an use-after-free issue for nlk groups ChunYu found a netlink use-after-free issue by syzkaller: [28448.842981] BUG: KASAN: use-after-free in __nla_put+0x37/0x40 at addr ffff8807185e2378 [28448.969918] Call Trace: [...] [28449.117207] __nla_put+0x37/0x40 [28449.132027] nla_put+0xf5/0x130 [28449.146261] sk_diag_fill.isra.4.constprop.5+0x5a0/0x750 [netlink_diag] [28449.176608] __netlink_diag_dump+0x25a/0x700 [netlink_diag] [28449.202215] netlink_diag_dump+0x176/0x240 [netlink_diag] [28449.226834] netlink_dump+0x488/0xbb0 [28449.298014] __netlink_dump_start+0x4e8/0x760 [28449.317924] netlink_diag_handler_dump+0x261/0x340 [netlink_diag] [28449.413414] sock_diag_rcv_msg+0x207/0x390 [28449.432409] netlink_rcv_skb+0x149/0x380 [28449.467647] sock_diag_rcv+0x2d/0x40 [28449.484362] netlink_unicast+0x562/0x7b0 [28449.564790] netlink_sendmsg+0xaa8/0xe60 [28449.661510] sock_sendmsg+0xcf/0x110 [28449.865631] __sys_sendmsg+0xf3/0x240 [28450.000964] SyS_sendmsg+0x32/0x50 [28450.016969] do_syscall_64+0x25c/0x6c0 [28450.154439] entry_SYSCALL64_slow_path+0x25/0x25 It was caused by no protection between nlk groups' free in netlink_release and nlk groups' accessing in sk_diag_dump_groups. The similar issue also exists in netlink_seq_show(). This patch is to defer nlk groups' free in deferred_put_nlk_sk. Reported-by: ChunYu Wang Acked-by: Florian Westphal Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5acee49db90b..94a61e602ee7 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -691,6 +691,9 @@ static void deferred_put_nlk_sk(struct rcu_head *head) struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu); struct sock *sk = &nlk->sk; + kfree(nlk->groups); + nlk->groups = NULL; + if (!refcount_dec_and_test(&sk->sk_refcnt)) return; @@ -769,9 +772,6 @@ static int netlink_release(struct socket *sock) netlink_table_ungrab(); } - kfree(nlk->groups); - nlk->groups = NULL; - local_bh_disable(); sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1); local_bh_enable(); -- cgit From f773608026ee1e75e2256e3ad625c222ff6f9305 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 6 Sep 2017 11:53:29 +0800 Subject: netlink: access nlk groups safely in netlink bind and getname Now there is no lock protecting nlk ngroups/groups' accessing in netlink bind and getname. It's safe from nlk groups' setting in netlink_release, but not from netlink_realloc_groups called by netlink_setsockopt. netlink_lock_table is needed in both netlink bind and getname when accessing nlk groups. Acked-by: Florian Westphal Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 94a61e602ee7..327807731b44 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -955,7 +955,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, struct net *net = sock_net(sk); struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; - int err; + int err = 0; long unsigned int groups = nladdr->nl_groups; bool bound; @@ -983,6 +983,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, return -EINVAL; } + netlink_lock_table(); if (nlk->netlink_bind && groups) { int group; @@ -993,7 +994,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, if (!err) continue; netlink_undo_bind(group, groups, sk); - return err; + goto unlock; } } @@ -1006,12 +1007,13 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, netlink_autobind(sock); if (err) { netlink_undo_bind(nlk->ngroups, groups, sk); - return err; + goto unlock; } } if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0])) - return 0; + goto unlock; + netlink_unlock_table(); netlink_table_grab(); netlink_update_subscriptions(sk, nlk->subscriptions + @@ -1022,6 +1024,10 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, netlink_table_ungrab(); return 0; + +unlock: + netlink_unlock_table(); + return err; } static int netlink_connect(struct socket *sock, struct sockaddr *addr, @@ -1079,7 +1085,9 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, nladdr->nl_groups = netlink_group_mask(nlk->dst_group); } else { nladdr->nl_pid = nlk->portid; + netlink_lock_table(); nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0; + netlink_unlock_table(); } return 0; } -- cgit From 8e0deed92406d93ae0365cb8a6134db5721e7aca Mon Sep 17 00:00:00 2001 From: Kleber Sacilotto de Souza Date: Wed, 6 Sep 2017 11:08:06 +0200 Subject: tipc: remove unnecessary call to dev_net() The net device is already stored in the 'net' variable, so no need to call dev_net() again. Signed-off-by: Kleber Sacilotto de Souza Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/bearer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index ac1d66d7e1fd..47ec121574ce 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -637,7 +637,7 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, break; case NETDEV_UNREGISTER: case NETDEV_CHANGENAME: - bearer_disable(dev_net(dev), b); + bearer_disable(net, b); break; } return NOTIFY_OK; -- cgit From a33fcba6ec01efcca33b1afad91057020f247f15 Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Mon, 4 Sep 2017 12:51:33 -0500 Subject: rtlwifi: btcoexist: Fix breakage of ant_sel for rtl8723be In commit bcd37f4a0831 ("rtlwifi: btcoex: 23b 2ant: let bt transmit when hw initialisation done"), there is an additional error when the module parameter ant_sel is used to select the auxilary antenna. The error is that the antenna selection is not checked when writing the antenna selection register. Fixes: bcd37f4a0831 ("rtlwifi: btcoex: 23b 2ant: let bt transmit when hw initialisation done") Signed-off-by: Larry Finger Cc: Ping-Ke Shih Cc: Yan-Hsuan Chuang Cc: Birming Chiu Cc: Shaofu Cc: Steven Ting Cc: Stable # 4.12+ Signed-off-by: Kalle Valo --- drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c index 31965f0ef69d..e8f07573aed9 100644 --- a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c +++ b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c @@ -1183,7 +1183,10 @@ static void btc8723b2ant_set_ant_path(struct btc_coexist *btcoexist, } /* fixed internal switch S1->WiFi, S0->BT */ - btcoexist->btc_write_4byte(btcoexist, 0x948, 0x0); + if (board_info->btdm_ant_pos == BTC_ANTENNA_AT_MAIN_PORT) + btcoexist->btc_write_2byte(btcoexist, 0x948, 0x0); + else + btcoexist->btc_write_2byte(btcoexist, 0x948, 0x280); switch (antpos_type) { case BTC_ANT_WIFI_AT_MAIN: -- cgit From 6d622692836950b3c943776f84c4557ff6c02f3b Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Mon, 4 Sep 2017 12:51:34 -0500 Subject: rtlwifi: btcoexist: Fix antenna selection code In commit 87d8a9f35202 ("rtlwifi: btcoex: call bind to setup btcoex"), the code turns on a call to exhalbtc_bind_bt_coex_withadapter(). This routine contains a bug that causes incorrect antenna selection for those HP laptops with only one antenna and an incorrectly programmed EFUSE. These boxes are the ones that need the ant_sel module parameter. Fixes: 87d8a9f35202 ("rtlwifi: btcoex: call bind to setup btcoex") Signed-off-by: Larry Finger Cc: Ping-Ke Shih Cc: Yan-Hsuan Chuang Cc: Birming Chiu Cc: Shaofu Cc: Steven Ting Cc: Stable # 4.13+ Signed-off-by: Kalle Valo --- .../realtek/rtlwifi/btcoexist/halbtcoutsrc.c | 23 +++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtcoutsrc.c b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtcoutsrc.c index c1eacd8352a2..b5e9877d935c 100644 --- a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtcoutsrc.c +++ b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtcoutsrc.c @@ -173,6 +173,16 @@ static u8 halbtc_get_wifi_central_chnl(struct btc_coexist *btcoexist) u8 rtl_get_hwpg_single_ant_path(struct rtl_priv *rtlpriv) { + struct rtl_mod_params *mod_params = rtlpriv->cfg->mod_params; + + /* override ant_num / ant_path */ + if (mod_params->ant_sel) { + rtlpriv->btcoexist.btc_info.ant_num = + (mod_params->ant_sel == 1 ? ANT_X2 : ANT_X1); + + rtlpriv->btcoexist.btc_info.single_ant_path = + (mod_params->ant_sel == 1 ? 0 : 1); + } return rtlpriv->btcoexist.btc_info.single_ant_path; } @@ -183,6 +193,7 @@ u8 rtl_get_hwpg_bt_type(struct rtl_priv *rtlpriv) u8 rtl_get_hwpg_ant_num(struct rtl_priv *rtlpriv) { + struct rtl_mod_params *mod_params = rtlpriv->cfg->mod_params; u8 num; if (rtlpriv->btcoexist.btc_info.ant_num == ANT_X2) @@ -190,6 +201,10 @@ u8 rtl_get_hwpg_ant_num(struct rtl_priv *rtlpriv) else num = 1; + /* override ant_num / ant_path */ + if (mod_params->ant_sel) + num = (mod_params->ant_sel == 1 ? ANT_X2 : ANT_X1) + 1; + return num; } @@ -876,7 +891,7 @@ bool exhalbtc_bind_bt_coex_withadapter(void *adapter) { struct btc_coexist *btcoexist = &gl_bt_coexist; struct rtl_priv *rtlpriv = adapter; - u8 ant_num = 2, chip_type, single_ant_path = 0; + u8 ant_num = 2, chip_type; if (btcoexist->binded) return false; @@ -911,12 +926,6 @@ bool exhalbtc_bind_bt_coex_withadapter(void *adapter) ant_num = rtl_get_hwpg_ant_num(rtlpriv); exhalbtc_set_ant_num(rtlpriv, BT_COEX_ANT_TYPE_PG, ant_num); - /* set default antenna position to main port */ - btcoexist->board_info.btdm_ant_pos = BTC_ANTENNA_AT_MAIN_PORT; - - single_ant_path = rtl_get_hwpg_single_ant_path(rtlpriv); - exhalbtc_set_single_ant_path(single_ant_path); - if (rtl_get_hwpg_package_type(rtlpriv) == 0) btcoexist->board_info.tfbga_package = false; else if (rtl_get_hwpg_package_type(rtlpriv) == 1) -- cgit From 2eabc84d2f8e4f36d3719eeb6a330e10c46c8da7 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Thu, 7 Sep 2017 10:51:52 +0300 Subject: iwlwifi: mvm: only send LEDS_CMD when the FW supports it The LEDS_CMD command is only supported in some newer FW versions (e.g. iwlwifi-8000C-31.ucode), so we can't send it to older versions (such as iwlwifi-8000C-27.ucode). To fix this, check for a new bit in the FW capabilities TLV that tells when the command is supported. Note that the current version of -31.ucode in linux-firmware.git (31.532993.0) does not have this capability bit set, so the LED won't work, even though this version should support it. But we will update this firmware soon, so it won't be a problem anymore. Fixes: 7089ae634c50 ("iwlwifi: mvm: use firmware LED command where applicable") Reported-by: Linus Torvalds Signed-off-by: Luca Coelho Signed-off-by: Kalle Valo --- drivers/net/wireless/intel/iwlwifi/fw/file.h | 1 + drivers/net/wireless/intel/iwlwifi/mvm/led.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/file.h b/drivers/net/wireless/intel/iwlwifi/fw/file.h index 887f6d8fc8a7..279248cd9cfb 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/file.h +++ b/drivers/net/wireless/intel/iwlwifi/fw/file.h @@ -378,6 +378,7 @@ enum iwl_ucode_tlv_capa { IWL_UCODE_TLV_CAPA_EXTEND_SHARED_MEM_CFG = (__force iwl_ucode_tlv_capa_t)80, IWL_UCODE_TLV_CAPA_LQM_SUPPORT = (__force iwl_ucode_tlv_capa_t)81, IWL_UCODE_TLV_CAPA_TX_POWER_ACK = (__force iwl_ucode_tlv_capa_t)84, + IWL_UCODE_TLV_CAPA_LED_CMD_SUPPORT = (__force iwl_ucode_tlv_capa_t)86, IWL_UCODE_TLV_CAPA_MLME_OFFLOAD = (__force iwl_ucode_tlv_capa_t)96, NUM_IWL_UCODE_TLV_CAPA diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/led.c b/drivers/net/wireless/intel/iwlwifi/mvm/led.c index 005e2e7278a5..b27269504a62 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/led.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/led.c @@ -92,7 +92,8 @@ static void iwl_mvm_send_led_fw_cmd(struct iwl_mvm *mvm, bool on) static void iwl_mvm_led_set(struct iwl_mvm *mvm, bool on) { - if (mvm->cfg->device_family >= IWL_DEVICE_FAMILY_8000) { + if (fw_has_capa(&mvm->fw->ucode_capa, + IWL_UCODE_TLV_CAPA_LED_CMD_SUPPORT)) { iwl_mvm_send_led_fw_cmd(mvm, on); return; } -- cgit From 80532384af4ccb6d6cc965fa9232aaa2c456362c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 6 Sep 2017 13:14:19 +0200 Subject: net: sched: fix memleak for chain zero There's a memleak happening for chain 0. The thing is, chain 0 needs to be always present, not created on demand. Therefore tcf_block_get upon creation of block calls the tcf_chain_create function directly. The chain is created with refcnt == 1, which is not correct in this case and causes the memleak. So move the refcnt increment into tcf_chain_get function even for the case when chain needs to be created. Reported-by: Jakub Kicinski Fixes: 5bc1701881e3 ("net: sched: introduce multichain support for filters") Signed-off-by: Jiri Pirko Tested-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/sched/cls_api.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ea6c65fd5fc5..c743f03cfebd 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -182,7 +182,7 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block, list_add_tail(&chain->list, &block->chain_list); chain->block = block; chain->index = chain_index; - chain->refcnt = 1; + chain->refcnt = 0; return chain; } @@ -217,15 +217,15 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, struct tcf_chain *chain; list_for_each_entry(chain, &block->chain_list, list) { - if (chain->index == chain_index) { - chain->refcnt++; - return chain; - } + if (chain->index == chain_index) + goto incref; } - if (create) - return tcf_chain_create(block, chain_index); - else - return NULL; + chain = create ? tcf_chain_create(block, chain_index) : NULL; + +incref: + if (chain) + chain->refcnt++; + return chain; } EXPORT_SYMBOL(tcf_chain_get); -- cgit From 5c25f30c93fdc5bf25e62101aeaae7a4f9b421b3 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 5 Sep 2017 17:26:33 +0800 Subject: ip6_gre: update mtu properly in ip6gre_err Now when probessing ICMPV6_PKT_TOOBIG, ip6gre_err only subtracts the offset of gre header from mtu info. The expected mtu of gre device should also subtract gre header. Otherwise, the next packets still can't be sent out. Jianlin found this issue when using the topo: client(ip6gre)<---->(nic1)route(nic2)<----->(ip6gre)server and reducing nic2's mtu, then both tcp and sctp's performance with big size data became 0. This patch is to fix it by also subtracting grehdr (tun->tun_hlen) from mtu info when updating gre device's mtu in ip6gre_err(). It also needs to subtract ETH_HLEN if gre dev'type is ARPHRD_ETHER. Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 67ff2aaf5dcb..b7a72d409334 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -432,7 +432,9 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } break; case ICMPV6_PKT_TOOBIG: - mtu = be32_to_cpu(info) - offset; + mtu = be32_to_cpu(info) - offset - t->tun_hlen; + if (t->dev->type == ARPHRD_ETHER) + mtu -= ETH_HLEN; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; t->dev->mtu = mtu; -- cgit From ca2c1418efe9f7fe37aa1f355efdf4eb293673ce Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 6 Sep 2017 14:44:36 +0200 Subject: udp: drop head states only when all skb references are gone After commit 0ddf3fb2c43d ("udp: preserve skb->dst if required for IP options processing") we clear the skb head state as soon as the skb carrying them is first processed. Since the same skb can be processed several times when MSG_PEEK is used, we can end up lacking the required head states, and eventually oopsing. Fix this clearing the skb head state only when processing the last skb reference. Reported-by: Eric Dumazet Fixes: 0ddf3fb2c43d ("udp: preserve skb->dst if required for IP options processing") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- net/core/skbuff.c | 9 +++------ net/ipv4/udp.c | 5 ++++- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f751f3b93039..72299ef00061 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -958,7 +958,7 @@ void kfree_skb(struct sk_buff *skb); void kfree_skb_list(struct sk_buff *segs); void skb_tx_error(struct sk_buff *skb); void consume_skb(struct sk_buff *skb); -void consume_stateless_skb(struct sk_buff *skb); +void __consume_stateless_skb(struct sk_buff *skb); void __kfree_skb(struct sk_buff *skb); extern struct kmem_cache *skbuff_head_cache; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 68065d7d383f..16982de649b9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -710,14 +710,11 @@ EXPORT_SYMBOL(consume_skb); * consume_stateless_skb - free an skbuff, assuming it is stateless * @skb: buffer to free * - * Works like consume_skb(), but this variant assumes that all the head - * states have been already dropped. + * Alike consume_skb(), but this variant assumes that this is the last + * skb reference and all the head states have been already dropped */ -void consume_stateless_skb(struct sk_buff *skb) +void __consume_stateless_skb(struct sk_buff *skb) { - if (!skb_unref(skb)) - return; - trace_consume_skb(skb); skb_release_data(skb); kfree_skbmem(skb); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index db1c9e78c83c..ef29df8648e4 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1397,12 +1397,15 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) unlock_sock_fast(sk, slow); } + if (!skb_unref(skb)) + return; + /* In the more common cases we cleared the head states previously, * see __udp_queue_rcv_skb(). */ if (unlikely(udp_skb_has_head_state(skb))) skb_release_head_state(skb); - consume_stateless_skb(skb); + __consume_stateless_skb(skb); } EXPORT_SYMBOL_GPL(skb_consume_udp); -- cgit From eef5a7cc2a571eb2176c8b9260d4ccfc9f6be127 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 6 Sep 2017 15:38:58 +0200 Subject: isdn: isdnloop: fix logic error in isdnloop_sendbuf gcc-7 found an ancient bug in the loop driver, leading to a condition that is always false, meaning we ignore the contents of 'card->flags' here: drivers/isdn/isdnloop/isdnloop.c:412:37: error: ?: using integer constants in boolean context, the expression will always evaluate to 'true' [-Werror=int-in-bool-context] This changes the braces in the expression to ensure we actually compare the flag bits, rather than comparing a constant. As Joe Perches pointed out, an earlier patch of mine incorrectly assumed this was a false-positive warning. Cc: Joe Perches Link: https://patchwork.kernel.org/patch/9840289/ Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/isdn/isdnloop/isdnloop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/isdn/isdnloop/isdnloop.c b/drivers/isdn/isdnloop/isdnloop.c index 6ffd13466b8c..e97232646ba1 100644 --- a/drivers/isdn/isdnloop/isdnloop.c +++ b/drivers/isdn/isdnloop/isdnloop.c @@ -409,7 +409,7 @@ isdnloop_sendbuf(int channel, struct sk_buff *skb, isdnloop_card *card) return -EINVAL; } if (len) { - if (!(card->flags & (channel) ? ISDNLOOP_FLAGS_B2ACTIVE : ISDNLOOP_FLAGS_B1ACTIVE)) + if (!(card->flags & (channel ? ISDNLOOP_FLAGS_B2ACTIVE : ISDNLOOP_FLAGS_B1ACTIVE))) return 0; if (card->sndcount[channel] > ISDNLOOP_MAX_SQUEUE) return 0; -- cgit From 126f760ca94dae77425695f9f9238b731de86e32 Mon Sep 17 00:00:00 2001 From: Håkon Bugge Date: Wed, 6 Sep 2017 18:35:51 +0200 Subject: rds: Fix incorrect statistics counting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In rds_send_xmit() there is logic to batch the sends. However, if another thread has acquired the lock and has incremented the send_gen, it is considered a race and we yield. The code incrementing the s_send_lock_queue_raced statistics counter did not count this event correctly. This commit counts the race condition correctly. Changes from v1: - Removed check for *someone_on_xmit()* - Fixed incorrect indentation Signed-off-by: Håkon Bugge Reviewed-by: Knut Omang Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/send.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/rds/send.c b/net/rds/send.c index 058a40743041..b52cdc8ae428 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -428,14 +428,18 @@ over_batch: * some work and we will skip our goto */ if (ret == 0) { + bool raced; + smp_mb(); + raced = send_gen != READ_ONCE(cp->cp_send_gen); + if ((test_bit(0, &conn->c_map_queued) || - !list_empty(&cp->cp_send_queue)) && - send_gen == READ_ONCE(cp->cp_send_gen)) { - rds_stats_inc(s_send_lock_queue_raced); + !list_empty(&cp->cp_send_queue)) && !raced) { if (batch_count < send_batch_count) goto restart; queue_delayed_work(rds_wq, &cp->cp_send_w, 1); + } else if (raced) { + rds_stats_inc(s_send_lock_queue_raced); } } out: -- cgit From f957dd3c8db2781c8a334b166800788feb618625 Mon Sep 17 00:00:00 2001 From: Ian W MORRISON Date: Thu, 31 Aug 2017 08:51:03 +1000 Subject: brcmfmac: feature check for multi-scheduled scan fails on bcm4345 devices The firmware feature check introduced for multi-scheduled scan is also failing for bcm4345 devices resulting in a firmware crash. The reason for this crash has not yet been root cause so this patch avoids the feature check for those device as a short-term fix. Fixes: 9fe929aaace6 ("brcmfmac: add firmware feature detection for gscan feature") Cc: # v4.13 Signed-off-by: Ian W MORRISON Acked-by: Arend van Spriel Signed-off-by: Kalle Valo --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/feature.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/feature.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/feature.c index f1b60740e020..53ae30259989 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/feature.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/feature.c @@ -159,7 +159,8 @@ void brcmf_feat_attach(struct brcmf_pub *drvr) brcmf_feat_firmware_capabilities(ifp); memset(&gscan_cfg, 0, sizeof(gscan_cfg)); - if (drvr->bus_if->chip != BRCM_CC_43430_CHIP_ID) + if (drvr->bus_if->chip != BRCM_CC_43430_CHIP_ID && + drvr->bus_if->chip != BRCM_CC_4345_CHIP_ID) brcmf_feat_iovar_data_set(ifp, BRCMF_FEAT_GSCAN, "pfn_gscan_cfg", &gscan_cfg, sizeof(gscan_cfg)); -- cgit From 1cc4a018669f2fb18c10010f1a7ab3f6fb688cef Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 20 Aug 2017 13:38:07 +0800 Subject: netfilter: ipvs: fix the issue that sctp_conn_schedule drops non-INIT packet Commit 5e26b1b3abce ("ipvs: support scheduling inverse and icmp SCTP packets") changed to check packet type early. It introduced a side effect: if it's not a INIT packet, ports will be set as NULL, and the packet will be dropped later. It caused that sctp couldn't create connection when ipvs module is loaded and any scheduler is registered on server. Li Shuang reproduced it by running the cmds on sctp server: # ipvsadm -A -t 1.1.1.1:80 -s rr # ipvsadm -D -t 1.1.1.1:80 then the server could't work any more. This patch is to return 1 when it's not an INIT packet. It means ipvs will accept it without creating a conn for it, just like what it does for tcp. Fixes: 5e26b1b3abce ("ipvs: support scheduling inverse and icmp SCTP packets") Reported-by: Li Shuang Signed-off-by: Xin Long Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_proto_sctp.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index e1efa446b305..81f08198b125 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -24,9 +24,12 @@ sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, if (sh) { sch = skb_header_pointer(skb, iph->len + sizeof(_sctph), sizeof(_schunkh), &_schunkh); - if (sch && (sch->type == SCTP_CID_INIT || - sysctl_sloppy_sctp(ipvs))) + if (sch) { + if (!(sysctl_sloppy_sctp(ipvs) || + sch->type == SCTP_CID_INIT)) + return 1; ports = &sh->source; + } } } else { ports = skb_header_pointer( -- cgit From 68913a018f6082f8f90abb8ff9114435ef45dff7 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 20 Aug 2017 13:38:08 +0800 Subject: netfilter: ipvs: do not create conn for ABORT packet in sctp_conn_schedule There's no reason for ipvs to create a conn for an ABORT packet even if sysctl_sloppy_sctp is set. This patch is to accept it without creating a conn, just as ipvs does for tcp's RST packet. Signed-off-by: Xin Long Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_proto_sctp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 81f08198b125..57c8ee66491e 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -25,7 +25,8 @@ sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, sch = skb_header_pointer(skb, iph->len + sizeof(_sctph), sizeof(_schunkh), &_schunkh); if (sch) { - if (!(sysctl_sloppy_sctp(ipvs) || + if (sch->type == SCTP_CID_ABORT || + !(sysctl_sloppy_sctp(ipvs) || sch->type == SCTP_CID_INIT)) return 1; ports = &sh->source; -- cgit From ba1cc08d9488c94cb8d94f545305688b72a2a300 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 8 Sep 2017 10:26:19 +0200 Subject: ipv6: fix memory leak with multiple tables during netns destruction fib6_net_exit only frees the main and local tables. If another table was created with fib6_alloc_table, we leak it when the netns is destroyed. Fix this in the same way ip_fib_net_exit cleans up tables, by walking through the whole hashtable of fib6_table's. We can get rid of the special cases for local and main, since they're also part of the hashtable. Reproducer: ip netns add x ip -net x -6 rule add from 6003:1::/64 table 100 ip netns del x Reported-by: Jianlin Shi Fixes: 58f09b78b730 ("[NETNS][IPV6] ip6_fib - make it per network namespace") Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index a3b5c163325f..8280172c806c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -191,6 +191,12 @@ void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) } EXPORT_SYMBOL_GPL(rt6_free_pcpu); +static void fib6_free_table(struct fib6_table *table) +{ + inetpeer_invalidate_tree(&table->tb6_peers); + kfree(table); +} + static void fib6_link_table(struct net *net, struct fib6_table *tb) { unsigned int h; @@ -2022,15 +2028,22 @@ out_timer: static void fib6_net_exit(struct net *net) { + unsigned int i; + rt6_ifdown(net, NULL); del_timer_sync(&net->ipv6.ip6_fib_timer); -#ifdef CONFIG_IPV6_MULTIPLE_TABLES - inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers); - kfree(net->ipv6.fib6_local_tbl); -#endif - inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers); - kfree(net->ipv6.fib6_main_tbl); + for (i = 0; i < FIB_TABLE_HASHSZ; i++) { + struct hlist_head *head = &net->ipv6.fib_table_hash[i]; + struct hlist_node *tmp; + struct fib6_table *tb; + + hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) { + hlist_del(&tb->tb6_hlist); + fib6_free_table(tb); + } + } + kfree(net->ipv6.fib_table_hash); kfree(net->ipv6.rt6_stats); fib6_notifier_exit(net); -- cgit From 75c2631468e8af554057246b2413e738dd96af3d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 31 Aug 2017 13:45:24 +0200 Subject: netfilter: nf_nat: don't bug when mapping already exists It seems preferrable to limp along if we have a conflicting mapping, its certainly better than a BUG(). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 40573aa6c133..dc3519cc7209 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -416,7 +416,9 @@ nf_nat_setup_info(struct nf_conn *ct, WARN_ON(maniptype != NF_NAT_MANIP_SRC && maniptype != NF_NAT_MANIP_DST); - BUG_ON(nf_nat_initialized(ct, maniptype)); + + if (WARN_ON(nf_nat_initialized(ct, maniptype))) + return NF_DROP; /* What we've got will look like inverse of reply. Normally * this is what is in the conntrack, except for prior -- cgit From a5d7a714569199f909cd60ff7074107bf15c7db4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 1 Sep 2017 22:41:03 +0200 Subject: netfilter: xtables: add scheduling opportunity in get_counters There are reports about spurious softlockups during iptables-restore, a backtrace i saw points at get_counters -- it uses a sequence lock and also has unbounded restart loop. Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 1 + net/ipv4/netfilter/ip_tables.c | 1 + net/ipv6/netfilter/ip6_tables.c | 1 + 3 files changed, 3 insertions(+) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index e04457198f93..9e2770fd00be 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -629,6 +629,7 @@ static void get_counters(const struct xt_table_info *t, ADD_COUNTER(counters[i], bcnt, pcnt); ++i; + cond_resched(); } } } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 576cba2b57e9..39286e543ee6 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -776,6 +776,7 @@ get_counters(const struct xt_table_info *t, ADD_COUNTER(counters[i], bcnt, pcnt); ++i; /* macro does multi eval of i */ + cond_resched(); } } } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 54b1e75eded1..01bd3ee5ebc6 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -795,6 +795,7 @@ get_counters(const struct xt_table_info *t, ADD_COUNTER(counters[i], bcnt, pcnt); ++i; + cond_resched(); } } } -- cgit From e1bf1687740ce1a3598a1c5e452b852ff2190682 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 Sep 2017 14:39:51 +0200 Subject: netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable" This reverts commit 870190a9ec9075205c0fa795a09fa931694a3ff1. It was not a good idea. The custom hash table was a much better fit for this purpose. A fast lookup is not essential, in fact for most cases there is no lookup at all because original tuple is not taken and can be used as-is. What needs to be fast is insertion and deletion. rhlist removal however requires a rhlist walk. We can have thousands of entries in such a list if source port/addresses are reused for multiple flows, if this happens removal requests are so expensive that deletions of a few thousand flows can take several seconds(!). The advantages that we got from rhashtable are: 1) table auto-sizing 2) multiple locks 1) would be nice to have, but it is not essential as we have at most one lookup per new flow, so even a million flows in the bysource table are not a problem compared to current deletion cost. 2) is easy to add to custom hash table. I tried to add hlist_node to rhlist to speed up rhltable_remove but this isn't doable without changing semantics. rhltable_remove_fast will check that the to-be-deleted object is part of the table and that requires a list walk that we want to avoid. Furthermore, using hlist_node increases size of struct rhlist_head, which in turn increases nf_conn size. Link: https://bugzilla.kernel.org/show_bug.cgi?id=196821 Reported-by: Ivan Babrou Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 3 +- include/net/netfilter/nf_nat.h | 1 - net/netfilter/nf_nat_core.c | 130 ++++++++++++++--------------------- 3 files changed, 54 insertions(+), 80 deletions(-) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index fdc9c64a1c94..8f3bd30511de 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -77,7 +76,7 @@ struct nf_conn { possible_net_t ct_net; #if IS_ENABLED(CONFIG_NF_NAT) - struct rhlist_head nat_bysource; + struct hlist_node nat_bysource; #endif /* all members below initialized via memset */ u8 __nfct_init_offset[0]; diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index 05c82a1a4267..b71701302e61 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -1,6 +1,5 @@ #ifndef _NF_NAT_H #define _NF_NAT_H -#include #include #include #include diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index dc3519cc7209..f090419f5f97 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -30,19 +30,17 @@ #include #include +static DEFINE_SPINLOCK(nf_nat_lock); + static DEFINE_MUTEX(nf_nat_proto_mutex); static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] __read_mostly; static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] __read_mostly; -struct nf_nat_conn_key { - const struct net *net; - const struct nf_conntrack_tuple *tuple; - const struct nf_conntrack_zone *zone; -}; - -static struct rhltable nf_nat_bysource_table; +static struct hlist_head *nf_nat_bysource __read_mostly; +static unsigned int nf_nat_htable_size __read_mostly; +static unsigned int nf_nat_hash_rnd __read_mostly; inline const struct nf_nat_l3proto * __nf_nat_l3proto_find(u8 family) @@ -118,17 +116,19 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) EXPORT_SYMBOL(nf_xfrm_me_harder); #endif /* CONFIG_XFRM */ -static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed) +/* We keep an extra hash for each conntrack, for fast searching. */ +static unsigned int +hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) { - const struct nf_conntrack_tuple *t; - const struct nf_conn *ct = data; + unsigned int hash; + + get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); - t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; /* Original src, to ensure we map it consistently if poss. */ + hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), + tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n)); - seed ^= net_hash_mix(nf_ct_net(ct)); - return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32), - t->dst.protonum ^ seed); + return reciprocal_scale(hash, nf_nat_htable_size); } /* Is this tuple already taken? (not by us) */ @@ -184,28 +184,6 @@ same_src(const struct nf_conn *ct, t->src.u.all == tuple->src.u.all); } -static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg, - const void *obj) -{ - const struct nf_nat_conn_key *key = arg->key; - const struct nf_conn *ct = obj; - - if (!same_src(ct, key->tuple) || - !net_eq(nf_ct_net(ct), key->net) || - !nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL)) - return 1; - - return 0; -} - -static struct rhashtable_params nf_nat_bysource_params = { - .head_offset = offsetof(struct nf_conn, nat_bysource), - .obj_hashfn = nf_nat_bysource_hash, - .obj_cmpfn = nf_nat_bysource_cmp, - .nelem_hint = 256, - .min_size = 1024, -}; - /* Only called for SRC manip */ static int find_appropriate_src(struct net *net, @@ -216,26 +194,22 @@ find_appropriate_src(struct net *net, struct nf_conntrack_tuple *result, const struct nf_nat_range *range) { + unsigned int h = hash_by_src(net, tuple); const struct nf_conn *ct; - struct nf_nat_conn_key key = { - .net = net, - .tuple = tuple, - .zone = zone - }; - struct rhlist_head *hl, *h; - - hl = rhltable_lookup(&nf_nat_bysource_table, &key, - nf_nat_bysource_params); - rhl_for_each_entry_rcu(ct, h, hl, nat_bysource) { - nf_ct_invert_tuplepr(result, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - result->dst = tuple->dst; - - if (in_range(l3proto, l4proto, result, range)) - return 1; + hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { + if (same_src(ct, tuple) && + net_eq(net, nf_ct_net(ct)) && + nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { + /* Copy source part from reply tuple. */ + nf_ct_invert_tuplepr(result, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + result->dst = tuple->dst; + + if (in_range(l3proto, l4proto, result, range)) + return 1; + } } - return 0; } @@ -408,6 +382,7 @@ nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype) { + struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; /* Can't setup nat info for confirmed ct. */ @@ -449,19 +424,14 @@ nf_nat_setup_info(struct nf_conn *ct, } if (maniptype == NF_NAT_MANIP_SRC) { - struct nf_nat_conn_key key = { - .net = nf_ct_net(ct), - .tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - .zone = nf_ct_zone(ct), - }; - int err; - - err = rhltable_insert_key(&nf_nat_bysource_table, - &key, - &ct->nat_bysource, - nf_nat_bysource_params); - if (err) - return NF_DROP; + unsigned int srchash; + + srchash = hash_by_src(net, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + spin_lock_bh(&nf_nat_lock); + hlist_add_head_rcu(&ct->nat_bysource, + &nf_nat_bysource[srchash]); + spin_unlock_bh(&nf_nat_lock); } /* It's done. */ @@ -570,8 +540,9 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) * will delete entry from already-freed table. */ clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); - rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource, - nf_nat_bysource_params); + spin_lock_bh(&nf_nat_lock); + hlist_del_rcu(&ct->nat_bysource); + spin_unlock_bh(&nf_nat_lock); /* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. @@ -699,9 +670,11 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister); /* No one using conntrack by the time this called. */ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { - if (ct->status & IPS_SRC_NAT_DONE) - rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource, - nf_nat_bysource_params); + if (ct->status & IPS_SRC_NAT_DONE) { + spin_lock_bh(&nf_nat_lock); + hlist_del_rcu(&ct->nat_bysource); + spin_unlock_bh(&nf_nat_lock); + } } static struct nf_ct_ext_type nat_extend __read_mostly = { @@ -825,13 +798,16 @@ static int __init nf_nat_init(void) { int ret; - ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params); - if (ret) - return ret; + /* Leave them the same for the moment. */ + nf_nat_htable_size = nf_conntrack_htable_size; + + nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); + if (!nf_nat_bysource) + return -ENOMEM; ret = nf_ct_extend_register(&nat_extend); if (ret < 0) { - rhltable_destroy(&nf_nat_bysource_table); + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); return ret; } @@ -865,8 +841,8 @@ static void __exit nf_nat_cleanup(void) for (i = 0; i < NFPROTO_NUMPROTO; i++) kfree(nf_nat_l4protos[i]); - - rhltable_destroy(&nf_nat_bysource_table); + synchronize_net(); + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); } MODULE_LICENSE("GPL"); -- cgit From 8073e960a03bf7b5d5ebfc5ff18ac475e1688f46 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 Sep 2017 14:39:52 +0200 Subject: netfilter: nat: use keyed locks no need to serialize on a single lock, we can partition the table and add/delete in parallel to different slots. This restores one of the advantages that got lost with the rhlist revert. Cc: Ivan Babrou Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index f090419f5f97..f393a7086025 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -30,7 +30,7 @@ #include #include -static DEFINE_SPINLOCK(nf_nat_lock); +static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; static DEFINE_MUTEX(nf_nat_proto_mutex); static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] @@ -425,13 +425,15 @@ nf_nat_setup_info(struct nf_conn *ct, if (maniptype == NF_NAT_MANIP_SRC) { unsigned int srchash; + spinlock_t *lock; srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - spin_lock_bh(&nf_nat_lock); + lock = &nf_nat_locks[srchash % ARRAY_SIZE(nf_nat_locks)]; + spin_lock_bh(lock); hlist_add_head_rcu(&ct->nat_bysource, &nf_nat_bysource[srchash]); - spin_unlock_bh(&nf_nat_lock); + spin_unlock_bh(lock); } /* It's done. */ @@ -525,6 +527,16 @@ static int nf_nat_proto_remove(struct nf_conn *i, void *data) return i->status & IPS_NAT_MASK ? 1 : 0; } +static void __nf_nat_cleanup_conntrack(struct nf_conn *ct) +{ + unsigned int h; + + h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + spin_lock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]); + hlist_del_rcu(&ct->nat_bysource); + spin_unlock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]); +} + static int nf_nat_proto_clean(struct nf_conn *ct, void *data) { if (nf_nat_proto_remove(ct, data)) @@ -540,9 +552,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) * will delete entry from already-freed table. */ clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); - spin_lock_bh(&nf_nat_lock); - hlist_del_rcu(&ct->nat_bysource); - spin_unlock_bh(&nf_nat_lock); + __nf_nat_cleanup_conntrack(ct); /* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. @@ -670,11 +680,8 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister); /* No one using conntrack by the time this called. */ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { - if (ct->status & IPS_SRC_NAT_DONE) { - spin_lock_bh(&nf_nat_lock); - hlist_del_rcu(&ct->nat_bysource); - spin_unlock_bh(&nf_nat_lock); - } + if (ct->status & IPS_SRC_NAT_DONE) + __nf_nat_cleanup_conntrack(ct); } static struct nf_ct_ext_type nat_extend __read_mostly = { @@ -796,10 +803,12 @@ static struct nf_ct_helper_expectfn follow_master_nat = { static int __init nf_nat_init(void) { - int ret; + int ret, i; /* Leave them the same for the moment. */ nf_nat_htable_size = nf_conntrack_htable_size; + if (nf_nat_htable_size < ARRAY_SIZE(nf_nat_locks)) + nf_nat_htable_size = ARRAY_SIZE(nf_nat_locks); nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); if (!nf_nat_bysource) @@ -812,6 +821,9 @@ static int __init nf_nat_init(void) return ret; } + for (i = 0; i < ARRAY_SIZE(nf_nat_locks); i++) + spin_lock_init(&nf_nat_locks[i]); + nf_ct_helper_expectfn_register(&follow_master_nat); BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); -- cgit From 74585d4f84379528347630253c42518c5002d2f9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 Sep 2017 14:47:57 +0200 Subject: netfilter: core: remove erroneous warn_on kernel test robot reported: WARNING: CPU: 0 PID: 1244 at net/netfilter/core.c:218 __nf_hook_entries_try_shrink+0x49/0xcd [..] After allowing batching in nf_unregister_net_hooks its possible that an earlier call to __nf_hook_entries_try_shrink already compacted the list. If this happens we don't need to do anything. Fixes: d3ad2c17b4047 ("netfilter: core: batch nf_unregister_net_hooks synchronize_net calls") Reported-by: kernel test robot Signed-off-by: Florian Westphal Acked-by: Aaron Conole Signed-off-by: Pablo Neira Ayuso --- net/netfilter/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 04fe25abc5f6..52cd2901a097 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -215,7 +215,7 @@ static void *__nf_hook_entries_try_shrink(struct nf_hook_entries __rcu **pp) if (skip == hook_entries) goto out_assign; - if (WARN_ON(skip == 0)) + if (skip == 0) return NULL; hook_entries -= skip; -- cgit From 05d0eae7c1cf6bf7b19c02b3a97ff457b3317323 Mon Sep 17 00:00:00 2001 From: Zhizhou Tian Date: Fri, 8 Sep 2017 11:00:16 +0800 Subject: netfilter: xt_hashlimit: alloc hashtable with right size struct xt_byteslimit_htable used hlist_head, but memory allocation is done through sizeof(struct list_head). Signed-off-by: Zhizhou Tian Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_hashlimit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 10d48234f5f4..962ea4a63d9f 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -279,7 +279,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, size = cfg->size; } else { size = (totalram_pages << PAGE_SHIFT) / 16384 / - sizeof(struct list_head); + sizeof(struct hlist_head); if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE) size = 8192; if (size < 16) @@ -287,7 +287,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, } /* FIXME: don't use vmalloc() here or anywhere else -HW */ hinfo = vmalloc(sizeof(struct xt_hashlimit_htable) + - sizeof(struct list_head) * size); + sizeof(struct hlist_head) * size); if (hinfo == NULL) return -ENOMEM; *out_hinfo = hinfo; -- cgit From 90c4ae4e2c1da9f1eaf846136861af43d4c1ff34 Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Fri, 8 Sep 2017 01:38:58 -0400 Subject: netfilter: xt_hashlimit: fix build error caused by 64bit division 64bit division causes build/link errors on 32bit architectures. It prints out error messages like: ERROR: "__aeabi_uldivmod" [net/netfilter/xt_hashlimit.ko] undefined! The value of avg passed through by userspace in BYTE mode cannot exceed U32_MAX. Which means 64bit division in user2rate_bytes is unnecessary. To fix this I have changed the type of param 'user' to u32. Since anything greater than U32_MAX is an invalid input we error out in hashlimit_mt_check_common() when this is the case. Changes in v2: Making return type as u32 would cause an overflow for small values of 'user' (for example 2, 3 etc). To avoid this I bumped up 'r' to u64 again as well as the return type. This is OK since the variable that stores the result is u64. We still avoid 64bit division here since 'user' is u32. Fixes: bea74641e378 ("netfilter: xt_hashlimit: add rate match mode") Signed-off-by: Vishwanath Pai Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_hashlimit.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 962ea4a63d9f..5da8746f7b88 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -35,6 +35,7 @@ #include #include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte "); @@ -527,12 +528,12 @@ static u64 user2rate(u64 user) } } -static u64 user2rate_bytes(u64 user) +static u64 user2rate_bytes(u32 user) { u64 r; - r = user ? 0xFFFFFFFFULL / user : 0xFFFFFFFFULL; - r = (r - 1) << 4; + r = user ? U32_MAX / user : U32_MAX; + r = (r - 1) << XT_HASHLIMIT_BYTE_SHIFT; return r; } @@ -588,7 +589,8 @@ static void rateinfo_init(struct dsthash_ent *dh, dh->rateinfo.prev_window = 0; dh->rateinfo.current_rate = 0; if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) { - dh->rateinfo.rate = user2rate_bytes(hinfo->cfg.avg); + dh->rateinfo.rate = + user2rate_bytes((u32)hinfo->cfg.avg); if (hinfo->cfg.burst) dh->rateinfo.burst = hinfo->cfg.burst * dh->rateinfo.rate; @@ -870,7 +872,7 @@ static int hashlimit_mt_check_common(const struct xt_mtchk_param *par, /* Check for overflow. */ if (revision >= 3 && cfg->mode & XT_HASHLIMIT_RATE_MATCH) { - if (cfg->avg == 0) { + if (cfg->avg == 0 || cfg->avg > U32_MAX) { pr_info("hashlimit invalid rate\n"); return -ERANGE; } -- cgit From 7906b00f5cd1cd484fced7fcda892176e3202c8a Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Fri, 8 Sep 2017 11:35:21 -0300 Subject: sctp: fix missing wake ups in some situations Commit fb586f25300f ("sctp: delay calls to sk_data_ready() as much as possible") minimized the number of wake ups that are triggered in case the association receives a packet with multiple data chunks on it and/or when io_events are enabled and then commit 0970f5b36659 ("sctp: signal sk_data_ready earlier on data chunks reception") moved the wake up to as soon as possible. It thus relies on the state machine running later to clean the flag that the event was already generated. The issue is that there are 2 call paths that calls sctp_ulpq_tail_event() outside of the state machine, causing the flag to linger and possibly omitting a needed wake up in the sequence. One of the call paths is when enabling SCTP_SENDER_DRY_EVENTS via setsockopt(SCTP_EVENTS), as noticed by Harald Welte. The other is when partial reliability triggers removal of chunks from the send queue when the application calls sendmsg(). This commit fixes it by not setting the flag in case the socket is not owned by the user, as it won't be cleaned later. This works for user-initiated calls and also for rx path processing. Fixes: fb586f25300f ("sctp: delay calls to sk_data_ready() as much as possible") Reported-by: Harald Welte Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/ulpqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 0225d62a869f..a71be33f3afe 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -265,7 +265,8 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) sctp_ulpq_clear_pd(ulpq); if (queue == &sk->sk_receive_queue && !sp->data_ready_signalled) { - sp->data_ready_signalled = 1; + if (!sock_owned_by_user(sk)) + sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } return 1; -- cgit From 1f3b359f1004bd34b7b0bad70b93e3c7af92a37b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Sep 2017 12:44:47 -0700 Subject: tcp: fix a request socket leak While the cited commit fixed a possible deadlock, it added a leak of the request socket, since reqsk_put() must be called if the BPF filter decided the ACK packet must be dropped. Fixes: d624d276d1dd ("tcp: fix possible deadlock in TCP stack vs BPF filter") Signed-off-by: Eric Dumazet Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 6 +++--- net/ipv6/tcp_ipv6.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a63486afa7a7..d9416b5162bc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1669,9 +1669,9 @@ process: */ sock_hold(sk); refcounted = true; - if (tcp_filter(sk, skb)) - goto discard_and_relse; - nsk = tcp_check_req(sk, skb, req, false); + nsk = NULL; + if (!tcp_filter(sk, skb)) + nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); goto discard_and_relse; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 38f76d8b231e..64d94afa427f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1460,9 +1460,9 @@ process: } sock_hold(sk); refcounted = true; - if (tcp_filter(sk, skb)) - goto discard_and_relse; - nsk = tcp_check_req(sk, skb, req, false); + nsk = NULL; + if (!tcp_filter(sk, skb)) + nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); goto discard_and_relse; -- cgit From 32a805baf0fb70b6dbedefcd7249ac7f580f9e3b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Sep 2017 15:48:47 -0700 Subject: ipv6: fix typo in fib6_net_exit() IPv6 FIB should use FIB6_TABLE_HASHSZ, not FIB_TABLE_HASHSZ. Fixes: ba1cc08d9488 ("ipv6: fix memory leak with multiple tables during netns destruction") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 8280172c806c..e5308d7cbd75 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2033,7 +2033,7 @@ static void fib6_net_exit(struct net *net) rt6_ifdown(net, NULL); del_timer_sync(&net->ipv6.ip6_fib_timer); - for (i = 0; i < FIB_TABLE_HASHSZ; i++) { + for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { struct hlist_head *head = &net->ipv6.fib_table_hash[i]; struct hlist_node *tmp; struct fib6_table *tb; -- cgit From 0f693f1995cf002432b70f43ce73f79bf8d0b6c9 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Thu, 7 Sep 2017 14:08:34 +0800 Subject: ip_tunnel: fix setting ttl and tos value in collect_md mode ttl and tos variables are declared and assigned, but are not used in iptunnel_xmit() function. Fixes: cfc7381b3002 ("ip_tunnel: add collect_md mode to IPIP tunnel") Cc: Alexei Starovoitov Signed-off-by: Haishuang Yan Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 129d1a3616f8..e1856bfa753d 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -618,8 +618,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) ip_rt_put(rt); goto tx_dropped; } - iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos, - key->ttl, df, !net_eq(tunnel->net, dev_net(dev))); + iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, + df, !net_eq(tunnel->net, dev_net(dev))); return; tx_error: dev->stats.tx_errors++; -- cgit From 18e1173d5f3615c8c2680853ba87830ad8a0febc Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Thu, 7 Sep 2017 14:08:35 +0800 Subject: ip6_tunnel: fix setting hop_limit value for ipv6 tunnel Similar to vxlan/geneve tunnel, if hop_limit is zero, it should fall back to ip6_dst_hoplimt(). Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 3a0ba2ae4b0f..10a693a19323 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1184,6 +1184,7 @@ route_lookup: init_tel_txopt(&opt, encap_limit); ipv6_push_frag_opts(skb, &opt.ops, &proto); } + hop_limit = hop_limit ? : ip6_dst_hoplimit(dst); /* Calculate max headroom for all the headers and adjust * needed_headroom if necessary. -- cgit From c43593d81ab59b8ad24ee5545e098e30f742d461 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Thu, 7 Sep 2017 11:09:59 +0300 Subject: dt-bindings: net: don't confuse with generic PHY property This complements commit 9a94b3a4bd (dt-binding: phy: don't confuse with Ethernet phy properties). The generic PHY 'phys' property sometime appears in the same node with the Ethernet PHY 'phy' or 'phy-handle' properties. Add a warning in ethernet.txt to reduce confusion. Signed-off-by: Baruch Siach Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/ethernet.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/net/ethernet.txt b/Documentation/devicetree/bindings/net/ethernet.txt index 7da86f22a13b..2974e63ba311 100644 --- a/Documentation/devicetree/bindings/net/ethernet.txt +++ b/Documentation/devicetree/bindings/net/ethernet.txt @@ -1,5 +1,9 @@ The following properties are common to the Ethernet controllers: +NOTE: All 'phy*' properties documented below are Ethernet specific. For the +generic PHY 'phys' property, see +Documentation/devicetree/bindings/phy/phy-bindings.txt. + - local-mac-address: array of 6 bytes, specifies the MAC address that was assigned to the network device; - mac-address: array of 6 bytes, specifies the MAC address that was last used by -- cgit From 165da3585743076c2a9de6f12b0b31df846d7c01 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Thu, 7 Sep 2017 12:25:48 +0300 Subject: dt-bindings: add SFF vendor prefix Acked-by: Rob Herring Signed-off-by: Baruch Siach Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/vendor-prefixes.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt index 4e72012928b4..12b5808b82f2 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.txt +++ b/Documentation/devicetree/bindings/vendor-prefixes.txt @@ -289,6 +289,7 @@ schindler Schindler seagate Seagate Technology PLC semtech Semtech Corporation sensirion Sensirion AG +sff Small Form Factor Committee sgx SGX Sensortech sharp Sharp Corporation si-en Si-En Technology Ltd. -- cgit From 3ef3714000ad86e8b8fbdaa3abc4d468a8e98a79 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Thu, 7 Sep 2017 12:25:49 +0300 Subject: dt-binding: net: sfp binding documentation Add device-tree binding documentation SFP transceivers. Support for SFP transceivers has been recently introduced (drivers/net/phy/sfp.c). Signed-off-by: Baruch Siach Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/sff,sfp.txt | 76 +++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/sff,sfp.txt diff --git a/Documentation/devicetree/bindings/net/sff,sfp.txt b/Documentation/devicetree/bindings/net/sff,sfp.txt new file mode 100644 index 000000000000..60e970ce10ee --- /dev/null +++ b/Documentation/devicetree/bindings/net/sff,sfp.txt @@ -0,0 +1,76 @@ +Small Form Factor (SFF) Committee Small Form-factor Pluggable (SFP) +Transceiver + +Required properties: + +- compatible : must be "sff,sfp" + +Optional Properties: + +- i2c-bus : phandle of an I2C bus controller for the SFP two wire serial + interface + +- mod-def0-gpios : GPIO phandle and a specifier of the MOD-DEF0 (AKA Mod_ABS) + module presence input gpio signal, active (module absent) high + +- los-gpios : GPIO phandle and a specifier of the Receiver Loss of Signal + Indication input gpio signal, active (signal lost) high + +- tx-fault-gpios : GPIO phandle and a specifier of the Module Transmitter + Fault input gpio signal, active (fault condition) high + +- tx-disable-gpios : GPIO phandle and a specifier of the Transmitter Disable + output gpio signal, active (Tx disable) high + +- rate-select0-gpios : GPIO phandle and a specifier of the Rx Signaling Rate + Select (AKA RS0) output gpio signal, low: low Rx rate, high: high Rx rate + +- rate-select1-gpios : GPIO phandle and a specifier of the Tx Signaling Rate + Select (AKA RS1) output gpio signal (SFP+ only), low: low Tx rate, high: + high Tx rate + +Example #1: Direct serdes to SFP connection + +sfp_eth3: sfp-eth3 { + compatible = "sff,sfp"; + i2c-bus = <&sfp_1g_i2c>; + los-gpios = <&cpm_gpio2 22 GPIO_ACTIVE_HIGH>; + mod-def0-gpios = <&cpm_gpio2 21 GPIO_ACTIVE_LOW>; + pinctrl-names = "default"; + pinctrl-0 = <&cpm_sfp_1g_pins &cps_sfp_1g_pins>; + tx-disable-gpios = <&cps_gpio1 24 GPIO_ACTIVE_HIGH>; + tx-fault-gpios = <&cpm_gpio2 19 GPIO_ACTIVE_HIGH>; +}; + +&cps_emac3 { + phy-names = "comphy"; + phys = <&cps_comphy5 0>; + sfp = <&sfp_eth3>; +}; + +Example #2: Serdes to PHY to SFP connection + +sfp_eth0: sfp-eth0 { + compatible = "sff,sfp"; + i2c-bus = <&sfpp0_i2c>; + los-gpios = <&cps_gpio1 28 GPIO_ACTIVE_HIGH>; + mod-def0-gpios = <&cps_gpio1 27 GPIO_ACTIVE_LOW>; + pinctrl-names = "default"; + pinctrl-0 = <&cps_sfpp0_pins>; + tx-disable-gpios = <&cps_gpio1 29 GPIO_ACTIVE_HIGH>; + tx-fault-gpios = <&cps_gpio1 26 GPIO_ACTIVE_HIGH>; +}; + +p0_phy: ethernet-phy@0 { + compatible = "ethernet-phy-ieee802.3-c45"; + pinctrl-names = "default"; + pinctrl-0 = <&cpm_phy0_pins &cps_phy0_pins>; + reg = <0>; + interrupt = <&cpm_gpio2 18 IRQ_TYPE_EDGE_FALLING>; + sfp = <&sfp_eth0>; +}; + +&cpm_eth0 { + phy = <&p0_phy>; + phy-mode = "10gbase-kr"; +}; -- cgit From 25ee079371640573bbb22ab04ca5fe7a6e9842cf Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Thu, 7 Sep 2017 12:25:50 +0300 Subject: net: phy: sfp: rename dt properties to match the binding Make the Rx rate select control gpio property name match the documented binding. This would make the addition of 'rate-select1-gpios' for SFP+ support more natural. Also, make the MOD-DEF0 gpio property name match the documentation. Signed-off-by: Baruch Siach Signed-off-by: David S. Miller --- drivers/net/phy/sfp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c index fb2cf4342f48..baee371bf767 100644 --- a/drivers/net/phy/sfp.c +++ b/drivers/net/phy/sfp.c @@ -58,11 +58,11 @@ enum { }; static const char *gpio_of_names[] = { - "moddef0", + "mod-def0", "los", "tx-fault", "tx-disable", - "rate-select", + "rate-select0", }; static const enum gpiod_flags gpio_flags[] = { -- cgit From 0fdbedc7ddfc22e5a6e9596dd01b0dd922bf142c Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Thu, 7 Sep 2017 13:24:20 +0200 Subject: davicom: Display proper debug level up to 6 This will make it explicit some messages are of the form: dm9000_dbg(db, 5, ... Signed-off-by: Mathieu Malaterre Signed-off-by: David S. Miller --- drivers/net/ethernet/davicom/dm9000.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/davicom/dm9000.c b/drivers/net/ethernet/davicom/dm9000.c index 16fe776ddbe5..50222b7b81f3 100644 --- a/drivers/net/ethernet/davicom/dm9000.c +++ b/drivers/net/ethernet/davicom/dm9000.c @@ -65,7 +65,7 @@ MODULE_PARM_DESC(watchdog, "transmit timeout in milliseconds"); */ static int debug; module_param(debug, int, 0644); -MODULE_PARM_DESC(debug, "dm9000 debug level (0-4)"); +MODULE_PARM_DESC(debug, "dm9000 debug level (0-6)"); /* DM9000 register address locking. * -- cgit From e333ac1f1db09e37425c90bb6cf3dec721a6d71d Mon Sep 17 00:00:00 2001 From: Ivan Khoronzhuk Date: Thu, 7 Sep 2017 18:32:30 +0300 Subject: net: ethernet: ti: netcp_core: no need in netif_napi_del Don't remove rx_napi specifically just before free_netdev(), it's supposed to be done in it and is confusing w/o tx_napi deletion. Signed-off-by: Ivan Khoronzhuk Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/netcp_core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index eb96a6913235..437d36289786 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -2145,7 +2145,6 @@ static void netcp_delete_interface(struct netcp_device *netcp_device, of_node_put(netcp->node_interface); unregister_netdev(ndev); - netif_napi_del(&netcp->rx_napi); free_netdev(ndev); } -- cgit From 9a486c9dc515f7daa05a344621031ee09645c522 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 7 Sep 2017 12:35:14 -0700 Subject: net: tulip: Constify tulip_tbl It looks like all users of tulip_tbl are reads, so mark this table as read-only. $ git grep tulip_tbl # edited to avoid line-wraps... interrupt.c: iowrite32(tulip_tbl[tp->chip_id].valid_intrs, ... interrupt.c: iowrite32(tulip_tbl[tp->chip_id].valid_intrs&~RxPollInt, ... interrupt.c: iowrite32(tulip_tbl[tp->chip_id].valid_intrs, ... interrupt.c: iowrite32(tulip_tbl[tp->chip_id].valid_intrs | TimerInt, pnic.c: iowrite32(tulip_tbl[tp->chip_id].valid_intrs, ioaddr + CSR7); tulip.h: extern struct tulip_chip_table tulip_tbl[]; tulip_core.c:struct tulip_chip_table tulip_tbl[] = { tulip_core.c:iowrite32(tulip_tbl[tp->chip_id].valid_intrs, ioaddr + CSR5); tulip_core.c:iowrite32(tulip_tbl[tp->chip_id].valid_intrs, ioaddr + CSR7); tulip_core.c:setup_timer(&tp->timer, tulip_tbl[tp->chip_id].media_timer, tulip_core.c:const char *chip_name = tulip_tbl[chip_idx].chip_name; tulip_core.c:if (pci_resource_len (pdev, 0) < tulip_tbl[chip_idx].io_size) tulip_core.c:ioaddr = pci_iomap(..., tulip_tbl[chip_idx].io_size); tulip_core.c:tp->flags = tulip_tbl[chip_idx].flags; tulip_core.c:setup_timer(&tp->timer, tulip_tbl[tp->chip_id].media_timer, tulip_core.c:INIT_WORK(&tp->media_work, tulip_tbl[tp->chip_id].media_task); Cc: "David S. Miller" Cc: Jarod Wilson Cc: "Gustavo A. R. Silva" Cc: netdev@vger.kernel.org Cc: linux-parisc@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- drivers/net/ethernet/dec/tulip/tulip.h | 2 +- drivers/net/ethernet/dec/tulip/tulip_core.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/dec/tulip/tulip.h b/drivers/net/ethernet/dec/tulip/tulip.h index 38431a155f09..06660dbc44b7 100644 --- a/drivers/net/ethernet/dec/tulip/tulip.h +++ b/drivers/net/ethernet/dec/tulip/tulip.h @@ -515,7 +515,7 @@ void comet_timer(unsigned long data); extern int tulip_debug; extern const char * const medianame[]; extern const char tulip_media_cap[]; -extern struct tulip_chip_table tulip_tbl[]; +extern const struct tulip_chip_table tulip_tbl[]; void oom_timer(unsigned long data); extern u8 t21040_csr13[]; diff --git a/drivers/net/ethernet/dec/tulip/tulip_core.c b/drivers/net/ethernet/dec/tulip/tulip_core.c index 84394b43c0a1..851b6d1f5a42 100644 --- a/drivers/net/ethernet/dec/tulip/tulip_core.c +++ b/drivers/net/ethernet/dec/tulip/tulip_core.c @@ -138,7 +138,7 @@ static void tulip_timer(unsigned long data) * It is indexed via the values in 'enum chips' */ -struct tulip_chip_table tulip_tbl[] = { +const struct tulip_chip_table tulip_tbl[] = { { }, /* placeholder for array, slot unused currently */ { }, /* placeholder for array, slot unused currently */ -- cgit From 109980b894e9dae66c37c3d804a415aa68b19c7e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 8 Sep 2017 00:14:51 +0200 Subject: bpf: don't select potentially stale ri->map from buggy xdp progs We can potentially run into a couple of issues with the XDP bpf_redirect_map() helper. The ri->map in the per CPU storage can become stale in several ways, mostly due to misuse, where we can then trigger a use after free on the map: i) prog A is calling bpf_redirect_map(), returning XDP_REDIRECT and running on a driver not supporting XDP_REDIRECT yet. The ri->map on that CPU becomes stale when the XDP program is unloaded on the driver, and a prog B loaded on a different driver which supports XDP_REDIRECT return code. prog B would have to omit calling to bpf_redirect_map() and just return XDP_REDIRECT, which would then access the freed map in xdp_do_redirect() since not cleared for that CPU. ii) prog A is calling bpf_redirect_map(), returning a code other than XDP_REDIRECT. prog A is then detached, which triggers release of the map. prog B is attached which, similarly as in i), would just return XDP_REDIRECT without having called bpf_redirect_map() and thus be accessing the freed map in xdp_do_redirect() since not cleared for that CPU. iii) prog A is attached to generic XDP, calling the bpf_redirect_map() helper and returning XDP_REDIRECT. xdp_do_generic_redirect() is currently not handling ri->map (will be fixed by Jesper), so it's not being reset. Later loading a e.g. native prog B which would, say, call bpf_xdp_redirect() and then returns XDP_REDIRECT would find in xdp_do_redirect() that a map was set and uses that causing use after free on map access. Fix thus needs to avoid accessing stale ri->map pointers, naive way would be to call a BPF function from drivers that just resets it to NULL for all XDP return codes but XDP_REDIRECT and including XDP_REDIRECT for drivers not supporting it yet (and let ri->map being handled in xdp_do_generic_redirect()). There is a less intrusive way w/o letting drivers call a reset for each BPF run. The verifier knows we're calling into bpf_xdp_redirect_map() helper, so it can do a small insn rewrite transparent to the prog itself in the sense that it fills R4 with a pointer to the own bpf_prog. We have that pointer at verification time anyway and R4 is allowed to be used as per calling convention we scratch R0 to R5 anyway, so they become inaccessible and program cannot read them prior to a write. Then, the helper would store the prog pointer in the current CPUs struct redirect_info. Later in xdp_do_*_redirect() we check whether the redirect_info's prog pointer is the same as passed xdp_prog pointer, and if that's the case then all good, since the prog holds a ref on the map anyway, so it is always valid at that point in time and must have a reference count of at least 1. If in the unlikely case they are not equal, it means we got a stale pointer, so we clear and bail out right there. Also do reset map and the owning prog in bpf_xdp_redirect(), so that bpf_xdp_redirect_map() and bpf_xdp_redirect() won't get mixed up, only the last call should take precedence. A tc bpf_redirect() doesn't use map anywhere yet, so no need to clear it there since never accessed in that layer. Note that in case the prog is released, and thus the map as well we're still under RCU read critical section at that time and have preemption disabled as well. Once we commit with the __dev_map_insert_ctx() from xdp_do_redirect_map() and set the map to ri->map_to_flush, we still wait for a xdp_do_flush_map() to finish in devmap dismantle time once flush_needed bit is set, so that is fine. Fixes: 97f91a7cf04f ("bpf: add bpf_redirect_map helper routine") Reported-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 16 ++++++++++++++++ net/core/filter.c | 21 +++++++++++++++++++-- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d690c7dd1f1a..477b6932c3c1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4203,6 +4203,22 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + if (insn->imm == BPF_FUNC_redirect_map) { + u64 addr = (unsigned long)prog; + struct bpf_insn r4_ld[] = { + BPF_LD_IMM64(BPF_REG_4, addr), + *insn, + }; + cnt = ARRAY_SIZE(r4_ld); + + new_prog = bpf_patch_insn_data(env, i + delta, r4_ld, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + } patch_call_imm: fn = prog->aux->ops->get_func_proto(insn->imm); /* all functions that have prototype and verifier allowed diff --git a/net/core/filter.c b/net/core/filter.c index 5912c738a7b2..0848df2cd9bf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1794,6 +1794,7 @@ struct redirect_info { u32 flags; struct bpf_map *map; struct bpf_map *map_to_flush; + const struct bpf_prog *map_owner; }; static DEFINE_PER_CPU(struct redirect_info, redirect_info); @@ -1807,7 +1808,6 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) ri->ifindex = ifindex; ri->flags = flags; - ri->map = NULL; return TC_ACT_REDIRECT; } @@ -2504,6 +2504,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); + const struct bpf_prog *map_owner = ri->map_owner; struct bpf_map *map = ri->map; u32 index = ri->ifindex; struct net_device *fwd; @@ -2511,6 +2512,15 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, ri->ifindex = 0; ri->map = NULL; + ri->map_owner = NULL; + + /* This is really only caused by a deliberately crappy + * BPF program, normally we would never hit that case, + * so no need to inform someone via tracepoints either, + * just bail out. + */ + if (unlikely(map_owner != xdp_prog)) + return -EINVAL; fwd = __dev_map_lookup_elem(map, index); if (!fwd) { @@ -2607,6 +2617,8 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) ri->ifindex = ifindex; ri->flags = flags; + ri->map = NULL; + ri->map_owner = NULL; return XDP_REDIRECT; } @@ -2619,7 +2631,8 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags) +BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, + const struct bpf_prog *, map_owner) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); @@ -2629,10 +2642,14 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags ri->ifindex = ifindex; ri->flags = flags; ri->map = map; + ri->map_owner = map_owner; return XDP_REDIRECT; } +/* Note, arg4 is hidden from users and populated by the verifier + * with the right pointer. + */ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { .func = bpf_xdp_redirect_map, .gpl_only = false, -- cgit From bbbe211c295ffb309247adb7b871dda60d92d2d5 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 8 Sep 2017 14:00:30 -0700 Subject: net: rcu lock and preempt disable missing around generic xdp do_xdp_generic must be called inside rcu critical section with preempt disabled to ensure BPF programs are valid and per-cpu variables used for redirect operations are consistent. This patch ensures this is true and fixes the splat below. The netif_receive_skb_internal() code path is now broken into two rcu critical sections. I decided it was better to limit the preempt_enable/disable block to just the xdp static key portion and the fallout is more rcu_read_lock/unlock calls. Seems like the best option to me. [ 607.596901] ============================= [ 607.596906] WARNING: suspicious RCU usage [ 607.596912] 4.13.0-rc4+ #570 Not tainted [ 607.596917] ----------------------------- [ 607.596923] net/core/dev.c:3948 suspicious rcu_dereference_check() usage! [ 607.596927] [ 607.596927] other info that might help us debug this: [ 607.596927] [ 607.596933] [ 607.596933] rcu_scheduler_active = 2, debug_locks = 1 [ 607.596938] 2 locks held by pool/14624: [ 607.596943] #0: (rcu_read_lock_bh){......}, at: [] ip_finish_output2+0x14d/0x890 [ 607.596973] #1: (rcu_read_lock_bh){......}, at: [] __dev_queue_xmit+0x14a/0xfd0 [ 607.597000] [ 607.597000] stack backtrace: [ 607.597006] CPU: 5 PID: 14624 Comm: pool Not tainted 4.13.0-rc4+ #570 [ 607.597011] Hardware name: Dell Inc. Precision Tower 5810/0HHV7N, BIOS A17 03/01/2017 [ 607.597016] Call Trace: [ 607.597027] dump_stack+0x67/0x92 [ 607.597040] lockdep_rcu_suspicious+0xdd/0x110 [ 607.597054] do_xdp_generic+0x313/0xa50 [ 607.597068] ? time_hardirqs_on+0x5b/0x150 [ 607.597076] ? mark_held_locks+0x6b/0xc0 [ 607.597088] ? netdev_pick_tx+0x150/0x150 [ 607.597117] netif_rx_internal+0x205/0x3f0 [ 607.597127] ? do_xdp_generic+0xa50/0xa50 [ 607.597144] ? lock_downgrade+0x2b0/0x2b0 [ 607.597158] ? __lock_is_held+0x93/0x100 [ 607.597187] netif_rx+0x119/0x190 [ 607.597202] loopback_xmit+0xfd/0x1b0 [ 607.597214] dev_hard_start_xmit+0x127/0x4e0 Fixes: d445516966dc ("net: xdp: support xdp generic on virtual devices") Fixes: b5cdae3291f7 ("net: Generic XDP") Acked-by: Daniel Borkmann Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/dev.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 6f845e4fec17..fb766d906148 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3981,8 +3981,13 @@ static int netif_rx_internal(struct sk_buff *skb) trace_netif_rx(skb); if (static_key_false(&generic_xdp_needed)) { - int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), - skb); + int ret; + + preempt_disable(); + rcu_read_lock(); + ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); + rcu_read_unlock(); + preempt_enable(); /* Consider XDP consuming the packet a success from * the netdev point of view we do not want to count @@ -4500,18 +4505,20 @@ static int netif_receive_skb_internal(struct sk_buff *skb) if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; - rcu_read_lock(); - if (static_key_false(&generic_xdp_needed)) { - int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), - skb); + int ret; - if (ret != XDP_PASS) { - rcu_read_unlock(); + preempt_disable(); + rcu_read_lock(); + ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); + rcu_read_unlock(); + preempt_enable(); + + if (ret != XDP_PASS) return NET_RX_DROP; - } } + rcu_read_lock(); #ifdef CONFIG_RPS if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; -- cgit From 5a67da2a71c64daeb456f6f3e87b5c7cecdc5ffa Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 8 Sep 2017 14:00:49 -0700 Subject: bpf: add support for sockmap detach programs The bpf map sockmap supports adding programs via attach commands. This patch adds the detach command to keep the API symmetric and allow users to remove previously added programs. Otherwise the user would have to delete the map and re-add it to get in this state. This also adds a series of additional tests to capture detach operation and also attaching/detaching invalid prog types. API note: socks will run (or not run) programs depending on the state of the map at the time the sock is added. We do not for example walk the map and remove programs from previously attached socks. Acked-by: Daniel Borkmann Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 8 +++--- kernel/bpf/sockmap.c | 2 +- kernel/bpf/syscall.c | 27 ++++++++++------- tools/testing/selftests/bpf/test_maps.c | 51 ++++++++++++++++++++++++++++++++- 4 files changed, 72 insertions(+), 16 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c2cb1b5c094e..8390859e79e7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -385,16 +385,16 @@ static inline void __dev_map_flush(struct bpf_map *map) #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); -int sock_map_attach_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type); +int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type); #else static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) { return NULL; } -static inline int sock_map_attach_prog(struct bpf_map *map, - struct bpf_prog *prog, - u32 type) +static inline int sock_map_prog(struct bpf_map *map, + struct bpf_prog *prog, + u32 type) { return -EOPNOTSUPP; } diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index f6ffde9c6a68..6424ce0e4969 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -792,7 +792,7 @@ out_progs: return err; } -int sock_map_attach_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) +int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct bpf_prog *orig; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 70ad8e220343..cb17e1cd1d43 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1096,10 +1096,10 @@ static int bpf_obj_get(const union bpf_attr *attr) #define BPF_PROG_ATTACH_LAST_FIELD attach_flags -static int sockmap_get_from_fd(const union bpf_attr *attr) +static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) { + struct bpf_prog *prog = NULL; int ufd = attr->target_fd; - struct bpf_prog *prog; struct bpf_map *map; struct fd f; int err; @@ -1109,16 +1109,20 @@ static int sockmap_get_from_fd(const union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - prog = bpf_prog_get_type(attr->attach_bpf_fd, BPF_PROG_TYPE_SK_SKB); - if (IS_ERR(prog)) { - fdput(f); - return PTR_ERR(prog); + if (attach) { + prog = bpf_prog_get_type(attr->attach_bpf_fd, + BPF_PROG_TYPE_SK_SKB); + if (IS_ERR(prog)) { + fdput(f); + return PTR_ERR(prog); + } } - err = sock_map_attach_prog(map, prog, attr->attach_type); + err = sock_map_prog(map, prog, attr->attach_type); if (err) { fdput(f); - bpf_prog_put(prog); + if (prog) + bpf_prog_put(prog); return err; } @@ -1155,7 +1159,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr); + return sockmap_get_from_fd(attr, true); default: return -EINVAL; } @@ -1204,7 +1208,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false); cgroup_put(cgrp); break; - + case BPF_SK_SKB_STREAM_PARSER: + case BPF_SK_SKB_STREAM_VERDICT: + ret = sockmap_get_from_fd(attr, false); + break; default: return -EINVAL; } diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 4acc772a28c0..fe3a443a1102 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -558,7 +558,7 @@ static void test_sockmap(int tasks, void *data) } } - /* Test attaching bad fds */ + /* Test attaching/detaching bad fds */ err = bpf_prog_attach(-1, fd, BPF_SK_SKB_STREAM_PARSER, 0); if (!err) { printf("Failed invalid parser prog attach\n"); @@ -571,6 +571,30 @@ static void test_sockmap(int tasks, void *data) goto out_sockmap; } + err = bpf_prog_attach(-1, fd, __MAX_BPF_ATTACH_TYPE, 0); + if (!err) { + printf("Failed unknown prog attach\n"); + goto out_sockmap; + } + + err = bpf_prog_detach(fd, BPF_SK_SKB_STREAM_PARSER); + if (err) { + printf("Failed empty parser prog detach\n"); + goto out_sockmap; + } + + err = bpf_prog_detach(fd, BPF_SK_SKB_STREAM_VERDICT); + if (err) { + printf("Failed empty verdict prog detach\n"); + goto out_sockmap; + } + + err = bpf_prog_detach(fd, __MAX_BPF_ATTACH_TYPE); + if (!err) { + printf("Detach invalid prog successful\n"); + goto out_sockmap; + } + /* Load SK_SKB program and Attach */ err = bpf_prog_load(SOCKMAP_PARSE_PROG, BPF_PROG_TYPE_SK_SKB, &obj, &parse_prog); @@ -643,6 +667,13 @@ static void test_sockmap(int tasks, void *data) goto out_sockmap; } + err = bpf_prog_attach(verdict_prog, map_fd_rx, + __MAX_BPF_ATTACH_TYPE, 0); + if (!err) { + printf("Attached unknown bpf prog\n"); + goto out_sockmap; + } + /* Test map update elem afterwards fd lives in fd and map_fd */ for (i = 0; i < 6; i++) { err = bpf_map_update_elem(map_fd_rx, &i, &sfd[i], BPF_ANY); @@ -809,6 +840,24 @@ static void test_sockmap(int tasks, void *data) assert(status == 0); } + err = bpf_prog_detach(map_fd_rx, __MAX_BPF_ATTACH_TYPE); + if (!err) { + printf("Detached an invalid prog type.\n"); + goto out_sockmap; + } + + err = bpf_prog_detach(map_fd_rx, BPF_SK_SKB_STREAM_PARSER); + if (err) { + printf("Failed parser prog detach\n"); + goto out_sockmap; + } + + err = bpf_prog_detach(map_fd_rx, BPF_SK_SKB_STREAM_VERDICT); + if (err) { + printf("Failed parser prog detach\n"); + goto out_sockmap; + } + /* Test map close sockets */ for (i = 0; i < 6; i++) close(sfd[i]); -- cgit From 374fb014fc5b15e420faa00af036868a635eadd3 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 8 Sep 2017 14:01:10 -0700 Subject: bpf: devmap, use cond_resched instead of cpu_relax Be a bit more friendly about waiting for flush bits to complete. Replace the cpu_relax() with a cond_resched(). Suggested-by: Daniel Borkmann Acked-by: Daniel Borkmann Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index ecf9f99ecc57..959c9a07f318 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -159,7 +159,7 @@ static void dev_map_free(struct bpf_map *map) unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); while (!bitmap_empty(bitmap, dtab->map.max_entries)) - cpu_relax(); + cond_resched(); } for (i = 0; i < dtab->map.max_entries; i++) { -- cgit From a010a2f6540ecc39f8701c6f7d35e22992c03fb6 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 8 Sep 2017 15:38:07 -0700 Subject: Revert "mdio_bus: Remove unneeded gpiod NULL check" This reverts commit 95b80bf3db03c2bf572a357cf74b9a6aefef0a4a ("mdio_bus: Remove unneeded gpiod NULL check"), this commit assumed that GPIOLIB checks for NULL descriptors, so it's safe to drop them, but it is not when CONFIG_GPIOLIB is disabled in the kernel. If we do call gpiod_set_value_cansleep() on a GPIO descriptor we will issue warnings coming from the inline stubs declared in include/linux/gpio/consumer.h. Fixes: 95b80bf3db03 ("mdio_bus: Remove unneeded gpiod NULL check") Reported-by: Woojung Huh Signed-off-by: Florian Fainelli Signed-off-by: Linus Walleij Acked-by: Linus Walleij Signed-off-by: David S. Miller --- drivers/net/phy/mdio_bus.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index b6f9fa670168..2df7b62c1a36 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -399,7 +399,8 @@ error: } /* Put PHYs in RESET to save power */ - gpiod_set_value_cansleep(bus->reset_gpiod, 1); + if (bus->reset_gpiod) + gpiod_set_value_cansleep(bus->reset_gpiod, 1); device_del(&bus->dev); return err; @@ -424,7 +425,8 @@ void mdiobus_unregister(struct mii_bus *bus) } /* Put PHYs in RESET to save power */ - gpiod_set_value_cansleep(bus->reset_gpiod, 1); + if (bus->reset_gpiod) + gpiod_set_value_cansleep(bus->reset_gpiod, 1); device_del(&bus->dev); } -- cgit From 9beb8bedb05c5f3a353dba62b8fa7cbbb9ec685e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 9 Sep 2017 01:40:35 +0200 Subject: bpf: make error reporting in bpf_warn_invalid_xdp_action more clear Differ between illegal XDP action code and just driver unsupported one to provide better feedback when we throw a one-time warning here. Reason is that with 814abfabef3c ("xdp: add bpf_redirect helper function") not all drivers support the new XDP return code yet and thus they will fall into their 'default' case when checking for return codes after program return, which then triggers a bpf_warn_invalid_xdp_action() stating that the return code is illegal, but from XDP perspective it's not. I decided not to place something like a XDP_ACT_MAX define into uapi i) given we don't have this either for all other program types, ii) future action codes could have further encoding there, which would render such define unsuitable and we wouldn't be able to rip it out again, and iii) we rarely add new action codes. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 4 ++-- net/core/filter.c | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ba848b761cfb..43ab5c402f98 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -766,8 +766,8 @@ struct bpf_sock { /* User return codes for XDP prog type. * A valid XDP program must return one of these defined values. All other - * return codes are reserved for future use. Unknown return codes will result - * in packet drop. + * return codes are reserved for future use. Unknown return codes will + * result in packet drops and a warning via bpf_warn_invalid_xdp_action(). */ enum xdp_action { XDP_ABORTED = 0, diff --git a/net/core/filter.c b/net/core/filter.c index 0848df2cd9bf..3a50a9b021e2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3609,7 +3609,11 @@ static bool xdp_is_valid_access(int off, int size, void bpf_warn_invalid_xdp_action(u32 act) { - WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act); + const u32 act_max = XDP_REDIRECT; + + WARN_ONCE(1, "%s XDP return value %u, expect packet loss!\n", + act > act_max ? "Illegal" : "Driver unsupported", + act); } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); -- cgit