From 6cafaf4764a32597c2195aa5411b87728e1fde8a Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Mon, 20 Jun 2016 21:11:45 +0800 Subject: netfilter: nf_tables: fix memory leak if expr init fails If expr init fails then we need to free it. So when the user add a nft rule as follows: # nft add rule filter input tcp dport 22 flow table ssh \ { ip saddr limit rate 0/second } memory leak will happen. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2c881871db38..cf7c74599cbe 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1724,9 +1724,11 @@ struct nft_expr *nft_expr_init(const struct nft_ctx *ctx, err = nf_tables_newexpr(ctx, &info, expr); if (err < 0) - goto err2; + goto err3; return expr; +err3: + kfree(expr); err2: module_put(info.ops->type->owner); err1: -- cgit From 62131e5d735226074cba53095545d76b491e5003 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Wed, 8 Jun 2016 20:20:10 +0800 Subject: netfilter: nft_meta: set skb->nf_trace appropriately When user add a nft rule to set nftrace to zero, for example: # nft add rule ip filter input nftrace set 0 We should set nf_trace to zero also. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_meta.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 16c50b0dd426..f4bad9dc15c4 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -227,7 +227,7 @@ void nft_meta_set_eval(const struct nft_expr *expr, skb->pkt_type = value; break; case NFT_META_NFTRACE: - skb->nf_trace = 1; + skb->nf_trace = !!value; break; default: WARN_ON(1); -- cgit From 9cc1c73ad66610bffc80b691136ffc1e9a3b1a58 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 24 Apr 2016 01:18:21 +0200 Subject: netfilter: conntrack: avoid integer overflow when resizing Can overflow so we might allocate very small table when bucket count is high on a 32bit platform. Note: resize is only possible from init_netns. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f204274a9b6b..62c42e970c89 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1601,8 +1601,15 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) unsigned int nr_slots, i; size_t sz; + if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) + return NULL; + BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); + + if (nr_slots > (UINT_MAX / sizeof(struct hlist_nulls_head))) + return NULL; + sz = nr_slots * sizeof(struct hlist_nulls_head); hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, get_order(sz)); -- cgit From 10c78f5854d361ded4736c1831948e0a5f67b932 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 2 Jul 2016 09:52:13 +0200 Subject: batman-adv: Avoid nullptr dereference in bla after vlan_insert_tag vlan_insert_tag can return NULL on errors. The bridge loop avoidance code therefore has to check the return value of vlan_insert_tag for NULL before it can safely operate on this pointer. Fixes: 23721387c409 ("batman-adv: add basic bridge loop avoidance code") Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/bridge_loop_avoidance.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 748a9ead7ce5..712978024c5d 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -418,9 +418,12 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, break; } - if (vid & BATADV_VLAN_HAS_TAG) + if (vid & BATADV_VLAN_HAS_TAG) { skb = vlan_insert_tag(skb, htons(ETH_P_8021Q), vid & VLAN_VID_MASK); + if (!skb) + goto out; + } skb_reset_mac_header(skb); skb->protocol = eth_type_trans(skb, soft_iface); -- cgit From 60154a1e0495ffb8343a95cefe1e874634572fa8 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 2 Jul 2016 09:52:14 +0200 Subject: batman-adv: Avoid nullptr dereference in dat after vlan_insert_tag vlan_insert_tag can return NULL on errors. The distributed arp table code therefore has to check the return value of vlan_insert_tag for NULL before it can safely operate on this pointer. Fixes: be1db4f6615b ("batman-adv: make the Distributed ARP Table vlan aware") Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/distributed-arp-table.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 278800a99c69..aee3b3991471 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -1009,9 +1009,12 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, if (!skb_new) goto out; - if (vid & BATADV_VLAN_HAS_TAG) + if (vid & BATADV_VLAN_HAS_TAG) { skb_new = vlan_insert_tag(skb_new, htons(ETH_P_8021Q), vid & VLAN_VID_MASK); + if (!skb_new) + goto out; + } skb_reset_mac_header(skb_new); skb_new->protocol = eth_type_trans(skb_new, @@ -1089,9 +1092,12 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv, */ skb_reset_mac_header(skb_new); - if (vid & BATADV_VLAN_HAS_TAG) + if (vid & BATADV_VLAN_HAS_TAG) { skb_new = vlan_insert_tag(skb_new, htons(ETH_P_8021Q), vid & VLAN_VID_MASK); + if (!skb_new) + goto out; + } /* To preserve backwards compatibility, the node has choose the outgoing * format based on the incoming request packet type. The assumption is -- cgit From 33fbb1f3db87ce53da925b3e034b4dd446d483f8 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 30 Jun 2016 20:10:46 +0200 Subject: batman-adv: Fix orig_node_vlan leak on orig_node_release batadv_orig_node_new uses batadv_orig_node_vlan_new to allocate a new batadv_orig_node_vlan and add it to batadv_orig_node::vlan_list. References to this list have also to be cleaned when the batadv_orig_node is removed. Fixes: 7ea7b4a14275 ("batman-adv: make the TT CRC logic VLAN specific") Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/originator.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 7f51bc2c06eb..fe2fcda4a984 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -765,6 +765,7 @@ static void batadv_orig_node_release(struct kref *ref) struct batadv_neigh_node *neigh_node; struct batadv_orig_node *orig_node; struct batadv_orig_ifinfo *orig_ifinfo; + struct batadv_orig_node_vlan *vlan; orig_node = container_of(ref, struct batadv_orig_node, refcount); @@ -784,6 +785,13 @@ static void batadv_orig_node_release(struct kref *ref) } spin_unlock_bh(&orig_node->neigh_list_lock); + spin_lock_bh(&orig_node->vlan_list_lock); + hlist_for_each_entry_safe(vlan, node_tmp, &orig_node->vlan_list, list) { + hlist_del_rcu(&vlan->list); + batadv_orig_node_vlan_put(vlan); + } + spin_unlock_bh(&orig_node->vlan_list_lock); + /* Free nc_nodes */ batadv_nc_purge_orig(orig_node->bat_priv, orig_node, NULL); -- cgit From 3db0decf1185357d6ab2256d0dede1ca9efda03d Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 1 Jul 2016 15:49:43 +0200 Subject: batman-adv: Fix non-atomic bla_claim::backbone_gw access The pointer batadv_bla_claim::backbone_gw can be changed at any time. Therefore, access to it must be protected to ensure that two function accessing the same backbone_gw are actually accessing the same. This is especially important when the crc_lock is used or when the backbone_gw of a claim is exchanged. Not doing so leads to invalid memory access and/or reference leaks. Fixes: 23721387c409 ("batman-adv: add basic bridge loop avoidance code") Fixes: 5a1dd8a4773d ("batman-adv: lock crc access in bridge loop avoidance") Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/bridge_loop_avoidance.c | 111 ++++++++++++++++++++++++++------- net/batman-adv/types.h | 2 + 2 files changed, 90 insertions(+), 23 deletions(-) diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 712978024c5d..825a5cdf4382 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -177,10 +177,21 @@ static void batadv_backbone_gw_put(struct batadv_bla_backbone_gw *backbone_gw) static void batadv_claim_release(struct kref *ref) { struct batadv_bla_claim *claim; + struct batadv_bla_backbone_gw *old_backbone_gw; claim = container_of(ref, struct batadv_bla_claim, refcount); - batadv_backbone_gw_put(claim->backbone_gw); + spin_lock_bh(&claim->backbone_lock); + old_backbone_gw = claim->backbone_gw; + claim->backbone_gw = NULL; + spin_unlock_bh(&claim->backbone_lock); + + spin_lock_bh(&old_backbone_gw->crc_lock); + old_backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); + spin_unlock_bh(&old_backbone_gw->crc_lock); + + batadv_backbone_gw_put(old_backbone_gw); + kfree_rcu(claim, rcu); } @@ -677,8 +688,10 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, const u8 *mac, const unsigned short vid, struct batadv_bla_backbone_gw *backbone_gw) { + struct batadv_bla_backbone_gw *old_backbone_gw; struct batadv_bla_claim *claim; struct batadv_bla_claim search_claim; + bool remove_crc = false; int hash_added; ether_addr_copy(search_claim.addr, mac); @@ -692,8 +705,10 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, return; ether_addr_copy(claim->addr, mac); + spin_lock_init(&claim->backbone_lock); claim->vid = vid; claim->lasttime = jiffies; + kref_get(&backbone_gw->refcount); claim->backbone_gw = backbone_gw; kref_init(&claim->refcount); @@ -721,15 +736,26 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, "bla_add_claim(): changing ownership for %pM, vid %d\n", mac, BATADV_PRINT_VID(vid)); - spin_lock_bh(&claim->backbone_gw->crc_lock); - claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); - spin_unlock_bh(&claim->backbone_gw->crc_lock); - batadv_backbone_gw_put(claim->backbone_gw); + remove_crc = true; } - /* set (new) backbone gw */ + + /* replace backbone_gw atomically and adjust reference counters */ + spin_lock_bh(&claim->backbone_lock); + old_backbone_gw = claim->backbone_gw; kref_get(&backbone_gw->refcount); claim->backbone_gw = backbone_gw; + spin_unlock_bh(&claim->backbone_lock); + + if (remove_crc) { + /* remove claim address from old backbone_gw */ + spin_lock_bh(&old_backbone_gw->crc_lock); + old_backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); + spin_unlock_bh(&old_backbone_gw->crc_lock); + } + batadv_backbone_gw_put(old_backbone_gw); + + /* add claim address to new backbone_gw */ spin_lock_bh(&backbone_gw->crc_lock); backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); spin_unlock_bh(&backbone_gw->crc_lock); @@ -739,6 +765,26 @@ claim_free_ref: batadv_claim_put(claim); } +/** + * batadv_bla_claim_get_backbone_gw - Get valid reference for backbone_gw of + * claim + * @claim: claim whose backbone_gw should be returned + * + * Return: valid reference to claim::backbone_gw + */ +static struct batadv_bla_backbone_gw * +batadv_bla_claim_get_backbone_gw(struct batadv_bla_claim *claim) +{ + struct batadv_bla_backbone_gw *backbone_gw; + + spin_lock_bh(&claim->backbone_lock); + backbone_gw = claim->backbone_gw; + kref_get(&backbone_gw->refcount); + spin_unlock_bh(&claim->backbone_lock); + + return backbone_gw; +} + /** * batadv_bla_del_claim - delete a claim from the claim hash * @bat_priv: the bat priv with all the soft interface information @@ -763,10 +809,6 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, batadv_choose_claim, claim); batadv_claim_put(claim); /* reference from the hash is gone */ - spin_lock_bh(&claim->backbone_gw->crc_lock); - claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); - spin_unlock_bh(&claim->backbone_gw->crc_lock); - /* don't need the reference from hash_find() anymore */ batadv_claim_put(claim); } @@ -1219,6 +1261,7 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, int now) { + struct batadv_bla_backbone_gw *backbone_gw; struct batadv_bla_claim *claim; struct hlist_head *head; struct batadv_hashtable *hash; @@ -1233,14 +1276,17 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, rcu_read_lock(); hlist_for_each_entry_rcu(claim, head, hash_entry) { + backbone_gw = batadv_bla_claim_get_backbone_gw(claim); if (now) goto purge_now; - if (!batadv_compare_eth(claim->backbone_gw->orig, + + if (!batadv_compare_eth(backbone_gw->orig, primary_if->net_dev->dev_addr)) - continue; + goto skip; + if (!batadv_has_timed_out(claim->lasttime, BATADV_BLA_CLAIM_TIMEOUT)) - continue; + goto skip; batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_purge_claims(): %pM, vid %d, time out\n", @@ -1248,8 +1294,10 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, purge_now: batadv_handle_unclaim(bat_priv, primary_if, - claim->backbone_gw->orig, + backbone_gw->orig, claim->addr, claim->vid); +skip: + batadv_backbone_gw_put(backbone_gw); } rcu_read_unlock(); } @@ -1760,9 +1808,11 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb, bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, bool is_bcast) { + struct batadv_bla_backbone_gw *backbone_gw; struct ethhdr *ethhdr; struct batadv_bla_claim search_claim, *claim = NULL; struct batadv_hard_iface *primary_if; + bool own_claim; bool ret; ethhdr = eth_hdr(skb); @@ -1797,8 +1847,12 @@ bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, } /* if it is our own claim ... */ - if (batadv_compare_eth(claim->backbone_gw->orig, - primary_if->net_dev->dev_addr)) { + backbone_gw = batadv_bla_claim_get_backbone_gw(claim); + own_claim = batadv_compare_eth(backbone_gw->orig, + primary_if->net_dev->dev_addr); + batadv_backbone_gw_put(backbone_gw); + + if (own_claim) { /* ... allow it in any case */ claim->lasttime = jiffies; goto allow; @@ -1862,7 +1916,9 @@ bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, { struct ethhdr *ethhdr; struct batadv_bla_claim search_claim, *claim = NULL; + struct batadv_bla_backbone_gw *backbone_gw; struct batadv_hard_iface *primary_if; + bool client_roamed; bool ret = false; primary_if = batadv_primary_if_get_selected(bat_priv); @@ -1892,8 +1948,12 @@ bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, goto allow; /* check if we are responsible. */ - if (batadv_compare_eth(claim->backbone_gw->orig, - primary_if->net_dev->dev_addr)) { + backbone_gw = batadv_bla_claim_get_backbone_gw(claim); + client_roamed = batadv_compare_eth(backbone_gw->orig, + primary_if->net_dev->dev_addr); + batadv_backbone_gw_put(backbone_gw); + + if (client_roamed) { /* if yes, the client has roamed and we have * to unclaim it. */ @@ -1941,6 +2001,7 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) struct net_device *net_dev = (struct net_device *)seq->private; struct batadv_priv *bat_priv = netdev_priv(net_dev); struct batadv_hashtable *hash = bat_priv->bla.claim_hash; + struct batadv_bla_backbone_gw *backbone_gw; struct batadv_bla_claim *claim; struct batadv_hard_iface *primary_if; struct hlist_head *head; @@ -1965,17 +2026,21 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) rcu_read_lock(); hlist_for_each_entry_rcu(claim, head, hash_entry) { - is_own = batadv_compare_eth(claim->backbone_gw->orig, + backbone_gw = batadv_bla_claim_get_backbone_gw(claim); + + is_own = batadv_compare_eth(backbone_gw->orig, primary_addr); - spin_lock_bh(&claim->backbone_gw->crc_lock); - backbone_crc = claim->backbone_gw->crc; - spin_unlock_bh(&claim->backbone_gw->crc_lock); + spin_lock_bh(&backbone_gw->crc_lock); + backbone_crc = backbone_gw->crc; + spin_unlock_bh(&backbone_gw->crc_lock); seq_printf(seq, " * %pM on %5d by %pM [%c] (%#.4x)\n", claim->addr, BATADV_PRINT_VID(claim->vid), - claim->backbone_gw->orig, + backbone_gw->orig, (is_own ? 'x' : ' '), backbone_crc); + + batadv_backbone_gw_put(backbone_gw); } rcu_read_unlock(); } diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index ba846b078af8..005122234b90 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1042,6 +1042,7 @@ struct batadv_bla_backbone_gw { * @addr: mac address of claimed non-mesh client * @vid: vlan id this client was detected on * @backbone_gw: pointer to backbone gw claiming this client + * @backbone_lock: lock protecting backbone_gw pointer * @lasttime: last time we heard of claim (locals only) * @hash_entry: hlist node for batadv_priv_bla::claim_hash * @refcount: number of contexts the object is used @@ -1051,6 +1052,7 @@ struct batadv_bla_claim { u8 addr[ETH_ALEN]; unsigned short vid; struct batadv_bla_backbone_gw *backbone_gw; + spinlock_t backbone_lock; /* protects backbone_gw */ unsigned long lasttime; struct hlist_node hash_entry; struct rcu_head rcu; -- cgit From 15c2ed753cd9e3e746472deab8151337a5b6da56 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 30 Jun 2016 20:11:34 +0200 Subject: batman-adv: Fix reference leak in batadv_find_router The replacement of last_bonding_candidate in batadv_orig_node has to be an atomic operation. Otherwise it is possible that the reference counter of a batadv_orig_ifinfo is reduced which was no longer the last_bonding_candidate when the new candidate is added. This can either lead to an invalid memory access or to reference leaks which make it impossible to an interface which was added to batman-adv. Fixes: f3b3d9018975 ("batman-adv: add bonding again") Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/routing.c | 52 ++++++++++++++++++++++++++++++++++++------------ net/batman-adv/types.h | 4 +++- 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 6c2901a86230..bfac086b4d01 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -455,6 +455,29 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv, return 0; } +/** + * batadv_last_bonding_replace - Replace last_bonding_candidate of orig_node + * @orig_node: originator node whose bonding candidates should be replaced + * @new_candidate: new bonding candidate or NULL + */ +static void +batadv_last_bonding_replace(struct batadv_orig_node *orig_node, + struct batadv_orig_ifinfo *new_candidate) +{ + struct batadv_orig_ifinfo *old_candidate; + + spin_lock_bh(&orig_node->neigh_list_lock); + old_candidate = orig_node->last_bonding_candidate; + + if (new_candidate) + kref_get(&new_candidate->refcount); + orig_node->last_bonding_candidate = new_candidate; + spin_unlock_bh(&orig_node->neigh_list_lock); + + if (old_candidate) + batadv_orig_ifinfo_put(old_candidate); +} + /** * batadv_find_router - find a suitable router for this originator * @bat_priv: the bat priv with all the soft interface information @@ -562,10 +585,6 @@ next: } rcu_read_unlock(); - /* last_bonding_candidate is reset below, remove the old reference. */ - if (orig_node->last_bonding_candidate) - batadv_orig_ifinfo_put(orig_node->last_bonding_candidate); - /* After finding candidates, handle the three cases: * 1) there is a next candidate, use that * 2) there is no next candidate, use the first of the list @@ -574,21 +593,28 @@ next: if (next_candidate) { batadv_neigh_node_put(router); - /* remove references to first candidate, we don't need it. */ - if (first_candidate) { - batadv_neigh_node_put(first_candidate_router); - batadv_orig_ifinfo_put(first_candidate); - } + kref_get(&next_candidate_router->refcount); router = next_candidate_router; - orig_node->last_bonding_candidate = next_candidate; + batadv_last_bonding_replace(orig_node, next_candidate); } else if (first_candidate) { batadv_neigh_node_put(router); - /* refcounting has already been done in the loop above. */ + kref_get(&first_candidate_router->refcount); router = first_candidate_router; - orig_node->last_bonding_candidate = first_candidate; + batadv_last_bonding_replace(orig_node, first_candidate); } else { - orig_node->last_bonding_candidate = NULL; + batadv_last_bonding_replace(orig_node, NULL); + } + + /* cleanup of candidates */ + if (first_candidate) { + batadv_neigh_node_put(first_candidate_router); + batadv_orig_ifinfo_put(first_candidate); + } + + if (next_candidate) { + batadv_neigh_node_put(next_candidate_router); + batadv_orig_ifinfo_put(next_candidate); } return router; diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 005122234b90..74d865a4df46 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -330,7 +330,9 @@ struct batadv_orig_node { DECLARE_BITMAP(bcast_bits, BATADV_TQ_LOCAL_WINDOW_SIZE); u32 last_bcast_seqno; struct hlist_head neigh_list; - /* neigh_list_lock protects: neigh_list and router */ + /* neigh_list_lock protects: neigh_list, ifinfo_list, + * last_bonding_candidate and router + */ spinlock_t neigh_list_lock; struct hlist_node hash_entry; struct batadv_priv *bat_priv; -- cgit From cbef1e102003edb236c6b2319ab269ccef963731 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 30 Jun 2016 21:41:13 +0200 Subject: batman-adv: Free last_bonding_candidate on release of orig_node The orig_ifinfo reference counter for last_bonding_candidate in batadv_orig_node has to be reduced when an originator node is released. Otherwise the orig_ifinfo is leaked and the reference counter the netdevice is not reduced correctly. Fixes: f3b3d9018975 ("batman-adv: add bonding again") Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/originator.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index fe2fcda4a984..ab8c4f9738fe 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -766,6 +766,7 @@ static void batadv_orig_node_release(struct kref *ref) struct batadv_orig_node *orig_node; struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_orig_node_vlan *vlan; + struct batadv_orig_ifinfo *last_candidate; orig_node = container_of(ref, struct batadv_orig_node, refcount); @@ -783,8 +784,14 @@ static void batadv_orig_node_release(struct kref *ref) hlist_del_rcu(&orig_ifinfo->list); batadv_orig_ifinfo_put(orig_ifinfo); } + + last_candidate = orig_node->last_bonding_candidate; + orig_node->last_bonding_candidate = NULL; spin_unlock_bh(&orig_node->neigh_list_lock); + if (last_candidate) + batadv_orig_ifinfo_put(last_candidate); + spin_lock_bh(&orig_node->vlan_list_lock); hlist_for_each_entry_safe(vlan, node_tmp, &orig_node->vlan_list, list) { hlist_del_rcu(&vlan->list); -- cgit From 6e8ef842223b90a33efd570128bb566a9ae6f5ad Mon Sep 17 00:00:00 2001 From: Purushottam Kushwaha Date: Tue, 5 Jul 2016 13:44:51 +0530 Subject: nl80211: Move ACL parsing later to avoid a possible memory leak No support for pbss results in a memory leak for the acl_data (if parse_acl_data succeeds). Fix this by moving the ACL parsing later. Cc: stable@vger.kernel.org Fixes: 34d505193bd10 ("cfg80211: basic support for PBSS network type") Signed-off-by: Purushottam Kushwaha Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d7599014055d..7d72283901a3 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3487,16 +3487,16 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) params.smps_mode = NL80211_SMPS_OFF; } + params.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]); + if (params.pbss && !rdev->wiphy.bands[NL80211_BAND_60GHZ]) + return -EOPNOTSUPP; + if (info->attrs[NL80211_ATTR_ACL_POLICY]) { params.acl = parse_acl_data(&rdev->wiphy, info); if (IS_ERR(params.acl)) return PTR_ERR(params.acl); } - params.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]); - if (params.pbss && !rdev->wiphy.bands[NL80211_BAND_60GHZ]) - return -EOPNOTSUPP; - wdev_lock(wdev); err = rdev_start_ap(rdev, dev, ¶ms); if (!err) { -- cgit From 16a910a6722b7a8680409e634c7c0dac073c01e4 Mon Sep 17 00:00:00 2001 From: Gregory Greenman Date: Tue, 5 Jul 2016 15:23:10 +0300 Subject: cfg80211: handle failed skb allocation Handle the case when dev_alloc_skb returns NULL. Cc: stable@vger.kernel.org Fixes: 2b67f944f88c2 ("cfg80211: reuse existing page fragments in A-MSDU rx") Signed-off-by: Gregory Greenman Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/wireless/util.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/wireless/util.c b/net/wireless/util.c index 2443ee30ba5b..b7d1592bd5b8 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -721,6 +721,8 @@ __ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen, * alignment since sizeof(struct ethhdr) is 14. */ frame = dev_alloc_skb(hlen + sizeof(struct ethhdr) + 2 + cur_len); + if (!frame) + return NULL; skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2); skb_copy_bits(skb, offset, skb_put(frame, cur_len), cur_len); -- cgit From d1fe176ca51fa3cb35f70c1d876d9a090e9befce Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 12 Jun 2016 10:43:19 +0200 Subject: batman-adv: Fix speedy join in gateway client mode Speedy join only works when the received packet is either broadcast or an 4addr unicast packet. Thus packets converted from broadcast to unicast via the gateway handling code have to be converted to 4addr packets to allow the receiving gateway server to add the sender address as temporary entry to the translation table. Not doing it will make the batman-adv gateway server drop the DHCP response in many situations because it doesn't yet have the TT entry for the destination of the DHCP response. Fixes: 371351731e9c ("batman-adv: change interface_rx to get orig node") Signed-off-by: Sven Eckelmann Acked-by: Antonio Quartulli Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/send.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index f2f125684ed9..010397650fa5 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -424,8 +424,8 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_orig_node *orig_node; orig_node = batadv_gw_get_selected_orig(bat_priv); - return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST, 0, - orig_node, vid); + return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST_4ADDR, + BATADV_P_DATA, orig_node, vid); } void batadv_schedule_bat_ogm(struct batadv_hard_iface *hard_iface) -- cgit From 3777ed688fba82d0bd43f9fc1ebbc6abe788576d Mon Sep 17 00:00:00 2001 From: Quentin Armitage Date: Thu, 16 Jun 2016 08:00:14 +0100 Subject: ipvs: fix bind to link-local mcast IPv6 address in backup When using HEAD from https://git.kernel.org/cgit/utils/kernel/ipvsadm/ipvsadm.git/, the command: ipvsadm --start-daemon backup --mcast-interface eth0.60 \ --mcast-group ff02::1:81 fails with the error message: Argument list too long whereas both: ipvsadm --start-daemon master --mcast-interface eth0.60 \ --mcast-group ff02::1:81 and: ipvsadm --start-daemon backup --mcast-interface eth0.60 \ --mcast-group 224.0.0.81 are successful. The error message "Argument list too long" isn't helpful. The error occurs because an IPv6 address is given in backup mode. The error is in make_receive_sock() in net/netfilter/ipvs/ip_vs_sync.c, since it fails to set the interface on the address or the socket before calling inet6_bind() (via sock->ops->bind), where the test 'if (!sk->sk_bound_dev_if)' failed. Setting sock->sk->sk_bound_dev_if on the socket before calling inet6_bind() resolves the issue. Fixes: d33288172e72 ("ipvs: add more mcast parameters for the sync daemon") Signed-off-by: Quentin Armitage Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_sync.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 803001a45aa1..1b07578bedf3 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1545,7 +1545,8 @@ error: /* * Set up receiving multicast socket over UDP */ -static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id) +static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id, + int ifindex) { /* multicast addr */ union ipvs_sockaddr mcast_addr; @@ -1566,6 +1567,7 @@ static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id) set_sock_size(sock->sk, 0, result); get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); + sock->sk->sk_bound_dev_if = ifindex; result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen); if (result < 0) { pr_err("Error binding to the multicast addr\n"); @@ -1868,7 +1870,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, if (state == IP_VS_STATE_MASTER) sock = make_send_sock(ipvs, id); else - sock = make_receive_sock(ipvs, id); + sock = make_receive_sock(ipvs, id, dev->ifindex); if (IS_ERR(sock)) { result = PTR_ERR(sock); goto outtinfo; -- cgit From c8607e020014cf11a61601a0005270bad81cabdf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 Jul 2016 14:53:06 +0200 Subject: netfilter: nft_ct: fix expiration getter We need to compute timeout.expires - jiffies, not the other way around. Add a helper, another patch can then later change more places in conntrack code where we currently open-code this. Will allow us to only change one place later when we remove per-ct timer. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 8 ++++++++ net/netfilter/nft_ct.c | 6 +----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index dd78bea227c8..b6083c34ef0d 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -284,6 +284,14 @@ static inline bool nf_is_loopback_packet(const struct sk_buff *skb) return skb->dev && skb->skb_iif && skb->dev->flags & IFF_LOOPBACK; } +/* jiffies until ct expires, 0 if already expired */ +static inline unsigned long nf_ct_expires(const struct nf_conn *ct) +{ + long timeout = (long)ct->timeout.expires - (long)jiffies; + + return timeout > 0 ? timeout : 0; +} + struct kernel_param; int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 137e308d5b24..81fbb450783e 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -54,7 +54,6 @@ static void nft_ct_get_eval(const struct nft_expr *expr, const struct nf_conn_help *help; const struct nf_conntrack_tuple *tuple; const struct nf_conntrack_helper *helper; - long diff; unsigned int state; ct = nf_ct_get(pkt->skb, &ctinfo); @@ -94,10 +93,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, return; #endif case NFT_CT_EXPIRATION: - diff = (long)jiffies - (long)ct->timeout.expires; - if (diff < 0) - diff = 0; - *dest = jiffies_to_msecs(diff); + *dest = jiffies_to_msecs(nf_ct_expires(ct)); return; case NFT_CT_HELPER: if (ct->master == NULL) -- cgit From 06708f81528725148473c0869d6af5f809c6824b Mon Sep 17 00:00:00 2001 From: Dmitri Epshtein Date: Wed, 6 Jul 2016 04:18:58 +0200 Subject: net: mvneta: set real interrupt per packet for tx_done Commit aebea2ba0f74 ("net: mvneta: fix Tx interrupt delay") intended to set coalescing threshold to a value guaranteeing interrupt generation per each sent packet, so that buffers can be released with no delay. In fact setting threshold to '1' was wrong, because it causes interrupt every two packets. According to the documentation a reason behind it is following - interrupt occurs once sent buffers counter reaches a value, which is higher than one specified in MVNETA_TXQ_SIZE_REG(q). This behavior was confirmed during tests. Also when testing the SoC working as a NAS device, better performance was observed with int-per-packet, as it strongly depends on the fact that all transmitted packets are released immediately. This commit enables NETA controller work in interrupt per sent packet mode by setting coalescing threshold to 0. Signed-off-by: Dmitri Epshtein Signed-off-by: Marcin Wojtas Cc: # v3.10+ Fixes aebea2ba0f74 ("net: mvneta: fix Tx interrupt delay") Acked-by: Willy Tarreau Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index d5d263bda333..f92018b13d28 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -244,7 +244,7 @@ /* Various constants */ /* Coalescing */ -#define MVNETA_TXDONE_COAL_PKTS 1 +#define MVNETA_TXDONE_COAL_PKTS 0 /* interrupt per packet */ #define MVNETA_RX_COAL_PKTS 32 #define MVNETA_RX_COAL_USEC 100 -- cgit From 205e1e255c479f3fd77446415706463b282f94e4 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 5 Jul 2016 22:12:36 -0700 Subject: ppp: defer netns reference release for ppp channel Matt reported that we have a NULL pointer dereference in ppp_pernet() from ppp_connect_channel(), i.e. pch->chan_net is NULL. This is due to that a parallel ppp_unregister_channel() could happen while we are in ppp_connect_channel(), during which pch->chan_net set to NULL. Since we need a reference to net per channel, it makes sense to sync the refcnt with the life time of the channel, therefore we should release this reference when we destroy it. Fixes: 1f461dcdd296 ("ppp: take reference on channels netns") Reported-by: Matt Bennett Cc: Paul Mackerras Cc: linux-ppp@vger.kernel.org Cc: Guillaume Nault Cc: Cyrill Gorcunov Signed-off-by: Cong Wang Reviewed-by: Cyrill Gorcunov Signed-off-by: David S. Miller --- drivers/net/ppp/ppp_generic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 8dedafa1a95d..a30ee427efab 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -2601,8 +2601,6 @@ ppp_unregister_channel(struct ppp_channel *chan) spin_lock_bh(&pn->all_channels_lock); list_del(&pch->list); spin_unlock_bh(&pn->all_channels_lock); - put_net(pch->chan_net); - pch->chan_net = NULL; pch->file.dead = 1; wake_up_interruptible(&pch->file.rwait); @@ -3136,6 +3134,9 @@ ppp_disconnect_channel(struct channel *pch) */ static void ppp_destroy_channel(struct channel *pch) { + put_net(pch->chan_net); + pch->chan_net = NULL; + atomic_dec(&channel_count); if (!pch->file.dead) { -- cgit From 92f7d07d68c1dfcbb80b3259f29dad8efe890803 Mon Sep 17 00:00:00 2001 From: hayeswang Date: Wed, 6 Jul 2016 17:35:59 +0800 Subject: r8152: remove the setting of LAN_WAKE_EN The LAN_WAKE_EN is not used to determine if the device could support WOL. It is used to signal a GPIO pin when a WOL event occurs. The WOL still works even though it is disabled. Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 0da72d39b4f9..419f4cee432b 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -2296,10 +2296,6 @@ static u32 __rtl_get_wol(struct r8152 *tp) u32 ocp_data; u32 wolopts = 0; - ocp_data = ocp_read_byte(tp, MCU_TYPE_PLA, PLA_CONFIG5); - if (!(ocp_data & LAN_WAKE_EN)) - return 0; - ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_CONFIG34); if (ocp_data & LINK_ON_WAKE_EN) wolopts |= WAKE_PHY; @@ -2332,15 +2328,13 @@ static void __rtl_set_wol(struct r8152 *tp, u32 wolopts) ocp_write_word(tp, MCU_TYPE_PLA, PLA_CONFIG34, ocp_data); ocp_data = ocp_read_word(tp, MCU_TYPE_PLA, PLA_CONFIG5); - ocp_data &= ~(UWF_EN | BWF_EN | MWF_EN | LAN_WAKE_EN); + ocp_data &= ~(UWF_EN | BWF_EN | MWF_EN); if (wolopts & WAKE_UCAST) ocp_data |= UWF_EN; if (wolopts & WAKE_BCAST) ocp_data |= BWF_EN; if (wolopts & WAKE_MCAST) ocp_data |= MWF_EN; - if (wolopts & WAKE_ANY) - ocp_data |= LAN_WAKE_EN; ocp_write_word(tp, MCU_TYPE_PLA, PLA_CONFIG5, ocp_data); ocp_write_byte(tp, MCU_TYPE_PLA, PLA_CRWECR, CRWECR_NORAML); -- cgit From b8efb894e672bd0080126c68a076ddcacfcbc0ef Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Wed, 6 Jul 2016 15:35:15 -0500 Subject: ibmvnic: properly start and stop tx queues Since ibmvnic uses multiple tx queues, start and stop all queues when opening and closing devices. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index ecdb6854a898..f04830e237d9 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -469,7 +469,8 @@ static int ibmvnic_open(struct net_device *netdev) crq.logical_link_state.link_state = IBMVNIC_LOGICAL_LNK_UP; ibmvnic_send_crq(adapter, &crq); - netif_start_queue(netdev); + netif_tx_start_all_queues(netdev); + return 0; bounce_map_failed: @@ -519,7 +520,7 @@ static int ibmvnic_close(struct net_device *netdev) for (i = 0; i < adapter->req_rx_queues; i++) napi_disable(&adapter->napi[i]); - netif_stop_queue(netdev); + netif_tx_stop_all_queues(netdev); if (adapter->bounce_buffer) { if (!dma_mapping_error(dev, adapter->bounce_buffer_dma)) { -- cgit From 88eb98a0178219e1d6e9037b71d293f19b89eef2 Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Wed, 6 Jul 2016 15:35:16 -0500 Subject: ibmvnic: dispose irq mappings IRQ mappings were not being properly disposed when releasing sub-CRQ's. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index f04830e237d9..79d2ab360805 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1257,6 +1257,7 @@ static void release_sub_crqs(struct ibmvnic_adapter *adapter) if (adapter->tx_scrq[i]) { free_irq(adapter->tx_scrq[i]->irq, adapter->tx_scrq[i]); + irq_dispose_mapping(adapter->tx_scrq[i]->irq); release_sub_crq_queue(adapter, adapter->tx_scrq[i]); } @@ -1268,6 +1269,7 @@ static void release_sub_crqs(struct ibmvnic_adapter *adapter) if (adapter->rx_scrq[i]) { free_irq(adapter->rx_scrq[i]->irq, adapter->rx_scrq[i]); + irq_dispose_mapping(adapter->rx_scrq[i]->irq); release_sub_crq_queue(adapter, adapter->rx_scrq[i]); } -- cgit From ea22d51a7831b062978fcf07c3c5ac7ecbb6cbeb Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Wed, 6 Jul 2016 15:35:17 -0500 Subject: ibmvnic: simplify and improve driver probe function This patch creates a function that handles sub-CRQ IRQ creation separately from sub-CRQ initialization. Another function is then needed to release sub-CRQ resources prior to sub-CRQ IRQ creation. These additions allow the driver probe function to be simplified, specifically during the VNIC Server login process. A timeout is also included while waiting for completion of the login process in case the VNIC Server is not available or some other error occurs. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 159 ++++++++++++++++++++++++------------- 1 file changed, 103 insertions(+), 56 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 79d2ab360805..52b0c07d3ca4 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -89,6 +89,7 @@ MODULE_VERSION(IBMVNIC_DRIVER_VERSION); static int ibmvnic_version = IBMVNIC_INITIAL_VERSION; static int ibmvnic_remove(struct vio_dev *); static void release_sub_crqs(struct ibmvnic_adapter *); +static void release_sub_crqs_no_irqs(struct ibmvnic_adapter *); static int ibmvnic_reset_crq(struct ibmvnic_adapter *); static int ibmvnic_send_crq_init(struct ibmvnic_adapter *); static int ibmvnic_reenable_crq_queue(struct ibmvnic_adapter *); @@ -1213,12 +1214,6 @@ static struct ibmvnic_sub_crq_queue *init_sub_crq_queue(struct ibmvnic_adapter goto reg_failed; } - scrq->irq = irq_create_mapping(NULL, scrq->hw_irq); - if (scrq->irq == NO_IRQ) { - dev_err(dev, "Error mapping irq\n"); - goto map_irq_failed; - } - scrq->adapter = adapter; scrq->size = 4 * PAGE_SIZE / sizeof(*scrq->msgs); scrq->cur = 0; @@ -1231,12 +1226,6 @@ static struct ibmvnic_sub_crq_queue *init_sub_crq_queue(struct ibmvnic_adapter return scrq; -map_irq_failed: - do { - rc = plpar_hcall_norets(H_FREE_SUB_CRQ, - adapter->vdev->unit_address, - scrq->crq_num); - } while (rc == H_BUSY || H_IS_LONG_BUSY(rc)); reg_failed: dma_unmap_single(dev, scrq->msg_token, 4 * PAGE_SIZE, DMA_BIDIRECTIONAL); @@ -1279,6 +1268,29 @@ static void release_sub_crqs(struct ibmvnic_adapter *adapter) adapter->requested_caps = 0; } +static void release_sub_crqs_no_irqs(struct ibmvnic_adapter *adapter) +{ + int i; + + if (adapter->tx_scrq) { + for (i = 0; i < adapter->req_tx_queues; i++) + if (adapter->tx_scrq[i]) + release_sub_crq_queue(adapter, + adapter->tx_scrq[i]); + adapter->tx_scrq = NULL; + } + + if (adapter->rx_scrq) { + for (i = 0; i < adapter->req_rx_queues; i++) + if (adapter->rx_scrq[i]) + release_sub_crq_queue(adapter, + adapter->rx_scrq[i]); + adapter->rx_scrq = NULL; + } + + adapter->requested_caps = 0; +} + static int disable_scrq_irq(struct ibmvnic_adapter *adapter, struct ibmvnic_sub_crq_queue *scrq) { @@ -1398,6 +1410,66 @@ static irqreturn_t ibmvnic_interrupt_rx(int irq, void *instance) return IRQ_HANDLED; } +static int init_sub_crq_irqs(struct ibmvnic_adapter *adapter) +{ + struct device *dev = &adapter->vdev->dev; + struct ibmvnic_sub_crq_queue *scrq; + int i = 0, j = 0; + int rc = 0; + + for (i = 0; i < adapter->req_tx_queues; i++) { + scrq = adapter->tx_scrq[i]; + scrq->irq = irq_create_mapping(NULL, scrq->hw_irq); + + if (scrq->irq == NO_IRQ) { + rc = -EINVAL; + dev_err(dev, "Error mapping irq\n"); + goto req_tx_irq_failed; + } + + rc = request_irq(scrq->irq, ibmvnic_interrupt_tx, + 0, "ibmvnic_tx", scrq); + + if (rc) { + dev_err(dev, "Couldn't register tx irq 0x%x. rc=%d\n", + scrq->irq, rc); + irq_dispose_mapping(scrq->irq); + goto req_rx_irq_failed; + } + } + + for (i = 0; i < adapter->req_rx_queues; i++) { + scrq = adapter->rx_scrq[i]; + scrq->irq = irq_create_mapping(NULL, scrq->hw_irq); + if (scrq->irq == NO_IRQ) { + rc = -EINVAL; + dev_err(dev, "Error mapping irq\n"); + goto req_rx_irq_failed; + } + rc = request_irq(scrq->irq, ibmvnic_interrupt_rx, + 0, "ibmvnic_rx", scrq); + if (rc) { + dev_err(dev, "Couldn't register rx irq 0x%x. rc=%d\n", + scrq->irq, rc); + irq_dispose_mapping(scrq->irq); + goto req_rx_irq_failed; + } + } + return rc; + +req_rx_irq_failed: + for (j = 0; j < i; j++) + free_irq(adapter->rx_scrq[j]->irq, adapter->rx_scrq[j]); + irq_dispose_mapping(adapter->rx_scrq[j]->irq); + i = adapter->req_tx_queues; +req_tx_irq_failed: + for (j = 0; j < i; j++) + free_irq(adapter->tx_scrq[j]->irq, adapter->tx_scrq[j]); + irq_dispose_mapping(adapter->rx_scrq[j]->irq); + release_sub_crqs_no_irqs(adapter); + return rc; +} + static void init_sub_crqs(struct ibmvnic_adapter *adapter, int retry) { struct device *dev = &adapter->vdev->dev; @@ -1406,8 +1478,7 @@ static void init_sub_crqs(struct ibmvnic_adapter *adapter, int retry) union ibmvnic_crq crq; int total_queues; int more = 0; - int i, j; - int rc; + int i; if (!retry) { /* Sub-CRQ entries are 32 byte long */ @@ -1486,13 +1557,6 @@ static void init_sub_crqs(struct ibmvnic_adapter *adapter, int retry) for (i = 0; i < adapter->req_tx_queues; i++) { adapter->tx_scrq[i] = allqueues[i]; adapter->tx_scrq[i]->pool_index = i; - rc = request_irq(adapter->tx_scrq[i]->irq, ibmvnic_interrupt_tx, - 0, "ibmvnic_tx", adapter->tx_scrq[i]); - if (rc) { - dev_err(dev, "Couldn't register tx irq 0x%x. rc=%d\n", - adapter->tx_scrq[i]->irq, rc); - goto req_tx_irq_failed; - } } adapter->rx_scrq = kcalloc(adapter->req_rx_queues, @@ -1503,13 +1567,6 @@ static void init_sub_crqs(struct ibmvnic_adapter *adapter, int retry) for (i = 0; i < adapter->req_rx_queues; i++) { adapter->rx_scrq[i] = allqueues[i + adapter->req_tx_queues]; adapter->rx_scrq[i]->scrq_num = i; - rc = request_irq(adapter->rx_scrq[i]->irq, ibmvnic_interrupt_rx, - 0, "ibmvnic_rx", adapter->rx_scrq[i]); - if (rc) { - dev_err(dev, "Couldn't register rx irq 0x%x. rc=%d\n", - adapter->rx_scrq[i]->irq, rc); - goto req_rx_irq_failed; - } } memset(&crq, 0, sizeof(crq)); @@ -1562,15 +1619,6 @@ static void init_sub_crqs(struct ibmvnic_adapter *adapter, int retry) return; -req_rx_irq_failed: - for (j = 0; j < i; j++) - free_irq(adapter->rx_scrq[j]->irq, adapter->rx_scrq[j]); - i = adapter->req_tx_queues; -req_tx_irq_failed: - for (j = 0; j < i; j++) - free_irq(adapter->tx_scrq[j]->irq, adapter->tx_scrq[j]); - kfree(adapter->rx_scrq); - adapter->rx_scrq = NULL; rx_failed: kfree(adapter->tx_scrq); adapter->tx_scrq = NULL; @@ -2351,9 +2399,9 @@ static void handle_request_cap_rsp(union ibmvnic_crq *crq, *req_value, (long int)be32_to_cpu(crq->request_capability_rsp. number), name); - release_sub_crqs(adapter); + release_sub_crqs_no_irqs(adapter); *req_value = be32_to_cpu(crq->request_capability_rsp.number); - complete(&adapter->init_done); + init_sub_crqs(adapter, 1); return; default: dev_err(dev, "Error %d in request cap rsp\n", @@ -2662,7 +2710,7 @@ static void handle_query_cap_rsp(union ibmvnic_crq *crq, out: if (atomic_read(&adapter->running_cap_queries) == 0) - complete(&adapter->init_done); + init_sub_crqs(adapter, 0); /* We're done querying the capabilities, initialize sub-crqs */ } @@ -3560,6 +3608,7 @@ static const struct file_operations ibmvnic_dump_ops = { static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) { + unsigned long timeout = msecs_to_jiffies(30000); struct ibmvnic_adapter *adapter; struct net_device *netdev; unsigned char *mac_addr_p; @@ -3638,30 +3687,26 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) ibmvnic_send_crq_init(adapter); init_completion(&adapter->init_done); - wait_for_completion(&adapter->init_done); + if (!wait_for_completion_timeout(&adapter->init_done, timeout)) + return 0; do { - adapter->renegotiate = false; - - init_sub_crqs(adapter, 0); - reinit_completion(&adapter->init_done); - wait_for_completion(&adapter->init_done); - if (adapter->renegotiate) { - release_sub_crqs(adapter); + adapter->renegotiate = false; + release_sub_crqs_no_irqs(adapter); send_cap_queries(adapter); reinit_completion(&adapter->init_done); - wait_for_completion(&adapter->init_done); + if (!wait_for_completion_timeout(&adapter->init_done, + timeout)) + return 0; } } while (adapter->renegotiate); - /* if init_sub_crqs is partially successful, retry */ - while (!adapter->tx_scrq || !adapter->rx_scrq) { - init_sub_crqs(adapter, 1); - - reinit_completion(&adapter->init_done); - wait_for_completion(&adapter->init_done); + rc = init_sub_crq_irqs(adapter); + if (rc) { + dev_err(&dev->dev, "failed to initialize sub crq irqs\n"); + goto free_debugfs; } netdev->real_num_tx_queues = adapter->req_tx_queues; @@ -3669,12 +3714,14 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) rc = register_netdev(netdev); if (rc) { dev_err(&dev->dev, "failed to register netdev rc=%d\n", rc); - goto free_debugfs; + goto free_sub_crqs; } dev_info(&dev->dev, "ibmvnic registered\n"); return 0; +free_sub_crqs: + release_sub_crqs(adapter); free_debugfs: if (adapter->debugfs_dir && !IS_ERR(adapter->debugfs_dir)) debugfs_remove_recursive(adapter->debugfs_dir); -- cgit From 65dc689182ec5117896d876cc03405ac51427314 Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Wed, 6 Jul 2016 15:35:18 -0500 Subject: ibmvnic: Fix passive VNIC server login process In some cases, if there is no VNIC server available during the driver probe, the driver should wait until it receives an initialization request from the VNIC Server to start the login process. Recent testing has show that this is incorrectly handled in the current driver. The proposed solution handles this initialization request by scheduling a task in the shared workqueue that completes the login process and registers the net device. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 61 ++++++++++++++++++++++++++++++++++++-- drivers/net/ethernet/ibm/ibmvnic.h | 2 ++ 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 52b0c07d3ca4..88f3c85fb04a 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -75,6 +75,7 @@ #include #include #include +#include #include "ibmvnic.h" @@ -3253,8 +3254,8 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, dev_info(dev, "Partner initialized\n"); /* Send back a response */ rc = ibmvnic_send_crq_init_complete(adapter); - if (rc == 0) - send_version_xchg(adapter); + if (!rc) + schedule_work(&adapter->vnic_crq_init); else dev_err(dev, "Can't send initrsp rc=%ld\n", rc); break; @@ -3606,6 +3607,60 @@ static const struct file_operations ibmvnic_dump_ops = { .release = single_release, }; +static void handle_crq_init_rsp(struct work_struct *work) +{ + struct ibmvnic_adapter *adapter = container_of(work, + struct ibmvnic_adapter, + vnic_crq_init); + struct device *dev = &adapter->vdev->dev; + struct net_device *netdev = adapter->netdev; + unsigned long timeout = msecs_to_jiffies(30000); + int rc; + + send_version_xchg(adapter); + reinit_completion(&adapter->init_done); + if (!wait_for_completion_timeout(&adapter->init_done, timeout)) { + dev_err(dev, "Passive init timeout\n"); + goto task_failed; + } + + do { + if (adapter->renegotiate) { + adapter->renegotiate = false; + release_sub_crqs_no_irqs(adapter); + send_cap_queries(adapter); + + reinit_completion(&adapter->init_done); + if (!wait_for_completion_timeout(&adapter->init_done, + timeout)) { + dev_err(dev, "Passive init timeout\n"); + goto task_failed; + } + } + } while (adapter->renegotiate); + rc = init_sub_crq_irqs(adapter); + + if (rc) + goto task_failed; + + netdev->real_num_tx_queues = adapter->req_tx_queues; + + rc = register_netdev(netdev); + if (rc) { + dev_err(dev, + "failed to register netdev rc=%d\n", rc); + goto register_failed; + } + dev_info(dev, "ibmvnic registered\n"); + + return; + +register_failed: + release_sub_crqs(adapter); +task_failed: + dev_err(dev, "Passive initialization was not successful\n"); +} + static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) { unsigned long timeout = msecs_to_jiffies(30000); @@ -3645,6 +3700,8 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) netdev->ethtool_ops = &ibmvnic_ethtool_ops; SET_NETDEV_DEV(netdev, &dev->dev); + INIT_WORK(&adapter->vnic_crq_init, handle_crq_init_rsp); + spin_lock_init(&adapter->stats_lock); rc = ibmvnic_init_crq_queue(adapter); diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h index 0b66a506a4e4..e82898fd518e 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.h +++ b/drivers/net/ethernet/ibm/ibmvnic.h @@ -1045,4 +1045,6 @@ struct ibmvnic_adapter { u64 opt_rxba_entries_per_subcrq; __be64 tx_rx_desc_req; u8 map_id; + + struct work_struct vnic_crq_init; }; -- cgit From 95556a883834122c616bbeb942654d745ceb9712 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Jul 2016 11:03:57 +0200 Subject: dccp: avoid deadlock in dccp_v4_ctl_send_reset In the prep work I did before enabling BH while handling socket backlog, I missed two points in DCCP : 1) dccp_v4_ctl_send_reset() uses bh_lock_sock(), assuming BH were blocked. It is not anymore always true. 2) dccp_v4_route_skb() was using __IP_INC_STATS() instead of IP_INC_STATS() A similar fix was done for TCP, in commit 47dcc20a39d0 ("ipv4: tcp: ip_send_unicast_reply() is not BH safe") Fixes: 7309f8821fd6 ("dccp: do not assume DCCP code is non preemptible") Fixes: 5413d1babe8f ("net: do not block BH while processing socket backlog") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 5c7e413a3ae4..25dd25b47d41 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -462,7 +462,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { - __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } @@ -527,17 +527,19 @@ static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) rxiph->daddr); skb_dst_set(skb, dst_clone(dst)); + local_bh_disable(); bh_lock_sock(ctl_sk); err = ip_build_and_send_pkt(skb, ctl_sk, rxiph->daddr, rxiph->saddr, NULL); bh_unlock_sock(ctl_sk); if (net_xmit_eval(err) == 0) { - DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - DCCP_INC_STATS(DCCP_MIB_OUTRSTS); + __DCCP_INC_STATS(DCCP_MIB_OUTSEGS); + __DCCP_INC_STATS(DCCP_MIB_OUTRSTS); } + local_bh_enable(); out: - dst_release(dst); + dst_release(dst); } static void dccp_v4_reqsk_destructor(struct request_sock *req) -- cgit From f3ea3119ad75dde0ba3e8da4653dbd5a189688e5 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 8 Jul 2016 16:42:48 +0100 Subject: bnxt_en: initialize rc to zero to avoid returning garbage rc is not initialized so it can contain garbage if it is not set by the call to bnxt_read_sfp_module_eeprom_info. Ensure garbage is not returned by initializing rc to 0. Signed-off-by: Colin Ian King Acked-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index a38cb047b540..1b0ae4a72e9e 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -1591,7 +1591,7 @@ static int bnxt_get_module_eeprom(struct net_device *dev, { struct bnxt *bp = netdev_priv(dev); u16 start = eeprom->offset, length = eeprom->len; - int rc; + int rc = 0; memset(data, 0, eeprom->len); -- cgit From a612769774a30e4fc143c4cb6395c12573415660 Mon Sep 17 00:00:00 2001 From: Michal Kubeček Date: Fri, 8 Jul 2016 17:52:33 +0200 Subject: udp: prevent bugcheck if filter truncates packet too much If socket filter truncates an udp packet below the length of UDP header in udpv6_queue_rcv_skb() or udp_queue_rcv_skb(), it will trigger a BUG_ON in skb_pull_rcsum(). This BUG_ON (and therefore a system crash if kernel is configured that way) can be easily enforced by an unprivileged user which was reported as CVE-2016-6162. For a reproducer, see http://seclists.org/oss-sec/2016/q3/8 Fixes: e6afc8ace6dd ("udp: remove headers from UDP packets before queueing") Reported-by: Marco Grassi Signed-off-by: Michal Kubecek Acked-by: Eric Dumazet Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp.c | 2 ++ net/ipv6/udp.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ca5e8ea29538..4aed8fc23d32 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1583,6 +1583,8 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (sk_filter(sk, skb)) goto drop; + if (unlikely(skb->len < sizeof(struct udphdr))) + goto drop; udp_csum_pull_header(skb); if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 005dc82c2138..acc09705618b 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -620,6 +620,8 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (sk_filter(sk, skb)) goto drop; + if (unlikely(skb->len < sizeof(struct udphdr))) + goto drop; udp_csum_pull_header(skb); if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { -- cgit From 75ff39ccc1bd5d3c455b6822ab09e533c551f758 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 10 Jul 2016 10:04:02 +0200 Subject: tcp: make challenge acks less predictable Yue Cao claims that current host rate limiting of challenge ACKS (RFC 5961) could leak enough information to allow a patient attacker to hijack TCP sessions. He will soon provide details in an academic paper. This patch increases the default limit from 100 to 1000, and adds some randomization so that the attacker can no longer hijack sessions without spending a considerable amount of probes. Based on initial analysis and patch from Linus. Note that we also have per socket rate limiting, so it is tempting to remove the host limit in the future. v2: randomize the count of challenge acks per second, not the period. Fixes: 282f23c6ee34 ("tcp: implement RFC 5961 3.2") Reported-by: Yue Cao Signed-off-by: Eric Dumazet Suggested-by: Linus Torvalds Cc: Yuchung Cheng Cc: Neal Cardwell Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d6c8f4cd0800..91868bb17818 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -87,7 +87,7 @@ int sysctl_tcp_adv_win_scale __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); /* rfc5961 challenge ack rate limiting */ -int sysctl_tcp_challenge_ack_limit = 100; +int sysctl_tcp_challenge_ack_limit = 1000; int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; @@ -3458,7 +3458,7 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) static u32 challenge_timestamp; static unsigned int challenge_count; struct tcp_sock *tp = tcp_sk(sk); - u32 now; + u32 count, now; /* First check our per-socket dupack rate limit. */ if (tcp_oow_rate_limited(sock_net(sk), skb, @@ -3466,13 +3466,18 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) &tp->last_oow_ack_time)) return; - /* Then check the check host-wide RFC 5961 rate limit. */ + /* Then check host-wide RFC 5961 rate limit. */ now = jiffies / HZ; if (now != challenge_timestamp) { + u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1; + challenge_timestamp = now; - challenge_count = 0; + WRITE_ONCE(challenge_count, half + + prandom_u32_max(sysctl_tcp_challenge_ack_limit)); } - if (++challenge_count <= sysctl_tcp_challenge_ack_limit) { + count = READ_ONCE(challenge_count); + if (count > 0) { + WRITE_ONCE(challenge_count, count - 1); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); tcp_send_ack(sk); } -- cgit From 80610229ef7b26615dbb6cb6e873709a60bacc9f Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 10 Jul 2016 21:11:55 +0300 Subject: ipv4: reject RTNH_F_DEAD and RTNH_F_LINKDOWN from user space Vegard Nossum is reporting for a crash in fib_dump_info when nh_dev = NULL and fib_nhs == 1: Pid: 50, comm: netlink.exe Not tainted 4.7.0-rc5+ RIP: 0033:[<00000000602b3d18>] RSP: 0000000062623890 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 000000006261b800 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000024 RDI: 000000006245ba00 RBP: 00000000626238f0 R08: 000000000000029c R09: 0000000000000000 R10: 0000000062468038 R11: 000000006245ba00 R12: 000000006245ba00 R13: 00000000625f96c0 R14: 00000000601e16f0 R15: 0000000000000000 Kernel panic - not syncing: Kernel mode fault at addr 0x2e0, ip 0x602b3d18 CPU: 0 PID: 50 Comm: netlink.exe Not tainted 4.7.0-rc5+ #581 Stack: 626238f0 960226a02 00000400 000000fe 62623910 600afca7 62623970 62623a48 62468038 00000018 00000000 00000000 Call Trace: [<602b3e93>] rtmsg_fib+0xd3/0x190 [<602b6680>] fib_table_insert+0x260/0x500 [<602b0e5d>] inet_rtm_newroute+0x4d/0x60 [<60250def>] rtnetlink_rcv_msg+0x8f/0x270 [<60267079>] netlink_rcv_skb+0xc9/0xe0 [<60250d4b>] rtnetlink_rcv+0x3b/0x50 [<60265400>] netlink_unicast+0x1a0/0x2c0 [<60265e47>] netlink_sendmsg+0x3f7/0x470 [<6021dc9a>] sock_sendmsg+0x3a/0x90 [<6021e0d0>] ___sys_sendmsg+0x300/0x360 [<6021fa64>] __sys_sendmsg+0x54/0xa0 [<6021fac0>] SyS_sendmsg+0x10/0x20 [<6001ea68>] handle_syscall+0x88/0x90 [<600295fd>] userspace+0x3fd/0x500 [<6001ac55>] fork_handler+0x85/0x90 $ addr2line -e vmlinux -i 0x602b3d18 include/linux/inetdevice.h:222 net/ipv4/fib_semantics.c:1264 Problem happens when RTNH_F_LINKDOWN is provided from user space when creating routes that do not use the flag, catched with netlink fuzzer. Currently, the kernel allows user space to set both flags to nh_flags and fib_flags but this is not intentional, the assumption was that they are not set. Fix this by rejecting both flags with EINVAL. Reported-by: Vegard Nossum Fixes: 0eeb075fad73 ("net: ipv4 sysctl option to ignore routes when nexthop link is down") Signed-off-by: Julian Anastasov Cc: Andy Gospodarek Cc: Dinesh Dutt Cc: Scott Feldman Reviewed-by: Andy Gospodarek Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d09173bf9500..539fa264e67d 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -479,6 +479,9 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, if (!rtnh_ok(rtnh, remaining)) return -EINVAL; + if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) + return -EINVAL; + nexthop_nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; nexthop_nh->nh_oif = rtnh->rtnh_ifindex; @@ -1003,6 +1006,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (fib_props[cfg->fc_type].scope > cfg->fc_scope) goto err_inval; + if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) + goto err_inval; + #ifdef CONFIG_IP_ROUTE_MULTIPATH if (cfg->fc_mp) { nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len); -- cgit From 779f1edec664a7b32b71f7b4702e085a08d60592 Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Mon, 11 Jul 2016 16:51:26 -0400 Subject: sock: ignore SCM_RIGHTS and SCM_CREDENTIALS in __sock_cmsg_send Sergei Trofimovich reported that pulse audio sends SCM_CREDENTIALS as a control message to TCP. Since __sock_cmsg_send does not support SCM_RIGHTS and SCM_CREDENTIALS, it returns an error and hence breaks pulse audio over TCP. SCM_RIGHTS and SCM_CREDENTIALS are sent on the SOL_SOCKET layer but they semantically belong to SOL_UNIX. Since all cmsg-processing functions including sock_cmsg_send ignore control messages of other layers, it is best to ignore SCM_RIGHTS and SCM_CREDENTIALS for consistency (and also for fixing pulse audio over TCP). Fixes: c14ac9451c34 ("sock: enable timestamping using control messages") Signed-off-by: Soheil Hassas Yeganeh Reported-by: Sergei Trofimovich Tested-by: Sergei Trofimovich Cc: Eric Dumazet Cc: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/sock.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/core/sock.c b/net/core/sock.c index 08bf97eceeb3..b7f12639c26a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1938,6 +1938,10 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; sockc->tsflags |= tsflags; break; + /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ + case SCM_RIGHTS: + case SCM_CREDENTIALS: + break; default: return -EINVAL; } -- cgit From 34ee32c9a5696247be405bb0c21f3d1fc6cb5729 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 11 Jul 2016 19:58:04 -0500 Subject: r8152: Add support for setting pass through MAC address on RTL8153-AD The RTL8153-AD supports a persistent system specific MAC address. This means a device plugged into two different systems with host side support will show different (but persistent) MAC addresses. This information for the system's persistent MAC address is burned in when the system HW is built and available under \_SB.AMAC in the DSDT at runtime. This technology is currently implemented in the Dell TB15 and WD15 Type-C docks. More information is available here: http://www.dell.com/support/article/us/en/04/SLN301147 Signed-off-by: Mario Limonciello Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 74 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 419f4cee432b..63f4018293bc 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -26,6 +26,7 @@ #include #include #include +#include /* Information for net-next */ #define NETNEXT_VERSION "08" @@ -460,6 +461,11 @@ /* SRAM_IMPEDANCE */ #define RX_DRIVING_MASK 0x6000 +/* MAC PASSTHRU */ +#define AD_MASK 0xfee0 +#define EFUSE 0xcfdb +#define PASS_THRU_MASK 0x1 + enum rtl_register_content { _1000bps = 0x10, _100bps = 0x08, @@ -1036,6 +1042,65 @@ out1: return ret; } +/* Devices containing RTL8153-AD can support a persistent + * host system provided MAC address. + * Examples of this are Dell TB15 and Dell WD15 docks + */ +static int vendor_mac_passthru_addr_read(struct r8152 *tp, struct sockaddr *sa) +{ + acpi_status status; + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + union acpi_object *obj; + int ret = -EINVAL; + u32 ocp_data; + unsigned char buf[6]; + + /* test for -AD variant of RTL8153 */ + ocp_data = ocp_read_word(tp, MCU_TYPE_USB, USB_MISC_0); + if ((ocp_data & AD_MASK) != 0x1000) + return -ENODEV; + + /* test for MAC address pass-through bit */ + ocp_data = ocp_read_byte(tp, MCU_TYPE_USB, EFUSE); + if ((ocp_data & PASS_THRU_MASK) != 1) + return -ENODEV; + + /* returns _AUXMAC_#AABBCCDDEEFF# */ + status = acpi_evaluate_object(NULL, "\\_SB.AMAC", NULL, &buffer); + obj = (union acpi_object *)buffer.pointer; + if (!ACPI_SUCCESS(status)) + return -ENODEV; + if (obj->type != ACPI_TYPE_BUFFER || obj->string.length != 0x17) { + netif_warn(tp, probe, tp->netdev, + "Invalid buffer when reading pass-thru MAC addr: " + "(%d, %d)\n", + obj->type, obj->string.length); + goto amacout; + } + if (strncmp(obj->string.pointer, "_AUXMAC_#", 9) != 0 || + strncmp(obj->string.pointer + 0x15, "#", 1) != 0) { + netif_warn(tp, probe, tp->netdev, + "Invalid header when reading pass-thru MAC addr\n"); + goto amacout; + } + ret = hex2bin(buf, obj->string.pointer + 9, 6); + if (!(ret == 0 && is_valid_ether_addr(buf))) { + netif_warn(tp, probe, tp->netdev, + "Invalid MAC when reading pass-thru MAC addr: " + "%d, %pM\n", ret, buf); + ret = -EINVAL; + goto amacout; + } + memcpy(sa->sa_data, buf, 6); + ether_addr_copy(tp->netdev->dev_addr, sa->sa_data); + netif_info(tp, probe, tp->netdev, + "Using pass-thru MAC addr %pM\n", sa->sa_data); + +amacout: + kfree(obj); + return ret; +} + static int set_ethernet_addr(struct r8152 *tp) { struct net_device *dev = tp->netdev; @@ -1044,8 +1109,15 @@ static int set_ethernet_addr(struct r8152 *tp) if (tp->version == RTL_VER_01) ret = pla_ocp_read(tp, PLA_IDR, 8, sa.sa_data); - else - ret = pla_ocp_read(tp, PLA_BACKUP, 8, sa.sa_data); + else { + /* if this is not an RTL8153-AD, no eFuse mac pass thru set, + * or system doesn't provide valid _SB.AMAC this will be + * be expected to non-zero + */ + ret = vendor_mac_passthru_addr_read(tp, &sa); + if (ret < 0) + ret = pla_ocp_read(tp, PLA_BACKUP, 8, sa.sa_data); + } if (ret < 0) { netif_err(tp, probe, dev, "Get ether addr fail\n"); -- cgit From 2d18ac4ba7454a4260473e68be7e485ae71e7948 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Mon, 11 Jul 2016 16:08:35 -0400 Subject: tipc: extend broadcast link initialization criteria At first contact between two nodes, an endpoint might sometimes have time to send out a LINK_PROTOCOL/STATE packet before it has received the broadcast initialization packet from the peer, i.e., before it has received a valid broadcast packet number to add to the 'bc_ack' field of the protocol message. This means that the peer endpoint will receive a protocol packet with an invalid broadcast acknowledge value of 0. Under unlucky circumstances this may lead to the original, already received acknowledge value being overwritten, so that the whole broadcast link goes stale after a while. We fix this by delaying the setting of the link field 'bc_peer_is_up' until we know that the peer really has received our own broadcast initialization message. The latter is always sent out as the first unicast message on a link, and always with seqeunce number 1. Because of this, we only need to look for a non-zero unicast acknowledge value in the arriving STATE messages, and once that is confirmed we know we are safe and can set the mentioned field. Before this moment, we must ignore all broadcast acknowledges from the peer. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/tipc/link.c b/net/tipc/link.c index 67b6ab9f4c8d..6483dc4333fb 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1559,7 +1559,12 @@ void tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, if (!msg_peer_node_is_up(hdr)) return; - l->bc_peer_is_up = true; + /* Open when peer ackowledges our bcast init msg (pkt #1) */ + if (msg_ack(hdr)) + l->bc_peer_is_up = true; + + if (!l->bc_peer_is_up) + return; /* Ignore if peers_snd_nxt goes beyond receive window */ if (more(peers_snd_nxt, l->rcv_nxt + l->window)) -- cgit From a71eb720355c28eaeb2de0c4d960247c69bb2c6f Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Mon, 11 Jul 2016 16:08:36 -0400 Subject: tipc: ensure correct broadcast send buffer release when peer is lost After a new receiver peer has been added to the broadcast transmission link, we allow immediate transmission of new broadcast packets, trusting that the new peer will not accept the packets until it has received the previously sent unicast broadcast initialiation message. In the same way, the sender must not accept any acknowledges until it has itself received the broadcast initialization from the peer, as well as confirmation of the reception of its own initialization message. Furthermore, when a receiver peer goes down, the sender has to produce the missing acknowledges from the lost peer locally, in order ensure correct release of the buffers that were expected to be acknowledged by the said peer. In a highly stressed system we have observed that contact with a peer may come up and be lost before the above mentioned broadcast initial- ization and confirmation have been received. This leads to the locally produced acknowledges being rejected, and the non-acknowledged buffers to linger in the broadcast link transmission queue until it fills up and the link goes into permanent congestion. In this commit, we remedy this by temporarily setting the corresponding broadcast receive link state to ESTABLISHED and the 'bc_peer_is_up' state to true before we issue the local acknowledges. This ensures that those acknowledges will always be accepted. The mentioned state values are restored immediately afterwards when the link is reset. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/tipc/link.c b/net/tipc/link.c index 6483dc4333fb..7d89f8713d49 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -349,6 +349,8 @@ void tipc_link_remove_bc_peer(struct tipc_link *snd_l, u16 ack = snd_l->snd_nxt - 1; snd_l->ackers--; + rcv_l->bc_peer_is_up = true; + rcv_l->state = LINK_ESTABLISHED; tipc_link_bc_ack_rcv(rcv_l, ack, xmitq); tipc_link_reset(rcv_l); rcv_l->state = LINK_RESET; -- cgit From 1fc07f3e1541cc49cc159beb3fdefc5013570eda Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Mon, 11 Jul 2016 16:08:37 -0400 Subject: tipc: reset all unicast links when broadcast send link fails In test situations with many nodes and a heavily stressed system we have observed that the transmission broadcast link may fail due to an excessive number of retransmissions of the same packet. In such situations we need to reset all unicast links to all peers, in order to reset and re-synchronize the broadcast link. In this commit, we add a new function tipc_bearer_reset_all() to be used in such situations. The function scans across all bearers and resets all their pertaining links. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bearer.c | 15 +++++++++++++++ net/tipc/bearer.h | 1 + net/tipc/node.c | 15 +++++++++++---- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index bf8f05c3eb82..a597708ae381 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -330,6 +330,21 @@ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b) return 0; } +/* tipc_bearer_reset_all - reset all links on all bearers + */ +void tipc_bearer_reset_all(struct net *net) +{ + struct tipc_net *tn = tipc_net(net); + struct tipc_bearer *b; + int i; + + for (i = 0; i < MAX_BEARERS; i++) { + b = rcu_dereference_rtnl(tn->bearer_list[i]); + if (b) + tipc_reset_bearer(net, b); + } +} + /** * bearer_disable * diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index f686e41b5abb..60e49c3be19c 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -198,6 +198,7 @@ void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest); void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest); struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name); struct tipc_media *tipc_media_find(const char *name); +void tipc_bearer_reset_all(struct net *net); int tipc_bearer_setup(void); void tipc_bearer_cleanup(void); void tipc_bearer_stop(struct net *net); diff --git a/net/tipc/node.c b/net/tipc/node.c index e01e2c71b5a1..23d4761842a0 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1297,10 +1297,6 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id rc = tipc_bcast_rcv(net, be->link, skb); - /* Broadcast link reset may happen at reassembly failure */ - if (rc & TIPC_LINK_DOWN_EVT) - tipc_node_reset_links(n); - /* Broadcast ACKs are sent on a unicast link */ if (rc & TIPC_LINK_SND_BC_ACK) { tipc_node_read_lock(n); @@ -1320,6 +1316,17 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id spin_unlock_bh(&be->inputq2.lock); tipc_sk_mcast_rcv(net, &be->arrvq, &be->inputq2); } + + if (rc & TIPC_LINK_DOWN_EVT) { + /* Reception reassembly failure => reset all links to peer */ + if (!tipc_link_is_up(be->link)) + tipc_node_reset_links(n); + + /* Retransmission failure => reset all links to all peers */ + if (!tipc_link_is_up(tipc_bc_sndlink(net))) + tipc_bearer_reset_all(net); + } + tipc_node_put(n); } -- cgit From 590b52e10d410e1439ae86be9fe19d75fdab628b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 11 Jul 2016 17:28:54 +0200 Subject: netfilter: conntrack: skip clash resolution if nat is in place The clash resolution is not easy to apply if the NAT table is registered. Even if no NAT rules are installed, the nul-binding ensures that a unique tuple is used, thus, the packet that loses race gets a different source port number, as described by: http://marc.info/?l=netfilter-devel&m=146818011604484&w=2 Clash resolution with NAT is also problematic if addresses/port range ports are used since the conntrack that wins race may describe a different mangling that we may have earlier applied to the packet via nf_nat_setup_info(). Fixes: 71d8c47fc653 ("netfilter: conntrack: introduce clash resolution on insertion race") Signed-off-by: Pablo Neira Ayuso Tested-by: Marc Dionne --- net/netfilter/nf_conntrack_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 62c42e970c89..9f530adad10d 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -646,6 +646,7 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); if (l4proto->allow_clash && + !nfct_nat(ct) && !nf_ct_is_dying(ct) && atomic_inc_not_zero(&ct->ct_general.use)) { nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct); -- cgit From 136ab0d0e10f29bdac3ee04bd0e9661073e15c80 Mon Sep 17 00:00:00 2001 From: Noam Camus Date: Tue, 12 Jul 2016 16:01:11 +0300 Subject: net: nps_enet: Fix PCS reset During commit b54b8c2d6e3c ("net: ezchip: adapt driver to little endian architecture") adapting to little endian architecture, zeroing of controller was left out. Signed-off-by: Elad Kanfi Signed-off-by: David S. Miller --- drivers/net/ethernet/ezchip/nps_enet.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/ezchip/nps_enet.c b/drivers/net/ethernet/ezchip/nps_enet.c index 06f031715b57..9b7a3f5a2818 100644 --- a/drivers/net/ethernet/ezchip/nps_enet.c +++ b/drivers/net/ethernet/ezchip/nps_enet.c @@ -285,6 +285,7 @@ static void nps_enet_hw_reset(struct net_device *ndev) ge_rst_value |= NPS_ENET_ENABLE << RST_GMAC_0_SHIFT; nps_enet_reg_set(priv, NPS_ENET_REG_GE_RST, ge_rst_value); usleep_range(10, 20); + ge_rst_value = 0; nps_enet_reg_set(priv, NPS_ENET_REG_GE_RST, ge_rst_value); /* Tx fifo reset sequence */ -- cgit From 386512d18b268c6182903239f9f3390f03ce4c7b Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 12 Jul 2016 16:04:35 -0700 Subject: net: ethoc: Fix early error paths In case any operation fails before we can successfully go the point where we would register a MDIO bus, we would be going to an error label which involves unregistering then freeing this yet to be created MDIO bus. Update all error paths to go to label free which is the only one valid until either the clock is enabled, or the MDIO bus is allocated and registered. This fixes kernel oops observed while trying to dereference the MDIO bus structure which is not yet allocated. Fixes: a1702857724f ("net: Add support for the OpenCores 10/100 Mbps Ethernet MAC.") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/ethoc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c index 4edb98c3c6c7..06ae14a8e946 100644 --- a/drivers/net/ethernet/ethoc.c +++ b/drivers/net/ethernet/ethoc.c @@ -1086,7 +1086,7 @@ static int ethoc_probe(struct platform_device *pdev) if (!priv->iobase) { dev_err(&pdev->dev, "cannot remap I/O memory space\n"); ret = -ENXIO; - goto error; + goto free; } if (netdev->mem_end) { @@ -1095,7 +1095,7 @@ static int ethoc_probe(struct platform_device *pdev) if (!priv->membase) { dev_err(&pdev->dev, "cannot remap memory space\n"); ret = -ENXIO; - goto error; + goto free; } } else { /* Allocate buffer memory */ @@ -1106,7 +1106,7 @@ static int ethoc_probe(struct platform_device *pdev) dev_err(&pdev->dev, "cannot allocate %dB buffer\n", buffer_size); ret = -ENOMEM; - goto error; + goto free; } netdev->mem_end = netdev->mem_start + buffer_size; priv->dma_alloc = buffer_size; @@ -1120,7 +1120,7 @@ static int ethoc_probe(struct platform_device *pdev) 128, (netdev->mem_end - netdev->mem_start + 1) / ETHOC_BUFSIZ); if (num_bd < 4) { ret = -ENODEV; - goto error; + goto free; } priv->num_bd = num_bd; /* num_tx must be a power of two */ @@ -1133,7 +1133,7 @@ static int ethoc_probe(struct platform_device *pdev) priv->vma = devm_kzalloc(&pdev->dev, num_bd*sizeof(void *), GFP_KERNEL); if (!priv->vma) { ret = -ENOMEM; - goto error; + goto free; } /* Allow the platform setup code to pass in a MAC address. */ -- cgit From ee6c21b9c11ad96318160f9a504a3fac2114ddca Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 12 Jul 2016 16:04:36 -0700 Subject: net: ethoc: Correctly pad short packets Even though the hardware can be doing zero padding, we want the SKB to be going out on the wire with the appropriate size. This fixes packet truncations observed with e.g: ARP packets. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/ethoc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c index 06ae14a8e946..4466a1187110 100644 --- a/drivers/net/ethernet/ethoc.c +++ b/drivers/net/ethernet/ethoc.c @@ -860,6 +860,11 @@ static netdev_tx_t ethoc_start_xmit(struct sk_buff *skb, struct net_device *dev) unsigned int entry; void *dest; + if (skb_put_padto(skb, ETHOC_ZLEN)) { + dev->stats.tx_errors++; + goto out_no_free; + } + if (unlikely(skb->len > ETHOC_BUFSIZ)) { dev->stats.tx_errors++; goto out; @@ -894,6 +899,7 @@ static netdev_tx_t ethoc_start_xmit(struct sk_buff *skb, struct net_device *dev) skb_tx_timestamp(skb); out: dev_kfree_skb(skb); +out_no_free: return NETDEV_TX_OK; } -- cgit From 2c1ccc993707ecb0830ef0aebb7c8061f7704aa3 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Wed, 13 Jul 2016 00:06:59 +0300 Subject: net/mlx5e: Fix TX Timeout to detect queues stuck on BQL Change netif_tx_queue_stopped to netif_xmit_stopped. This will show when queues are stopped due to byte queue limits. Fixes: 3947ca185999 ('net/mlx5e: Implement ndo_tx_timeout callback') Signed-off-by: Daniel Jurgens Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 7a0dca29c642..0cebc7e44307 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2656,7 +2656,7 @@ static void mlx5e_tx_timeout(struct net_device *dev) for (i = 0; i < priv->params.num_channels * priv->params.num_tc; i++) { struct mlx5e_sq *sq = priv->txq_to_sq_map[i]; - if (!netif_tx_queue_stopped(netdev_get_tx_queue(dev, i))) + if (!netif_xmit_stopped(netdev_get_tx_queue(dev, i))) continue; sched_work = true; set_bit(MLX5E_SQ_STATE_TX_TIMEOUT, &sq->state); -- cgit From c3b7c5c9504348e0c22fa47629c419d82c963bc2 Mon Sep 17 00:00:00 2001 From: Mohamad Haj Yahia Date: Wed, 13 Jul 2016 00:07:00 +0300 Subject: net/mlx5e: start/stop all tx queues upon open/close netdev Start all tx queues (including inactive ones) when opening the netdev. Stop all tx queues (including inactive ones) when closing the netdev. This is a workaround for the tx timeout watchdog false alarm issue in which the netdev watchdog is polling all the tx queues which may include inactive queues and thus once lowering the real tx queues number (ethtool -L) it will generate tx timeout watchdog false alarms. Fixes: 3947ca185999 ('net/mlx5e: Implement ndo_tx_timeout callback') Signed-off-by: Mohamad Haj Yahia Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 0cebc7e44307..5a4d88c2cdb2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1348,6 +1348,11 @@ static int mlx5e_open_channels(struct mlx5e_priv *priv) goto err_close_channels; } + /* FIXME: This is a W/A for tx timeout watch dog false alarm when + * polling for inactive tx queues. + */ + netif_tx_start_all_queues(priv->netdev); + kfree(cparam); return 0; @@ -1367,6 +1372,12 @@ static void mlx5e_close_channels(struct mlx5e_priv *priv) { int i; + /* FIXME: This is a W/A only for tx timeout watch dog false alarm when + * polling for inactive tx queues. + */ + netif_tx_stop_all_queues(priv->netdev); + netif_tx_disable(priv->netdev); + for (i = 0; i < priv->params.num_channels; i++) mlx5e_close_channel(priv->channel[i]); -- cgit From f4979fcea7fd36d8e2f556abef86f80e0d5af1ba Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 12 Jul 2016 18:18:56 -0400 Subject: rose: limit sk_filter trim to payload Sockets can have a filter program attached that drops or trims incoming packets based on the filter program return value. Rose requires data packets to have at least ROSE_MIN_LEN bytes. It verifies this on arrival in rose_route_frame and unconditionally pulls the bytes in rose_recvmsg. The filter can trim packets to below this value in-between, causing pull to fail, leaving the partial header at the time of skb_copy_datagram_msg. Place a lower bound on the size to which sk_filter may trim packets by introducing sk_filter_trim_cap and call this for rose packets. Signed-off-by: Willem de Bruijn Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/filter.h | 6 +++++- net/core/filter.c | 10 +++++----- net/rose/rose_in.c | 3 ++- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 6fc31ef1da2d..8f74f3d61894 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -467,7 +467,11 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) } #endif /* CONFIG_DEBUG_SET_MODULE_RONX */ -int sk_filter(struct sock *sk, struct sk_buff *skb); +int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); +static inline int sk_filter(struct sock *sk, struct sk_buff *skb) +{ + return sk_filter_trim_cap(sk, skb, 1); +} struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err); void bpf_prog_free(struct bpf_prog *fp); diff --git a/net/core/filter.c b/net/core/filter.c index c4b330c85c02..e759d90e8cef 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -53,9 +53,10 @@ #include /** - * sk_filter - run a packet through a socket filter + * sk_filter_trim_cap - run a packet through a socket filter * @sk: sock associated with &sk_buff * @skb: buffer to filter + * @cap: limit on how short the eBPF program may trim the packet * * Run the eBPF program and then cut skb->data to correct size returned by * the program. If pkt_len is 0 we toss packet. If skb->len is smaller @@ -64,7 +65,7 @@ * be accepted or -EPERM if the packet should be tossed. * */ -int sk_filter(struct sock *sk, struct sk_buff *skb) +int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) { int err; struct sk_filter *filter; @@ -85,14 +86,13 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) filter = rcu_dereference(sk->sk_filter); if (filter) { unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb); - - err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; + err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; } rcu_read_unlock(); return err; } -EXPORT_SYMBOL(sk_filter); +EXPORT_SYMBOL(sk_filter_trim_cap); static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) { diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c index 79c4abcfa6b4..0a6394754e81 100644 --- a/net/rose/rose_in.c +++ b/net/rose/rose_in.c @@ -164,7 +164,8 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety rose_frames_acked(sk, nr); if (ns == rose->vr) { rose_start_idletimer(sk); - if (sock_queue_rcv_skb(sk, skb) == 0) { + if (sk_filter_trim_cap(sk, skb, ROSE_MIN_LEN) == 0 && + __sock_queue_rcv_skb(sk, skb) == 0) { rose->vr = (rose->vr + 1) % ROSE_MODULUS; queued = 1; } else { -- cgit From 4f0c40d94461cfd23893a17335b2ab78ecb333c8 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 12 Jul 2016 18:18:57 -0400 Subject: dccp: limit sk_filter trim to payload Dccp verifies packet integrity, including length, at initial rcv in dccp_invalid_packet, later pulls headers in dccp_enqueue_skb. A call to sk_filter in-between can cause __skb_pull to wrap skb->len. skb_copy_datagram_msg interprets this as a negative value, so (correctly) fails with EFAULT. The negative length is reported in ioctl SIOCINQ or possibly in a DCCP_WARN in dccp_close. Introduce an sk_receive_skb variant that caps how small a filter program can trim packets, and call this in dccp with the header length. Excessively trimmed packets are now processed normally and queued for reception as 0B payloads. Fixes: 7c657876b63c ("[DCCP]: Initial implementation") Signed-off-by: Willem de Bruijn Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/sock.h | 8 +++++++- net/core/sock.c | 7 ++++--- net/dccp/ipv4.c | 2 +- net/dccp/ipv6.c | 2 +- 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 649d2a8c17fc..ff5be7e8ddea 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1576,7 +1576,13 @@ static inline void sock_put(struct sock *sk) */ void sock_gen_put(struct sock *sk); -int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested); +int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, + unsigned int trim_cap); +static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb, + const int nested) +{ + return __sk_receive_skb(sk, skb, nested, 1); +} static inline void sk_tx_queue_set(struct sock *sk, int tx_queue) { diff --git a/net/core/sock.c b/net/core/sock.c index b7f12639c26a..25dab8b60223 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -452,11 +452,12 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sock_queue_rcv_skb); -int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) +int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, + const int nested, unsigned int trim_cap) { int rc = NET_RX_SUCCESS; - if (sk_filter(sk, skb)) + if (sk_filter_trim_cap(sk, skb, trim_cap)) goto discard_and_relse; skb->dev = NULL; @@ -492,7 +493,7 @@ discard_and_relse: kfree_skb(skb); goto out; } -EXPORT_SYMBOL(sk_receive_skb); +EXPORT_SYMBOL(__sk_receive_skb); struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 25dd25b47d41..345a3aeb8c7e 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -868,7 +868,7 @@ lookup: goto discard_and_relse; nf_reset(skb); - return sk_receive_skb(sk, skb, 1); + return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4); no_dccp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index d176f4e66369..3ff137d9471d 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -732,7 +732,7 @@ lookup: if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - return sk_receive_skb(sk, skb, 1) ? -1 : 0; + return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4) ? -1 : 0; no_dccp_socket: if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) -- cgit From 005db31d5f5f7c31cfdc43505d77eb3ca5cf8ec6 Mon Sep 17 00:00:00 2001 From: Beniamino Galvani Date: Wed, 13 Jul 2016 18:25:08 +0200 Subject: bonding: set carrier off for devices created through netlink Commit e826eafa65c6 ("bonding: Call netif_carrier_off after register_netdevice") moved netif_carrier_off() from bond_init() to bond_create(), but the latter is called only for initial default devices and ones created through sysfs: $ modprobe bonding $ echo +bond1 > /sys/class/net/bonding_masters $ ip link add bond2 type bond $ grep "MII Status" /proc/net/bonding/* /proc/net/bonding/bond0:MII Status: down /proc/net/bonding/bond1:MII Status: down /proc/net/bonding/bond2:MII Status: up Ensure that carrier is initially off also for devices created through netlink. Signed-off-by: Beniamino Galvani Signed-off-by: David S. Miller --- drivers/net/bonding/bond_netlink.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index db760e84119f..b8df0f5e8c25 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -446,7 +446,11 @@ static int bond_newlink(struct net *src_net, struct net_device *bond_dev, if (err < 0) return err; - return register_netdevice(bond_dev); + err = register_netdevice(bond_dev); + + netif_carrier_off(bond_dev); + + return err; } static size_t bond_get_size(const struct net_device *bond_dev) -- cgit From 858296c8784bf98450765cbc6b1bc2e44175cc01 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 14 Jun 2016 15:45:42 -0700 Subject: i40e/i40evf: Fix i40e_rx_checksum There are a couple of issues I found in i40e_rx_checksum while doing some recent testing. As a result I have found the Rx checksum logic is pretty much broken and returning that the checksum is valid for tunnels in cases where it is not. First the inner types are not the correct values to use to test for if a tunnel is present or not. In addition the inner protocol types are not a bitmask as such performing an OR of the values doesn't make sense. I have instead changed the code so that the inner protocol types are used to determine if we report CHECKSUM_UNNECESSARY or not. For anything that does not end in UDP, TCP, or SCTP it doesn't make much sense to report a checksum offload since it won't contain a checksum anyway. This leaves us with the need to set the csum_level based on some value. For that purpose I am using the tunnel_type field. If the tunnel type is GRENAT or greater then this means we have a GRE or UDP tunnel with an inner header. In the case of GRE or UDP we will have a possible checksum present so for this reason it should be safe to set the csum_level to 1 to indicate that we are reporting the state of the inner header. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 30 +++++++++++++++------------ drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 30 +++++++++++++++------------ 2 files changed, 34 insertions(+), 26 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 55f151fca1dc..a8868e1bf832 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1280,8 +1280,8 @@ static inline void i40e_rx_checksum(struct i40e_vsi *vsi, union i40e_rx_desc *rx_desc) { struct i40e_rx_ptype_decoded decoded; - bool ipv4, ipv6, tunnel = false; u32 rx_error, rx_status; + bool ipv4, ipv6; u8 ptype; u64 qword; @@ -1336,19 +1336,23 @@ static inline void i40e_rx_checksum(struct i40e_vsi *vsi, if (rx_error & BIT(I40E_RX_DESC_ERROR_PPRS_SHIFT)) return; - /* The hardware supported by this driver does not validate outer - * checksums for tunneled VXLAN or GENEVE frames. I don't agree - * with it but the specification states that you "MAY validate", it - * doesn't make it a hard requirement so if we have validated the - * inner checksum report CHECKSUM_UNNECESSARY. + /* If there is an outer header present that might contain a checksum + * we need to bump the checksum level by 1 to reflect the fact that + * we are indicating we validated the inner checksum. */ - if (decoded.inner_prot & (I40E_RX_PTYPE_INNER_PROT_TCP | - I40E_RX_PTYPE_INNER_PROT_UDP | - I40E_RX_PTYPE_INNER_PROT_SCTP)) - tunnel = true; - - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->csum_level = tunnel ? 1 : 0; + if (decoded.tunnel_type >= I40E_RX_PTYPE_TUNNEL_IP_GRENAT) + skb->csum_level = 1; + + /* Only report checksum unnecessary for TCP, UDP, or SCTP */ + switch (decoded.inner_prot) { + case I40E_RX_PTYPE_INNER_PROT_TCP: + case I40E_RX_PTYPE_INNER_PROT_UDP: + case I40E_RX_PTYPE_INNER_PROT_SCTP: + skb->ip_summed = CHECKSUM_UNNECESSARY; + /* fall though */ + default: + break; + } return; diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c index be99189da925..79d99cd91b24 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c @@ -752,8 +752,8 @@ static inline void i40e_rx_checksum(struct i40e_vsi *vsi, union i40e_rx_desc *rx_desc) { struct i40e_rx_ptype_decoded decoded; - bool ipv4, ipv6, tunnel = false; u32 rx_error, rx_status; + bool ipv4, ipv6; u8 ptype; u64 qword; @@ -808,19 +808,23 @@ static inline void i40e_rx_checksum(struct i40e_vsi *vsi, if (rx_error & BIT(I40E_RX_DESC_ERROR_PPRS_SHIFT)) return; - /* The hardware supported by this driver does not validate outer - * checksums for tunneled VXLAN or GENEVE frames. I don't agree - * with it but the specification states that you "MAY validate", it - * doesn't make it a hard requirement so if we have validated the - * inner checksum report CHECKSUM_UNNECESSARY. + /* If there is an outer header present that might contain a checksum + * we need to bump the checksum level by 1 to reflect the fact that + * we are indicating we validated the inner checksum. */ - if (decoded.inner_prot & (I40E_RX_PTYPE_INNER_PROT_TCP | - I40E_RX_PTYPE_INNER_PROT_UDP | - I40E_RX_PTYPE_INNER_PROT_SCTP)) - tunnel = true; - - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->csum_level = tunnel ? 1 : 0; + if (decoded.tunnel_type >= I40E_RX_PTYPE_TUNNEL_IP_GRENAT) + skb->csum_level = 1; + + /* Only report checksum unnecessary for TCP, UDP, or SCTP */ + switch (decoded.inner_prot) { + case I40E_RX_PTYPE_INNER_PROT_TCP: + case I40E_RX_PTYPE_INNER_PROT_UDP: + case I40E_RX_PTYPE_INNER_PROT_SCTP: + skb->ip_summed = CHECKSUM_UNNECESSARY; + /* fall though */ + default: + break; + } return; -- cgit From f6bd09625ba66446821d55c61891bea9e2cdc5b3 Mon Sep 17 00:00:00 2001 From: Kiran Patil Date: Mon, 20 Jun 2016 09:10:34 -0700 Subject: i40e: enable VSI broadcast promiscuous mode instead of adding broadcast filter This patch sets VSI broadcast promiscuous mode during VSI add sequence and prevents adding MAC filter if specified MAC address is broadcast. Change-ID: Ia62251fca095bc449d0497fc44bec3a5a0136773 Signed-off-by: Kiran Patil Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_main.c | 32 ++++++++++++++++++----------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 5ea22008d721..1592dcbed790 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1344,6 +1344,13 @@ struct i40e_mac_filter *i40e_add_filter(struct i40e_vsi *vsi, if (!vsi || !macaddr) return NULL; + /* Do not allow broadcast filter to be added since broadcast filter + * is added as part of add VSI for any newly created VSI except + * FDIR VSI + */ + if (is_broadcast_ether_addr(macaddr)) + return NULL; + f = i40e_find_filter(vsi, macaddr, vlan, is_vf, is_netdev); if (!f) { f = kzalloc(sizeof(*f), GFP_ATOMIC); @@ -2151,18 +2158,6 @@ int i40e_sync_vsi_filters(struct i40e_vsi *vsi) aq_ret, pf->hw.aq.asq_last_status); } } - aq_ret = i40e_aq_set_vsi_broadcast(&vsi->back->hw, - vsi->seid, - cur_promisc, NULL); - if (aq_ret) { - retval = i40e_aq_rc_to_posix(aq_ret, - pf->hw.aq.asq_last_status); - dev_info(&pf->pdev->dev, - "set brdcast promisc failed, err %s, aq_err %s\n", - i40e_stat_str(&pf->hw, aq_ret), - i40e_aq_str(&pf->hw, - pf->hw.aq.asq_last_status)); - } } out: /* if something went wrong then set the changed flag so we try again */ @@ -9224,6 +9219,7 @@ int i40e_is_vsi_uplink_mode_veb(struct i40e_vsi *vsi) static int i40e_add_vsi(struct i40e_vsi *vsi) { int ret = -ENODEV; + i40e_status aq_ret = 0; u8 laa_macaddr[ETH_ALEN]; bool found_laa_mac_filter = false; struct i40e_pf *pf = vsi->back; @@ -9413,6 +9409,18 @@ static int i40e_add_vsi(struct i40e_vsi *vsi) vsi->seid = ctxt.seid; vsi->id = ctxt.vsi_number; } + /* Except FDIR VSI, for all othet VSI set the broadcast filter */ + if (vsi->type != I40E_VSI_FDIR) { + aq_ret = i40e_aq_set_vsi_broadcast(hw, vsi->seid, true, NULL); + if (aq_ret) { + ret = i40e_aq_rc_to_posix(aq_ret, + hw->aq.asq_last_status); + dev_info(&pf->pdev->dev, + "set brdcast promisc failed, err %s, aq_err %s\n", + i40e_stat_str(hw, aq_ret), + i40e_aq_str(hw, hw->aq.asq_last_status)); + } + } spin_lock_bh(&vsi->mac_filter_list_lock); /* If macvlan filters already exist, force them to get loaded */ -- cgit From 4b732cd4bb6006ad7fd4d5cdba27fcb751cdf4b7 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 15 Jun 2016 15:37:59 +0200 Subject: ixgbe: napi_poll must return the work done Currently the function ixgbe_poll() returns 0 when it clean completely the rx rings, but this foul budget accounting in core code. Fix this returning the actual work done, capped to weight - 1, since the core doesn't allow to return the full budget when the driver modifies the napi status Signed-off-by: Paolo Abeni Reviewed-by: Venkatesh Srinivas Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 088c47cf27d9..8bebd862a54c 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2887,7 +2887,7 @@ int ixgbe_poll(struct napi_struct *napi, int budget) if (!test_bit(__IXGBE_DOWN, &adapter->state)) ixgbe_irq_enable_queues(adapter, BIT_ULL(q_vector->v_idx)); - return 0; + return min(work_done, budget - 1); } /** -- cgit From 7f6c553902bfa1c4e3f6cfa955c5ea036c7fe8e4 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Mon, 27 Jun 2016 12:16:43 -0300 Subject: i40e: use valid online CPU on q_vector initialization Currently, the q_vector initialization routine sets the affinity_mask of a q_vector based on v_idx value. Meaning a loop iterates on v_idx, which is an incremental value, and the cpumask is created based on this value. This is a problem in systems with multiple logical CPUs per core (like in SMT scenarios). If we disable some logical CPUs, by turning SMT off for example, we will end up with a sparse cpu_online_mask, i.e., only the first CPU in a core is online, and incremental filling in q_vector cpumask might lead to multiple offline CPUs being assigned to q_vectors. Example: if we have a system with 8 cores each one containing 8 logical CPUs (SMT == 8 in this case), we have 64 CPUs in total. But if SMT is disabled, only the 1st CPU in each core remains online, so the cpu_online_mask in this case would have only 8 bits set, in a sparse way. In general case, when SMT is off the cpu_online_mask has only C bits set: 0, 1*N, 2*N, ..., C*(N-1) where C == # of cores; N == # of logical CPUs per core. In our example, only bits 0, 8, 16, 24, 32, 40, 48, 56 would be set. This patch changes the way q_vector's affinity_mask is created: it iterates on v_idx, but consumes the CPU index from the cpu_online_mask instead of just using the v_idx incremental value. No functional changes were introduced. Signed-off-by: Guilherme G Piccoli Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_main.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 1592dcbed790..501f15d9f4d6 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -7721,10 +7721,11 @@ static int i40e_init_msix(struct i40e_pf *pf) * i40e_vsi_alloc_q_vector - Allocate memory for a single interrupt vector * @vsi: the VSI being configured * @v_idx: index of the vector in the vsi struct + * @cpu: cpu to be used on affinity_mask * * We allocate one q_vector. If allocation fails we return -ENOMEM. **/ -static int i40e_vsi_alloc_q_vector(struct i40e_vsi *vsi, int v_idx) +static int i40e_vsi_alloc_q_vector(struct i40e_vsi *vsi, int v_idx, int cpu) { struct i40e_q_vector *q_vector; @@ -7735,7 +7736,8 @@ static int i40e_vsi_alloc_q_vector(struct i40e_vsi *vsi, int v_idx) q_vector->vsi = vsi; q_vector->v_idx = v_idx; - cpumask_set_cpu(v_idx, &q_vector->affinity_mask); + cpumask_set_cpu(cpu, &q_vector->affinity_mask); + if (vsi->netdev) netif_napi_add(vsi->netdev, &q_vector->napi, i40e_napi_poll, NAPI_POLL_WEIGHT); @@ -7759,8 +7761,7 @@ static int i40e_vsi_alloc_q_vector(struct i40e_vsi *vsi, int v_idx) static int i40e_vsi_alloc_q_vectors(struct i40e_vsi *vsi) { struct i40e_pf *pf = vsi->back; - int v_idx, num_q_vectors; - int err; + int err, v_idx, num_q_vectors, current_cpu; /* if not MSIX, give the one vector only to the LAN VSI */ if (pf->flags & I40E_FLAG_MSIX_ENABLED) @@ -7770,10 +7771,15 @@ static int i40e_vsi_alloc_q_vectors(struct i40e_vsi *vsi) else return -EINVAL; + current_cpu = cpumask_first(cpu_online_mask); + for (v_idx = 0; v_idx < num_q_vectors; v_idx++) { - err = i40e_vsi_alloc_q_vector(vsi, v_idx); + err = i40e_vsi_alloc_q_vector(vsi, v_idx, current_cpu); if (err) goto err_out; + current_cpu = cpumask_next(current_cpu, cpu_online_mask); + if (unlikely(current_cpu >= nr_cpu_ids)) + current_cpu = cpumask_first(cpu_online_mask); } return 0; -- cgit From 083ae308280d13d187512b9babe3454342a7987e Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 14 Jul 2016 11:38:40 -0400 Subject: tcp: enable per-socket rate limiting of all 'challenge acks' The per-socket rate limit for 'challenge acks' was introduced in the context of limiting ack loops: commit f2b2c582e824 ("tcp: mitigate ACK loops for connections as tcp_sock") And I think it can be extended to rate limit all 'challenge acks' on a per-socket basis. Since we have the global tcp_challenge_ack_limit, this patch allows for tcp_challenge_ack_limit to be set to a large value and effectively rely on the per-socket limit, or set tcp_challenge_ack_limit to a lower value and still prevents a single connections from consuming the entire challenge ack quota. It further moves in the direction of eliminating the global limit at some point, as Eric Dumazet has suggested. This a follow-up to: Subject: tcp: make challenge acks less predictable Cc: Eric Dumazet Cc: David S. Miller Cc: Neal Cardwell Cc: Yuchung Cheng Cc: Yue Cao Signed-off-by: Jason Baron Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 91868bb17818..42bf89aaf6a5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3421,6 +3421,23 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 return flag; } +static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, + u32 *last_oow_ack_time) +{ + if (*last_oow_ack_time) { + s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); + + if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { + NET_INC_STATS(net, mib_idx); + return true; /* rate-limited: don't send yet! */ + } + } + + *last_oow_ack_time = tcp_time_stamp; + + return false; /* not rate-limited: go ahead, send dupack now! */ +} + /* Return true if we're currently rate-limiting out-of-window ACKs and * thus shouldn't send a dupack right now. We rate-limit dupacks in * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS @@ -3434,21 +3451,9 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, /* Data packets without SYNs are not likely part of an ACK loop. */ if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) && !tcp_hdr(skb)->syn) - goto not_rate_limited; - - if (*last_oow_ack_time) { - s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); - - if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { - NET_INC_STATS(net, mib_idx); - return true; /* rate-limited: don't send yet! */ - } - } - - *last_oow_ack_time = tcp_time_stamp; + return false; -not_rate_limited: - return false; /* not rate-limited: go ahead, send dupack now! */ + return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time); } /* RFC 5961 7 [ACK Throttling] */ @@ -3461,9 +3466,9 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) u32 count, now; /* First check our per-socket dupack rate limit. */ - if (tcp_oow_rate_limited(sock_net(sk), skb, - LINUX_MIB_TCPACKSKIPPEDCHALLENGE, - &tp->last_oow_ack_time)) + if (__tcp_oow_rate_limited(sock_net(sk), + LINUX_MIB_TCPACKSKIPPEDCHALLENGE, + &tp->last_oow_ack_time)) return; /* Then check host-wide RFC 5961 rate limit. */ -- cgit From c961e877cff4b669788900a7e12386f67efbe2d3 Mon Sep 17 00:00:00 2001 From: Grant Grundler Date: Thu, 14 Jul 2016 11:27:16 -0700 Subject: r8152: add MODULE_VERSION ethtool -i provides a driver version that is hard coded. Export the same value via "modinfo". Signed-off-by: Grant Grundler Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 63f4018293bc..e9654a685381 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -4425,3 +4425,4 @@ module_usb_driver(rtl8152_driver); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); +MODULE_VERSION(DRIVER_VERSION); -- cgit From 6277d46b10a0b35c83656f085cf8e32ded6fdd60 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 15 Jul 2016 11:14:58 +0200 Subject: mlxsw: spectrum: Force link training according to admin state When setting a new speed we need to disable and enable the port for the changes to take effect. We currently only do that if the operational state of the port is up. However, setting a new speed following link training failure will require us to explicitly set the port down and then up. Instead, disable and enable the port based on its administrative state. Fixes: 56ade8fe3fe1 ("mlxsw: spectrum: Add initial support for Spectrum ASIC") Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 660429ebfbe1..5ccdf22822ac 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -171,23 +171,6 @@ static int mlxsw_sp_port_admin_status_set(struct mlxsw_sp_port *mlxsw_sp_port, return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(paos), paos_pl); } -static int mlxsw_sp_port_oper_status_get(struct mlxsw_sp_port *mlxsw_sp_port, - bool *p_is_up) -{ - struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; - char paos_pl[MLXSW_REG_PAOS_LEN]; - u8 oper_status; - int err; - - mlxsw_reg_paos_pack(paos_pl, mlxsw_sp_port->local_port, 0); - err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(paos), paos_pl); - if (err) - return err; - oper_status = mlxsw_reg_paos_oper_status_get(paos_pl); - *p_is_up = oper_status == MLXSW_PORT_ADMIN_STATUS_UP ? true : false; - return 0; -} - static int mlxsw_sp_port_dev_addr_set(struct mlxsw_sp_port *mlxsw_sp_port, unsigned char *addr) { @@ -1493,7 +1476,6 @@ static int mlxsw_sp_port_set_settings(struct net_device *dev, u32 eth_proto_new; u32 eth_proto_cap; u32 eth_proto_admin; - bool is_up; int err; speed = ethtool_cmd_speed(cmd); @@ -1525,12 +1507,7 @@ static int mlxsw_sp_port_set_settings(struct net_device *dev, return err; } - err = mlxsw_sp_port_oper_status_get(mlxsw_sp_port, &is_up); - if (err) { - netdev_err(dev, "Failed to get oper status"); - return err; - } - if (!is_up) + if (!netif_running(dev)) return 0; err = mlxsw_sp_port_admin_status_set(mlxsw_sp_port, false); -- cgit From c3f1576810affced47684e04a08c1ffa845144c9 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 15 Jul 2016 11:14:59 +0200 Subject: mlxsw: spectrum: Indicate support for autonegotiation The device supports link autonegotiation, so let the user know about it by indicating support via ethtool ops. Fixes: 56ade8fe3fe1 ("mlxsw: spectrum: Add initial support for Spectrum ASIC") Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 5ccdf22822ac..374080027b2f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -1417,7 +1417,8 @@ static int mlxsw_sp_port_get_settings(struct net_device *dev, cmd->supported = mlxsw_sp_from_ptys_supported_port(eth_proto_cap) | mlxsw_sp_from_ptys_supported_link(eth_proto_cap) | - SUPPORTED_Pause | SUPPORTED_Asym_Pause; + SUPPORTED_Pause | SUPPORTED_Asym_Pause | + SUPPORTED_Autoneg; cmd->advertising = mlxsw_sp_from_ptys_advert_link(eth_proto_admin); mlxsw_sp_from_ptys_speed_duplex(netif_carrier_ok(dev), eth_proto_oper, cmd); -- cgit From 7347180dcaed9c5582732f9372ac940b9b1a907d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 15 Jul 2016 11:15:00 +0200 Subject: mlxsw: spectrum: Don't emit errors when PFC is disabled We can't have PAUSE frames and PFC both enabled on the same port, but the fact that ieee_setpfc() was called doesn't necessarily mean PFC is enabled. Only emit errors when PAUSE frames and PFC are enabled simultaneously. Fixes: d81a6bdb87ce ("mlxsw: spectrum: Add IEEE 802.1Qbb PFC support") Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c index 0b323661c0b6..5d4b1e7f59f9 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c @@ -351,7 +351,8 @@ static int mlxsw_sp_dcbnl_ieee_setpfc(struct net_device *dev, struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev); int err; - if (mlxsw_sp_port->link.tx_pause || mlxsw_sp_port->link.rx_pause) { + if ((mlxsw_sp_port->link.tx_pause || mlxsw_sp_port->link.rx_pause) && + pfc->pfc_en) { netdev_err(dev, "PAUSE frames already enabled on port\n"); return -EINVAL; } -- cgit From 28f5275e4aab97680c8243ec26e202e44c99e5bf Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 15 Jul 2016 11:15:01 +0200 Subject: mlxsw: spectrum: Prevent overwrite of DCB capability fields The number of supported traffic classes that can have ETS and PFC simultaneously enabled is not subject to user configuration, so make sure we always initialize them to the correct values following a set operation. Fixes: 8e8dfe9fdf06 ("mlxsw: spectrum: Add IEEE 802.1Qaz ETS support") Fixes: d81a6bdb87ce ("mlxsw: spectrum: Add IEEE 802.1Qbb PFC support") Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c index 5d4b1e7f59f9..4af3f2728e47 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c @@ -249,6 +249,7 @@ static int mlxsw_sp_dcbnl_ieee_setets(struct net_device *dev, return err; memcpy(mlxsw_sp_port->dcb.ets, ets, sizeof(*ets)); + mlxsw_sp_port->dcb.ets->ets_cap = IEEE_8021QAZ_MAX_TCS; return 0; } @@ -372,6 +373,7 @@ static int mlxsw_sp_dcbnl_ieee_setpfc(struct net_device *dev, } memcpy(mlxsw_sp_port->dcb.pfc, pfc, sizeof(*pfc)); + mlxsw_sp_port->dcb.pfc->pfc_cap = IEEE_8021QAZ_MAX_TCS; return 0; -- cgit From 11719a58bdf7724c463db54ea2abcec54a87b69c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 15 Jul 2016 11:15:02 +0200 Subject: mlxsw: spectrum: Prevent invalid ingress buffer mapping Packets entering the switch are mapped to a Switch Priority (SP) according to their PCP value (untagged frames are mapped to SP 0). The packets are classified to a priority group (PG) buffer in the port's headroom according to their SP. The switch maintains another mapping (SP to IEEE priority), which is used to generate PFC frames for lossless PGs. This mapping is initialized to IEEE = SP % 8. Therefore, when mapping SP 'x' to PG 'y' we create a situation in which an IEEE priority is mapped to two different PGs: IEEE 'x' ---> SP 'x' ---> PG 'y' IEEE 'x' ---> SP 'x + 8' ---> PG '0' (default) Which is invalid, as a flow can use only one PG buffer. Fix this by mapping both SP 'x' and 'x + 8' to the same PG buffer. Fixes: 8e8dfe9fdf06 ("mlxsw: spectrum: Add IEEE 802.1Qaz ETS support") Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/reg.h | 17 ++++++++++++++++- drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c | 2 +- drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c | 3 ++- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index 1977e7a5c530..57d48da709fb 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -2718,7 +2718,7 @@ static inline void mlxsw_reg_ppcnt_pack(char *payload, u8 local_port, * Configures the switch priority to buffer table. */ #define MLXSW_REG_PPTB_ID 0x500B -#define MLXSW_REG_PPTB_LEN 0x0C +#define MLXSW_REG_PPTB_LEN 0x10 static const struct mlxsw_reg_info mlxsw_reg_pptb = { .id = MLXSW_REG_PPTB_ID, @@ -2784,6 +2784,13 @@ MLXSW_ITEM32(reg, pptb, pm_msb, 0x08, 24, 8); */ MLXSW_ITEM32(reg, pptb, untagged_buff, 0x08, 0, 4); +/* reg_pptb_prio_to_buff_msb + * Mapping of switch priority to one of the allocated receive port + * buffers. + * Access: RW + */ +MLXSW_ITEM_BIT_ARRAY(reg, pptb, prio_to_buff_msb, 0x0C, 0x04, 4); + #define MLXSW_REG_PPTB_ALL_PRIO 0xFF static inline void mlxsw_reg_pptb_pack(char *payload, u8 local_port) @@ -2792,6 +2799,14 @@ static inline void mlxsw_reg_pptb_pack(char *payload, u8 local_port) mlxsw_reg_pptb_mm_set(payload, MLXSW_REG_PPTB_MM_UM); mlxsw_reg_pptb_local_port_set(payload, local_port); mlxsw_reg_pptb_pm_set(payload, MLXSW_REG_PPTB_ALL_PRIO); + mlxsw_reg_pptb_pm_msb_set(payload, MLXSW_REG_PPTB_ALL_PRIO); +} + +static inline void mlxsw_reg_pptb_prio_to_buff_pack(char *payload, u8 prio, + u8 buff) +{ + mlxsw_reg_pptb_prio_to_buff_set(payload, prio, buff); + mlxsw_reg_pptb_prio_to_buff_msb_set(payload, prio, buff); } /* PBMC - Port Buffer Management Control Register diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c index a3720a0fad7d..074cdda7b6f3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c @@ -194,7 +194,7 @@ static int mlxsw_sp_port_pb_prio_init(struct mlxsw_sp_port *mlxsw_sp_port) mlxsw_reg_pptb_pack(pptb_pl, mlxsw_sp_port->local_port); for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) - mlxsw_reg_pptb_prio_to_buff_set(pptb_pl, i, 0); + mlxsw_reg_pptb_prio_to_buff_pack(pptb_pl, i, 0); return mlxsw_reg_write(mlxsw_sp_port->mlxsw_sp->core, MLXSW_REG(pptb), pptb_pl); } diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c index 4af3f2728e47..01cfb7512827 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c @@ -103,7 +103,8 @@ static int mlxsw_sp_port_pg_prio_map(struct mlxsw_sp_port *mlxsw_sp_port, mlxsw_reg_pptb_pack(pptb_pl, mlxsw_sp_port->local_port); for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) - mlxsw_reg_pptb_prio_to_buff_set(pptb_pl, i, prio_tc[i]); + mlxsw_reg_pptb_prio_to_buff_pack(pptb_pl, i, prio_tc[i]); + return mlxsw_reg_write(mlxsw_sp_port->mlxsw_sp->core, MLXSW_REG(pptb), pptb_pl); } -- cgit From e86663c475d384ab5f46cb5637e9b7ad08c5c505 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 15 Jul 2016 15:42:52 -0700 Subject: net: bgmac: Fix infinite loop in bgmac_dma_tx_add() Nothing is decrementing the index "i" while we are cleaning up the fragments we could not successful transmit. Fixes: 9cde94506eacf ("bgmac: implement scatter/gather support") Reported-by: coverity (CID 1352048) Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bgmac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c index a6333d38ecc0..25bbae5928d4 100644 --- a/drivers/net/ethernet/broadcom/bgmac.c +++ b/drivers/net/ethernet/broadcom/bgmac.c @@ -231,7 +231,7 @@ err_dma: dma_unmap_single(dma_dev, slot->dma_addr, skb_headlen(skb), DMA_TO_DEVICE); - while (i > 0) { + while (i-- > 0) { int index = (ring->end + i) % BGMAC_TX_RING_SLOTS; struct bgmac_slot_info *slot = &ring->slots[index]; u32 ctl1 = le32_to_cpu(ring->cpu_base[index].ctl1); -- cgit From 18d3df3eab23796d7f852f9c6bb60962b8372ced Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 14 Jul 2016 18:00:10 +0200 Subject: vlan: use a valid default mtu value for vlan over macsec macsec can't cope with mtu frames which need vlan tag insertion, and vlan device set the default mtu equal to the underlying dev's one. By default vlan over macsec devices use invalid mtu, dropping all the large packets. This patch adds a netif helper to check if an upper vlan device needs mtu reduction. The helper is used during vlan devices initialization to set a valid default and during mtu updating to forbid invalid, too bit, mtu values. The helper currently only check if the lower dev is a macsec device, if we get more users, we need to update only the helper (possibly reserving an additional IFF bit). Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++++ net/8021q/vlan_dev.c | 10 ++++++---- net/8021q/vlan_netlink.c | 7 +++++-- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f45929ce8157..da4b33bea982 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4145,6 +4145,13 @@ static inline void netif_keep_dst(struct net_device *dev) dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM); } +/* return true if dev can't cope with mtu frames that need vlan tag insertion */ +static inline bool netif_reduces_vlan_mtu(struct net_device *dev) +{ + /* TODO: reserve and use an additional IFF bit, if we get more users */ + return dev->priv_flags & IFF_MACSEC; +} + extern struct pernet_operations __net_initdata loopback_net_ops; /* Logging, debugging and troubleshooting/diagnostic helpers. */ diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 86ae75b77390..516b0e73263c 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -146,10 +146,12 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, static int vlan_dev_change_mtu(struct net_device *dev, int new_mtu) { - /* TODO: gotta make sure the underlying layer can handle it, - * maybe an IFF_VLAN_CAPABLE flag for devices? - */ - if (vlan_dev_priv(dev)->real_dev->mtu < new_mtu) + struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; + unsigned int max_mtu = real_dev->mtu; + + if (netif_reduces_vlan_mtu(real_dev)) + max_mtu -= VLAN_HLEN; + if (max_mtu < new_mtu) return -ERANGE; dev->mtu = new_mtu; diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index c92b52f37d38..1270207f3d7c 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -118,6 +118,7 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev; + unsigned int max_mtu; __be16 proto; int err; @@ -144,9 +145,11 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, if (err < 0) return err; + max_mtu = netif_reduces_vlan_mtu(real_dev) ? real_dev->mtu - VLAN_HLEN : + real_dev->mtu; if (!tb[IFLA_MTU]) - dev->mtu = real_dev->mtu; - else if (dev->mtu > real_dev->mtu) + dev->mtu = max_mtu; + else if (dev->mtu > max_mtu) return -EINVAL; err = vlan_changelink(dev, tb, data); -- cgit From de702da7a823ab0c4a1e53ed79a2695f0d453855 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 15 Jul 2016 16:40:22 -0700 Subject: et131x: Fix logical vs bitwise check in et131x_tx_timeout() We should be using a logical check here instead of a bitwise operation to check if the device is closed already in et131x_tx_timeout(). Reported-by: coverity (CID 146498) Fixes: 38df6492eb511 ("et131x: Add PCIe gigabit ethernet driver et131x to drivers/net") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/agere/et131x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/agere/et131x.c b/drivers/net/ethernet/agere/et131x.c index 30defe6c81f2..821d86c38ab2 100644 --- a/drivers/net/ethernet/agere/et131x.c +++ b/drivers/net/ethernet/agere/et131x.c @@ -3851,7 +3851,7 @@ static void et131x_tx_timeout(struct net_device *netdev) unsigned long flags; /* If the device is closed, ignore the timeout */ - if (~(adapter->flags & FMP_ADAPTER_INTERRUPT_IN_USE)) + if (!(adapter->flags & FMP_ADAPTER_INTERRUPT_IN_USE)) return; /* Any nonrecoverable hardware error? -- cgit From ea6ff112b095dce2060c304195904d859c3e2625 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 15 Jul 2016 16:41:16 -0700 Subject: net: nb8800: Fix SKB leak in nb8800_receive() In case nb8800_receive() fails to allocate a fragment, we would leak the SKB freshly allocated and just return, instead, free it. Reported-by: coverity (CID 1341750) Signed-off-by: Florian Fainelli Acked-by: Mans Rullgard Signed-off-by: David S. Miller --- drivers/net/ethernet/aurora/nb8800.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/aurora/nb8800.c b/drivers/net/ethernet/aurora/nb8800.c index 08a23e6b60e9..1a3555d03a96 100644 --- a/drivers/net/ethernet/aurora/nb8800.c +++ b/drivers/net/ethernet/aurora/nb8800.c @@ -259,6 +259,7 @@ static void nb8800_receive(struct net_device *dev, unsigned int i, if (err) { netdev_err(dev, "rx buffer allocation failed\n"); dev->stats.rx_dropped++; + dev_kfree_skb(skb); return; } -- cgit From 8e6ce7ebeb34f0992f56de078c3744fb383657fa Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 15 Jul 2016 16:42:16 -0700 Subject: net: cavium: liquidio: Avoid dma_unmap_single on uninitialized ndata The label lio_xmit_failed is used 3 times through liquidio_xmit() but it always makes a call to dma_unmap_single() using potentially uninitialized variables from "ndata" variable. Out of the 3 gotos, 2 run after ndata has been initialized, and had a prior dma_map_single() call. Fix this by adding a new error label: lio_xmit_dma_failed which does this dma_unmap_single() and then processed with the lio_xmit_failed fallthrough. Fixes: f21fb3ed364bb ("Add support of Cavium Liquidio ethernet adapters") Reported-by: coverity (CID 1309740) Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/liquidio/lio_main.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c index 8de79ae63231..0e7e7da8d201 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c @@ -2821,7 +2821,7 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev) if (!g) { netif_info(lio, tx_err, lio->netdev, "Transmit scatter gather: glist null!\n"); - goto lio_xmit_failed; + goto lio_xmit_dma_failed; } cmdsetup.s.gather = 1; @@ -2892,7 +2892,7 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev) else status = octnet_send_nic_data_pkt(oct, &ndata, xmit_more); if (status == IQ_SEND_FAILED) - goto lio_xmit_failed; + goto lio_xmit_dma_failed; netif_info(lio, tx_queued, lio->netdev, "Transmit queued successfully\n"); @@ -2906,12 +2906,13 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev) return NETDEV_TX_OK; +lio_xmit_dma_failed: + dma_unmap_single(&oct->pci_dev->dev, ndata.cmd.dptr, + ndata.datasize, DMA_TO_DEVICE); lio_xmit_failed: stats->tx_dropped++; netif_info(lio, tx_err, lio->netdev, "IQ%d Transmit dropped:%llu\n", iq_no, stats->tx_dropped); - dma_unmap_single(&oct->pci_dev->dev, ndata.cmd.dptr, - ndata.datasize, DMA_TO_DEVICE); recv_buffer_free(skb); return NETDEV_TX_OK; } -- cgit From 0564bf0afae443deeb16f36e2c39fefff89d05f2 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sat, 16 Jul 2016 17:08:56 +0300 Subject: net/sched/sch_htb: clamp xstats tokens to fit into 32-bit int In kernel HTB keeps tokens in signed 64-bit in nanoseconds. In netlink protocol these values are converted into pshed ticks (64ns for now) and truncated to 32-bit. In struct tc_htb_xstats fields "tokens" and "ctokens" are declared as unsigned 32-bit but they could be negative thus tool 'tc' prints them as signed. Big values loose higher bits and/or become negative. This patch clamps tokens in xstat into range from INT_MIN to INT_MAX. In this way it's easier to understand what's going on here. Signed-off-by: Konstantin Khlebnikov Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 62f9d8100c6e..052f84d6cc23 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1140,8 +1140,10 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) if (!cl->level && cl->un.leaf.q) qlen = cl->un.leaf.q->q.qlen; - cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens); - cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens); + cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens), + INT_MIN, INT_MAX); + cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens), + INT_MIN, INT_MAX); if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 || -- cgit From c74bfbdba0e8d056e4ba579a666b5cdb8ec3cd35 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sat, 16 Jul 2016 17:33:15 -0400 Subject: sctp: load transport header after sk_filter Do not cache pointers into the skb linear segment across sk_filter. The function call can trigger pskb_expand_head. Signed-off-by: Willem de Bruijn Acked-by: Daniel Borkmann Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/input.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/sctp/input.c b/net/sctp/input.c index a701527a9480..47cf4604d19c 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -112,7 +112,6 @@ int sctp_rcv(struct sk_buff *skb) struct sctp_ep_common *rcvr; struct sctp_transport *transport = NULL; struct sctp_chunk *chunk; - struct sctphdr *sh; union sctp_addr src; union sctp_addr dest; int family; @@ -127,8 +126,6 @@ int sctp_rcv(struct sk_buff *skb) if (skb_linearize(skb)) goto discard_it; - sh = sctp_hdr(skb); - /* Pull up the IP and SCTP headers. */ __skb_pull(skb, skb_transport_offset(skb)); if (skb->len < sizeof(struct sctphdr)) @@ -230,7 +227,7 @@ int sctp_rcv(struct sk_buff *skb) chunk->rcvr = rcvr; /* Remember the SCTP header. */ - chunk->sctp_hdr = sh; + chunk->sctp_hdr = sctp_hdr(skb); /* Set the source and destination addresses of the incoming chunk. */ sctp_init_addrs(chunk, &src, &dest); -- cgit From 30f56e3ced0f4966e8a84ece1acceccbbb73d365 Mon Sep 17 00:00:00 2001 From: Eugenia Emantayev Date: Mon, 18 Jul 2016 18:35:11 +0300 Subject: net/mlx4_en: Move filters cleanup to a proper location Filters cleanup should be done once before destroying net device, since filters list is contained in the private data. Fixes: 1eb8c695bda9 ('net/mlx4_en: Add accelerated RFS support') Signed-off-by: Eugenia Emantayev Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 4 ++++ drivers/net/ethernet/mellanox/mlx4/en_rx.c | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 0c0dfd6cdca6..5d809c8c27c4 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2080,6 +2080,10 @@ void mlx4_en_destroy_netdev(struct net_device *dev) mdev->upper[priv->port] = NULL; mutex_unlock(&mdev->state_lock); +#ifdef CONFIG_RFS_ACCEL + mlx4_en_cleanup_filters(priv); +#endif + mlx4_en_free_resources(priv); kfree(priv->tx_ring); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index c1b3a9c8cf3b..99b5407f2278 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -514,9 +514,6 @@ void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv, ring->rx_info = NULL; kfree(ring); *pring = NULL; -#ifdef CONFIG_RFS_ACCEL - mlx4_en_cleanup_filters(priv); -#endif } void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv, -- cgit From ec25bc04ed8e12947738468cbe2191f1529f9e39 Mon Sep 17 00:00:00 2001 From: Eugenia Emantayev Date: Mon, 18 Jul 2016 18:35:12 +0300 Subject: net/mlx4_en: Add resilience in low memory systems This patch fixes the lost of Ethernet port on low memory system, when driver frees its resources and fails to allocate new resources. Issue could happen while changing number of channels, rings size or changing the timestamp configuration. This fix is necessary because of removing vmap use in the code. When vmap was in use driver could allocate non-contiguous memory and make it contiguous with vmap. Now it could fail to allocate a large chunk of contiguous memory and lose the port. Current code tries to allocate new resources and then upon success frees the old resources. Fixes: 73898db04301 ('net/mlx4: Avoid wrong virtual mappings') Signed-off-by: Eugenia Emantayev Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 54 +++++++----- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 106 +++++++++++++++++++++--- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 9 +- 3 files changed, 132 insertions(+), 37 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c index fc95affaf76b..44cf16d01f42 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c @@ -1042,6 +1042,8 @@ static int mlx4_en_set_ringparam(struct net_device *dev, { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_en_port_profile new_prof; + struct mlx4_en_priv *tmp; u32 rx_size, tx_size; int port_up = 0; int err = 0; @@ -1061,22 +1063,25 @@ static int mlx4_en_set_ringparam(struct net_device *dev, tx_size == priv->tx_ring[0]->size) return 0; + tmp = kzalloc(sizeof(*tmp), GFP_KERNEL); + if (!tmp) + return -ENOMEM; + mutex_lock(&mdev->state_lock); + memcpy(&new_prof, priv->prof, sizeof(struct mlx4_en_port_profile)); + new_prof.tx_ring_size = tx_size; + new_prof.rx_ring_size = rx_size; + err = mlx4_en_try_alloc_resources(priv, tmp, &new_prof); + if (err) + goto out; + if (priv->port_up) { port_up = 1; mlx4_en_stop_port(dev, 1); } - mlx4_en_free_resources(priv); - - priv->prof->tx_ring_size = tx_size; - priv->prof->rx_ring_size = rx_size; + mlx4_en_safe_replace_resources(priv, tmp); - err = mlx4_en_alloc_resources(priv); - if (err) { - en_err(priv, "Failed reallocating port resources\n"); - goto out; - } if (port_up) { err = mlx4_en_start_port(dev); if (err) @@ -1084,8 +1089,8 @@ static int mlx4_en_set_ringparam(struct net_device *dev, } err = mlx4_en_moderation_update(priv); - out: + kfree(tmp); mutex_unlock(&mdev->state_lock); return err; } @@ -1714,6 +1719,8 @@ static int mlx4_en_set_channels(struct net_device *dev, { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_en_port_profile new_prof; + struct mlx4_en_priv *tmp; int port_up = 0; int err = 0; @@ -1723,23 +1730,26 @@ static int mlx4_en_set_channels(struct net_device *dev, !channel->tx_count || !channel->rx_count) return -EINVAL; + tmp = kzalloc(sizeof(*tmp), GFP_KERNEL); + if (!tmp) + return -ENOMEM; + mutex_lock(&mdev->state_lock); + memcpy(&new_prof, priv->prof, sizeof(struct mlx4_en_port_profile)); + new_prof.num_tx_rings_p_up = channel->tx_count; + new_prof.tx_ring_num = channel->tx_count * MLX4_EN_NUM_UP; + new_prof.rx_ring_num = channel->rx_count; + + err = mlx4_en_try_alloc_resources(priv, tmp, &new_prof); + if (err) + goto out; + if (priv->port_up) { port_up = 1; mlx4_en_stop_port(dev, 1); } - mlx4_en_free_resources(priv); - - priv->num_tx_rings_p_up = channel->tx_count; - priv->tx_ring_num = channel->tx_count * MLX4_EN_NUM_UP; - priv->rx_ring_num = channel->rx_count; - - err = mlx4_en_alloc_resources(priv); - if (err) { - en_err(priv, "Failed reallocating port resources\n"); - goto out; - } + mlx4_en_safe_replace_resources(priv, tmp); netif_set_real_num_tx_queues(dev, priv->tx_ring_num); netif_set_real_num_rx_queues(dev, priv->rx_ring_num); @@ -1757,8 +1767,8 @@ static int mlx4_en_set_channels(struct net_device *dev, } err = mlx4_en_moderation_update(priv); - out: + kfree(tmp); mutex_unlock(&mdev->state_lock); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 5d809c8c27c4..8359e9e51b3b 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -1954,7 +1954,7 @@ static int mlx4_en_close(struct net_device *dev) return 0; } -void mlx4_en_free_resources(struct mlx4_en_priv *priv) +static void mlx4_en_free_resources(struct mlx4_en_priv *priv) { int i; @@ -1979,7 +1979,7 @@ void mlx4_en_free_resources(struct mlx4_en_priv *priv) } -int mlx4_en_alloc_resources(struct mlx4_en_priv *priv) +static int mlx4_en_alloc_resources(struct mlx4_en_priv *priv) { struct mlx4_en_port_profile *prof = priv->prof; int i; @@ -2044,6 +2044,77 @@ static void mlx4_en_shutdown(struct net_device *dev) rtnl_unlock(); } +static int mlx4_en_copy_priv(struct mlx4_en_priv *dst, + struct mlx4_en_priv *src, + struct mlx4_en_port_profile *prof) +{ + memcpy(&dst->hwtstamp_config, &prof->hwtstamp_config, + sizeof(dst->hwtstamp_config)); + dst->num_tx_rings_p_up = src->mdev->profile.num_tx_rings_p_up; + dst->tx_ring_num = prof->tx_ring_num; + dst->rx_ring_num = prof->rx_ring_num; + dst->flags = prof->flags; + dst->mdev = src->mdev; + dst->port = src->port; + dst->dev = src->dev; + dst->prof = prof; + dst->stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) + + DS_SIZE * MLX4_EN_MAX_RX_FRAGS); + + dst->tx_ring = kzalloc(sizeof(struct mlx4_en_tx_ring *) * MAX_TX_RINGS, + GFP_KERNEL); + if (!dst->tx_ring) + return -ENOMEM; + + dst->tx_cq = kzalloc(sizeof(struct mlx4_en_cq *) * MAX_TX_RINGS, + GFP_KERNEL); + if (!dst->tx_cq) { + kfree(dst->tx_ring); + return -ENOMEM; + } + return 0; +} + +static void mlx4_en_update_priv(struct mlx4_en_priv *dst, + struct mlx4_en_priv *src) +{ + memcpy(dst->rx_ring, src->rx_ring, + sizeof(struct mlx4_en_rx_ring *) * src->rx_ring_num); + memcpy(dst->rx_cq, src->rx_cq, + sizeof(struct mlx4_en_cq *) * src->rx_ring_num); + memcpy(&dst->hwtstamp_config, &src->hwtstamp_config, + sizeof(dst->hwtstamp_config)); + dst->tx_ring_num = src->tx_ring_num; + dst->rx_ring_num = src->rx_ring_num; + dst->tx_ring = src->tx_ring; + dst->tx_cq = src->tx_cq; + memcpy(dst->prof, src->prof, sizeof(struct mlx4_en_port_profile)); +} + +int mlx4_en_try_alloc_resources(struct mlx4_en_priv *priv, + struct mlx4_en_priv *tmp, + struct mlx4_en_port_profile *prof) +{ + mlx4_en_copy_priv(tmp, priv, prof); + + if (mlx4_en_alloc_resources(tmp)) { + en_warn(priv, + "%s: Resource allocation failed, using previous configuration\n", + __func__); + kfree(tmp->tx_ring); + kfree(tmp->tx_cq); + return -ENOMEM; + } + return 0; +} + +void mlx4_en_safe_replace_resources(struct mlx4_en_priv *priv, + struct mlx4_en_priv *tmp) +{ + mlx4_en_free_resources(priv); + mlx4_en_update_priv(priv, tmp); +} + void mlx4_en_destroy_netdev(struct net_device *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); @@ -3128,6 +3199,8 @@ int mlx4_en_reset_config(struct net_device *dev, { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_en_port_profile new_prof; + struct mlx4_en_priv *tmp; int port_up = 0; int err = 0; @@ -3144,19 +3217,29 @@ int mlx4_en_reset_config(struct net_device *dev, return -EINVAL; } + tmp = kzalloc(sizeof(*tmp), GFP_KERNEL); + if (!tmp) + return -ENOMEM; + mutex_lock(&mdev->state_lock); + + memcpy(&new_prof, priv->prof, sizeof(struct mlx4_en_port_profile)); + memcpy(&new_prof.hwtstamp_config, &ts_config, sizeof(ts_config)); + + err = mlx4_en_try_alloc_resources(priv, tmp, &new_prof); + if (err) + goto out; + if (priv->port_up) { port_up = 1; mlx4_en_stop_port(dev, 1); } - mlx4_en_free_resources(priv); - en_warn(priv, "Changing device configuration rx filter(%x) rx vlan(%x)\n", - ts_config.rx_filter, !!(features & NETIF_F_HW_VLAN_CTAG_RX)); + ts_config.rx_filter, + !!(features & NETIF_F_HW_VLAN_CTAG_RX)); - priv->hwtstamp_config.tx_type = ts_config.tx_type; - priv->hwtstamp_config.rx_filter = ts_config.rx_filter; + mlx4_en_safe_replace_resources(priv, tmp); if (DEV_FEATURE_CHANGED(dev, features, NETIF_F_HW_VLAN_CTAG_RX)) { if (features & NETIF_F_HW_VLAN_CTAG_RX) @@ -3190,11 +3273,6 @@ int mlx4_en_reset_config(struct net_device *dev, dev->features &= ~NETIF_F_HW_VLAN_CTAG_RX; } - err = mlx4_en_alloc_resources(priv); - if (err) { - en_err(priv, "Failed reallocating port resources\n"); - goto out; - } if (port_up) { err = mlx4_en_start_port(dev); if (err) @@ -3203,6 +3281,8 @@ int mlx4_en_reset_config(struct net_device *dev, out: mutex_unlock(&mdev->state_lock); - netdev_features_change(dev); + kfree(tmp); + if (!err) + netdev_features_change(dev); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index 467d47ed2c39..13d297ee34bb 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -353,12 +353,14 @@ struct mlx4_en_port_profile { u32 rx_ring_num; u32 tx_ring_size; u32 rx_ring_size; + u8 num_tx_rings_p_up; u8 rx_pause; u8 rx_ppp; u8 tx_pause; u8 tx_ppp; int rss_rings; int inline_thold; + struct hwtstamp_config hwtstamp_config; }; struct mlx4_en_profile { @@ -623,8 +625,11 @@ void mlx4_en_set_stats_bitmap(struct mlx4_dev *dev, u8 rx_ppp, u8 rx_pause, u8 tx_ppp, u8 tx_pause); -void mlx4_en_free_resources(struct mlx4_en_priv *priv); -int mlx4_en_alloc_resources(struct mlx4_en_priv *priv); +int mlx4_en_try_alloc_resources(struct mlx4_en_priv *priv, + struct mlx4_en_priv *tmp, + struct mlx4_en_port_profile *prof); +void mlx4_en_safe_replace_resources(struct mlx4_en_priv *priv, + struct mlx4_en_priv *tmp); int mlx4_en_create_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq, int entries, int ring, enum cq_type mode, int node); -- cgit From 97b041971e1c70da077fcfa7a50b362e22cd241e Mon Sep 17 00:00:00 2001 From: Douglas Miller Date: Mon, 18 Jul 2016 12:28:45 -0500 Subject: Update maintainer for EHEA driver. Since Thadeu left IBM, EHEA has gone mostly unmaintained, since his email address doesn't work anymore. I'm stepping up to help maintain this driver upstream. I'm adding Thadeu's personal e-mail address in Cc, hoping that we can get his ack. CC: Thadeu Lima de Souza Cascardo Signed-off-by: Douglas Miller Acked-by: Thadeu Lima de Souza Cascardo Signed-off-by: David S. Miller --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 1209323b7e43..bda882585648 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4477,7 +4477,7 @@ S: Orphan F: fs/efs/ EHEA (IBM pSeries eHEA 10Gb ethernet adapter) DRIVER -M: Thadeu Lima de Souza Cascardo +M: Douglas Miller L: netdev@vger.kernel.org S: Maintained F: drivers/net/ethernet/ibm/ehea/ -- cgit From eabfdda93477f6ee5e153f560560e9cb1c617fd7 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 18 Jul 2016 15:02:06 -0400 Subject: net: switchdev: change ageing_time type to clock_t The switchdev value for the SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME attribute is a clock_t and requires to use helpers such as clock_t_to_jiffies() to convert to milliseconds. Change ageing_time type from u32 to clock_t to make it explicit. Fixes: f55ac58ae64c ("switchdev: add bridge ageing_time attribute") Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/switchdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 985619a59323..1d8e158241da 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -60,7 +60,7 @@ struct switchdev_attr { struct netdev_phys_item_id ppid; /* PORT_PARENT_ID */ u8 stp_state; /* PORT_STP_STATE */ unsigned long brport_flags; /* PORT_BRIDGE_FLAGS */ - u32 ageing_time; /* BRIDGE_AGEING_TIME */ + clock_t ageing_time; /* BRIDGE_AGEING_TIME */ bool vlan_filtering; /* BRIDGE_VLAN_FILTERING */ } u; }; -- cgit From edbe77462302ec0b11a90244de13f9012118c538 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Tue, 19 Jul 2016 14:40:51 +0900 Subject: packet: fix second argument of sock_tx_timestamp() This patch fixes an issue that a syscall (e.g. sendto syscall) cannot work correctly. Since the sendto syscall doesn't have msg_control buffer, the sock_tx_timestamp() in packet_snd() cannot work correctly because the socks.tsflags is set to 0. So, this patch sets the socks.tsflags to sk->sk_tsflags as default. Fixes: c14ac9451c34 ("sock: enable timestamping using control messages") Reported-by: Kazuya Mizuguchi Reported-by: Keita Kobayashi Signed-off-by: Yoshihiro Shimoda Acked-by: Soheil Hassas Yeganeh Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 9f0983fa4d52..53e87ceb26e7 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1927,7 +1927,7 @@ retry: goto out_unlock; } - sockc.tsflags = 0; + sockc.tsflags = sk->sk_tsflags; if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) { @@ -2678,7 +2678,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); } - sockc.tsflags = 0; + sockc.tsflags = po->sk.sk_tsflags; if (msg->msg_controllen) { err = sock_cmsg_send(&po->sk, msg, &sockc); if (unlikely(err)) @@ -2881,7 +2881,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; - sockc.tsflags = 0; + sockc.tsflags = sk->sk_tsflags; sockc.mark = sk->sk_mark; if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); -- cgit From 882b0f2fba83374149f0a5869d95aa8b44dad31e Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 21 Jul 2016 00:39:53 +0300 Subject: net/mlx5e: Fix del vxlan port command buffer memset memset the command buffers rather than the pointers to them. Fixes: b3f63c3d5e2c ("net/mlx5e: Add netdev support for VXLAN tunneling") Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/vxlan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c index 05de77267d58..e25a73ed2981 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c @@ -72,8 +72,8 @@ static int mlx5e_vxlan_core_del_port_cmd(struct mlx5_core_dev *mdev, u16 port) u32 in[MLX5_ST_SZ_DW(delete_vxlan_udp_dport_in)]; u32 out[MLX5_ST_SZ_DW(delete_vxlan_udp_dport_out)]; - memset(&in, 0, sizeof(in)); - memset(&out, 0, sizeof(out)); + memset(in, 0, sizeof(in)); + memset(out, 0, sizeof(out)); MLX5_SET(delete_vxlan_udp_dport_in, in, opcode, MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT); -- cgit From f8e7718cc0445587fe8530fc2d240d9aac2c9072 Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Wed, 20 Jul 2016 18:01:18 -0400 Subject: packet: propagate sock_cmsg_send() error sock_cmsg_send() can return different error codes and not only -EINVAL, and we should properly propagate them. Fixes: c14ac9451c34 ("sock: enable timestamping using control messages") Signed-off-by: Soheil Hassas Yeganeh Cc: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 53e87ceb26e7..b43c4015b2f7 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1930,10 +1930,8 @@ retry: sockc.tsflags = sk->sk_tsflags; if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); - if (unlikely(err)) { - err = -EINVAL; + if (unlikely(err)) goto out_unlock; - } } skb->protocol = proto; -- cgit