From 1f4f554a72be0d8c164c2f5bc6ba939a1c624fb4 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Sat, 9 Sep 2017 11:58:03 +0300
Subject: net: qualcomm: rmnet: Fix a double free

There is a typo here so we accidentally free "skb" instead of "skbn".
It leads to a double free and a leak.  After discussing with Subash,
it's better to just move the check before the allocation and avoid the
need to free.

Fixes: ceed73a2cf4a ("drivers: net: ethernet: qualcomm: rmnet: Initial implementation")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
index 557c9bf1a469..86b8c758f94e 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
@@ -84,6 +84,10 @@ struct sk_buff *rmnet_map_deaggregate(struct sk_buff *skb)
 	if (((int)skb->len - (int)packet_len) < 0)
 		return NULL;
 
+	/* Some hardware can send us empty frames. Catch them */
+	if (ntohs(maph->pkt_len) == 0)
+		return NULL;
+
 	skbn = alloc_skb(packet_len + RMNET_MAP_DEAGGR_SPACING, GFP_ATOMIC);
 	if (!skbn)
 		return NULL;
@@ -94,11 +98,5 @@ struct sk_buff *rmnet_map_deaggregate(struct sk_buff *skb)
 	memcpy(skbn->data, skb->data, packet_len);
 	skb_pull(skb, packet_len);
 
-	/* Some hardware can send us empty frames. Catch them */
-	if (ntohs(maph->pkt_len) == 0) {
-		kfree_skb(skb);
-		return NULL;
-	}
-
 	return skbn;
 }
-- 
cgit 


From 4400081b631af69abc63cea3352680e3d85e0c39 Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Mon, 11 Sep 2017 09:42:26 +0200
Subject: mlxsw: spectrum: Fix EEPROM access in case of SFP/SFP+

The current code does not handle correctly the access to the upper page
in case of SFP/SFP+ EEPROM. In that case the offset should be local
and the I2C address should be changed.

Fixes: 2ea109039cd3 ("mlxsw: spectrum: Add support for access cable info via ethtool")
Reported-by: Florian Klink <flokli@flokli.de>
Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index ed7cd6c48019..e0804599fcae 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -2545,7 +2545,9 @@ out:
 	return err;
 }
 
-#define MLXSW_SP_QSFP_I2C_ADDR 0x50
+#define MLXSW_SP_I2C_ADDR_LOW 0x50
+#define MLXSW_SP_I2C_ADDR_HIGH 0x51
+#define MLXSW_SP_EEPROM_PAGE_LENGTH 256
 
 static int mlxsw_sp_query_module_eeprom(struct mlxsw_sp_port *mlxsw_sp_port,
 					u16 offset, u16 size, void *data,
@@ -2554,12 +2556,25 @@ static int mlxsw_sp_query_module_eeprom(struct mlxsw_sp_port *mlxsw_sp_port,
 	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
 	char eeprom_tmp[MLXSW_SP_REG_MCIA_EEPROM_SIZE];
 	char mcia_pl[MLXSW_REG_MCIA_LEN];
+	u16 i2c_addr;
 	int status;
 	int err;
 
 	size = min_t(u16, size, MLXSW_SP_REG_MCIA_EEPROM_SIZE);
+
+	if (offset < MLXSW_SP_EEPROM_PAGE_LENGTH &&
+	    offset + size > MLXSW_SP_EEPROM_PAGE_LENGTH)
+		/* Cross pages read, read until offset 256 in low page */
+		size = MLXSW_SP_EEPROM_PAGE_LENGTH - offset;
+
+	i2c_addr = MLXSW_SP_I2C_ADDR_LOW;
+	if (offset >= MLXSW_SP_EEPROM_PAGE_LENGTH) {
+		i2c_addr = MLXSW_SP_I2C_ADDR_HIGH;
+		offset -= MLXSW_SP_EEPROM_PAGE_LENGTH;
+	}
+
 	mlxsw_reg_mcia_pack(mcia_pl, mlxsw_sp_port->mapping.module,
-			    0, 0, offset, size, MLXSW_SP_QSFP_I2C_ADDR);
+			    0, 0, offset, size, i2c_addr);
 
 	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(mcia), mcia_pl);
 	if (err)
-- 
cgit 


From 8195b1396ec86dddbba443c74b2188b423556c74 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Wed, 6 Sep 2017 13:53:05 -0700
Subject: hv_netvsc: fix deadlock on hotplug

When a virtual device is added dynamically (via host console), then
the vmbus sends an offer message for the primary channel. The processing
of this message for networking causes the network device to then
initialize the sub channels.

The problem is that setting up the sub channels needs to wait until
the subsequent subchannel offers have been processed. These offers
come in on the same ring buffer and work queue as where the primary
offer is being processed; leading to a deadlock.

This did not happen in older kernels, because the sub channel waiting
logic was broken (it wasn't really waiting).

The solution is to do the sub channel setup in its own work queue
context that is scheduled by the primary channel setup; and then
happens later.

Fixes: 732e49850c5e ("netvsc: fix race on sub channel creation")
Reported-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/hyperv_net.h   |   3 +
 drivers/net/hyperv/netvsc.c       |   3 +
 drivers/net/hyperv/netvsc_drv.c   |  11 +---
 drivers/net/hyperv/rndis_filter.c | 122 ++++++++++++++++++++++++++------------
 4 files changed, 94 insertions(+), 45 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index ec546da86683..d98cdfb1536b 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -204,6 +204,8 @@ int netvsc_recv_callback(struct net_device *net,
 			 const struct ndis_pkt_8021q_info *vlan);
 void netvsc_channel_cb(void *context);
 int netvsc_poll(struct napi_struct *napi, int budget);
+
+void rndis_set_subchannel(struct work_struct *w);
 bool rndis_filter_opened(const struct netvsc_device *nvdev);
 int rndis_filter_open(struct netvsc_device *nvdev);
 int rndis_filter_close(struct netvsc_device *nvdev);
@@ -782,6 +784,7 @@ struct netvsc_device {
 	u32 num_chn;
 
 	atomic_t open_chn;
+	struct work_struct subchan_work;
 	wait_queue_head_t subchan_open;
 
 	struct rndis_device *extension;
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 0062b802676f..a5511b7326af 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -81,6 +81,7 @@ static struct netvsc_device *alloc_net_device(void)
 
 	init_completion(&net_device->channel_init_wait);
 	init_waitqueue_head(&net_device->subchan_open);
+	INIT_WORK(&net_device->subchan_work, rndis_set_subchannel);
 
 	return net_device;
 }
@@ -557,6 +558,8 @@ void netvsc_device_remove(struct hv_device *device)
 		= rtnl_dereference(net_device_ctx->nvdev);
 	int i;
 
+	cancel_work_sync(&net_device->subchan_work);
+
 	netvsc_disconnect_vsp(device);
 
 	RCU_INIT_POINTER(net_device_ctx->nvdev, NULL);
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 165ba4b3b423..c538a4f15f3b 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -853,10 +853,7 @@ static int netvsc_set_channels(struct net_device *net,
 	rndis_filter_device_remove(dev, nvdev);
 
 	nvdev = rndis_filter_device_add(dev, &device_info);
-	if (!IS_ERR(nvdev)) {
-		netif_set_real_num_tx_queues(net, nvdev->num_chn);
-		netif_set_real_num_rx_queues(net, nvdev->num_chn);
-	} else {
+	if (IS_ERR(nvdev)) {
 		ret = PTR_ERR(nvdev);
 		device_info.num_chn = orig;
 		nvdev = rndis_filter_device_add(dev, &device_info);
@@ -1954,9 +1951,6 @@ static int netvsc_probe(struct hv_device *dev,
 		NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
 	net->vlan_features = net->features;
 
-	netif_set_real_num_tx_queues(net, nvdev->num_chn);
-	netif_set_real_num_rx_queues(net, nvdev->num_chn);
-
 	netdev_lockdep_set_classes(net);
 
 	/* MTU range: 68 - 1500 or 65521 */
@@ -2012,9 +2006,10 @@ static int netvsc_remove(struct hv_device *dev)
 	if (vf_netdev)
 		netvsc_unregister_vf(vf_netdev);
 
+	unregister_netdevice(net);
+
 	rndis_filter_device_remove(dev,
 				   rtnl_dereference(ndev_ctx->nvdev));
-	unregister_netdevice(net);
 	rtnl_unlock();
 
 	hv_set_drvdata(dev, NULL);
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 69c40b8fccc3..731bc7cc6f43 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1039,8 +1039,6 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc)
 
 	/* Set the channel before opening.*/
 	nvchan->channel = new_sc;
-	netif_napi_add(ndev, &nvchan->napi,
-		       netvsc_poll, NAPI_POLL_WEIGHT);
 
 	ret = vmbus_open(new_sc, nvscdev->ring_size * PAGE_SIZE,
 			 nvscdev->ring_size * PAGE_SIZE, NULL, 0,
@@ -1048,12 +1046,88 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc)
 	if (ret == 0)
 		napi_enable(&nvchan->napi);
 	else
-		netif_napi_del(&nvchan->napi);
+		netdev_notice(ndev, "sub channel open failed: %d\n", ret);
 
 	atomic_inc(&nvscdev->open_chn);
 	wake_up(&nvscdev->subchan_open);
 }
 
+/* Open sub-channels after completing the handling of the device probe.
+ * This breaks overlap of processing the host message for the
+ * new primary channel with the initialization of sub-channels.
+ */
+void rndis_set_subchannel(struct work_struct *w)
+{
+	struct netvsc_device *nvdev
+		= container_of(w, struct netvsc_device, subchan_work);
+	struct nvsp_message *init_packet = &nvdev->channel_init_pkt;
+	struct net_device_context *ndev_ctx;
+	struct rndis_device *rdev;
+	struct net_device *ndev;
+	struct hv_device *hv_dev;
+	int i, ret;
+
+	if (!rtnl_trylock()) {
+		schedule_work(w);
+		return;
+	}
+
+	rdev = nvdev->extension;
+	if (!rdev)
+		goto unlock;	/* device was removed */
+
+	ndev = rdev->ndev;
+	ndev_ctx = netdev_priv(ndev);
+	hv_dev = ndev_ctx->device_ctx;
+
+	memset(init_packet, 0, sizeof(struct nvsp_message));
+	init_packet->hdr.msg_type = NVSP_MSG5_TYPE_SUBCHANNEL;
+	init_packet->msg.v5_msg.subchn_req.op = NVSP_SUBCHANNEL_ALLOCATE;
+	init_packet->msg.v5_msg.subchn_req.num_subchannels =
+						nvdev->num_chn - 1;
+	ret = vmbus_sendpacket(hv_dev->channel, init_packet,
+			       sizeof(struct nvsp_message),
+			       (unsigned long)init_packet,
+			       VM_PKT_DATA_INBAND,
+			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
+	if (ret) {
+		netdev_err(ndev, "sub channel allocate send failed: %d\n", ret);
+		goto failed;
+	}
+
+	wait_for_completion(&nvdev->channel_init_wait);
+	if (init_packet->msg.v5_msg.subchn_comp.status != NVSP_STAT_SUCCESS) {
+		netdev_err(ndev, "sub channel request failed\n");
+		goto failed;
+	}
+
+	nvdev->num_chn = 1 +
+		init_packet->msg.v5_msg.subchn_comp.num_subchannels;
+
+	/* wait for all sub channels to open */
+	wait_event(nvdev->subchan_open,
+		   atomic_read(&nvdev->open_chn) == nvdev->num_chn);
+
+	/* ignore failues from setting rss parameters, still have channels */
+	rndis_filter_set_rss_param(rdev, netvsc_hash_key);
+
+	netif_set_real_num_tx_queues(ndev, nvdev->num_chn);
+	netif_set_real_num_rx_queues(ndev, nvdev->num_chn);
+
+	rtnl_unlock();
+	return;
+
+failed:
+	/* fallback to only primary channel */
+	for (i = 1; i < nvdev->num_chn; i++)
+		netif_napi_del(&nvdev->chan_table[i].napi);
+
+	nvdev->max_chn = 1;
+	nvdev->num_chn = 1;
+unlock:
+	rtnl_unlock();
+}
+
 struct netvsc_device *rndis_filter_device_add(struct hv_device *dev,
 				      struct netvsc_device_info *device_info)
 {
@@ -1063,7 +1137,6 @@ struct netvsc_device *rndis_filter_device_add(struct hv_device *dev,
 	struct rndis_device *rndis_device;
 	struct ndis_offload hwcaps;
 	struct ndis_offload_params offloads;
-	struct nvsp_message *init_packet;
 	struct ndis_recv_scale_cap rsscap;
 	u32 rsscap_size = sizeof(struct ndis_recv_scale_cap);
 	unsigned int gso_max_size = GSO_MAX_SIZE;
@@ -1215,9 +1288,7 @@ struct netvsc_device *rndis_filter_device_add(struct hv_device *dev,
 							net_device->num_chn);
 
 	atomic_set(&net_device->open_chn, 1);
-
-	if (net_device->num_chn == 1)
-		return net_device;
+	vmbus_set_sc_create_callback(dev->channel, netvsc_sc_open);
 
 	for (i = 1; i < net_device->num_chn; i++) {
 		ret = netvsc_alloc_recv_comp_ring(net_device, i);
@@ -1228,38 +1299,15 @@ struct netvsc_device *rndis_filter_device_add(struct hv_device *dev,
 		}
 	}
 
-	vmbus_set_sc_create_callback(dev->channel, netvsc_sc_open);
+	for (i = 1; i < net_device->num_chn; i++)
+		netif_napi_add(net, &net_device->chan_table[i].napi,
+			       netvsc_poll, NAPI_POLL_WEIGHT);
 
-	init_packet = &net_device->channel_init_pkt;
-	memset(init_packet, 0, sizeof(struct nvsp_message));
-	init_packet->hdr.msg_type = NVSP_MSG5_TYPE_SUBCHANNEL;
-	init_packet->msg.v5_msg.subchn_req.op = NVSP_SUBCHANNEL_ALLOCATE;
-	init_packet->msg.v5_msg.subchn_req.num_subchannels =
-						net_device->num_chn - 1;
-	ret = vmbus_sendpacket(dev->channel, init_packet,
-			       sizeof(struct nvsp_message),
-			       (unsigned long)init_packet,
-			       VM_PKT_DATA_INBAND,
-			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
-	if (ret)
-		goto out;
-
-	wait_for_completion(&net_device->channel_init_wait);
-	if (init_packet->msg.v5_msg.subchn_comp.status != NVSP_STAT_SUCCESS) {
-		ret = -ENODEV;
-		goto out;
-	}
+	if (net_device->num_chn > 1)
+		schedule_work(&net_device->subchan_work);
 
-	net_device->num_chn = 1 +
-		init_packet->msg.v5_msg.subchn_comp.num_subchannels;
-
-	/* wait for all sub channels to open */
-	wait_event(net_device->subchan_open,
-		   atomic_read(&net_device->open_chn) == net_device->num_chn);
-
-	/* ignore failues from setting rss parameters, still have channels */
-	rndis_filter_set_rss_param(rndis_device, netvsc_hash_key);
 out:
+	/* if unavailable, just proceed with one queue */
 	if (ret) {
 		net_device->max_chn = 1;
 		net_device->num_chn = 1;
@@ -1280,10 +1328,10 @@ void rndis_filter_device_remove(struct hv_device *dev,
 	/* Halt and release the rndis device */
 	rndis_filter_halt_device(rndis_dev);
 
-	kfree(rndis_dev);
 	net_dev->extension = NULL;
 
 	netvsc_device_remove(dev);
+	kfree(rndis_dev);
 }
 
 int rndis_filter_open(struct netvsc_device *nvdev)
-- 
cgit 


From 8f2bb1de73344dbedd4195016b782bee7bf3598f Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Wed, 6 Sep 2017 13:53:06 -0700
Subject: hv_netvsc: avoid unnecessary wakeups on subchannel creation

Only need to wakeup the initiator after all sub-channels
are opened.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/rndis_filter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 731bc7cc6f43..065b204d8e17 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1048,8 +1048,8 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc)
 	else
 		netdev_notice(ndev, "sub channel open failed: %d\n", ret);
 
-	atomic_inc(&nvscdev->open_chn);
-	wake_up(&nvscdev->subchan_open);
+	if (atomic_inc_return(&nvscdev->open_chn) == nvscdev->num_chn)
+		wake_up(&nvscdev->subchan_open);
 }
 
 /* Open sub-channels after completing the handling of the device probe.
-- 
cgit 


From c6644d07eff6588b2dedf881279fb0d1c7783970 Mon Sep 17 00:00:00 2001
From: Kosuke Tatsukawa <tatsu@ab.jp.nec.com>
Date: Wed, 6 Sep 2017 22:47:59 +0000
Subject: net: bonding: Fix transmit load balancing in balance-alb mode if
 specified by sysfs

Commit cbf5ecb30560 ("net: bonding: Fix transmit load balancing in
balance-alb mode") tried to fix transmit dynamic load balancing in
balance-alb mode, which wasn't working after commit 8b426dc54cf4
("bonding: remove hardcoded value").

It turned out that my previous patch only fixed the case when
balance-alb was specified as bonding module parameter, and not when
balance-alb mode was set using /sys/class/net/*/bonding/mode (the most
common usage).  In the latter case, tlb_dynamic_lb was set up according
to the default mode of the bonding interface, which happens to be
balance-rr.

This additional patch addresses this issue by setting up tlb_dynamic_lb
to 1 if "mode" is set to balance-alb through the sysfs interface.

I didn't add code to change tlb_balance_lb back to the default value for
other modes, because "mode" is usually set up only once during
initialization, and it's not worthwhile to change the static variable
bonding_defaults in bond_main.c to a global variable just for this
purpose.

Commit 8b426dc54cf4 also changes the value of tlb_dynamic_lb for
balance-tlb mode if it is set up using the sysfs interface.  I didn't
change that behavior, because the value of tlb_balance_lb can be changed
using the sysfs interface for balance-tlb, and I didn't like changing
the default value back and forth for balance-tlb.

As for balance-alb, /sys/class/net/*/bonding/tlb_balance_lb cannot be
written to.  However, I think balance-alb with tlb_dynamic_lb set to 0
is not an intended usage, so there is little use making it writable at
this moment.

Fixes: 8b426dc54cf4 ("bonding: remove hardcoded value")
Reported-by: Reinis Rozitis <r@roze.lv>
Signed-off-by: Kosuke Tatsukawa <tatsu@ab.jp.nec.com>
Cc: stable@vger.kernel.org  # v4.12+
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Acked-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_options.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index a12d603d41c6..5931aa2fe997 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -754,6 +754,9 @@ static int bond_option_mode_set(struct bonding *bond,
 			   bond->params.miimon);
 	}
 
+	if (newval->value == BOND_MODE_ALB)
+		bond->params.tlb_dynamic_lb = 1;
+
 	/* don't cache arp_validate between modes */
 	bond->params.arp_validate = BOND_ARP_VALIDATE_NONE;
 	bond->params.mode = newval->value;
-- 
cgit 


From 609320c8a22715b74b39796930c3542719f8ab62 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 7 Sep 2017 18:36:15 -0700
Subject: perf/bpf: fix a clang compilation issue

clang does not support variable length array for structure member.
It has the following error during compilation:

kernel/trace/trace_syscalls.c:568:17: error: fields must have a constant size:
'variable length array in structure' extension will never be supported
                unsigned long args[sys_data->nb_args];
                              ^

The fix is to use a fixed array length instead.

Reported-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/syscalls.h      | 2 ++
 kernel/trace/trace_syscalls.c | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 88951b795ee3..95606a2d556f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -200,6 +200,8 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
 #define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
 #define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
 
+#define SYSCALL_DEFINE_MAXARGS	6
+
 #define SYSCALL_DEFINEx(x, sname, ...)				\
 	SYSCALL_METADATA(sname, x, __VA_ARGS__)			\
 	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 9c4eef20301c..696afe72d3b1 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -565,7 +565,7 @@ static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
 	struct syscall_tp_t {
 		unsigned long long regs;
 		unsigned long syscall_nr;
-		unsigned long args[sys_data->nb_args];
+		unsigned long args[SYSCALL_DEFINE_MAXARGS];
 	} param;
 	int i;
 
-- 
cgit 


From 96c5508e3012ed0984ab93821d64ac1ff3279c09 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sun, 10 Sep 2017 09:47:02 +0200
Subject: xdp: implement xdp_redirect_map for generic XDP

Using bpf_redirect_map is allowed for generic XDP programs, but the
appropriate map lookup was never performed in xdp_do_generic_redirect().

Instead the map-index is directly used as the ifindex.  For the
xdp_redirect_map sample in SKB-mode '-S', this resulted in trying
sending on ifindex 0 which isn't valid, resulting in getting SKB
packets dropped.  Thus, the reported performance numbers are wrong in
commit 24251c264798 ("samples/bpf: add option for native and skb mode
for redirect apps") for the 'xdp_redirect_map -S' case.

Before commit 109980b894e9 ("bpf: don't select potentially stale
ri->map from buggy xdp progs") it could crash the kernel.  Like this
commit also check that the map_owner owner is correct before
dereferencing the map pointer.  But make sure that this API misusage
can be caught by a tracepoint. Thus, allowing userspace via
tracepoints to detect misbehaving bpf_progs.

Fixes: 6103aa96ec07 ("net: implement XDP_REDIRECT for xdp generic")
Fixes: 24251c264798 ("samples/bpf: add option for native and skb mode for redirect apps")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/xdp.h |  4 ++--
 net/core/filter.c          | 38 ++++++++++++++++++++++++++------------
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 862575ac8da9..4e16c43fba10 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -138,11 +138,11 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
 
 #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)		\
 	 trace_xdp_redirect_map(dev, xdp, fwd ? fwd->ifindex : 0,	\
-				0, map, idx);
+				0, map, idx)
 
 #define _trace_xdp_redirect_map_err(dev, xdp, fwd, map, idx, err)	\
 	 trace_xdp_redirect_map_err(dev, xdp, fwd ? fwd->ifindex : 0,	\
-				    err, map, idx);
+				    err, map, idx)
 
 #endif /* _TRACE_XDP_H */
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 3a50a9b021e2..24dd33dd9f04 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2506,21 +2506,19 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 	const struct bpf_prog *map_owner = ri->map_owner;
 	struct bpf_map *map = ri->map;
+	struct net_device *fwd = NULL;
 	u32 index = ri->ifindex;
-	struct net_device *fwd;
 	int err;
 
 	ri->ifindex = 0;
 	ri->map = NULL;
 	ri->map_owner = NULL;
 
-	/* This is really only caused by a deliberately crappy
-	 * BPF program, normally we would never hit that case,
-	 * so no need to inform someone via tracepoints either,
-	 * just bail out.
-	 */
-	if (unlikely(map_owner != xdp_prog))
-		return -EINVAL;
+	if (unlikely(map_owner != xdp_prog)) {
+		err = -EFAULT;
+		map = NULL;
+		goto err;
+	}
 
 	fwd = __dev_map_lookup_elem(map, index);
 	if (!fwd) {
@@ -2576,13 +2574,27 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
 			    struct bpf_prog *xdp_prog)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	const struct bpf_prog *map_owner = ri->map_owner;
+	struct bpf_map *map = ri->map;
+	struct net_device *fwd = NULL;
 	u32 index = ri->ifindex;
-	struct net_device *fwd;
 	unsigned int len;
 	int err = 0;
 
-	fwd = dev_get_by_index_rcu(dev_net(dev), index);
 	ri->ifindex = 0;
+	ri->map = NULL;
+	ri->map_owner = NULL;
+
+	if (map) {
+		if (unlikely(map_owner != xdp_prog)) {
+			err = -EFAULT;
+			map = NULL;
+			goto err;
+		}
+		fwd = __dev_map_lookup_elem(map, index);
+	} else {
+		fwd = dev_get_by_index_rcu(dev_net(dev), index);
+	}
 	if (unlikely(!fwd)) {
 		err = -EINVAL;
 		goto err;
@@ -2600,10 +2612,12 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
 	}
 
 	skb->dev = fwd;
-	_trace_xdp_redirect(dev, xdp_prog, index);
+	map ? _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index)
+		: _trace_xdp_redirect(dev, xdp_prog, index);
 	return 0;
 err:
-	_trace_xdp_redirect_err(dev, xdp_prog, index, err);
+	map ? _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err)
+		: _trace_xdp_redirect_err(dev, xdp_prog, index, err);
 	return err;
 }
 EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
-- 
cgit 


From 33e34e735fd4227b49735f1fc059dc9646abd1c6 Mon Sep 17 00:00:00 2001
From: David Lebrun <dlebrun@google.com>
Date: Sun, 10 Sep 2017 14:22:01 +0100
Subject: ipv6: sr: remove duplicate routing header type check

As seg6_validate_srh() already checks that the Routing Header type is
correct, it is not necessary to do it again in get_srh().

Fixes: 5829d70b ("ipv6: sr: fix get_srh() to comply with IPv6 standard "RFC 8200")
Signed-off-by: David Lebrun <dlebrun@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/seg6_local.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 7ff54db73a48..825b8e01f947 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -72,10 +72,6 @@ static struct ipv6_sr_hdr *get_srh(struct sk_buff *skb)
 
 	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
 
-	/* make sure it's a Segment Routing header (Routing Type 4) */
-	if (srh->type != IPV6_SRCRT_TYPE_4)
-		return NULL;
-
 	len = (srh->hdrlen + 1) << 3;
 
 	if (!pskb_may_pull(skb, srhoff + len))
-- 
cgit 


From 230cfd2dbc228a6992287d31c5d93bc6c2552024 Mon Sep 17 00:00:00 2001
From: Josh Hunt <johunt@akamai.com>
Date: Sun, 10 Sep 2017 15:48:50 -0400
Subject: net/sched: fix pointer check in gen_handle

Fixes sparse warning about pointer in gen_handle:
net/sched/cls_rsvp.h:392:40: warning: Using plain integer as NULL pointer

Fixes: 8113c095672f6 ("net_sched: use void pointer for filter handle")
Signed-off-by: Josh Hunt <johunt@akamai.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_rsvp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 98c05db85bcb..b1f6ed48bc72 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -389,7 +389,7 @@ static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt)
 		if ((data->hgenerator += 0x10000) == 0)
 			data->hgenerator = 0x10000;
 		h = data->hgenerator|salt;
-		if (rsvp_get(tp, h) == 0)
+		if (!rsvp_get(tp, h))
 			return h;
 	}
 	return 0;
-- 
cgit 


From 9c0827317f235865ae421293f8aecf6cb327a63e Mon Sep 17 00:00:00 2001
From: Nisar Sayed <Nisar.Sayed@microchip.com>
Date: Mon, 11 Sep 2017 17:43:11 +0000
Subject: smsc95xx: Configure pause time to 0xffff when tx flow control enabled

Configure pause time to 0xffff when tx flow control enabled

Set pause time to 0xffff in the pause frame to indicate the
partner to stop sending the packets. When RX buffer frees up,
the device sends pause frame with pause time zero for partner to
resume transmission.

Fixes: 2f7ca802bdae ("Add SMSC LAN9500 USB2.0 10/100 ethernet adapter driver")
Signed-off-by: Nisar Sayed <Nisar.Sayed@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/smsc95xx.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c
index 340c13484e5c..309b88acd3d0 100644
--- a/drivers/net/usb/smsc95xx.c
+++ b/drivers/net/usb/smsc95xx.c
@@ -526,7 +526,7 @@ static void smsc95xx_set_multicast(struct net_device *netdev)
 static int smsc95xx_phy_update_flowcontrol(struct usbnet *dev, u8 duplex,
 					   u16 lcladv, u16 rmtadv)
 {
-	u32 flow, afc_cfg = 0;
+	u32 flow = 0, afc_cfg;
 
 	int ret = smsc95xx_read_reg(dev, AFC_CFG, &afc_cfg);
 	if (ret < 0)
@@ -537,20 +537,19 @@ static int smsc95xx_phy_update_flowcontrol(struct usbnet *dev, u8 duplex,
 
 		if (cap & FLOW_CTRL_RX)
 			flow = 0xFFFF0002;
-		else
-			flow = 0;
 
-		if (cap & FLOW_CTRL_TX)
+		if (cap & FLOW_CTRL_TX) {
 			afc_cfg |= 0xF;
-		else
+			flow |= 0xFFFF0000;
+		} else {
 			afc_cfg &= ~0xF;
+		}
 
 		netif_dbg(dev, link, dev->net, "rx pause %s, tx pause %s\n",
 				   cap & FLOW_CTRL_RX ? "enabled" : "disabled",
 				   cap & FLOW_CTRL_TX ? "enabled" : "disabled");
 	} else {
 		netif_dbg(dev, link, dev->net, "half duplex\n");
-		flow = 0;
 		afc_cfg |= 0xF;
 	}
 
-- 
cgit 


From 5829e62ac17a40ab08c1b905565604a4b5fa7af6 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Mon, 11 Sep 2017 21:56:20 +0200
Subject: openvswitch: Fix an error handling path in
 'ovs_nla_init_match_and_action()'

All other error handling paths in this function go through the 'error'
label. This one should do the same.

Fixes: 9cc9a5cb176c ("datapath: Avoid using stack larger than 1024.")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/datapath.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 76cf273a56c7..c3aec6227c91 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1112,7 +1112,8 @@ static int ovs_nla_init_match_and_action(struct net *net,
 		if (!a[OVS_FLOW_ATTR_KEY]) {
 			OVS_NLERR(log,
 				  "Flow key attribute not present in set flow.");
-			return -EINVAL;
+			error = -EINVAL;
+			goto error;
 		}
 
 		*acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], key,
-- 
cgit 


From da8ab57863ed7e912d10b179b6bdc652f635bd19 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 11 Sep 2017 15:58:38 -0700
Subject: tcp/dccp: remove reqsk_put() from inet_child_forget()

Back in linux-4.4, I inadvertently put a call to reqsk_put() in
inet_child_forget(), forgetting it could be called from two different
points.

In the case it is called from inet_csk_reqsk_queue_add(), we want to
keep the reference on the request socket, since it is released later by
the caller (tcp_v{4|6}_rcv())

This bug never showed up because atomic_dec_and_test() was not signaling
the underflow, and SLAB_DESTROY_BY RCU semantic for request sockets
prevented the request to be put in quarantine.

Recent conversion of socket refcount from atomic_t to refcount_t finally
exposed the bug.

So move the reqsk_put() to inet_csk_listen_stop() to fix this.

Thanks to Shankara Pailoor for using syzkaller and providing
a nice set of .config and C repro.

WARNING: CPU: 2 PID: 4277 at lib/refcount.c:186
refcount_sub_and_test+0x167/0x1b0 lib/refcount.c:186
Kernel panic - not syncing: panic_on_warn set ...

CPU: 2 PID: 4277 Comm: syz-executor0 Not tainted 4.13.0-rc7 #3
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
Ubuntu-1.8.2-1ubuntu1 04/01/2014
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0xf7/0x1aa lib/dump_stack.c:52
 panic+0x1ae/0x3a7 kernel/panic.c:180
 __warn+0x1c4/0x1d9 kernel/panic.c:541
 report_bug+0x211/0x2d0 lib/bug.c:183
 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:190
 do_trap_no_signal arch/x86/kernel/traps.c:224 [inline]
 do_trap+0x260/0x390 arch/x86/kernel/traps.c:273
 do_error_trap+0x118/0x340 arch/x86/kernel/traps.c:310
 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:323
 invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:846
RIP: 0010:refcount_sub_and_test+0x167/0x1b0 lib/refcount.c:186
RSP: 0018:ffff88006e006b60 EFLAGS: 00010286
RAX: 0000000000000026 RBX: 0000000000000000 RCX: 0000000000000000
RDX: 0000000000000026 RSI: 1ffff1000dc00d2c RDI: ffffed000dc00d60
RBP: ffff88006e006bf0 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: 1ffff1000dc00d6d
R13: 00000000ffffffff R14: 0000000000000001 R15: ffff88006ce9d340
 refcount_dec_and_test+0x1a/0x20 lib/refcount.c:211
 reqsk_put+0x71/0x2b0 include/net/request_sock.h:123
 tcp_v4_rcv+0x259e/0x2e20 net/ipv4/tcp_ipv4.c:1729
 ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216
 NF_HOOK include/linux/netfilter.h:248 [inline]
 ip_local_deliver+0x1ce/0x6d0 net/ipv4/ip_input.c:257
 dst_input include/net/dst.h:477 [inline]
 ip_rcv_finish+0x8db/0x19c0 net/ipv4/ip_input.c:397
 NF_HOOK include/linux/netfilter.h:248 [inline]
 ip_rcv+0xc3f/0x17d0 net/ipv4/ip_input.c:488
 __netif_receive_skb_core+0x1fb7/0x31f0 net/core/dev.c:4298
 __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4336
 process_backlog+0x1c5/0x6d0 net/core/dev.c:5102
 napi_poll net/core/dev.c:5499 [inline]
 net_rx_action+0x6d3/0x14a0 net/core/dev.c:5565
 __do_softirq+0x2cb/0xb2d kernel/softirq.c:284
 do_softirq_own_stack+0x1c/0x30 arch/x86/entry/entry_64.S:898
 </IRQ>
 do_softirq.part.16+0x63/0x80 kernel/softirq.c:328
 do_softirq kernel/softirq.c:176 [inline]
 __local_bh_enable_ip+0x84/0x90 kernel/softirq.c:181
 local_bh_enable include/linux/bottom_half.h:31 [inline]
 rcu_read_unlock_bh include/linux/rcupdate.h:705 [inline]
 ip_finish_output2+0x8ad/0x1360 net/ipv4/ip_output.c:231
 ip_finish_output+0x74e/0xb80 net/ipv4/ip_output.c:317
 NF_HOOK_COND include/linux/netfilter.h:237 [inline]
 ip_output+0x1cc/0x850 net/ipv4/ip_output.c:405
 dst_output include/net/dst.h:471 [inline]
 ip_local_out+0x95/0x160 net/ipv4/ip_output.c:124
 ip_queue_xmit+0x8c6/0x1810 net/ipv4/ip_output.c:504
 tcp_transmit_skb+0x1963/0x3320 net/ipv4/tcp_output.c:1123
 tcp_send_ack.part.35+0x38c/0x620 net/ipv4/tcp_output.c:3575
 tcp_send_ack+0x49/0x60 net/ipv4/tcp_output.c:3545
 tcp_rcv_synsent_state_process net/ipv4/tcp_input.c:5795 [inline]
 tcp_rcv_state_process+0x4876/0x4b60 net/ipv4/tcp_input.c:5930
 tcp_v4_do_rcv+0x58a/0x820 net/ipv4/tcp_ipv4.c:1483
 sk_backlog_rcv include/net/sock.h:907 [inline]
 __release_sock+0x124/0x360 net/core/sock.c:2223
 release_sock+0xa4/0x2a0 net/core/sock.c:2715
 inet_wait_for_connect net/ipv4/af_inet.c:557 [inline]
 __inet_stream_connect+0x671/0xf00 net/ipv4/af_inet.c:643
 inet_stream_connect+0x58/0xa0 net/ipv4/af_inet.c:682
 SYSC_connect+0x204/0x470 net/socket.c:1628
 SyS_connect+0x24/0x30 net/socket.c:1609
 entry_SYSCALL_64_fastpath+0x18/0xad
RIP: 0033:0x451e59
RSP: 002b:00007f474843fc08 EFLAGS: 00000216 ORIG_RAX: 000000000000002a
RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 0000000000451e59
RDX: 0000000000000010 RSI: 0000000020002000 RDI: 0000000000000007
RBP: 0000000000000046 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000216 R12: 0000000000000000
R13: 00007ffc040a0f8f R14: 00007f47484409c0 R15: 0000000000000000

Fixes: ebb516af60e1 ("tcp/dccp: fix race at listener dismantle phase")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Shankara Pailoor <sp3485@columbia.edu>
Tested-by: Shankara Pailoor <sp3485@columbia.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_connection_sock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 4089c013cb03..b9c64b40a83a 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -916,7 +916,6 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
 		tcp_sk(child)->fastopen_rsk = NULL;
 	}
 	inet_csk_destroy_sock(child);
-	reqsk_put(req);
 }
 
 struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
@@ -987,6 +986,7 @@ void inet_csk_listen_stop(struct sock *sk)
 		sock_hold(child);
 
 		inet_child_forget(sk, req, child);
+		reqsk_put(req);
 		bh_unlock_sock(child);
 		local_bh_enable();
 		sock_put(child);
-- 
cgit 


From d7fb60b9cafb982cb2e46a267646a8dfd4f2e5da Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Mon, 11 Sep 2017 16:33:30 -0700
Subject: net_sched: get rid of tcfa_rcu

gen estimator has been rewritten in commit 1c0d32fde5bd
("net_sched: gen_estimator: complete rewrite of rate estimators"),
the caller is no longer needed to wait for a grace period.
So this patch gets rid of it.

This also completely closes a race condition between action free
path and filter chain add/remove path for the following patch.
Because otherwise the nested RCU callback can't be caught by
rcu_barrier().

Please see also the comments in code.

Cc: Jiri Pirko <jiri@mellanox.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h |  2 --
 net/sched/act_api.c   | 17 ++++++++---------
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 8f3d5d8b5ae0..b944e0eb93be 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -34,7 +34,6 @@ struct tc_action {
 	struct gnet_stats_queue		tcfa_qstats;
 	struct net_rate_estimator __rcu *tcfa_rate_est;
 	spinlock_t			tcfa_lock;
-	struct rcu_head			tcfa_rcu;
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
 	struct gnet_stats_queue __percpu *cpu_qstats;
 	struct tc_cookie	*act_cookie;
@@ -50,7 +49,6 @@ struct tc_action {
 #define tcf_qstats	common.tcfa_qstats
 #define tcf_rate_est	common.tcfa_rate_est
 #define tcf_lock	common.tcfa_lock
-#define tcf_rcu		common.tcfa_rcu
 
 /* Update lastuse only if needed, to avoid dirtying a cache line.
  * We use a temp variable to avoid fetching jiffies twice.
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index a306974e2fb4..fcd7dc7b807a 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -53,10 +53,13 @@ static void tcf_action_goto_chain_exec(const struct tc_action *a,
 	res->goto_tp = rcu_dereference_bh(chain->filter_chain);
 }
 
-static void free_tcf(struct rcu_head *head)
+/* XXX: For standalone actions, we don't need a RCU grace period either, because
+ * actions are always connected to filters and filters are already destroyed in
+ * RCU callbacks, so after a RCU grace period actions are already disconnected
+ * from filters. Readers later can not find us.
+ */
+static void free_tcf(struct tc_action *p)
 {
-	struct tc_action *p = container_of(head, struct tc_action, tcfa_rcu);
-
 	free_percpu(p->cpu_bstats);
 	free_percpu(p->cpu_qstats);
 
@@ -76,11 +79,7 @@ static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p)
 	idr_remove_ext(&idrinfo->action_idr, p->tcfa_index);
 	spin_unlock_bh(&idrinfo->lock);
 	gen_kill_estimator(&p->tcfa_rate_est);
-	/*
-	 * gen_estimator est_timer() might access p->tcfa_lock
-	 * or bstats, wait a RCU grace period before freeing p
-	 */
-	call_rcu(&p->tcfa_rcu, free_tcf);
+	free_tcf(p);
 }
 
 int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
@@ -259,7 +258,7 @@ void tcf_idr_cleanup(struct tc_action *a, struct nlattr *est)
 {
 	if (est)
 		gen_kill_estimator(&a->tcfa_rate_est);
-	call_rcu(&a->tcfa_rcu, free_tcf);
+	free_tcf(a);
 }
 EXPORT_SYMBOL(tcf_idr_cleanup);
 
-- 
cgit 


From e2ef75445340ca7ec2c4558f84ae6c8c5d650fc8 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Mon, 11 Sep 2017 16:33:31 -0700
Subject: net_sched: fix reference counting of tc filter chain

This patch fixes the following ugliness of tc filter chain refcnt:

a) tp proto should hold a refcnt to the chain too. This significantly
   simplifies the logic.

b) Chain 0 is no longer special, it is created with refcnt=1 like any
   other chains. All the ugliness in tcf_chain_put() can be gone!

c) No need to handle the flushing oddly, because block still holds
   chain 0, it can not be released, this guarantees block is the last
   user.

d) The race condition with RCU callbacks is easier to handle with just
   a rcu_barrier(). Much easier to understand, nothing to hide. Thanks
   to the previous patch. Please see also the comments in code.

e) Make the code understandable by humans, much less error-prone.

Fixes: 744a4cf63e52 ("net: sched: fix use after free when tcf_chain_destroy is called multiple times")
Fixes: 5bc1701881e3 ("net: sched: introduce multichain support for filters")
Cc: Jiri Pirko <jiri@mellanox.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 47 ++++++++++++++++++++++++++---------------------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index c743f03cfebd..d29e79d98a69 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -182,7 +182,7 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block,
 	list_add_tail(&chain->list, &block->chain_list);
 	chain->block = block;
 	chain->index = chain_index;
-	chain->refcnt = 0;
+	chain->refcnt = 1;
 	return chain;
 }
 
@@ -194,21 +194,20 @@ static void tcf_chain_flush(struct tcf_chain *chain)
 		RCU_INIT_POINTER(*chain->p_filter_chain, NULL);
 	while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) {
 		RCU_INIT_POINTER(chain->filter_chain, tp->next);
+		tcf_chain_put(chain);
 		tcf_proto_destroy(tp);
 	}
 }
 
 static void tcf_chain_destroy(struct tcf_chain *chain)
 {
-	/* May be already removed from the list by the previous call. */
-	if (!list_empty(&chain->list))
-		list_del_init(&chain->list);
+	list_del(&chain->list);
+	kfree(chain);
+}
 
-	/* There might still be a reference held when we got here from
-	 * tcf_block_put. Wait for the user to drop reference before free.
-	 */
-	if (!chain->refcnt)
-		kfree(chain);
+static void tcf_chain_hold(struct tcf_chain *chain)
+{
+	++chain->refcnt;
 }
 
 struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
@@ -217,24 +216,19 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
 	struct tcf_chain *chain;
 
 	list_for_each_entry(chain, &block->chain_list, list) {
-		if (chain->index == chain_index)
-			goto incref;
+		if (chain->index == chain_index) {
+			tcf_chain_hold(chain);
+			return chain;
+		}
 	}
-	chain = create ? tcf_chain_create(block, chain_index) : NULL;
 
-incref:
-	if (chain)
-		chain->refcnt++;
-	return chain;
+	return create ? tcf_chain_create(block, chain_index) : NULL;
 }
 EXPORT_SYMBOL(tcf_chain_get);
 
 void tcf_chain_put(struct tcf_chain *chain)
 {
-	/* Destroy unused chain, with exception of chain 0, which is the
-	 * default one and has to be always present.
-	 */
-	if (--chain->refcnt == 0 && !chain->filter_chain && chain->index != 0)
+	if (--chain->refcnt == 0)
 		tcf_chain_destroy(chain);
 }
 EXPORT_SYMBOL(tcf_chain_put);
@@ -279,10 +273,19 @@ void tcf_block_put(struct tcf_block *block)
 	if (!block)
 		return;
 
+	/* XXX: Standalone actions are not allowed to jump to any chain, and
+	 * bound actions should be all removed after flushing. However,
+	 * filters are destroyed in RCU callbacks, we have to flush and wait
+	 * for them inside the loop, otherwise we race with RCU callbacks on
+	 * this list.
+	 */
 	list_for_each_entry_safe(chain, tmp, &block->chain_list, list) {
 		tcf_chain_flush(chain);
-		tcf_chain_destroy(chain);
+		rcu_barrier();
 	}
+
+	list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
+		tcf_chain_put(chain);
 	kfree(block);
 }
 EXPORT_SYMBOL(tcf_block_put);
@@ -360,6 +363,7 @@ static void tcf_chain_tp_insert(struct tcf_chain *chain,
 		rcu_assign_pointer(*chain->p_filter_chain, tp);
 	RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info));
 	rcu_assign_pointer(*chain_info->pprev, tp);
+	tcf_chain_hold(chain);
 }
 
 static void tcf_chain_tp_remove(struct tcf_chain *chain,
@@ -371,6 +375,7 @@ static void tcf_chain_tp_remove(struct tcf_chain *chain,
 	if (chain->p_filter_chain && tp == chain->filter_chain)
 		RCU_INIT_POINTER(*chain->p_filter_chain, next);
 	RCU_INIT_POINTER(*chain_info->pprev, next);
+	tcf_chain_put(chain);
 }
 
 static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain,
-- 
cgit 


From 1697c4bb5245649a23f06a144cc38c06715e1b65 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Mon, 11 Sep 2017 16:33:32 -0700
Subject: net_sched: carefully handle tcf_block_put()

As pointed out by Jiri, there is still a race condition between
tcf_block_put() and tcf_chain_destroy() in a RCU callback. There
is no way to make it correct without proper locking or synchronization,
because both operate on a shared list.

Locking is hard, because the only lock we can pick here is a spinlock,
however, in tc_dump_tfilter() we iterate this list with a sleeping
function called (tcf_chain_dump()), which makes using a lock to protect
chain_list almost impossible.

Jiri suggested the idea of holding a refcnt before flushing, this works
because it guarantees us there would be no parallel tcf_chain_destroy()
during the loop, therefore the race condition is gone. But we have to
be very careful with proper synchronization with RCU callbacks.

Suggested-by: Jiri Pirko <jiri@mellanox.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index d29e79d98a69..0b2219adf520 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -275,15 +275,27 @@ void tcf_block_put(struct tcf_block *block)
 
 	/* XXX: Standalone actions are not allowed to jump to any chain, and
 	 * bound actions should be all removed after flushing. However,
-	 * filters are destroyed in RCU callbacks, we have to flush and wait
-	 * for them inside the loop, otherwise we race with RCU callbacks on
-	 * this list.
+	 * filters are destroyed in RCU callbacks, we have to hold the chains
+	 * first, otherwise we would always race with RCU callbacks on this list
+	 * without proper locking.
 	 */
-	list_for_each_entry_safe(chain, tmp, &block->chain_list, list) {
+
+	/* Wait for existing RCU callbacks to cool down. */
+	rcu_barrier();
+
+	/* Hold a refcnt for all chains, except 0, in case they are gone. */
+	list_for_each_entry(chain, &block->chain_list, list)
+		if (chain->index)
+			tcf_chain_hold(chain);
+
+	/* No race on the list, because no chain could be destroyed. */
+	list_for_each_entry(chain, &block->chain_list, list)
 		tcf_chain_flush(chain);
-		rcu_barrier();
-	}
 
+	/* Wait for RCU callbacks to release the reference count. */
+	rcu_barrier();
+
+	/* At this point, all the chains should have refcnt == 1. */
 	list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
 		tcf_chain_put(chain);
 	kfree(block);
-- 
cgit 


From 6399ebcccffa12e65bc15eda039d37673264ebce Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Tue, 12 Sep 2017 08:50:53 +0200
Subject: mlxsw: spectrum: Prevent mirred-related crash on removal

When removing the offloading of mirred actions under
matchall classifiers, mlxsw would find the destination port
associated with the offloaded action and utilize it for undoing
the configuration.

Depending on the order by which ports are removed, it's possible that
the destination port would get removed before the source port.
In such a scenario, when actions would be flushed for the source port
mlxsw would perform an illegal dereference as the destination port is
no longer listed.

Since the only item necessary for undoing the configuration on the
destination side is the port-id and that in turn is already maintained
by mlxsw on the source-port, simply stop trying to access the
destination port and use the port-id directly instead.

Fixes: 763b4b70af ("mlxsw: spectrum: Add support in matchall mirror TC offloading")
Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index e0804599fcae..696b99e65a5a 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -575,15 +575,14 @@ static void mlxsw_sp_span_entry_destroy(struct mlxsw_sp *mlxsw_sp,
 }
 
 static struct mlxsw_sp_span_entry *
-mlxsw_sp_span_entry_find(struct mlxsw_sp_port *port)
+mlxsw_sp_span_entry_find(struct mlxsw_sp *mlxsw_sp, u8 local_port)
 {
-	struct mlxsw_sp *mlxsw_sp = port->mlxsw_sp;
 	int i;
 
 	for (i = 0; i < mlxsw_sp->span.entries_count; i++) {
 		struct mlxsw_sp_span_entry *curr = &mlxsw_sp->span.entries[i];
 
-		if (curr->used && curr->local_port == port->local_port)
+		if (curr->used && curr->local_port == local_port)
 			return curr;
 	}
 	return NULL;
@@ -594,7 +593,8 @@ static struct mlxsw_sp_span_entry
 {
 	struct mlxsw_sp_span_entry *span_entry;
 
-	span_entry = mlxsw_sp_span_entry_find(port);
+	span_entry = mlxsw_sp_span_entry_find(port->mlxsw_sp,
+					      port->local_port);
 	if (span_entry) {
 		/* Already exists, just take a reference */
 		span_entry->ref_count++;
@@ -783,12 +783,13 @@ err_port_bind:
 }
 
 static void mlxsw_sp_span_mirror_remove(struct mlxsw_sp_port *from,
-					struct mlxsw_sp_port *to,
+					u8 destination_port,
 					enum mlxsw_sp_span_type type)
 {
 	struct mlxsw_sp_span_entry *span_entry;
 
-	span_entry = mlxsw_sp_span_entry_find(to);
+	span_entry = mlxsw_sp_span_entry_find(from->mlxsw_sp,
+					      destination_port);
 	if (!span_entry) {
 		netdev_err(from->dev, "no span entry found\n");
 		return;
@@ -1563,14 +1564,12 @@ static void
 mlxsw_sp_port_del_cls_matchall_mirror(struct mlxsw_sp_port *mlxsw_sp_port,
 				      struct mlxsw_sp_port_mall_mirror_tc_entry *mirror)
 {
-	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
 	enum mlxsw_sp_span_type span_type;
-	struct mlxsw_sp_port *to_port;
 
-	to_port = mlxsw_sp->ports[mirror->to_local_port];
 	span_type = mirror->ingress ?
 			MLXSW_SP_SPAN_INGRESS : MLXSW_SP_SPAN_EGRESS;
-	mlxsw_sp_span_mirror_remove(mlxsw_sp_port, to_port, span_type);
+	mlxsw_sp_span_mirror_remove(mlxsw_sp_port, mirror->to_local_port,
+				    span_type);
 }
 
 static int
-- 
cgit 


From 833a8b405465e935a1ff7ab086b54a3ef90437ca Mon Sep 17 00:00:00 2001
From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Tue, 12 Sep 2017 17:47:56 +0800
Subject: ip_tunnel: fix ip tunnel lookup in collect_md mode

In collect_md mode, if the tun dev is down, it still can call
ip_tunnel_rcv to receive on packets, and the rx statistics increase
improperly.

When the md tunnel is down, it's not neccessary to increase RX drops
for the tunnel device, packets would be recieved on fallback tunnel,
and the RX drops on fallback device will be increased as expected.

Fixes: 2e15ea390e6f ("ip_gre: Add support to collect tunnel metadata.")
Cc: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_tunnel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index e1856bfa753d..e9805ad664ac 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -176,7 +176,7 @@ skip_key_lookup:
 		return cand;
 
 	t = rcu_dereference(itn->collect_md_tun);
-	if (t)
+	if (t && t->dev->flags & IFF_UP)
 		return t;
 
 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
-- 
cgit 


From 6c1cb4393cc7e7107e4e94a9a0744451296ca8a6 Mon Sep 17 00:00:00 2001
From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Tue, 12 Sep 2017 17:47:57 +0800
Subject: ip6_tunnel: fix ip6 tunnel lookup in collect_md mode

In collect_md mode, if the tun dev is down, it still can call
__ip6_tnl_rcv to receive on packets, and the rx statistics increase
improperly.

When the md tunnel is down, it's not neccessary to increase RX drops
for the tunnel device, packets would be recieved on fallback tunnel,
and the RX drops on fallback device will be increased as expected.

Fixes: 8d79266bc48c ("ip6_tunnel: add collect_md mode to IPv6 tunnels")
Cc: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 10a693a19323..ae73164559d5 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -171,7 +171,7 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_
 	}
 
 	t = rcu_dereference(ip6n->collect_md_tun);
-	if (t)
+	if (t && t->dev->flags & IFF_UP)
 		return t;
 
 	t = rcu_dereference(ip6n->tnls_wc[0]);
-- 
cgit 


From f13ad104b4e886a03e75f130daf579ef9bf33dfc Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Tue, 12 Sep 2017 15:10:05 +0300
Subject: net: bonding: fix tlb_dynamic_lb default value

Commit 8b426dc54cf4 ("bonding: remove hardcoded value") changed the
default value for tlb_dynamic_lb which lead to either broken ALB mode
(since tlb_dynamic_lb can be changed only in TLB) or setting TLB mode
with tlb_dynamic_lb equal to 0.
The first issue was recently fixed by setting tlb_dynamic_lb to 1 always
when switching to ALB mode, but the default value is still wrong and
we'll enter TLB mode with tlb_dynamic_lb equal to 0 if the mode is
changed via netlink or sysfs. In order to restore the previous behaviour
and default value simply remove the mode check around the default param
initialization for tlb_dynamic_lb which will always set it to 1 as
before.

Fixes: 8b426dc54cf4 ("bonding: remove hardcoded value")
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Acked-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index fc63992ab0e0..c99dc59d729b 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4289,7 +4289,7 @@ static int bond_check_params(struct bond_params *params)
 	int bond_mode	= BOND_MODE_ROUNDROBIN;
 	int xmit_hashtype = BOND_XMIT_POLICY_LAYER2;
 	int lacp_fast = 0;
-	int tlb_dynamic_lb = 0;
+	int tlb_dynamic_lb;
 
 	/* Convert string parameters. */
 	if (mode) {
@@ -4601,16 +4601,13 @@ static int bond_check_params(struct bond_params *params)
 	}
 	ad_user_port_key = valptr->value;
 
-	if ((bond_mode == BOND_MODE_TLB) || (bond_mode == BOND_MODE_ALB)) {
-		bond_opt_initstr(&newval, "default");
-		valptr = bond_opt_parse(bond_opt_get(BOND_OPT_TLB_DYNAMIC_LB),
-					&newval);
-		if (!valptr) {
-			pr_err("Error: No tlb_dynamic_lb default value");
-			return -EINVAL;
-		}
-		tlb_dynamic_lb = valptr->value;
+	bond_opt_initstr(&newval, "default");
+	valptr = bond_opt_parse(bond_opt_get(BOND_OPT_TLB_DYNAMIC_LB), &newval);
+	if (!valptr) {
+		pr_err("Error: No tlb_dynamic_lb default value");
+		return -EINVAL;
 	}
+	tlb_dynamic_lb = valptr->value;
 
 	if (lp_interval == 0) {
 		pr_warn("Warning: ip_interval must be between 1 and %d, so it was reset to %d\n",
-- 
cgit 


From 854426ef359c52bdf7087bc20c8d9105d075ca29 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 12 Sep 2017 14:31:48 +0200
Subject: w90p910_ether: include linux/interrupt.h

A randconfig build caused a compile failure:

drivers/net/ethernet/nuvoton/w90p910_ether.c: In function 'w90p910_ether_close':
drivers/net/ethernet/nuvoton/w90p910_ether.c:580:2: error: implicit declaration of function 'free_irq'; did you mean 'free_uid'? [-Werror=implicit-function-declaration]

Adding the correct include fixes the problem.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/nuvoton/w90p910_ether.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/nuvoton/w90p910_ether.c b/drivers/net/ethernet/nuvoton/w90p910_ether.c
index 89ab786da25f..4a67c55aa9f1 100644
--- a/drivers/net/ethernet/nuvoton/w90p910_ether.c
+++ b/drivers/net/ethernet/nuvoton/w90p910_ether.c
@@ -11,6 +11,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/mii.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
-- 
cgit 


From 822f8565c93949fb2d31502d595c8bc45629c9b7 Mon Sep 17 00:00:00 2001
From: Suresh Reddy <suresh.reddy@broadcom.com>
Date: Wed, 13 Sep 2017 11:12:42 -0400
Subject: be2net: fix TSO6/GSO issue causing TX-stall on Lancer/BEx

IPv6 TSO requests with extension hdrs are a problem to the
Lancer and BEx chips. Workaround is to disable TSO6 feature
for such packets.

Also in Lancer chips, MSS less than 256 was resulting in TX stall.
Fix this by disabling GSO when MSS less than 256.

Signed-off-by: Suresh Reddy <suresh.reddy@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/emulex/benet/be.h      |  8 ++++++++
 drivers/net/ethernet/emulex/benet/be_main.c | 14 ++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h
index 674cf9d13b98..8984c4938881 100644
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -930,6 +930,14 @@ static inline bool is_ipv4_pkt(struct sk_buff *skb)
 	return skb->protocol == htons(ETH_P_IP) && ip_hdr(skb)->version == 4;
 }
 
+static inline bool is_ipv6_ext_hdr(struct sk_buff *skb)
+{
+	if (ip_hdr(skb)->version == 6)
+		return ipv6_ext_hdr(ipv6_hdr(skb)->nexthdr);
+	else
+		return false;
+}
+
 #define be_error_recovering(adapter)	\
 		(adapter->flags & BE_FLAGS_TRY_RECOVERY)
 
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 319eee36649b..0e3d9f39a807 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -5089,6 +5089,20 @@ static netdev_features_t be_features_check(struct sk_buff *skb,
 	struct be_adapter *adapter = netdev_priv(dev);
 	u8 l4_hdr = 0;
 
+	if (skb_is_gso(skb)) {
+		/* IPv6 TSO requests with extension hdrs are a problem
+		 * to Lancer and BE3 HW. Disable TSO6 feature.
+		 */
+		if (!skyhawk_chip(adapter) && is_ipv6_ext_hdr(skb))
+			features &= ~NETIF_F_TSO6;
+
+		/* Lancer cannot handle the packet with MSS less than 256.
+		 * Disable the GSO support in such cases
+		 */
+		if (lancer_chip(adapter) && skb_shinfo(skb)->gso_size < 256)
+			features &= ~NETIF_F_GSO_MASK;
+	}
+
 	/* The code below restricts offload features for some tunneled and
 	 * Q-in-Q packets.
 	 * Offload features for normal (non tunnel) packets are unchanged.
-- 
cgit 


From 255cd50f207ae8ec7b22663246c833407744e634 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 13 Sep 2017 17:32:37 +0200
Subject: net: sched: fix use-after-free in tcf_action_destroy and
 tcf_del_walker

Recent commit d7fb60b9cafb ("net_sched: get rid of tcfa_rcu") removed
freeing in call_rcu, which changed already existing hard-to-hit
race condition into 100% hit:

[  598.599825] BUG: unable to handle kernel NULL pointer dereference at 0000000000000030
[  598.607782] IP: tcf_action_destroy+0xc0/0x140

Or:

[   40.858924] BUG: unable to handle kernel NULL pointer dereference at 0000000000000030
[   40.862840] IP: tcf_generic_walker+0x534/0x820

Fix this by storing the ops and use them directly for module_put call.

Fixes: a85a970af265 ("net_sched: move tc_action into tcf_common")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index fcd7dc7b807a..da6fa82c98a8 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -180,7 +180,7 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
 	idr_for_each_entry_ext(idr, p, id) {
 		ret = __tcf_idr_release(p, false, true);
 		if (ret == ACT_P_DELETED) {
-			module_put(p->ops->owner);
+			module_put(ops->owner);
 			n_i++;
 		} else if (ret < 0) {
 			goto nla_put_failure;
@@ -514,13 +514,15 @@ EXPORT_SYMBOL(tcf_action_exec);
 
 int tcf_action_destroy(struct list_head *actions, int bind)
 {
+	const struct tc_action_ops *ops;
 	struct tc_action *a, *tmp;
 	int ret = 0;
 
 	list_for_each_entry_safe(a, tmp, actions, list) {
+		ops = a->ops;
 		ret = __tcf_idr_release(a, bind, true);
 		if (ret == ACT_P_DELETED)
-			module_put(a->ops->owner);
+			module_put(ops->owner);
 		else if (ret < 0)
 			return ret;
 	}
-- 
cgit 


From b95a2d831b815189618d18e3e89bcfa5072351a1 Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Wed, 13 Sep 2017 10:15:58 -0700
Subject: nfp: add whitelist of supported flow dissector

Previously we did not check the flow dissector against a list of allowed
and supported flow key dissectors. This patch introduces such a list and
correctly rejects unsupported flow keys.

Fixes: 43f84b72c50d ("nfp: add metadata to each flow offload")
Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/flower/offload.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index d396183108f7..a18b4d2b1d3e 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -44,6 +44,16 @@
 #include "../nfp_net.h"
 #include "../nfp_port.h"
 
+#define NFP_FLOWER_WHITELIST_DISSECTOR \
+	(BIT(FLOW_DISSECTOR_KEY_CONTROL) | \
+	 BIT(FLOW_DISSECTOR_KEY_BASIC) | \
+	 BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | \
+	 BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | \
+	 BIT(FLOW_DISSECTOR_KEY_PORTS) | \
+	 BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) | \
+	 BIT(FLOW_DISSECTOR_KEY_VLAN) | \
+	 BIT(FLOW_DISSECTOR_KEY_IP))
+
 static int
 nfp_flower_xmit_flow(struct net_device *netdev,
 		     struct nfp_fl_payload *nfp_flow, u8 mtype)
@@ -112,6 +122,9 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls,
 	u8 key_layer;
 	int key_size;
 
+	if (flow->dissector->used_keys & ~NFP_FLOWER_WHITELIST_DISSECTOR)
+		return -EOPNOTSUPP;
+
 	if (dissector_uses_key(flow->dissector,
 			       FLOW_DISSECTOR_KEY_ENC_CONTROL)) {
 		struct flow_dissector_key_control *mask_enc_ctl =
-- 
cgit 


From 4cbe94f2af25bf8f4d5dea56c770937d896342bf Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 13 Sep 2017 10:15:59 -0700
Subject: nfp: wait for board state before talking to the NSP

Board state informs us which low-level initialization stages the card
has completed.  We should wait for the card to be fully initialized
before trying to communicate with it, not only before we configure
passing traffic.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_main.c     | 43 +++++++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/nfp_net_main.c | 23 ------------
 2 files changed, 43 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c
index f055b1774d65..424707d41fbd 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c
@@ -74,6 +74,45 @@ static const struct pci_device_id nfp_pci_device_ids[] = {
 };
 MODULE_DEVICE_TABLE(pci, nfp_pci_device_ids);
 
+static bool nfp_board_ready(struct nfp_pf *pf)
+{
+	const char *cp;
+	long state;
+	int err;
+
+	cp = nfp_hwinfo_lookup(pf->hwinfo, "board.state");
+	if (!cp)
+		return false;
+
+	err = kstrtol(cp, 0, &state);
+	if (err < 0)
+		return false;
+
+	return state == 15;
+}
+
+static int nfp_pf_board_state_wait(struct nfp_pf *pf)
+{
+	const unsigned long wait_until = jiffies + 10 * HZ;
+
+	while (!nfp_board_ready(pf)) {
+		if (time_is_before_eq_jiffies(wait_until)) {
+			nfp_err(pf->cpp, "NFP board initialization timeout\n");
+			return -EINVAL;
+		}
+
+		nfp_info(pf->cpp, "waiting for board initialization\n");
+		if (msleep_interruptible(500))
+			return -ERESTARTSYS;
+
+		/* Refresh cached information */
+		kfree(pf->hwinfo);
+		pf->hwinfo = nfp_hwinfo_read(pf->cpp);
+	}
+
+	return 0;
+}
+
 static int nfp_pcie_sriov_read_nfd_limit(struct nfp_pf *pf)
 {
 	int err;
@@ -425,6 +464,10 @@ static int nfp_pci_probe(struct pci_dev *pdev,
 		 nfp_hwinfo_lookup(pf->hwinfo, "assembly.revision"),
 		 nfp_hwinfo_lookup(pf->hwinfo, "cpld.version"));
 
+	err = nfp_pf_board_state_wait(pf);
+	if (err)
+		goto err_hwinfo_free;
+
 	err = devlink_register(devlink, &pdev->dev);
 	if (err)
 		goto err_hwinfo_free;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
index 5abb9ba31e7d..ff373acd28f3 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
@@ -64,23 +64,6 @@
 
 #define NFP_PF_CSR_SLICE_SIZE	(32 * 1024)
 
-static int nfp_is_ready(struct nfp_pf *pf)
-{
-	const char *cp;
-	long state;
-	int err;
-
-	cp = nfp_hwinfo_lookup(pf->hwinfo, "board.state");
-	if (!cp)
-		return 0;
-
-	err = kstrtol(cp, 0, &state);
-	if (err < 0)
-		return 0;
-
-	return state == 15;
-}
-
 /**
  * nfp_net_get_mac_addr() - Get the MAC address.
  * @pf:       NFP PF handle
@@ -725,12 +708,6 @@ int nfp_net_pci_probe(struct nfp_pf *pf)
 
 	INIT_WORK(&pf->port_refresh_work, nfp_net_refresh_vnics);
 
-	/* Verify that the board has completed initialization */
-	if (!nfp_is_ready(pf)) {
-		nfp_err(pf->cpp, "NFP is not ready for NIC operation.\n");
-		return -EINVAL;
-	}
-
 	if (!pf->rtbl) {
 		nfp_err(pf->cpp, "No %s, giving up.\n",
 			pf->fw_loaded ? "symbol table" : "firmware found");
-- 
cgit 


From 7dbd5b7517376c4395a9ed0b26cf6b4db80d8415 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 13 Sep 2017 10:16:00 -0700
Subject: nfp: wait for the NSP resource to appear on boot

The control process (NSP) may take some time to complete its
initialization.  This is not a problem on most servers, but
on very fast-booting machines it may not be ready for operation
when driver probes the device.  There is also a version of the
flash in the wild where NSP tries to train the links as part
of init.  To wait for NSP initialization we should make sure
its resource has already been added to the resource table.
NSP adds itself there as last step of init.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_main.c      |  4 ++
 drivers/net/ethernet/netronome/nfp/nfpcore/nfp.h   |  2 +
 .../ethernet/netronome/nfp/nfpcore/nfp_resource.c  | 45 ++++++++++++++++++++++
 3 files changed, 51 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c
index 424707d41fbd..f8fa63b66739 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c
@@ -351,6 +351,10 @@ static int nfp_nsp_init(struct pci_dev *pdev, struct nfp_pf *pf)
 	struct nfp_nsp *nsp;
 	int err;
 
+	err = nfp_resource_wait(pf->cpp, NFP_RESOURCE_NSP, 30);
+	if (err)
+		return err;
+
 	nsp = nfp_nsp_open(pf->cpp);
 	if (IS_ERR(nsp)) {
 		err = PTR_ERR(nsp);
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp.h b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp.h
index 1a8d04a1e113..3ce51f03126f 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp.h
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp.h
@@ -97,6 +97,8 @@ nfp_resource_acquire(struct nfp_cpp *cpp, const char *name);
 
 void nfp_resource_release(struct nfp_resource *res);
 
+int nfp_resource_wait(struct nfp_cpp *cpp, const char *name, unsigned int secs);
+
 u32 nfp_resource_cpp_id(struct nfp_resource *res);
 
 const char *nfp_resource_name(struct nfp_resource *res);
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c
index 072612263dab..b1dd13ff282b 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c
@@ -249,6 +249,51 @@ void nfp_resource_release(struct nfp_resource *res)
 	kfree(res);
 }
 
+/**
+ * nfp_resource_wait() - Wait for resource to appear
+ * @cpp:	NFP CPP handle
+ * @name:	Name of the resource
+ * @secs:	Number of seconds to wait
+ *
+ * Wait for resource to appear in the resource table, grab and release
+ * its lock.  The wait is jiffies-based, don't expect fine granularity.
+ *
+ * Return: 0 on success, errno otherwise.
+ */
+int nfp_resource_wait(struct nfp_cpp *cpp, const char *name, unsigned int secs)
+{
+	unsigned long warn_at = jiffies + NFP_MUTEX_WAIT_FIRST_WARN * HZ;
+	unsigned long err_at = jiffies + secs * HZ;
+	struct nfp_resource *res;
+
+	while (true) {
+		res = nfp_resource_acquire(cpp, name);
+		if (!IS_ERR(res)) {
+			nfp_resource_release(res);
+			return 0;
+		}
+
+		if (PTR_ERR(res) != -ENOENT) {
+			nfp_err(cpp, "error waiting for resource %s: %ld\n",
+				name, PTR_ERR(res));
+			return PTR_ERR(res);
+		}
+		if (time_is_before_eq_jiffies(err_at)) {
+			nfp_err(cpp, "timeout waiting for resource %s\n", name);
+			return -ETIMEDOUT;
+		}
+		if (time_is_before_eq_jiffies(warn_at)) {
+			warn_at = jiffies + NFP_MUTEX_WAIT_NEXT_WARN * HZ;
+			nfp_info(cpp, "waiting for NFP resource %s\n", name);
+		}
+		if (msleep_interruptible(10)) {
+			nfp_err(cpp, "wait for resource %s interrupted\n",
+				name);
+			return -ERESTARTSYS;
+		}
+	}
+}
+
 /**
  * nfp_resource_cpp_id() - Return the cpp_id of a resource handle
  * @res:	NFP Resource handle
-- 
cgit 


From ca558e185972d8ecd308760abf972f5d408bcff0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 13 Sep 2017 11:16:45 -0700
Subject: net_sched: gen_estimator: fix scaling error in bytes/packets samples

Denys reported wrong rate estimations with HTB classes.

It appears the bug was added in linux-4.10, since my tests
where using intervals of one second only.

HTB using 4 sec default rate estimators, reported rates
were 4x higher.

We need to properly scale the bytes/packets samples before
integrating them in EWMA.

Tested:
 echo 1 >/sys/module/sch_htb/parameters/htb_rate_est

 Setup HTB with one class with a rate/cail of 5Gbit

 Generate traffic on this class

 tc -s -d cl sh dev eth0 classid 7002:11
class htb 7002:11 parent 7002:1 prio 5 quantum 200000 rate 5Gbit ceil
5Gbit linklayer ethernet burst 80000b/1 mpu 0b cburst 80000b/1 mpu 0b
level 0 rate_handle 1
 Sent 1488215421648 bytes 982969243 pkt (dropped 0, overlimits 0
requeues 0)
 rate 5Gbit 412814pps backlog 136260b 2p requeues 0
 TCP pkts/rtx 982969327/45 bytes 1488215557414/68130
 lended: 22732826 borrowed: 0 giants: 0
 tokens: -1684 ctokens: -1684

Fixes: 1c0d32fde5bd ("net_sched: gen_estimator: complete rewrite of rate estimators")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Denys Fedoryshchenko <nuclearcat@nuclearcat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/gen_estimator.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 0385dece1f6f..7c1ffd6f9501 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -83,10 +83,10 @@ static void est_timer(unsigned long arg)
 	u64 rate, brate;
 
 	est_fetch_counters(est, &b);
-	brate = (b.bytes - est->last_bytes) << (8 - est->ewma_log);
+	brate = (b.bytes - est->last_bytes) << (10 - est->ewma_log - est->intvl_log);
 	brate -= (est->avbps >> est->ewma_log);
 
-	rate = (u64)(b.packets - est->last_packets) << (8 - est->ewma_log);
+	rate = (u64)(b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log);
 	rate -= (est->avpps >> est->ewma_log);
 
 	write_seqcount_begin(&est->seq);
-- 
cgit 


From 6fa9c623a03c560178e0dcec23f59dbfd29b21b9 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Wed, 13 Sep 2017 22:28:53 +0300
Subject: MAINTAINERS: review Renesas DT bindings as well

When adding myself  as a reviewer for  the Renesas  Ethernet drivers
I somehow forgot about the bindings -- I want to review them as well.

Fixes: 8e6569af3a1b ("MAINTAINERS: add myself as Renesas Ethernet drivers reviewer")
Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 7f32b510fdea..c2985b7f188e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11379,6 +11379,8 @@ RENESAS ETHERNET DRIVERS
 R:	Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
 L:	netdev@vger.kernel.org
 L:	linux-renesas-soc@vger.kernel.org
+F:	Documentation/devicetree/bindings/net/renesas,*.txt
+F:	Documentation/devicetree/bindings/net/sh_eth.txt
 F:	drivers/net/ethernet/renesas/
 F:	include/linux/sh_eth.h
 
-- 
cgit 


From fa5f7b51fc3080c2b195fa87c7eca7c05e56f673 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 14 Sep 2017 02:00:54 +0300
Subject: sctp: potential read out of bounds in sctp_ulpevent_type_enabled()

This code causes a static checker warning because Smatch doesn't trust
anything that comes from skb->data.  I've reviewed this code and I do
think skb->data can be controlled by the user here.

The sctp_event_subscribe struct has 13 __u8 fields and we want to see
if ours is non-zero.  sn_type can be any value in the 0-USHRT_MAX range.
We're subtracting SCTP_SN_TYPE_BASE which is 1 << 15 so we could read
either before the start of the struct or after the end.

This is a very old bug and it's surprising that it would go undetected
for so long but my theory is that it just doesn't have a big impact so
it would be hard to notice.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/ulpevent.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index 1060494ac230..b8c86ec1a8f5 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -153,8 +153,12 @@ __u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event);
 static inline int sctp_ulpevent_type_enabled(__u16 sn_type,
 					     struct sctp_event_subscribe *mask)
 {
+	int offset = sn_type - SCTP_SN_TYPE_BASE;
 	char *amask = (char *) mask;
-	return amask[sn_type - SCTP_SN_TYPE_BASE];
+
+	if (offset >= sizeof(struct sctp_event_subscribe))
+		return 0;
+	return amask[offset];
 }
 
 /* Given an event subscription, is this event enabled? */
-- 
cgit 


From a5135676bbf18ab4caed9effd321bd126f9ee11f Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Thu, 14 Sep 2017 13:22:25 +0200
Subject: tls: make tls_sw_free_resources static

Make the needlessly global function tls_sw_free_resources static to fix
a gcc/sparse warning.

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_sw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index fa596fa71ba7..7d80040a37b6 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -639,7 +639,7 @@ sendpage_end:
 	return ret;
 }
 
-void tls_sw_free_resources(struct sock *sk)
+static void tls_sw_free_resources(struct sock *sk)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
-- 
cgit 


From 23f4822207e04c5f78924fe0e5193c14ba720b4c Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 14 Sep 2017 17:01:25 +0100
Subject: tg3: clean up redundant initialization of tnapi

tnapi is being initialized and then immediately updated and
hence the initialiation is redundant.  Clean up the warning
by moving the declaration and initialization to the inside
of the for-loop.

Cleans up clang scan-build warning:
warning: Value stored to 'tnapi' during its initialization is never read

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/tg3.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index af33dc15c55f..656e6af70f0a 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -11536,11 +11536,11 @@ static int tg3_start(struct tg3 *tp, bool reset_phy, bool test_irq,
 	tg3_napi_enable(tp);
 
 	for (i = 0; i < tp->irq_cnt; i++) {
-		struct tg3_napi *tnapi = &tp->napi[i];
 		err = tg3_request_irq(tp, i);
 		if (err) {
 			for (i--; i >= 0; i--) {
-				tnapi = &tp->napi[i];
+				struct tg3_napi *tnapi = &tp->napi[i];
+
 				free_irq(tnapi->irq_vec, tnapi);
 			}
 			goto out_napi_fini;
-- 
cgit 


From 4739df6211911e6597602222b640e5c002563df6 Mon Sep 17 00:00:00 2001
From: Himanshu Jha <himanshujha199640@gmail.com>
Date: Tue, 12 Sep 2017 16:49:22 +0530
Subject: qed: remove unnecessary call to memset

call to memset to assign 0 value immediately after allocating
memory with kzalloc is unnecesaary as kzalloc allocates the memory
filled with 0 value.

Semantic patch used to resolve this issue:

@@
expression e,e2; constant c;
statement S;
@@

  e = kzalloc(e2, c);
  if(e == NULL) S
- memset(e, 0, e2);

Signed-off-by: Himanshu Jha <himanshujha199640@gmail.com>
Signed-off-by: Himanshu Jha <himanshujha199640@gmail.com>
Acked-by: Sudarsana Kalluru <sudarsana.kalluru@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_dcbx.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
index eaca4578435d..8f6ccc0c39e5 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
@@ -1244,7 +1244,6 @@ int qed_dcbx_get_config_params(struct qed_hwfn *p_hwfn,
 	if (!dcbx_info)
 		return -ENOMEM;
 
-	memset(dcbx_info, 0, sizeof(*dcbx_info));
 	rc = qed_dcbx_query_params(p_hwfn, dcbx_info, QED_DCBX_OPERATIONAL_MIB);
 	if (rc) {
 		kfree(dcbx_info);
-- 
cgit 


From ecf091171b70787f92b18eeaa4ddc74f9221fa56 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 12 Sep 2017 22:10:53 +0200
Subject: net: vrf: avoid gcc-4.6 warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When building an allmodconfig kernel with gcc-4.6, we get a rather
odd warning:

drivers/net/vrf.c: In function ‘vrf_ip6_input_dst’:
drivers/net/vrf.c:964:3: error: initialized field with side-effects overwritten [-Werror]
drivers/net/vrf.c:964:3: error: (near initialization for ‘fl6’) [-Werror]

I have no idea what this warning is even trying to say, but it does
seem like a false positive. Reordering the initialization in to match
the structure definition gets rid of the warning, and might also avoid
whatever gcc thinks is wrong here.

Fixes: 9ff74384600a ("net: vrf: Handle ipv6 multicast and link-local addresses")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 7e19051f3230..9b243e6f3008 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -957,12 +957,12 @@ static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev,
 {
 	const struct ipv6hdr *iph = ipv6_hdr(skb);
 	struct flowi6 fl6 = {
+		.flowi6_iif     = ifindex,
+		.flowi6_mark    = skb->mark,
+		.flowi6_proto   = iph->nexthdr,
 		.daddr          = iph->daddr,
 		.saddr          = iph->saddr,
 		.flowlabel      = ip6_flowinfo(iph),
-		.flowi6_mark    = skb->mark,
-		.flowi6_proto   = iph->nexthdr,
-		.flowi6_iif     = ifindex,
 	};
 	struct net *net = dev_net(vrf_dev);
 	struct rt6_info *rt6;
-- 
cgit 


From 7095c973453e56efa0903e863b59cd89c75e62dc Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 12 Sep 2017 13:14:26 -0700
Subject: net: systemport: Fix 64-bit stats deadlock

We can enter a deadlock situation because there is no sufficient protection
when ndo_get_stats64() runs in process context to guard against RX or TX NAPI
contexts running in softirq, this can lead to the following lockdep splat and
actual deadlock was experienced as well with an iperf session in the background
and a while loop doing ifconfig + ethtool.

[    5.780350] ================================
[    5.784679] WARNING: inconsistent lock state
[    5.789011] 4.13.0-rc7-02179-g32fae27c725d #70 Not tainted
[    5.794561] --------------------------------
[    5.798890] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
[    5.804971] swapper/0/0 [HC0[0]:SC1[1]:HE0:SE0] takes:
[    5.810175]  (&syncp->seq#2){+.?...}, at: [<c0768a28>] bcm_sysport_tx_reclaim+0x30/0x54
[    5.818327] {SOFTIRQ-ON-W} state was registered at:
[    5.823278]   bcm_sysport_get_stats64+0x17c/0x258
[    5.828053]   dev_get_stats+0x38/0xac
[    5.831776]   rtnl_fill_stats+0x30/0x118
[    5.835761]   rtnl_fill_ifinfo+0x538/0xe24
[    5.839921]   rtmsg_ifinfo_build_skb+0x6c/0xd8
[    5.844430]   rtmsg_ifinfo_event.part.5+0x14/0x44
[    5.849201]   rtmsg_ifinfo+0x20/0x28
[    5.852837]   register_netdevice+0x628/0x6b8
[    5.857171]   register_netdev+0x14/0x24
[    5.861051]   bcm_sysport_probe+0x30c/0x438
[    5.865280]   platform_drv_probe+0x50/0xb0
[    5.869418]   driver_probe_device+0x2e8/0x450
[    5.873817]   __driver_attach+0x104/0x120
[    5.877871]   bus_for_each_dev+0x7c/0xc0
[    5.881834]   bus_add_driver+0x1b0/0x270
[    5.885797]   driver_register+0x78/0xf4
[    5.889675]   do_one_initcall+0x54/0x190
[    5.893646]   kernel_init_freeable+0x144/0x1d0
[    5.898135]   kernel_init+0x8/0x110
[    5.901665]   ret_from_fork+0x14/0x2c
[    5.905363] irq event stamp: 24263
[    5.908804] hardirqs last  enabled at (24262): [<c08eecf0>] net_rx_action+0xc4/0x4e4
[    5.916624] hardirqs last disabled at (24263): [<c0a7da00>] _raw_spin_lock_irqsave+0x1c/0x98
[    5.925143] softirqs last  enabled at (24258): [<c022a7fc>] irq_enter+0x84/0x98
[    5.932524] softirqs last disabled at (24259): [<c022a918>] irq_exit+0x108/0x16c
[    5.939985]
[    5.939985] other info that might help us debug this:
[    5.946576]  Possible unsafe locking scenario:
[    5.946576]
[    5.952556]        CPU0
[    5.955031]        ----
[    5.957506]   lock(&syncp->seq#2);
[    5.960955]   <Interrupt>
[    5.963604]     lock(&syncp->seq#2);
[    5.967227]
[    5.967227]  *** DEADLOCK ***
[    5.967227]
[    5.973222] 1 lock held by swapper/0/0:
[    5.977092]  #0:  (&(&ring->lock)->rlock){..-...}, at: [<c0768a18>] bcm_sysport_tx_reclaim+0x20/0x54

So just remove the u64_stats_update_begin()/end() pair in ndo_get_stats64()
since it does not appear to be useful for anything. No inconsistency was
observed with either ifconfig or ethtool, global TX counts equal the sum of
per-queue TX counts on a 32-bit architecture.

Fixes: 10377ba7673d ("net: systemport: Support 64bit statistics")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bcmsysport.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index a6572b51435a..c3c53f6cd9e6 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -1735,11 +1735,8 @@ static void bcm_sysport_get_stats64(struct net_device *dev,
 		stats->tx_packets += tx_packets;
 	}
 
-	/* lockless update tx_bytes and tx_packets */
-	u64_stats_update_begin(&priv->syncp);
 	stats64->tx_bytes = stats->tx_bytes;
 	stats64->tx_packets = stats->tx_packets;
-	u64_stats_update_end(&priv->syncp);
 
 	do {
 		start = u64_stats_fetch_begin_irq(&priv->syncp);
-- 
cgit 


From 2aa70f864955bf02362e3fb3008e4208d7a17a98 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 13 Sep 2017 19:42:05 +0200
Subject: net: smsc911x: Quieten netif during suspend

If the network interface is kept running during suspend, the net core
may call net_device_ops.ndo_start_xmit() while the Ethernet device is
still suspended, which may lead to a system crash.

E.g. on sh73a0/kzm9g and r8a73a4/ape6evm, the external Ethernet chip is
driven by a PM controlled clock.  If the Ethernet registers are accessed
while the clock is not running, the system will crash with an imprecise
external abort.

As this is a race condition with a small time window, it is not so easy
to trigger at will.  Using pm_test may increase your chances:

    # echo 0 > /sys/module/printk/parameters/console_suspend
    # echo platform > /sys/power/pm_test
    # echo mem > /sys/power/state

To fix this, make sure the network interface is quietened during
suspend.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/smsc/smsc911x.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/smsc/smsc911x.c b/drivers/net/ethernet/smsc/smsc911x.c
index 0b6a39b003a4..012fb66eed8d 100644
--- a/drivers/net/ethernet/smsc/smsc911x.c
+++ b/drivers/net/ethernet/smsc/smsc911x.c
@@ -2595,6 +2595,11 @@ static int smsc911x_suspend(struct device *dev)
 	struct net_device *ndev = dev_get_drvdata(dev);
 	struct smsc911x_data *pdata = netdev_priv(ndev);
 
+	if (netif_running(ndev)) {
+		netif_stop_queue(ndev);
+		netif_device_detach(ndev);
+	}
+
 	/* enable wake on LAN, energy detection and the external PME
 	 * signal. */
 	smsc911x_reg_write(pdata, PMT_CTRL,
@@ -2628,7 +2633,15 @@ static int smsc911x_resume(struct device *dev)
 	while (!(smsc911x_reg_read(pdata, PMT_CTRL) & PMT_CTRL_READY_) && --to)
 		udelay(1000);
 
-	return (to == 0) ? -EIO : 0;
+	if (to == 0)
+		return -EIO;
+
+	if (netif_running(ndev)) {
+		netif_device_attach(ndev);
+		netif_start_queue(ndev);
+	}
+
+	return 0;
 }
 
 static const struct dev_pm_ops smsc911x_pm_ops = {
-- 
cgit 


From cbea8f02069533ea2ad4e5b3bfbcdb0894c20354 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 13 Sep 2017 17:11:37 -0700
Subject: net: ipv4: fix l3slave check for index returned in IP_PKTINFO

rt_iif is only set to the actual egress device for the output path. The
recent change to consider the l3slave flag when returning IP_PKTINFO
works for local traffic (the correct device index is returned), but it
broke the more typical use case of packets received from a remote host
always returning the VRF index rather than the original ingress device.
Update the fixup to consider l3slave and rt_iif actually getting set.

Fixes: 1dfa76390bf05 ("net: ipv4: add check for l3slave for index returned in IP_PKTINFO")
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index e558e4f9597b..a599aa83fdad 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1207,7 +1207,6 @@ e_inval:
 void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
 {
 	struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
-	bool l3slave = ipv4_l3mdev_skb(IPCB(skb)->flags);
 	bool prepare = (inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) ||
 		       ipv6_sk_rxinfo(sk);
 
@@ -1221,8 +1220,13 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
 		 * (e.g., process binds socket to eth0 for Tx which is
 		 * redirected to loopback in the rtable/dst).
 		 */
-		if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX || l3slave)
+		struct rtable *rt = skb_rtable(skb);
+		bool l3slave = ipv4_l3mdev_skb(IPCB(skb)->flags);
+
+		if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX)
 			pktinfo->ipi_ifindex = inet_iif(skb);
+		else if (l3slave && rt && rt->rt_iif)
+			pktinfo->ipi_ifindex = rt->rt_iif;
 
 		pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
 	} else {
-- 
cgit 


From 8c72c65b426b47b3c166a8fef0d8927fe5e8a28d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@googl.com>
Date: Wed, 13 Sep 2017 20:30:39 -0700
Subject: tcp: update skb->skb_mstamp more carefully

liujian reported a problem in TCP_USER_TIMEOUT processing with a patch
in tcp_probe_timer() :
      https://www.spinics.net/lists/netdev/msg454496.html

After investigations, the root cause of the problem is that we update
skb->skb_mstamp of skbs in write queue, even if the attempt to send a
clone or copy of it failed. One reason being a routing problem.

This patch prevents this, solving liujian issue.

It also removes a potential RTT miscalculation, since
__tcp_retransmit_skb() is not OR-ing TCP_SKB_CB(skb)->sacked with
TCPCB_EVER_RETRANS if a failure happens, but skb->skb_mstamp has
been changed.

A future ACK would then lead to a very small RTT sample and min_rtt
would then be lowered to this too small value.

Tested:

# cat user_timeout.pkt
--local_ip=192.168.102.64

    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
   +0 bind(3, ..., ...) = 0
   +0 listen(3, 1) = 0

   +0 `ifconfig tun0 192.168.102.64/16; ip ro add 192.0.2.1 dev tun0`

   +0 < S 0:0(0) win 0 <mss 1460>
   +0 > S. 0:0(0) ack 1 <mss 1460>

  +.1 < . 1:1(0) ack 1 win 65530
   +0 accept(3, ..., ...) = 4

   +0 setsockopt(4, SOL_TCP, TCP_USER_TIMEOUT, [3000], 4) = 0
   +0 write(4, ..., 24) = 24
   +0 > P. 1:25(24) ack 1 win 29200
   +.1 < . 1:1(0) ack 25 win 65530

//change the ipaddress
   +1 `ifconfig tun0 192.168.0.10/16`

   +1 write(4, ..., 24) = 24
   +1 write(4, ..., 24) = 24
   +1 write(4, ..., 24) = 24
   +1 write(4, ..., 24) = 24

   +0 `ifconfig tun0 192.168.102.64/16`
   +0 < . 1:2(1) ack 25 win 65530
   +0 `ifconfig tun0 192.168.0.10/16`

   +3 write(4, ..., 24) = -1

# ./packetdrill user_timeout.pkt

Signed-off-by: Eric Dumazet <edumazet@googl.com>
Reported-by: liujian <liujian56@huawei.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5b6690d05abb..a85a8c2948e5 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -991,6 +991,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	struct tcp_skb_cb *tcb;
 	struct tcp_out_options opts;
 	unsigned int tcp_options_size, tcp_header_size;
+	struct sk_buff *oskb = NULL;
 	struct tcp_md5sig_key *md5;
 	struct tcphdr *th;
 	int err;
@@ -998,12 +999,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	BUG_ON(!skb || !tcp_skb_pcount(skb));
 	tp = tcp_sk(sk);
 
-	skb->skb_mstamp = tp->tcp_mstamp;
 	if (clone_it) {
 		TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
 			- tp->snd_una;
 		tcp_rate_skb_sent(sk, skb);
 
+		oskb = skb;
 		if (unlikely(skb_cloned(skb)))
 			skb = pskb_copy(skb, gfp_mask);
 		else
@@ -1011,6 +1012,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 		if (unlikely(!skb))
 			return -ENOBUFS;
 	}
+	skb->skb_mstamp = tp->tcp_mstamp;
 
 	inet = inet_sk(sk);
 	tcb = TCP_SKB_CB(skb);
@@ -1122,12 +1124,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 
 	err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
 
-	if (likely(err <= 0))
-		return err;
-
-	tcp_enter_cwr(sk);
+	if (unlikely(err > 0)) {
+		tcp_enter_cwr(sk);
+		err = net_xmit_eval(err);
+	}
+	if (!err && oskb)
+		oskb->skb_mstamp = tp->tcp_mstamp;
 
-	return net_xmit_eval(err);
+	return err;
 }
 
 /* This routine just queues the buffer for sending.
@@ -2869,10 +2873,11 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 		     skb_headroom(skb) >= 0xFFFF)) {
 		struct sk_buff *nskb;
 
-		skb->skb_mstamp = tp->tcp_mstamp;
 		nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
 		err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
 			     -ENOBUFS;
+		if (!err)
+			skb->skb_mstamp = tp->tcp_mstamp;
 	} else {
 		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
 	}
-- 
cgit 


From 5023a6db73196695f4cc2db1a0eb37957ca27772 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Thu, 14 Sep 2017 09:31:07 -0700
Subject: netvsc: increase default receive buffer size

The default receive buffer size was reduced by recent change
to a value which was appropriate for 10G and Windows Server 2016.
But the value is too small for full performance with 40G on Azure.
Increase the default back to maximum supported by host.

Fixes: 8b5327975ae1 ("netvsc: allow controlling send/recv buffer size")
Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc_drv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index c538a4f15f3b..d4902ee5f260 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -49,7 +49,7 @@
 #define NETVSC_MIN_TX_SECTIONS	10
 #define NETVSC_DEFAULT_TX	192	/* ~1M */
 #define NETVSC_MIN_RX_SECTIONS	10	/* ~64K */
-#define NETVSC_DEFAULT_RX	2048	/* ~4M */
+#define NETVSC_DEFAULT_RX	10485   /* Max ~16M */
 
 #define LINKCHANGE_INT (2 * HZ)
 #define VF_TAKEOVER_INT (HZ / 10)
-- 
cgit 


From d25adbeb0cdb860fb39e09cdd025e9cfc954c5ab Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 15 Sep 2017 11:02:21 +0800
Subject: sctp: fix an use-after-free issue in sctp_sock_dump

Commit 86fdb3448cc1 ("sctp: ensure ep is not destroyed before doing the
dump") tried to fix an use-after-free issue by checking !sctp_sk(sk)->ep
with holding sock and sock lock.

But Paolo noticed that endpoint could be destroyed in sctp_rcv without
sock lock protection. It means the use-after-free issue still could be
triggered when sctp_rcv put and destroy ep after sctp_sock_dump checks
!ep, although it's pretty hard to reproduce.

I could reproduce it by mdelay in sctp_rcv while msleep in sctp_close
and sctp_sock_dump long time.

This patch is to add another param cb_done to sctp_for_each_transport
and dump ep->assocs with holding tsp after jumping out of transport's
traversal in it to avoid this issue.

It can also improve sctp diag dump to make it run faster, as no need
to save sk into cb->args[5] and keep calling sctp_for_each_transport
any more.

This patch is also to use int * instead of int for the pos argument
in sctp_for_each_transport, which could make postion increment only
in sctp_for_each_transport and no need to keep changing cb->args[2]
in sctp_sock_filter and sctp_sock_dump any more.

Fixes: 86fdb3448cc1 ("sctp: ensure ep is not destroyed before doing the dump")
Reported-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h |  3 ++-
 net/sctp/sctp_diag.c    | 32 +++++++++-----------------------
 net/sctp/socket.c       | 40 +++++++++++++++++++++++++---------------
 3 files changed, 36 insertions(+), 39 deletions(-)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 06b4f515e157..d7d8cba01469 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -127,7 +127,8 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *),
 				  const union sctp_addr *laddr,
 				  const union sctp_addr *paddr, void *p);
 int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *),
-			    struct net *net, int pos, void *p);
+			    int (*cb_done)(struct sctp_transport *, void *),
+			    struct net *net, int *pos, void *p);
 int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *), void *p);
 int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
 		       struct sctp_info *info);
diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
index e99518e79b52..7008a992749b 100644
--- a/net/sctp/sctp_diag.c
+++ b/net/sctp/sctp_diag.c
@@ -279,9 +279,11 @@ out:
 	return err;
 }
 
-static int sctp_sock_dump(struct sock *sk, void *p)
+static int sctp_sock_dump(struct sctp_transport *tsp, void *p)
 {
+	struct sctp_endpoint *ep = tsp->asoc->ep;
 	struct sctp_comm_param *commp = p;
+	struct sock *sk = ep->base.sk;
 	struct sk_buff *skb = commp->skb;
 	struct netlink_callback *cb = commp->cb;
 	const struct inet_diag_req_v2 *r = commp->r;
@@ -289,9 +291,7 @@ static int sctp_sock_dump(struct sock *sk, void *p)
 	int err = 0;
 
 	lock_sock(sk);
-	if (!sctp_sk(sk)->ep)
-		goto release;
-	list_for_each_entry(assoc, &sctp_sk(sk)->ep->asocs, asocs) {
+	list_for_each_entry(assoc, &ep->asocs, asocs) {
 		if (cb->args[4] < cb->args[1])
 			goto next;
 
@@ -327,40 +327,30 @@ next:
 		cb->args[4]++;
 	}
 	cb->args[1] = 0;
-	cb->args[2]++;
 	cb->args[3] = 0;
 	cb->args[4] = 0;
 release:
 	release_sock(sk);
-	sock_put(sk);
 	return err;
 }
 
-static int sctp_get_sock(struct sctp_transport *tsp, void *p)
+static int sctp_sock_filter(struct sctp_transport *tsp, void *p)
 {
 	struct sctp_endpoint *ep = tsp->asoc->ep;
 	struct sctp_comm_param *commp = p;
 	struct sock *sk = ep->base.sk;
-	struct netlink_callback *cb = commp->cb;
 	const struct inet_diag_req_v2 *r = commp->r;
 	struct sctp_association *assoc =
 		list_entry(ep->asocs.next, struct sctp_association, asocs);
 
 	/* find the ep only once through the transports by this condition */
 	if (tsp->asoc != assoc)
-		goto out;
+		return 0;
 
 	if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family)
-		goto out;
-
-	sock_hold(sk);
-	cb->args[5] = (long)sk;
+		return 0;
 
 	return 1;
-
-out:
-	cb->args[2]++;
-	return 0;
 }
 
 static int sctp_ep_dump(struct sctp_endpoint *ep, void *p)
@@ -503,12 +493,8 @@ skip:
 	if (!(idiag_states & ~(TCPF_LISTEN | TCPF_CLOSE)))
 		goto done;
 
-next:
-	cb->args[5] = 0;
-	sctp_for_each_transport(sctp_get_sock, net, cb->args[2], &commp);
-
-	if (cb->args[5] && !sctp_sock_dump((struct sock *)cb->args[5], &commp))
-		goto next;
+	sctp_for_each_transport(sctp_sock_filter, sctp_sock_dump,
+				net, (int *)&cb->args[2], &commp);
 
 done:
 	cb->args[1] = cb->args[4];
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 1b00a1e09b93..d4730ada7f32 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4658,29 +4658,39 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *),
 EXPORT_SYMBOL_GPL(sctp_transport_lookup_process);
 
 int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *),
-			    struct net *net, int pos, void *p) {
+			    int (*cb_done)(struct sctp_transport *, void *),
+			    struct net *net, int *pos, void *p) {
 	struct rhashtable_iter hti;
-	void *obj;
-	int err;
-
-	err = sctp_transport_walk_start(&hti);
-	if (err)
-		return err;
+	struct sctp_transport *tsp;
+	int ret;
 
-	obj = sctp_transport_get_idx(net, &hti, pos + 1);
-	for (; !IS_ERR_OR_NULL(obj); obj = sctp_transport_get_next(net, &hti)) {
-		struct sctp_transport *transport = obj;
+again:
+	ret = sctp_transport_walk_start(&hti);
+	if (ret)
+		return ret;
 
-		if (!sctp_transport_hold(transport))
+	tsp = sctp_transport_get_idx(net, &hti, *pos + 1);
+	for (; !IS_ERR_OR_NULL(tsp); tsp = sctp_transport_get_next(net, &hti)) {
+		if (!sctp_transport_hold(tsp))
 			continue;
-		err = cb(transport, p);
-		sctp_transport_put(transport);
-		if (err)
+		ret = cb(tsp, p);
+		if (ret)
 			break;
+		(*pos)++;
+		sctp_transport_put(tsp);
 	}
 	sctp_transport_walk_stop(&hti);
 
-	return err;
+	if (ret) {
+		if (cb_done && !cb_done(tsp, p)) {
+			(*pos)++;
+			sctp_transport_put(tsp);
+			goto again;
+		}
+		sctp_transport_put(tsp);
+	}
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(sctp_for_each_transport);
 
-- 
cgit 


From 8c7c19a55e41ae69d1cd18ab56e6e9b66a679a7c Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 15 Sep 2017 11:02:48 +0800
Subject: sctp: do not mark sk dumped when inet_sctp_diag_fill returns err

sctp_diag would not actually dump out sk/asoc if inet_sctp_diag_fill
returns err, in which case it shouldn't mark sk dumped by setting
cb->args[3] as 1 in sctp_sock_dump().

Otherwise, it could cause some asocs to have no parent's sk dumped
in 'ss --sctp'.

So this patch is to not set cb->args[3] when inet_sctp_diag_fill()
returns err in sctp_sock_dump().

Fixes: 8f840e47f190 ("sctp: add the sctp_diag.c file")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sctp_diag.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
index 7008a992749b..22ed01a76b19 100644
--- a/net/sctp/sctp_diag.c
+++ b/net/sctp/sctp_diag.c
@@ -309,7 +309,6 @@ static int sctp_sock_dump(struct sctp_transport *tsp, void *p)
 					cb->nlh->nlmsg_seq,
 					NLM_F_MULTI, cb->nlh,
 					commp->net_admin) < 0) {
-			cb->args[3] = 1;
 			err = 1;
 			goto release;
 		}
-- 
cgit 


From e67b8a685c7c984e834e3181ef4619cd7025a136 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Fri, 15 Sep 2017 14:37:38 +0100
Subject: bpf/verifier: reject BPF_ALU64|BPF_END

Neither ___bpf_prog_run nor the JITs accept it.
Also adds a new test case.

Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)")
Signed-off-by: Edward Cree <ecree@solarflare.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c                       |  3 ++-
 tools/testing/selftests/bpf/test_verifier.c | 16 ++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 477b6932c3c1..799b2451ef2d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2292,7 +2292,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			}
 		} else {
 			if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
-			    (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) {
+			    (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
+			    BPF_CLASS(insn->code) == BPF_ALU64) {
 				verbose("BPF_END uses reserved fields\n");
 				return -EINVAL;
 			}
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 8eb09950258b..26f3250bdcd2 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -6629,6 +6629,22 @@ static struct bpf_test tests[] = {
 		.result = REJECT,
 		.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 	},
+	{
+		"invalid 64-bit BPF_END",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_0, 0),
+			{
+				.code  = BPF_ALU64 | BPF_END | BPF_TO_LE,
+				.dst_reg = BPF_REG_0,
+				.src_reg = 0,
+				.off   = 0,
+				.imm   = 32,
+			},
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "BPF_END uses reserved fields",
+		.result = REJECT,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
cgit 


From fc22579917eb7e13433448a342f1cb1592920940 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 15 Sep 2017 16:47:42 -0700
Subject: tcp: fix data delivery rate

Now skb->mstamp_skb is updated later, we also need to call
tcp_rate_skb_sent() after the update is done.

Fixes: 8c72c65b426b ("tcp: update skb->skb_mstamp more carefully")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a85a8c2948e5..1c839c99114c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1002,8 +1002,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	if (clone_it) {
 		TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
 			- tp->snd_una;
-		tcp_rate_skb_sent(sk, skb);
-
 		oskb = skb;
 		if (unlikely(skb_cloned(skb)))
 			skb = pskb_copy(skb, gfp_mask);
@@ -1128,9 +1126,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 		tcp_enter_cwr(sk);
 		err = net_xmit_eval(err);
 	}
-	if (!err && oskb)
+	if (!err && oskb) {
 		oskb->skb_mstamp = tp->tcp_mstamp;
-
+		tcp_rate_skb_sent(sk, oskb);
+	}
 	return err;
 }
 
-- 
cgit 


From 2130c0281608a109653272902e4d00b45bf00571 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sat, 16 Sep 2017 16:28:02 +0200
Subject: Documentation: link in networking docs

Fix link in filter.txt.

Acked-by: Pavel Machek <pavel@ucw.cz>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/filter.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index e5e33bac2068..789b74dbe1d9 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -45,7 +45,7 @@ in many more places. There's xt_bpf for netfilter, cls_bpf in the kernel
 qdisc layer, SECCOMP-BPF (SECure COMPuting [1]), and lots of other places
 such as team driver, PTP code, etc where BPF is being used.
 
- [1] Documentation/prctl/seccomp_filter.txt
+ [1] Documentation/userspace-api/seccomp_filter.rst
 
 Original BPF paper:
 
-- 
cgit 


From 8e29f97979c300406c21994986bdfcdb67fe4ff7 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Fri, 15 Sep 2017 15:31:07 +0200
Subject: mlxsw: spectrum_router: Only handle IPv4 and IPv6 events

The driver doesn't support events from address families other than IPv4
and IPv6, so ignore them. Otherwise, we risk queueing a work item before
it's initialized.

This can happen in case a VRF is configured when MROUTE_MULTIPLE_TABLES
is enabled, as the VRF driver will try to add an l3mdev rule for the
IPMR family.

Fixes: 65e65ec137f4 ("mlxsw: spectrum_router: Don't ignore IPv6 notifications")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reported-by: Andreas Rammhold <andreas@rammhold.de>
Reported-by: Florian Klink <flokli@flokli.de>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index f0fb898533fb..2cfb3f5d092d 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -4868,7 +4868,8 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
 	struct fib_notifier_info *info = ptr;
 	struct mlxsw_sp_router *router;
 
-	if (!net_eq(info->net, &init_net))
+	if (!net_eq(info->net, &init_net) ||
+	    (info->family != AF_INET && info->family != AF_INET6))
 		return NOTIFY_DONE;
 
 	fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
-- 
cgit