summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2019-07-01 19:34:46 -0700
committerDavid S. Miller <davem@davemloft.net>2019-07-01 19:34:46 -0700
commit2a8d8e0feca29f27570732807c6353151309e97c (patch)
tree30d85bf915bacf13f14ea2d3c40bbc68815d799d /drivers
parent8909783cb5b719e05c17e62529efb97c564b2e26 (diff)
parent509e56b37cc32c9b5fc2be585c25d1e60d6a1d73 (diff)
Merge branch 'blackhole-device-to-invalidate-dst'
Mahesh Bandewar says: ==================== blackhole device to invalidate dst When we invalidate dst or mark it "dead", we assign 'lo' to dst->dev. First of all this assignment is racy and more over, it has MTU implications. The standard dev MTU is 1500 while the Loopback MTU is 64k. TCP code when dereferencing the dst don't check if the dst is valid or not. TCP when dereferencing a dead-dst while negotiating a new connection, may use dst device which is 'lo' instead of using the correct device. Consider the following scenario: A SYN arrives on an interface and tcp-layer while processing SYNACK finds a dst and associates it with SYNACK skb. Now before skb gets passed to L3 for processing, if that dst gets "dead" (because of the virtual device getting disappeared & then reappeared), the 'lo' gets assigned to that dst (lo MTU = 64k). Let's assume the SYN has ADV_MSS set as 9k while the output device through which this SYNACK is going to go out has standard MTU of 1500. The MTU check during the route check passes since MIN(9K, 64K) is 9k and TCP successfully negotiates 9k MSS. The subsequent data packet; bigger in size gets passed to the device and it won't be marked as GSO since the assumed MTU of the device is 9k. This either crashes the NIC and we have seen fixes that went into drivers to handle this scenario. 8914a595110a ('bnx2x: disable GSO where gso_size is too big for hardware') and 2b16f048729b ('net: create skb_gso_validate_mac_len()') and with those fixes TCP eventually recovers but not before few dropped segments. Well, I'm not a TCP expert and though we have experienced these corner cases in our environment, I could not reproduce this case reliably in my test setup to try this fix myself. However, Michael Chan <michael.chan@broadcom.com> had a setup where these fixes helped him mitigate the issue and not cause the crash. The idea here is to not alter the data-path with additional locks or smb()/rmb() barriers to avoid racy assignments but to create a new device that has really low MTU that has .ndo_start_xmit essentially a kfree_skb(). Make use of this device instead of 'lo' when marking the dst dead. First patch implements the blackhole device and second patch uses it in IPv4 and IPv6 stack while the third patch is the self test that ensures the sanity of this device. v1->v2 fixed the self-test patch to handle the conflict v2 -> v3 fixed Kconfig text/string. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/net/loopback.c76
1 files changed, 67 insertions, 9 deletions
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 87d361666cdd..3b39def5471e 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -55,6 +55,13 @@
#include <net/net_namespace.h>
#include <linux/u64_stats_sync.h>
+/* blackhole_netdev - a device used for dsts that are marked expired!
+ * This is global device (instead of per-net-ns) since it's not needed
+ * to be per-ns and gets initialized at boot time.
+ */
+struct net_device *blackhole_netdev;
+EXPORT_SYMBOL(blackhole_netdev);
+
/* The higher levels take care of making this non-reentrant (it's
* called with bh's disabled).
*/
@@ -150,12 +157,14 @@ static const struct net_device_ops loopback_ops = {
.ndo_set_mac_address = eth_mac_addr,
};
-/* The loopback device is special. There is only one instance
- * per network namespace.
- */
-static void loopback_setup(struct net_device *dev)
+static void gen_lo_setup(struct net_device *dev,
+ unsigned int mtu,
+ const struct ethtool_ops *eth_ops,
+ const struct header_ops *hdr_ops,
+ const struct net_device_ops *dev_ops,
+ void (*dev_destructor)(struct net_device *dev))
{
- dev->mtu = 64 * 1024;
+ dev->mtu = mtu;
dev->hard_header_len = ETH_HLEN; /* 14 */
dev->min_header_len = ETH_HLEN; /* 14 */
dev->addr_len = ETH_ALEN; /* 6 */
@@ -174,11 +183,20 @@ static void loopback_setup(struct net_device *dev)
| NETIF_F_NETNS_LOCAL
| NETIF_F_VLAN_CHALLENGED
| NETIF_F_LOOPBACK;
- dev->ethtool_ops = &loopback_ethtool_ops;
- dev->header_ops = &eth_header_ops;
- dev->netdev_ops = &loopback_ops;
+ dev->ethtool_ops = eth_ops;
+ dev->header_ops = hdr_ops;
+ dev->netdev_ops = dev_ops;
dev->needs_free_netdev = true;
- dev->priv_destructor = loopback_dev_free;
+ dev->priv_destructor = dev_destructor;
+}
+
+/* The loopback device is special. There is only one instance
+ * per network namespace.
+ */
+static void loopback_setup(struct net_device *dev)
+{
+ gen_lo_setup(dev, (64 * 1024), &loopback_ethtool_ops, &eth_header_ops,
+ &loopback_ops, loopback_dev_free);
}
/* Setup and register the loopback device. */
@@ -213,3 +231,43 @@ out:
struct pernet_operations __net_initdata loopback_net_ops = {
.init = loopback_net_init,
};
+
+/* blackhole netdevice */
+static netdev_tx_t blackhole_netdev_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ kfree_skb(skb);
+ net_warn_ratelimited("%s(): Dropping skb.\n", __func__);
+ return NETDEV_TX_OK;
+}
+
+static const struct net_device_ops blackhole_netdev_ops = {
+ .ndo_start_xmit = blackhole_netdev_xmit,
+};
+
+/* This is a dst-dummy device used specifically for invalidated
+ * DSTs and unlike loopback, this is not per-ns.
+ */
+static void blackhole_netdev_setup(struct net_device *dev)
+{
+ gen_lo_setup(dev, ETH_MIN_MTU, NULL, NULL, &blackhole_netdev_ops, NULL);
+}
+
+/* Setup and register the blackhole_netdev. */
+static int __init blackhole_netdev_init(void)
+{
+ blackhole_netdev = alloc_netdev(0, "blackhole_dev", NET_NAME_UNKNOWN,
+ blackhole_netdev_setup);
+ if (!blackhole_netdev)
+ return -ENOMEM;
+
+ dev_init_scheduler(blackhole_netdev);
+ dev_activate(blackhole_netdev);
+
+ blackhole_netdev->flags |= IFF_UP | IFF_RUNNING;
+ dev_net_set(blackhole_netdev, &init_net);
+
+ return 0;
+}
+
+device_initcall(blackhole_netdev_init);