summaryrefslogtreecommitdiff
path: root/include/net
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2016-09-10 23:12:54 -0700
committerDavid S. Miller <davem@davemloft.net>2016-09-10 23:12:54 -0700
commit8fee3156034a844c0da62b338ef7651cfb0fc8b1 (patch)
tree3ff3d4b5dc5197f6089c610a2fa5380516a1d8df /include/net
parentcf9932a9414e241571008edd7412ab22f02b5704 (diff)
parentc71ad3d45a5e928e617ca436f3ce88bb773fb766 (diff)
Merge branch 'vrf-tx-hook'
David Ahern says: ==================== net: Convert vrf to tx hook The motivation for this series is that ICMP Unreachable - Fragmentation Needed packets are not handled properly for VRFs. Specifically, the FIB lookup in __ip_rt_update_pmtu fails so no nexthop exception is created with the reduced MTU. As a result connections stall if packets larger than the smallest MTU in the path are generated. While investigating that problem I also noticed that the MSS for all connections in a VRF is based on the VRF device's MTU and not the route the packets ultimately go through. VRF currently uses a dst to direct packets to the device. The first FIB lookup returns this dst and then the lookup in the VRF driver gets the actual output route. A side effect of this design is that the VRF dst is cached on sockets and then used for calculations like the MSS. This series fixes this problem by removing the hook in the FIB lookups that returns the dst pointing to the VRF device to the VRF and always doing the actual FIB lookup. This allows the real dst to be used throughout the stack (for example the MSS). Packets are diverted to the VRF device on Tx using an l3mdev hook in the output path similar to to what is done for Rx. The end result is a simpler implementation for VRF with fewer intrusions into the network stack and symmetrical packet handling for Rx and Tx paths. Comparison of netperf performance for a build without l3mdev (best case performance), the old vrf driver and the VRF driver from this series. Data are collected using VMs with virtio + vhost. The netperf client runs in the VM and netserver runs in the host. 1-byte RR tests are done as these packets exaggerate the performance hit due to the extra lookups done for l3mdev and VRF. Command: netperf -cC -H ${ip} -l 60 -t {TCP,UDP}_RR [-J red] TCP_RR UDP_RR IPv4 IPv6 IPv4 IPv6 no l3mdev 29,996 30,601 31,638 24,336 vrf old 27,417 27,626 29,159 24,801 vrf new 28,036 28,372 30,110 24,857 l3mdev, no vrf 29,534 30,465 30,670 24,346 * Transactions per second as reported by netperf * netperf modified to take a bind-to-device argument -- the -J red option 1. 'no l3mdev' == NET_L3_MASTER_DEV is unset so code is compiled out 2. 'vrf old' == data for existing implementation 3. 'vrf new' == data with this series 4. 'l3mdev, no vrf' == NET_L3_MASTER_DEV is enabled but traffic is not going through a VRF About the series - patch 1 adds the flow update (changing oif or iif to L3 master device and setting the flag to skip the oif check) to ipv4 and ipv6 paths just before hitting the rules. This catches all code paths in a single spot. - patch 2 adds the Tx hook to push the packet to the l3mdev if relevant - patch 3 adds some checks so the vrf device can act as a vrf-local loopback. These changes were not needed before since the vrf dst was returned from the lookup. - patches 4 and 5 flip the ipv4 and ipv6 stacks to the tx hook leaving the route lookup to be the real one. The dst flip happens at the beginning of the L3 output path so the VRFs can have device based features such as netfilter, tc and tcpdump. - patches 6-11 remove no longer needed l3mdev code v2 - properly handle IPv6 link scope addresses - keep the device xmit path and associated dst which is switched in by the l3_out hook. packets still need to go through the xmit path in case the user puts a qdisc on the vrf device and to allow tc rules. version 1 short circuited the tx handling and only covered netfilter and tcpdump. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include/net')
-rw-r--r--include/net/flow.h3
-rw-r--r--include/net/l3mdev.h131
-rw-r--r--include/net/route.h10
3 files changed, 57 insertions, 87 deletions
diff --git a/include/net/flow.h b/include/net/flow.h
index d47ef4bb5423..035aa7716967 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -34,8 +34,7 @@ struct flowi_common {
__u8 flowic_flags;
#define FLOWI_FLAG_ANYSRC 0x01
#define FLOWI_FLAG_KNOWN_NH 0x02
-#define FLOWI_FLAG_L3MDEV_SRC 0x04
-#define FLOWI_FLAG_SKIP_NH_OIF 0x08
+#define FLOWI_FLAG_SKIP_NH_OIF 0x04
__u32 flowic_secid;
struct flowi_tunnel flowic_tun_key;
};
diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index e90095091aa0..3832099289c5 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -11,6 +11,7 @@
#ifndef _NET_L3MDEV_H_
#define _NET_L3MDEV_H_
+#include <net/dst.h>
#include <net/fib_rules.h>
/**
@@ -18,30 +19,24 @@
*
* @l3mdev_fib_table: Get FIB table id to use for lookups
*
- * @l3mdev_get_rtable: Get cached IPv4 rtable (dst_entry) for device
+ * @l3mdev_l3_rcv: Hook in L3 receive path
*
- * @l3mdev_get_saddr: Get source address for a flow
+ * @l3mdev_l3_out: Hook in L3 output path
*
- * @l3mdev_get_rt6_dst: Get cached IPv6 rt6_info (dst_entry) for device
+ * @l3mdev_link_scope_lookup: IPv6 lookup for linklocal and mcast destinations
*/
struct l3mdev_ops {
u32 (*l3mdev_fib_table)(const struct net_device *dev);
struct sk_buff * (*l3mdev_l3_rcv)(struct net_device *dev,
struct sk_buff *skb, u16 proto);
-
- /* IPv4 ops */
- struct rtable * (*l3mdev_get_rtable)(const struct net_device *dev,
- const struct flowi4 *fl4);
- int (*l3mdev_get_saddr)(struct net_device *dev,
- struct flowi4 *fl4);
+ struct sk_buff * (*l3mdev_l3_out)(struct net_device *dev,
+ struct sock *sk, struct sk_buff *skb,
+ u16 proto);
/* IPv6 ops */
- struct dst_entry * (*l3mdev_get_rt6_dst)(const struct net_device *dev,
+ struct dst_entry * (*l3mdev_link_scope_lookup)(const struct net_device *dev,
struct flowi6 *fl6);
- int (*l3mdev_get_saddr6)(struct net_device *dev,
- const struct sock *sk,
- struct flowi6 *fl6);
};
#ifdef CONFIG_NET_L3_MASTER_DEV
@@ -49,6 +44,8 @@ struct l3mdev_ops {
int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
struct fib_lookup_arg *arg);
+void l3mdev_update_flow(struct net *net, struct flowi *fl);
+
int l3mdev_master_ifindex_rcu(const struct net_device *dev);
static inline int l3mdev_master_ifindex(struct net_device *dev)
{
@@ -80,7 +77,7 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex)
}
static inline
-const struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev)
+struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev)
{
/* netdev_master_upper_dev_get_rcu calls
* list_first_or_null_rcu to walk the upper dev list.
@@ -89,7 +86,7 @@ const struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev)
* typecast to remove the const
*/
struct net_device *dev = (struct net_device *)_dev;
- const struct net_device *master;
+ struct net_device *master;
if (!dev)
return NULL;
@@ -104,26 +101,6 @@ const struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev)
return master;
}
-/* get index of an interface to use for FIB lookups. For devices
- * enslaved to an L3 master device FIB lookups are based on the
- * master index
- */
-static inline int l3mdev_fib_oif_rcu(struct net_device *dev)
-{
- return l3mdev_master_ifindex_rcu(dev) ? : dev->ifindex;
-}
-
-static inline int l3mdev_fib_oif(struct net_device *dev)
-{
- int oif;
-
- rcu_read_lock();
- oif = l3mdev_fib_oif_rcu(dev);
- rcu_read_unlock();
-
- return oif;
-}
-
u32 l3mdev_fib_table_rcu(const struct net_device *dev);
u32 l3mdev_fib_table_by_index(struct net *net, int ifindex);
static inline u32 l3mdev_fib_table(const struct net_device *dev)
@@ -137,15 +114,6 @@ static inline u32 l3mdev_fib_table(const struct net_device *dev)
return tb_id;
}
-static inline struct rtable *l3mdev_get_rtable(const struct net_device *dev,
- const struct flowi4 *fl4)
-{
- if (netif_is_l3_master(dev) && dev->l3mdev_ops->l3mdev_get_rtable)
- return dev->l3mdev_ops->l3mdev_get_rtable(dev, fl4);
-
- return NULL;
-}
-
static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
{
struct net_device *dev;
@@ -165,11 +133,7 @@ static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
return rc;
}
-int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4);
-
-struct dst_entry *l3mdev_get_rt6_dst(struct net *net, struct flowi6 *fl6);
-int l3mdev_get_saddr6(struct net *net, const struct sock *sk,
- struct flowi6 *fl6);
+struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6);
static inline
struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto)
@@ -199,6 +163,34 @@ struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
return l3mdev_l3_rcv(skb, AF_INET6);
}
+static inline
+struct sk_buff *l3mdev_l3_out(struct sock *sk, struct sk_buff *skb, u16 proto)
+{
+ struct net_device *dev = skb_dst(skb)->dev;
+
+ if (netif_is_l3_slave(dev)) {
+ struct net_device *master;
+
+ master = netdev_master_upper_dev_get_rcu(dev);
+ if (master && master->l3mdev_ops->l3mdev_l3_out)
+ skb = master->l3mdev_ops->l3mdev_l3_out(master, sk,
+ skb, proto);
+ }
+
+ return skb;
+}
+
+static inline
+struct sk_buff *l3mdev_ip_out(struct sock *sk, struct sk_buff *skb)
+{
+ return l3mdev_l3_out(sk, skb, AF_INET);
+}
+
+static inline
+struct sk_buff *l3mdev_ip6_out(struct sock *sk, struct sk_buff *skb)
+{
+ return l3mdev_l3_out(sk, skb, AF_INET6);
+}
#else
static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev)
@@ -216,20 +208,11 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex)
}
static inline
-const struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev)
+struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev)
{
return NULL;
}
-static inline int l3mdev_fib_oif_rcu(struct net_device *dev)
-{
- return dev ? dev->ifindex : 0;
-}
-static inline int l3mdev_fib_oif(struct net_device *dev)
-{
- return dev ? dev->ifindex : 0;
-}
-
static inline u32 l3mdev_fib_table_rcu(const struct net_device *dev)
{
return 0;
@@ -243,43 +226,37 @@ static inline u32 l3mdev_fib_table_by_index(struct net *net, int ifindex)
return 0;
}
-static inline struct rtable *l3mdev_get_rtable(const struct net_device *dev,
- const struct flowi4 *fl4)
-{
- return NULL;
-}
-
static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
{
return false;
}
-static inline int l3mdev_get_saddr(struct net *net, int ifindex,
- struct flowi4 *fl4)
+static inline
+struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6)
{
- return 0;
+ return NULL;
}
static inline
-struct dst_entry *l3mdev_get_rt6_dst(struct net *net, struct flowi6 *fl6)
+struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
{
- return NULL;
+ return skb;
}
-static inline int l3mdev_get_saddr6(struct net *net, const struct sock *sk,
- struct flowi6 *fl6)
+static inline
+struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
{
- return 0;
+ return skb;
}
static inline
-struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
+struct sk_buff *l3mdev_ip_out(struct sock *sk, struct sk_buff *skb)
{
return skb;
}
static inline
-struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
+struct sk_buff *l3mdev_ip6_out(struct sock *sk, struct sk_buff *skb)
{
return skb;
}
@@ -290,6 +267,10 @@ int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
{
return 1;
}
+static inline
+void l3mdev_update_flow(struct net *net, struct flowi *fl)
+{
+}
#endif
#endif /* _NET_L3MDEV_H_ */
diff --git a/include/net/route.h b/include/net/route.h
index ad777d79af94..0429d47cad25 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -29,7 +29,6 @@
#include <net/flow.h>
#include <net/inet_sock.h>
#include <net/ip_fib.h>
-#include <net/l3mdev.h>
#include <linux/in_route.h>
#include <linux/rtnetlink.h>
#include <linux/rcupdate.h>
@@ -285,15 +284,6 @@ static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
ip_route_connect_init(fl4, dst, src, tos, oif, protocol,
sport, dport, sk);
- if (!src && oif) {
- int rc;
-
- rc = l3mdev_get_saddr(net, oif, fl4);
- if (rc < 0)
- return ERR_PTR(rc);
-
- src = fl4->saddr;
- }
if (!dst || !src) {
rt = __ip_route_output_key(net, fl4);
if (IS_ERR(rt))