summaryrefslogtreecommitdiff
path: root/net/switchdev/switchdev.c
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2015-07-20 18:32:45 -0700
committerDavid S. Miller <davem@davemloft.net>2015-07-20 18:32:45 -0700
commitbd265242c10afa6c99c46b13ad087238d750fdb4 (patch)
tree701d726a9490d0bd1ca9be3ac704a75fb9e37488 /net/switchdev/switchdev.c
parent8254973fa3459b512b6c0cd5b0e4641e4d7c048c (diff)
parenta48037e7c6c25436912f78f48cdbb75a710b7aa9 (diff)
Merge branch 'offload_fwd_mark'
Scott Feldman says: ==================== switchdev: avoid duplicate packet forwarding v3: - Per Nicolas Dichtel review: remove errant empty union. v2: - Per davem review: in sk_buff, union fwd_mark with secmark to save space since features appear to be mutually exclusive. - Per Simon Horman review: - fix grammar in switchdev.txt wrt fwd_mark - remove some unrelated changes that snuck in v1: This patchset was previously submitted as RFC. No changes from the last version (v2) sent under RFC. Including RFC version history here for reference. RFC v2: - s/fwd_mark/offload_fwd_mark - use consume_skb rather than kfree_skb when dropping pkt on egress. - Use Jiri's suggestion to use ifindex of one of the ports in a group as the mark for all the ports in the group. This can be done with no additional storage (no hashtable from v1). To pull it off, we need some simple recursive routines to walk the netdev tree ensuring all leaves in the tree (ports) in the same group (e.g. bridge) belonging to the same switch device will have the same offload fwd mark. Maybe someone sees a better design for the recusive routines? They're not too bad, and should cover the stacked driver cases. RFC v1: With switchdev support for offloading L2/L3 forwarding data path to a switch device, we have a general problem where both the device and the kernel may forward the packet, resulting in duplicate packets on the wire. Anytime a packet is forwarded by the device and a copy is sent to the CPU, there is potential for duplicate forwarding, as the kernel may also do a forwarding lookup and send the packet on the wire. The specific problem this patch series is interested in solving is avoiding duplicate packets on bridged ports. There was a previous RFC from Roopa (http://marc.info/?l=linux-netdev&m=142687073314252&w=2) to address this problem, but didn't solve the problem of mixed ports in the bridge from different devices; there was no way to exclude some ports from forwarding and include others. This RFC solves that problem by tagging the ingressing packet with a unique mark, and then comparing the packet mark with the egress port mark, and skip forwarding when there is a match. For the mixed ports bridge case, only those ports with matching marks are skipped. The switchdev port driver must do two things: 1) Generate a fwd_mark for each switch port, using some unique key of the switch device (and optionally port). This is done when the port netdev is registered or if the port's group membership changes (joins/leaves a bridge, for example). 2) On packet ingress from port, mark the skb with the ingress port's fwd_mark. If the device supports it, it's useful to only mark skbs which were already forwarded by the device. If the device does not support such indication, all skbs can be marked, even if they're local dst. Two new 32-bit fields are added to struct sk_buff and struct netdevice to hold the fwd_mark. I've wrapped these with CONFIG_NET_SWITCHDEV for now. I tried using skb->mark for this purpose, but ebtables can overwrite the skb->mark before the bridge gets it, so that will not work. In general, this fwd_mark can be used for any case where a packet is forwarded by the device and a copy is sent to the CPU, to avoid the kernel re-forwarding the packet. sFlow is another use-case that comes to mind, but I haven't explored the details. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/switchdev/switchdev.c')
-rw-r--r--net/switchdev/switchdev.c111
1 files changed, 105 insertions, 6 deletions
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 9f2add3cba26..33bafa2e703e 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -910,13 +910,9 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi)
if (switchdev_port_attr_get(dev, &attr))
return NULL;
- if (nhsel > 0) {
- if (prev_attr.u.ppid.id_len != attr.u.ppid.id_len)
+ if (nhsel > 0 &&
+ !netdev_phys_item_id_same(&prev_attr.u.ppid, &attr.u.ppid))
return NULL;
- if (memcmp(prev_attr.u.ppid.id, attr.u.ppid.id,
- attr.u.ppid.id_len))
- return NULL;
- }
prev_attr = attr;
}
@@ -1043,3 +1039,106 @@ void switchdev_fib_ipv4_abort(struct fib_info *fi)
fi->fib_net->ipv4.fib_offload_disabled = true;
}
EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort);
+
+static bool switchdev_port_same_parent_id(struct net_device *a,
+ struct net_device *b)
+{
+ struct switchdev_attr a_attr = {
+ .id = SWITCHDEV_ATTR_PORT_PARENT_ID,
+ .flags = SWITCHDEV_F_NO_RECURSE,
+ };
+ struct switchdev_attr b_attr = {
+ .id = SWITCHDEV_ATTR_PORT_PARENT_ID,
+ .flags = SWITCHDEV_F_NO_RECURSE,
+ };
+
+ if (switchdev_port_attr_get(a, &a_attr) ||
+ switchdev_port_attr_get(b, &b_attr))
+ return false;
+
+ return netdev_phys_item_id_same(&a_attr.u.ppid, &b_attr.u.ppid);
+}
+
+static u32 switchdev_port_fwd_mark_get(struct net_device *dev,
+ struct net_device *group_dev)
+{
+ struct net_device *lower_dev;
+ struct list_head *iter;
+
+ netdev_for_each_lower_dev(group_dev, lower_dev, iter) {
+ if (lower_dev == dev)
+ continue;
+ if (switchdev_port_same_parent_id(dev, lower_dev))
+ return lower_dev->offload_fwd_mark;
+ return switchdev_port_fwd_mark_get(dev, lower_dev);
+ }
+
+ return dev->ifindex;
+}
+
+static void switchdev_port_fwd_mark_reset(struct net_device *group_dev,
+ u32 old_mark, u32 *reset_mark)
+{
+ struct net_device *lower_dev;
+ struct list_head *iter;
+
+ netdev_for_each_lower_dev(group_dev, lower_dev, iter) {
+ if (lower_dev->offload_fwd_mark == old_mark) {
+ if (!*reset_mark)
+ *reset_mark = lower_dev->ifindex;
+ lower_dev->offload_fwd_mark = *reset_mark;
+ }
+ switchdev_port_fwd_mark_reset(lower_dev, old_mark, reset_mark);
+ }
+}
+
+/**
+ * switchdev_port_fwd_mark_set - Set port offload forwarding mark
+ *
+ * @dev: port device
+ * @group_dev: containing device
+ * @joining: true if dev is joining group; false if leaving group
+ *
+ * An ungrouped port's offload mark is just its ifindex. A grouped
+ * port's (member of a bridge, for example) offload mark is the ifindex
+ * of one of the ports in the group with the same parent (switch) ID.
+ * Ports on the same device in the same group will have the same mark.
+ *
+ * Example:
+ *
+ * br0 ifindex=9
+ * sw1p1 ifindex=2 mark=2
+ * sw1p2 ifindex=3 mark=2
+ * sw2p1 ifindex=4 mark=5
+ * sw2p2 ifindex=5 mark=5
+ *
+ * If sw2p2 leaves the bridge, we'll have:
+ *
+ * br0 ifindex=9
+ * sw1p1 ifindex=2 mark=2
+ * sw1p2 ifindex=3 mark=2
+ * sw2p1 ifindex=4 mark=4
+ * sw2p2 ifindex=5 mark=5
+ */
+void switchdev_port_fwd_mark_set(struct net_device *dev,
+ struct net_device *group_dev,
+ bool joining)
+{
+ u32 mark = dev->ifindex;
+ u32 reset_mark = 0;
+
+ if (group_dev && joining) {
+ mark = switchdev_port_fwd_mark_get(dev, group_dev);
+ } else if (group_dev && !joining) {
+ if (dev->offload_fwd_mark == mark)
+ /* Ohoh, this port was the mark reference port,
+ * but it's leaving the group, so reset the
+ * mark for the remaining ports in the group.
+ */
+ switchdev_port_fwd_mark_reset(group_dev, mark,
+ &reset_mark);
+ }
+
+ dev->offload_fwd_mark = mark;
+}
+EXPORT_SYMBOL_GPL(switchdev_port_fwd_mark_set);