diff options
Diffstat (limited to 'net')
209 files changed, 5362 insertions, 2150 deletions
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index d1902828a18a..e5d23e75572a 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -638,12 +638,7 @@ void vlan_dev_free_egress_priority(const struct net_device *dev) static void vlan_dev_uninit(struct net_device *dev) { - struct vlan_dev_priv *vlan = vlan_dev_priv(dev); - vlan_dev_free_egress_priority(dev); - - /* Get rid of the vlan's reference to real_dev */ - dev_put_track(vlan->real_dev, &vlan->dev_tracker); } static netdev_features_t vlan_dev_fix_features(struct net_device *dev, @@ -856,6 +851,9 @@ static void vlan_dev_free(struct net_device *dev) free_percpu(vlan->vlan_pcpu_stats); vlan->vlan_pcpu_stats = NULL; + + /* Get rid of the vlan's reference to real_dev */ + dev_put_track(vlan->real_dev, &vlan->dev_tracker); } void vlan_setup(struct net_device *dev) diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index eb9fb55280ef..01f8067994d6 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -281,9 +281,9 @@ static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv) ref = priv->rings[i].intf->ref[j]; gnttab_end_foreign_access(ref, 0, 0); } - free_pages((unsigned long)priv->rings[i].data.in, - priv->rings[i].intf->ring_order - - (PAGE_SHIFT - XEN_PAGE_SHIFT)); + free_pages_exact(priv->rings[i].data.in, + 1UL << (priv->rings[i].intf->ring_order + + XEN_PAGE_SHIFT)); } gnttab_end_foreign_access(priv->rings[i].ref, 0, 0); free_page((unsigned long)priv->rings[i].intf); @@ -322,8 +322,8 @@ static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev, if (ret < 0) goto out; ring->ref = ret; - bytes = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - order - (PAGE_SHIFT - XEN_PAGE_SHIFT)); + bytes = alloc_pages_exact(1UL << (order + XEN_PAGE_SHIFT), + GFP_KERNEL | __GFP_ZERO); if (!bytes) { ret = -ENOMEM; goto out; @@ -354,9 +354,7 @@ out: if (bytes) { for (i--; i >= 0; i--) gnttab_end_foreign_access(ring->intf->ref[i], 0, 0); - free_pages((unsigned long)bytes, - ring->intf->ring_order - - (PAGE_SHIFT - XEN_PAGE_SHIFT)); + free_pages_exact(bytes, 1UL << (order + XEN_PAGE_SHIFT)); } gnttab_end_foreign_access(ring->ref, 0, 0); free_page((unsigned long)ring->intf); diff --git a/net/Kconfig b/net/Kconfig index 8a1f9d0287de..6b78f695caa6 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -434,6 +434,19 @@ config NET_DEVLINK config PAGE_POOL bool +config PAGE_POOL_STATS + default n + bool "Page pool stats" + depends on PAGE_POOL + help + Enable page pool statistics to track page allocation and recycling + in page pools. This option incurs additional CPU cost in allocation + and recycle paths and additional memory cost to store the statistics. + These statistics are only available if this option is enabled and if + the driver using the page pool supports exporting this data. + + If unsure, say N. + config FAILOVER tristate "Generic failover module" help diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index d53cbb4e2503..6bd097180772 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -87,6 +87,13 @@ again: ax25_for_each(s, &ax25_list) { if (s->ax25_dev == ax25_dev) { sk = s->sk; + if (!sk) { + spin_unlock_bh(&ax25_list_lock); + s->ax25_dev = NULL; + ax25_disconnect(s, ENETUNREACH); + spin_lock_bh(&ax25_list_lock); + goto again; + } sock_hold(sk); spin_unlock_bh(&ax25_list_lock); lock_sock(sk); diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index f94f538fa382..7f6a7c96ac92 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -13,13 +13,13 @@ #include <linux/bug.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> +#include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/init.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 71999e13f729..b6db999abf75 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -10,13 +10,13 @@ #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/byteorder/generic.h> +#include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/minmax.h> #include <linux/netdevice.h> diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 1d750f3cb2e4..033639df96d8 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -9,12 +9,12 @@ #include <linux/atomic.h> #include <linux/byteorder/generic.h> +#include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 2ed9496fc41f..7f8a14d99cdb 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -10,6 +10,7 @@ #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/compiler.h> +#include <linux/container_of.h> #include <linux/crc16.h> #include <linux/errno.h> #include <linux/etherdevice.h> @@ -443,7 +444,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, const u8 *mac, batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, skb->len + ETH_HLEN); - netif_rx_any_context(skb); + netif_rx(skb); out: batadv_hardif_put(primary_if); } diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 2f008e329007..fefb51a5f606 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -11,6 +11,7 @@ #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/byteorder/generic.h> +#include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> @@ -20,7 +21,6 @@ #include <linux/in.h> #include <linux/ip.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/netlink.h> diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index b7466136e292..d26124bc27e1 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -9,6 +9,7 @@ #include <linux/atomic.h> #include <linux/byteorder/generic.h> +#include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 8a2b78f9c4b2..83fb51b6e299 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -9,11 +9,11 @@ #include <linux/atomic.h> #include <linux/byteorder/generic.h> +#include <linux/container_of.h> #include <linux/gfp.h> #include <linux/if.h> #include <linux/if_arp.h> #include <linux/if_ether.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/limits.h> #include <linux/list.h> @@ -149,25 +149,28 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) struct net *net = dev_net(net_dev); struct net_device *parent_dev; struct net *parent_net; + int iflink; bool ret; /* check if this is a batman-adv mesh interface */ if (batadv_softif_is_valid(net_dev)) return true; - /* no more parents..stop recursion */ - if (dev_get_iflink(net_dev) == 0 || - dev_get_iflink(net_dev) == net_dev->ifindex) + iflink = dev_get_iflink(net_dev); + if (iflink == 0) return false; parent_net = batadv_getlink_net(net_dev, net); + /* iflink to itself, most likely physical device */ + if (net == parent_net && iflink == net_dev->ifindex) + return false; + /* recurse over the parent device */ - parent_dev = __dev_get_by_index((struct net *)parent_net, - dev_get_iflink(net_dev)); - /* if we got a NULL parent_dev there is something broken.. */ + parent_dev = __dev_get_by_index((struct net *)parent_net, iflink); if (!parent_dev) { - pr_err("Cannot find parent device\n"); + pr_warn("Cannot find parent device. Skipping batadv-on-batadv check for %s\n", + net_dev->name); return false; } @@ -214,14 +217,15 @@ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) struct net_device *real_netdev = NULL; struct net *real_net; struct net *net; - int ifindex; + int iflink; ASSERT_RTNL(); if (!netdev) return NULL; - if (netdev->ifindex == dev_get_iflink(netdev)) { + iflink = dev_get_iflink(netdev); + if (iflink == 0) { dev_hold(netdev); return netdev; } @@ -231,9 +235,16 @@ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) goto out; net = dev_net(hard_iface->soft_iface); - ifindex = dev_get_iflink(netdev); real_net = batadv_getlink_net(netdev, net); - real_netdev = dev_get_by_index(real_net, ifindex); + + /* iflink to itself, most likely physical device */ + if (net == real_net && netdev->ifindex == iflink) { + real_netdev = netdev; + dev_hold(real_netdev); + goto out; + } + + real_netdev = dev_get_by_index(real_net, iflink); out: batadv_hardif_put(hard_iface); diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 5207cd8d6ad8..e8a449915566 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -9,6 +9,7 @@ #include <linux/atomic.h> #include <linux/build_bug.h> #include <linux/byteorder/generic.h> +#include <linux/container_of.h> #include <linux/crc32c.h> #include <linux/device.h> #include <linux/errno.h> @@ -132,7 +133,6 @@ static void __exit batadv_exit(void) rtnl_link_unregister(&batadv_link_ops); unregister_netdevice_notifier(&batadv_hard_if_notifier); - flush_workqueue(batadv_event_workqueue); destroy_workqueue(batadv_event_workqueue); batadv_event_workqueue = NULL; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 494d1ebecac2..f3be82999f1f 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2022.0" +#define BATADV_SOURCE_VERSION "2022.1" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 9f311fddfaf9..b238455913df 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -11,6 +11,7 @@ #include <linux/bitops.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> +#include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 974d726fabb9..5f4aeeb60dc4 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -11,6 +11,7 @@ #include <linux/bitops.h> #include <linux/byteorder/generic.h> #include <linux/compiler.h> +#include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> @@ -19,7 +20,6 @@ #include <linux/init.h> #include <linux/jhash.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index aadc653ca1d8..34903df4fe93 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -8,11 +8,11 @@ #include "main.h" #include <linux/atomic.h> +#include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 477d85a3b558..0379b126865d 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -10,13 +10,13 @@ #include <linux/atomic.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> +#include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if.h> #include <linux/if_ether.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/netdevice.h> diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 2dbbe6c19609..0f5c0679b55a 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -11,6 +11,7 @@ #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/compiler.h> +#include <linux/container_of.h> #include <linux/cpumask.h> #include <linux/errno.h> #include <linux/etherdevice.h> @@ -19,7 +20,6 @@ #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 93730d30af54..7f3dd3c393e0 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -12,13 +12,13 @@ #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/compiler.h> +#include <linux/container_of.h> #include <linux/err.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/init.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/kthread.h> #include <linux/limits.h> diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 4b7ad6684bc4..8478034d3abf 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -13,6 +13,7 @@ #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/compiler.h> +#include <linux/container_of.h> #include <linux/crc32c.h> #include <linux/errno.h> #include <linux/etherdevice.h> @@ -21,7 +22,6 @@ #include <linux/init.h> #include <linux/jhash.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 0cb58eb04093..7ec2e2343884 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -7,10 +7,10 @@ #include "main.h" #include <linux/byteorder/generic.h> +#include <linux/container_of.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 133d7ea063fb..215af9b3b589 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -240,7 +240,7 @@ static int give_skb_to_upper(struct sk_buff *skb, struct net_device *dev) if (!skb_cp) return NET_RX_DROP; - return netif_rx_ni(skb_cp); + return netif_rx(skb_cp); } static int iphc_decompress(struct sk_buff *skb, struct net_device *netdev, @@ -641,7 +641,6 @@ static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan, return NULL; peer->chan = chan; - memset(&peer->peer_addr, 0, sizeof(struct in6_addr)); baswap((void *)peer->lladdr, &chan->dst); diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index 40baa6b7321a..5a6a49885ab6 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -400,7 +400,7 @@ static int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb) dev->stats.rx_packets++; nskb->ip_summed = CHECKSUM_NONE; nskb->protocol = eth_type_trans(nskb, dev); - netif_rx_ni(nskb); + netif_rx(nskb); return 0; badframe: diff --git a/net/bluetooth/eir.h b/net/bluetooth/eir.h index 05e2e917fc25..43f1945bffc5 100644 --- a/net/bluetooth/eir.h +++ b/net/bluetooth/eir.h @@ -15,6 +15,11 @@ u8 eir_create_scan_rsp(struct hci_dev *hdev, u8 instance, u8 *ptr); u8 eir_append_local_name(struct hci_dev *hdev, u8 *eir, u8 ad_len); u8 eir_append_appearance(struct hci_dev *hdev, u8 *ptr, u8 ad_len); +static inline u16 eir_precalc_len(u8 data_len) +{ + return sizeof(u8) * 2 + data_len; +} + static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data, u8 data_len) { @@ -36,6 +41,21 @@ static inline u16 eir_append_le16(u8 *eir, u16 eir_len, u8 type, u16 data) return eir_len; } +static inline u16 eir_skb_put_data(struct sk_buff *skb, u8 type, u8 *data, u8 data_len) +{ + u8 *eir; + u16 eir_len; + + eir_len = eir_precalc_len(data_len); + eir = skb_put(skb, eir_len); + WARN_ON(sizeof(type) + data_len > U8_MAX); + eir[0] = sizeof(type) + data_len; + eir[1] = type; + memcpy(&eir[2], data, data_len); + + return eir_len; +} + static inline void *eir_get_data(u8 *eir, size_t eir_len, u8 type, size_t *data_len) { diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 5bde0ec41177..b4782a6c1025 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2739,6 +2739,7 @@ void hci_release_dev(struct hci_dev *hdev) hci_dev_unlock(hdev); ida_simple_remove(&hci_index_ida, hdev->id); + kfree_skb(hdev->sent_cmd); kfree(hdev); } EXPORT_SYMBOL(hci_release_dev); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 63b925921c87..4888c1f8a9b7 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5716,8 +5716,6 @@ static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); - adv = hci_find_adv_instance(hdev, ev->handle); - /* The Bluetooth Core 5.3 specification clearly states that this event * shall not be sent when the Host disables the advertising set. So in * case of HCI_ERROR_CANCELLED_BY_HOST, just ignore the event. @@ -5730,9 +5728,13 @@ static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, void *data, return; } + hci_dev_lock(hdev); + + adv = hci_find_adv_instance(hdev, ev->handle); + if (ev->status) { if (!adv) - return; + goto unlock; /* Remove advertising as it has been terminated */ hci_remove_adv_instance(hdev, ev->handle); @@ -5740,12 +5742,12 @@ static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, void *data, list_for_each_entry_safe(adv, n, &hdev->adv_instances, list) { if (adv->enabled) - return; + goto unlock; } /* We are no longer advertising, clear HCI_LE_ADV */ hci_dev_clear_flag(hdev, HCI_LE_ADV); - return; + goto unlock; } if (adv) @@ -5760,16 +5762,19 @@ static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, void *data, if (hdev->adv_addr_type != ADDR_LE_DEV_RANDOM || bacmp(&conn->resp_addr, BDADDR_ANY)) - return; + goto unlock; if (!ev->handle) { bacpy(&conn->resp_addr, &hdev->random_addr); - return; + goto unlock; } if (adv) bacpy(&conn->resp_addr, &adv->random_addr); } + +unlock: + hci_dev_unlock(hdev); } static void hci_le_conn_update_complete_evt(struct hci_dev *hdev, void *data, diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 6e71aa6b6fea..af7ea8a3317d 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -276,40 +276,37 @@ EXPORT_SYMBOL(__hci_cmd_sync_status); static void hci_cmd_sync_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_sync_work); - struct hci_cmd_sync_work_entry *entry; - hci_cmd_sync_work_func_t func; - hci_cmd_sync_work_destroy_t destroy; - void *data; bt_dev_dbg(hdev, ""); - mutex_lock(&hdev->cmd_sync_work_lock); - entry = list_first_entry(&hdev->cmd_sync_work_list, - struct hci_cmd_sync_work_entry, list); - if (entry) { - list_del(&entry->list); - func = entry->func; - data = entry->data; - destroy = entry->destroy; - kfree(entry); - } else { - func = NULL; - data = NULL; - destroy = NULL; - } - mutex_unlock(&hdev->cmd_sync_work_lock); + /* Dequeue all entries and run them */ + while (1) { + struct hci_cmd_sync_work_entry *entry; - if (func) { - int err; + mutex_lock(&hdev->cmd_sync_work_lock); + entry = list_first_entry_or_null(&hdev->cmd_sync_work_list, + struct hci_cmd_sync_work_entry, + list); + if (entry) + list_del(&entry->list); + mutex_unlock(&hdev->cmd_sync_work_lock); - hci_req_sync_lock(hdev); + if (!entry) + break; - err = func(hdev, data); + bt_dev_dbg(hdev, "entry %p", entry); - if (destroy) - destroy(hdev, data, err); + if (entry->func) { + int err; + + hci_req_sync_lock(hdev); + err = entry->func(hdev, entry->data); + if (entry->destroy) + entry->destroy(hdev, entry->data, err); + hci_req_sync_unlock(hdev); + } - hci_req_sync_unlock(hdev); + kfree(entry); } } @@ -1844,6 +1841,7 @@ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) struct bdaddr_list *b, *t; u8 num_entries = 0; bool pend_conn, pend_report; + u8 filter_policy; int err; /* Pause advertising if resolving list can be used as controllers are @@ -1930,6 +1928,8 @@ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) err = -EINVAL; done: + filter_policy = err ? 0x00 : 0x01; + /* Enable address resolution when LL Privacy is enabled. */ err = hci_le_set_addr_resolution_enable_sync(hdev, 0x01); if (err) @@ -1940,7 +1940,7 @@ done: hci_resume_advertising_sync(hdev); /* Select filter policy to use accept list */ - return err ? 0x00 : 0x01; + return filter_policy; } /* Returns true if an le connection is in the scanning state */ @@ -3265,10 +3265,10 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev) if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) events[0] |= 0x40; /* LE Data Length Change */ - /* If the controller supports LL Privacy feature, enable - * the corresponding event. + /* If the controller supports LL Privacy feature or LE Extended Adv, + * enable the corresponding event. */ - if (hdev->le_features[0] & HCI_LE_LL_PRIVACY) + if (use_enhanced_conn_complete(hdev)) events[1] |= 0x02; /* LE Enhanced Connection Complete */ /* If the controller supports Extended Scanner Filter @@ -4109,9 +4109,9 @@ int hci_dev_close_sync(struct hci_dev *hdev) hci_inquiry_cache_flush(hdev); hci_pend_le_actions_clear(hdev); hci_conn_hash_flush(hdev); - hci_dev_unlock(hdev); - + /* Prevent data races on hdev->smp_data or hdev->smp_bredr_data */ smp_unregister(hdev); + hci_dev_unlock(hdev); hci_sock_dev_event(hdev, HCI_DEV_DOWN); @@ -4425,7 +4425,7 @@ static int hci_disconnect_all_sync(struct hci_dev *hdev, u8 reason) return err; } - return err; + return 0; } /* This function perform power off HCI command sequence as follows: @@ -5188,7 +5188,7 @@ static int hci_le_ext_create_conn_sync(struct hci_dev *hdev, return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_EXT_CREATE_CONN, plen, data, HCI_EV_LE_ENHANCED_CONN_COMPLETE, - HCI_CMD_TIMEOUT, NULL); + conn->conn_timeout, NULL); } int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn) @@ -5273,9 +5273,18 @@ int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn) cp.min_ce_len = cpu_to_le16(0x0000); cp.max_ce_len = cpu_to_le16(0x0000); + /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 2261: + * + * If this event is unmasked and the HCI_LE_Connection_Complete event + * is unmasked, only the HCI_LE_Enhanced_Connection_Complete event is + * sent when a new connection has been created. + */ err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CREATE_CONN, - sizeof(cp), &cp, HCI_EV_LE_CONN_COMPLETE, - HCI_CMD_TIMEOUT, NULL); + sizeof(cp), &cp, + use_enhanced_conn_complete(hdev) ? + HCI_EV_LE_ENHANCED_CONN_COMPLETE : + HCI_EV_LE_CONN_COMPLETE, + conn->conn_timeout, NULL); done: /* Re-enable advertising after the connection attempt is finished. */ diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index e817ff0607a0..8df99c07f272 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1436,6 +1436,7 @@ static void l2cap_ecred_connect(struct l2cap_chan *chan) l2cap_ecred_init(chan, 0); + memset(&data, 0, sizeof(data)); data.pdu.req.psm = chan->psm; data.pdu.req.mtu = cpu_to_le16(chan->imtu); data.pdu.req.mps = cpu_to_le16(chan->mps); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 5dd684e0b259..8101a6a31841 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1220,7 +1220,13 @@ static int new_settings(struct hci_dev *hdev, struct sock *skip) static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp = cmd->param; + struct mgmt_mode *cp; + + /* Make sure cmd still outstanding. */ + if (cmd != pending_find(MGMT_OP_SET_POWERED, hdev)) + return; + + cp = cmd->param; bt_dev_dbg(hdev, "err %d", err); @@ -1244,7 +1250,7 @@ static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err) mgmt_status(err)); } - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); } static int set_powered_sync(struct hci_dev *hdev, void *data) @@ -1283,7 +1289,7 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - cmd = mgmt_pending_new(sk, MGMT_OP_SET_POWERED, hdev, data, len); + cmd = mgmt_pending_add(sk, MGMT_OP_SET_POWERED, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; @@ -1292,6 +1298,9 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, err = hci_cmd_sync_queue(hdev, set_powered_sync, cmd, mgmt_set_powered_complete); + if (err < 0) + mgmt_pending_remove(cmd); + failed: hci_dev_unlock(hdev); return err; @@ -1385,6 +1394,10 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "err %d", err); + /* Make sure cmd still outstanding. */ + if (cmd != pending_find(MGMT_OP_SET_DISCOVERABLE, hdev)) + return; + hci_dev_lock(hdev); if (err) { @@ -1404,7 +1417,7 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, new_settings(hdev, cmd->sk); done: - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); hci_dev_unlock(hdev); } @@ -1513,7 +1526,7 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - cmd = mgmt_pending_new(sk, MGMT_OP_SET_DISCOVERABLE, hdev, data, len); + cmd = mgmt_pending_add(sk, MGMT_OP_SET_DISCOVERABLE, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; @@ -1540,6 +1553,9 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, err = hci_cmd_sync_queue(hdev, set_discoverable_sync, cmd, mgmt_set_discoverable_complete); + if (err < 0) + mgmt_pending_remove(cmd); + failed: hci_dev_unlock(hdev); return err; @@ -1552,6 +1568,10 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "err %d", err); + /* Make sure cmd still outstanding. */ + if (cmd != pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) + return; + hci_dev_lock(hdev); if (err) { @@ -1564,7 +1584,9 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, new_settings(hdev, cmd->sk); done: - mgmt_pending_free(cmd); + if (cmd) + mgmt_pending_remove(cmd); + hci_dev_unlock(hdev); } @@ -1636,7 +1658,7 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - cmd = mgmt_pending_new(sk, MGMT_OP_SET_CONNECTABLE, hdev, data, len); + cmd = mgmt_pending_add(sk, MGMT_OP_SET_CONNECTABLE, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; @@ -1656,6 +1678,9 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, err = hci_cmd_sync_queue(hdev, set_connectable_sync, cmd, mgmt_set_connectable_complete); + if (err < 0) + mgmt_pending_remove(cmd); + failed: hci_dev_unlock(hdev); return err; @@ -1776,6 +1801,10 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) u8 enable = cp->val; bool changed; + /* Make sure cmd still outstanding. */ + if (cmd != pending_find(MGMT_OP_SET_SSP, hdev)) + return; + if (err) { u8 mgmt_err = mgmt_status(err); @@ -2269,7 +2298,9 @@ static int remove_uuid(struct sock *sk, struct hci_dev *hdev, void *data, struct mgmt_cp_remove_uuid *cp = data; struct mgmt_pending_cmd *cmd; struct bt_uuid *match, *tmp; - u8 bt_uuid_any[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + static const u8 bt_uuid_any[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; int err, found; bt_dev_dbg(hdev, "sock %p", sk); @@ -3323,6 +3354,9 @@ static void set_name_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "err %d", err); + if (cmd != pending_find(MGMT_OP_SET_LOCAL_NAME, hdev)) + return; + if (status) { mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, status); @@ -3495,6 +3529,9 @@ static void set_default_phy_complete(struct hci_dev *hdev, void *data, int err) struct sk_buff *skb = cmd->skb; u8 status = mgmt_status(err); + if (cmd != pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev)) + return; + if (!status) { if (!skb) status = MGMT_STATUS_FAILED; @@ -3761,13 +3798,6 @@ static int set_wideband_speech(struct sock *sk, struct hci_dev *hdev, hci_dev_lock(hdev); - if (pending_find(MGMT_OP_SET_WIDEBAND_SPEECH, hdev)) { - err = mgmt_cmd_status(sk, hdev->id, - MGMT_OP_SET_WIDEBAND_SPEECH, - MGMT_STATUS_BUSY); - goto unlock; - } - if (hdev_is_powered(hdev) && !!cp->val != hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED)) { @@ -4515,9 +4545,9 @@ static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, } } -done: hci_dev_unlock(hdev); +done: if (status == MGMT_STATUS_SUCCESS) device_flags_changed(sk, hdev, &cp->addr.bdaddr, cp->addr.type, supported_flags, current_flags); @@ -5038,12 +5068,6 @@ static int read_local_oob_data(struct sock *sk, struct hci_dev *hdev, goto unlock; } - if (pending_find(MGMT_OP_READ_LOCAL_OOB_DATA, hdev)) { - err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, - MGMT_STATUS_BUSY); - goto unlock; - } - cmd = mgmt_pending_new(sk, MGMT_OP_READ_LOCAL_OOB_DATA, hdev, NULL, 0); if (!cmd) err = -ENOMEM; @@ -5263,11 +5287,16 @@ static void start_discovery_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; + if (cmd != pending_find(MGMT_OP_START_DISCOVERY, hdev) && + cmd != pending_find(MGMT_OP_START_LIMITED_DISCOVERY, hdev) && + cmd != pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev)) + return; + bt_dev_dbg(hdev, "err %d", err); mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), cmd->param, 1); - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); hci_discovery_set_state(hdev, err ? DISCOVERY_STOPPED: DISCOVERY_FINDING); @@ -5329,7 +5358,7 @@ static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev, else hdev->discovery.limited = false; - cmd = mgmt_pending_new(sk, op, hdev, data, len); + cmd = mgmt_pending_add(sk, op, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; @@ -5338,7 +5367,7 @@ static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev, err = hci_cmd_sync_queue(hdev, start_discovery_sync, cmd, start_discovery_complete); if (err < 0) { - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); goto failed; } @@ -5432,7 +5461,7 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, goto failed; } - cmd = mgmt_pending_new(sk, MGMT_OP_START_SERVICE_DISCOVERY, + cmd = mgmt_pending_add(sk, MGMT_OP_START_SERVICE_DISCOVERY, hdev, data, len); if (!cmd) { err = -ENOMEM; @@ -5465,7 +5494,7 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, err = hci_cmd_sync_queue(hdev, start_discovery_sync, cmd, start_discovery_complete); if (err < 0) { - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); goto failed; } @@ -5497,11 +5526,14 @@ static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; + if (cmd != pending_find(MGMT_OP_STOP_DISCOVERY, hdev)) + return; + bt_dev_dbg(hdev, "err %d", err); mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), cmd->param, 1); - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); if (!err) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); @@ -5537,7 +5569,7 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, goto unlock; } - cmd = mgmt_pending_new(sk, MGMT_OP_STOP_DISCOVERY, hdev, data, len); + cmd = mgmt_pending_add(sk, MGMT_OP_STOP_DISCOVERY, hdev, data, len); if (!cmd) { err = -ENOMEM; goto unlock; @@ -5546,7 +5578,7 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, err = hci_cmd_sync_queue(hdev, stop_discovery_sync, cmd, stop_discovery_complete); if (err < 0) { - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); goto unlock; } @@ -7476,6 +7508,9 @@ static void read_local_oob_ext_data_complete(struct hci_dev *hdev, void *data, u8 status = mgmt_status(err); u16 eir_len; + if (cmd != pending_find(MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev)) + return; + if (!status) { if (!skb) status = MGMT_STATUS_FAILED; @@ -7971,11 +8006,7 @@ static bool requested_adv_flags_are_valid(struct hci_dev *hdev, u32 adv_flags) static bool adv_busy(struct hci_dev *hdev) { - return (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) || - pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) || - pending_find(MGMT_OP_SET_LE, hdev) || - pending_find(MGMT_OP_ADD_EXT_ADV_PARAMS, hdev) || - pending_find(MGMT_OP_ADD_EXT_ADV_DATA, hdev)); + return pending_find(MGMT_OP_SET_LE, hdev); } static void add_adv_complete(struct hci_dev *hdev, struct sock *sk, u8 instance, @@ -8048,7 +8079,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, u32 flags; u8 status; u16 timeout, duration; - unsigned int prev_instance_cnt = hdev->adv_instance_cnt; + unsigned int prev_instance_cnt; u8 schedule_instance = 0; struct adv_info *next_instance; int err; @@ -8099,6 +8130,8 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, goto unlock; } + prev_instance_cnt = hdev->adv_instance_cnt; + err = hci_add_adv_instance(hdev, cp->instance, flags, cp->adv_data_len, cp->data, cp->scan_rsp_len, @@ -8565,9 +8598,7 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, goto unlock; } - if (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) || - pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) || - pending_find(MGMT_OP_SET_LE, hdev)) { + if (pending_find(MGMT_OP_SET_LE, hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING, MGMT_STATUS_BUSY); goto unlock; @@ -8603,7 +8634,6 @@ static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev, struct mgmt_cp_get_adv_size_info *cp = data; struct mgmt_rp_get_adv_size_info rp; u32 flags, supported_flags; - int err; bt_dev_dbg(hdev, "sock %p", sk); @@ -8630,10 +8660,8 @@ static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev, rp.max_adv_data_len = tlv_data_max_len(hdev, flags, true); rp.max_scan_rsp_len = tlv_data_max_len(hdev, flags, false); - err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO, - MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); - - return err; + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO, + MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); } static const struct hci_mgmt_handler mgmt_handlers[] = { @@ -9061,12 +9089,14 @@ void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn, u16 eir_len = 0; u32 flags = 0; + /* allocate buff for LE or BR/EDR adv */ if (conn->le_adv_data_len > 0) skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_CONNECTED, - conn->le_adv_data_len); + sizeof(*ev) + conn->le_adv_data_len); else skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_CONNECTED, - 2 + name_len + 5); + sizeof(*ev) + (name ? eir_precalc_len(name_len) : 0) + + eir_precalc_len(sizeof(conn->dev_class))); ev = skb_put(skb, sizeof(*ev)); bacpy(&ev->addr.bdaddr, &conn->dst); @@ -9085,18 +9115,12 @@ void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn, skb_put_data(skb, conn->le_adv_data, conn->le_adv_data_len); eir_len = conn->le_adv_data_len; } else { - if (name_len > 0) { - eir_len = eir_append_data(ev->eir, 0, EIR_NAME_COMPLETE, - name, name_len); - skb_put(skb, eir_len); - } + if (name) + eir_len += eir_skb_put_data(skb, EIR_NAME_COMPLETE, name, name_len); - if (memcmp(conn->dev_class, "\0\0\0", 3) != 0) { - eir_len = eir_append_data(ev->eir, eir_len, - EIR_CLASS_OF_DEV, - conn->dev_class, 3); - skb_put(skb, 5); - } + if (memcmp(conn->dev_class, "\0\0\0", sizeof(conn->dev_class))) + eir_len += eir_skb_put_data(skb, EIR_CLASS_OF_DEV, + conn->dev_class, sizeof(conn->dev_class)); } ev->eir_len = cpu_to_le16(eir_len); @@ -9785,28 +9809,21 @@ void mgmt_remote_name(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, { struct sk_buff *skb; struct mgmt_ev_device_found *ev; - u16 eir_len; - u32 flags; + u16 eir_len = 0; + u32 flags = 0; - if (name_len) - skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_FOUND, 2 + name_len); - else - skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_FOUND, 0); + skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_FOUND, + sizeof(*ev) + (name ? eir_precalc_len(name_len) : 0)); ev = skb_put(skb, sizeof(*ev)); bacpy(&ev->addr.bdaddr, bdaddr); ev->addr.type = link_to_bdaddr(link_type, addr_type); ev->rssi = rssi; - if (name) { - eir_len = eir_append_data(ev->eir, 0, EIR_NAME_COMPLETE, name, - name_len); - flags = 0; - skb_put(skb, eir_len); - } else { - eir_len = 0; + if (name) + eir_len += eir_skb_put_data(skb, EIR_NAME_COMPLETE, name, name_len); + else flags = MGMT_DEV_FOUND_NAME_REQUEST_FAILED; - } ev->eir_len = cpu_to_le16(eir_len); ev->flags = cpu_to_le32(flags); diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c index edee60bbc7b4..37eef2ce55ae 100644 --- a/net/bluetooth/mgmt_util.c +++ b/net/bluetooth/mgmt_util.c @@ -77,11 +77,12 @@ int mgmt_send_event_skb(unsigned short channel, struct sk_buff *skb, int flag, { struct hci_dev *hdev; struct mgmt_hdr *hdr; - int len = skb->len; + int len; if (!skb) return -EINVAL; + len = skb->len; hdev = bt_cb(skb)->mgmt.hdev; /* Time stamp */ diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index f08034500813..eb129e48f90b 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -150,6 +150,11 @@ static int bpf_test_finish(const union bpf_attr *kattr, if (data_out) { int len = sinfo ? copy_size - sinfo->xdp_frags_size : copy_size; + if (len < 0) { + err = -ENOSPC; + goto out; + } + if (copy_to_user(data_out, data, len)) goto out; diff --git a/net/bridge/br.c b/net/bridge/br.c index 1fac72cc617f..b1dea3febeea 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -342,23 +342,26 @@ void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on) clear_bit(opt, &br->options); } -static void __net_exit br_net_exit(struct net *net) +static void __net_exit br_net_exit_batch(struct list_head *net_list) { struct net_device *dev; + struct net *net; LIST_HEAD(list); rtnl_lock(); - for_each_netdev(net, dev) - if (netif_is_bridge_master(dev)) - br_dev_delete(dev, &list); + + list_for_each_entry(net, net_list, exit_list) + for_each_netdev(net, dev) + if (netif_is_bridge_master(dev)) + br_dev_delete(dev, &list); unregister_netdevice_many(&list); - rtnl_unlock(); + rtnl_unlock(); } static struct pernet_operations br_net_ops = { - .exit = br_net_exit, + .exit_batch = br_net_exit_batch, }; static const struct stp_proto br_stp_proto = { diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c index 3db1def4437b..e5e48c6e35d7 100644 --- a/net/bridge/br_arp_nd_proxy.c +++ b/net/bridge/br_arp_nd_proxy.c @@ -84,7 +84,7 @@ static void br_arp_send(struct net_bridge *br, struct net_bridge_port *p, skb->ip_summed = CHECKSUM_UNNECESSARY; skb->pkt_type = PACKET_HOST; - netif_rx_ni(skb); + netif_rx(skb); } } @@ -364,7 +364,7 @@ static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p, reply->ip_summed = CHECKSUM_UNNECESSARY; reply->pkt_type = PACKET_HOST; - netif_rx_ni(reply); + netif_rx(reply); } } diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index ec646656dbf1..02bb620d3b8d 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -62,7 +62,7 @@ EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit); int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - skb->tstamp = 0; + skb_clear_tstamp(skb); return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, net, sk, skb, NULL, skb->dev, br_dev_queue_push_xmit); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index b50382f957c1..e0c13fcc50ed 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -81,6 +81,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (!p || p->state == BR_STATE_DISABLED) goto drop; + br = p->br; brmctx = &p->br->multicast_ctx; pmctx = &p->multicast_ctx; state = p->state; @@ -88,10 +89,18 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb &state, &vlan)) goto out; + if (p->flags & BR_PORT_LOCKED) { + struct net_bridge_fdb_entry *fdb_src = + br_fdb_find_rcu(br, eth_hdr(skb)->h_source, vid); + + if (!fdb_src || READ_ONCE(fdb_src->dst) != p || + test_bit(BR_FDB_LOCAL, &fdb_src->flags)) + goto drop; + } + nbp_switchdev_frame_mark(p, skb); /* insert into forwarding database after filtering to avoid spoofing */ - br = p->br; if (p->flags & BR_LEARNING) br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0); diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index de2409889489..db4f2641d1cd 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -82,6 +82,9 @@ static void br_multicast_find_del_pg(struct net_bridge *br, struct net_bridge_port_group *pg); static void __br_multicast_stop(struct net_bridge_mcast *brmctx); +static int br_mc_disabled_update(struct net_device *dev, bool value, + struct netlink_ext_ack *extack); + static struct net_bridge_port_group * br_sg_port_find(struct net_bridge *br, struct net_bridge_port_group_sg_key *sg_p) @@ -1156,6 +1159,7 @@ struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br, return mp; if (atomic_read(&br->mdb_hash_tbl.nelems) >= br->hash_max) { + br_mc_disabled_update(br->dev, false, NULL); br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false); return ERR_PTR(-E2BIG); } diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 2ff83d84230d..7d4432ca9a20 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -184,6 +184,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */ + nla_total_size(1) /* IFLA_BRPORT_NEIGH_SUPPRESS */ + nla_total_size(1) /* IFLA_BRPORT_ISOLATED */ + + nla_total_size(1) /* IFLA_BRPORT_LOCKED */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */ @@ -269,7 +270,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, BR_MRP_LOST_CONT)) || nla_put_u8(skb, IFLA_BRPORT_MRP_IN_OPEN, !!(p->flags & BR_MRP_LOST_IN_CONT)) || - nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED))) + nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)) || + nla_put_u8(skb, IFLA_BRPORT_LOCKED, !!(p->flags & BR_PORT_LOCKED))) return -EMSGSIZE; timerval = br_timer_value(&p->message_age_timer); @@ -827,6 +829,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 }, [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 }, [IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 }, + [IFLA_BRPORT_LOCKED] = { .type = NLA_U8 }, [IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 }, [IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 }, }; @@ -893,6 +896,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[], br_set_port_flag(p, tb, IFLA_BRPORT_VLAN_TUNNEL, BR_VLAN_TUNNEL); br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS, BR_NEIGH_SUPPRESS); br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED); + br_set_port_flag(p, tb, IFLA_BRPORT_LOCKED, BR_PORT_LOCKED); changed_mask = old_flags ^ p->flags; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 2661dda1a92b..48bc61ebc211 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1985,7 +1985,7 @@ void br_switchdev_mdb_notify(struct net_device *dev, struct net_bridge_port_group *pg, int type); int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags, - struct netlink_ext_ack *extack); + bool changed, struct netlink_ext_ack *extack); int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid); void br_switchdev_init(struct net_bridge *br); @@ -2052,8 +2052,8 @@ static inline int br_switchdev_set_port_flag(struct net_bridge_port *p, return 0; } -static inline int br_switchdev_port_vlan_add(struct net_device *dev, - u16 vid, u16 flags, +static inline int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, + u16 flags, bool changed, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index f8fbaaa7c501..6f6a70121a5e 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -72,7 +72,7 @@ bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, /* Flags that can be offloaded to hardware */ #define BR_PORT_FLAGS_HW_OFFLOAD (BR_LEARNING | BR_FLOOD | \ - BR_MCAST_FLOOD | BR_BCAST_FLOOD) + BR_MCAST_FLOOD | BR_BCAST_FLOOD | BR_PORT_LOCKED) int br_switchdev_set_port_flag(struct net_bridge_port *p, unsigned long flags, @@ -160,13 +160,14 @@ br_switchdev_fdb_notify(struct net_bridge *br, } int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags, - struct netlink_ext_ack *extack) + bool changed, struct netlink_ext_ack *extack) { struct switchdev_obj_port_vlan v = { .obj.orig_dev = dev, .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .flags = flags, .vid = vid, + .changed = changed, }; return switchdev_port_obj_add(dev, &v.obj, extack); @@ -351,51 +352,19 @@ br_switchdev_vlan_replay_one(struct notifier_block *nb, return notifier_to_errno(err); } -static int br_switchdev_vlan_replay(struct net_device *br_dev, - struct net_device *dev, - const void *ctx, bool adding, - struct notifier_block *nb, - struct netlink_ext_ack *extack) +static int br_switchdev_vlan_replay_group(struct notifier_block *nb, + struct net_device *dev, + struct net_bridge_vlan_group *vg, + const void *ctx, unsigned long action, + struct netlink_ext_ack *extack) { - struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; - struct net_bridge_port *p; - struct net_bridge *br; - unsigned long action; int err = 0; u16 pvid; - ASSERT_RTNL(); - - if (!nb) - return 0; - - if (!netif_is_bridge_master(br_dev)) - return -EINVAL; - - if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) - return -EINVAL; - - if (netif_is_bridge_master(dev)) { - br = netdev_priv(dev); - vg = br_vlan_group(br); - p = NULL; - } else { - p = br_port_get_rtnl(dev); - if (WARN_ON(!p)) - return -EINVAL; - vg = nbp_vlan_group(p); - br = p->br; - } - if (!vg) return 0; - if (adding) - action = SWITCHDEV_PORT_OBJ_ADD; - else - action = SWITCHDEV_PORT_OBJ_DEL; - pvid = br_get_pvid(vg); list_for_each_entry(v, &vg->vlan_list, vlist) { @@ -415,7 +384,48 @@ static int br_switchdev_vlan_replay(struct net_device *br_dev, return err; } - return err; + return 0; +} + +static int br_switchdev_vlan_replay(struct net_device *br_dev, + const void *ctx, bool adding, + struct notifier_block *nb, + struct netlink_ext_ack *extack) +{ + struct net_bridge *br = netdev_priv(br_dev); + struct net_bridge_port *p; + unsigned long action; + int err; + + ASSERT_RTNL(); + + if (!nb) + return 0; + + if (!netif_is_bridge_master(br_dev)) + return -EINVAL; + + if (adding) + action = SWITCHDEV_PORT_OBJ_ADD; + else + action = SWITCHDEV_PORT_OBJ_DEL; + + err = br_switchdev_vlan_replay_group(nb, br_dev, br_vlan_group(br), + ctx, action, extack); + if (err) + return err; + + list_for_each_entry(p, &br->port_list, list) { + struct net_device *dev = p->dev; + + err = br_switchdev_vlan_replay_group(nb, dev, + nbp_vlan_group(p), + ctx, action, extack); + if (err) + return err; + } + + return 0; } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING @@ -681,8 +691,7 @@ static int nbp_switchdev_sync_objs(struct net_bridge_port *p, const void *ctx, struct net_device *dev = p->dev; int err; - err = br_switchdev_vlan_replay(br_dev, dev, ctx, true, blocking_nb, - extack); + err = br_switchdev_vlan_replay(br_dev, ctx, true, blocking_nb, extack); if (err && err != -EOPNOTSUPP) return err; @@ -706,11 +715,11 @@ static void nbp_switchdev_unsync_objs(struct net_bridge_port *p, struct net_device *br_dev = p->br->dev; struct net_device *dev = p->dev; - br_switchdev_vlan_replay(br_dev, dev, ctx, false, blocking_nb, NULL); + br_switchdev_fdb_replay(br_dev, ctx, false, atomic_nb); br_switchdev_mdb_replay(br_dev, dev, ctx, false, blocking_nb, NULL); - br_switchdev_fdb_replay(br_dev, ctx, false, atomic_nb); + br_switchdev_vlan_replay(br_dev, ctx, false, blocking_nb, NULL); } /* Let the bridge know that this port is offloaded, so that it can assign a diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 1402d5ca242d..7557e90b60e1 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -34,53 +34,70 @@ static struct net_bridge_vlan *br_vlan_lookup(struct rhashtable *tbl, u16 vid) return rhashtable_lookup_fast(tbl, &vid, br_vlan_rht_params); } -static bool __vlan_add_pvid(struct net_bridge_vlan_group *vg, +static void __vlan_add_pvid(struct net_bridge_vlan_group *vg, const struct net_bridge_vlan *v) { if (vg->pvid == v->vid) - return false; + return; smp_wmb(); br_vlan_set_pvid_state(vg, v->state); vg->pvid = v->vid; - - return true; } -static bool __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid) +static void __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid) { if (vg->pvid != vid) - return false; + return; smp_wmb(); vg->pvid = 0; - - return true; } -/* return true if anything changed, false otherwise */ -static bool __vlan_add_flags(struct net_bridge_vlan *v, u16 flags) +/* Update the BRIDGE_VLAN_INFO_PVID and BRIDGE_VLAN_INFO_UNTAGGED flags of @v. + * If @commit is false, return just whether the BRIDGE_VLAN_INFO_PVID and + * BRIDGE_VLAN_INFO_UNTAGGED bits of @flags would produce any change onto @v. + */ +static bool __vlan_flags_update(struct net_bridge_vlan *v, u16 flags, + bool commit) { struct net_bridge_vlan_group *vg; - u16 old_flags = v->flags; - bool ret; + bool change; if (br_vlan_is_master(v)) vg = br_vlan_group(v->br); else vg = nbp_vlan_group(v->port); + /* check if anything would be changed on commit */ + change = !!(flags & BRIDGE_VLAN_INFO_PVID) == !!(vg->pvid != v->vid) || + ((flags ^ v->flags) & BRIDGE_VLAN_INFO_UNTAGGED); + + if (!commit) + goto out; + if (flags & BRIDGE_VLAN_INFO_PVID) - ret = __vlan_add_pvid(vg, v); + __vlan_add_pvid(vg, v); else - ret = __vlan_delete_pvid(vg, v->vid); + __vlan_delete_pvid(vg, v->vid); if (flags & BRIDGE_VLAN_INFO_UNTAGGED) v->flags |= BRIDGE_VLAN_INFO_UNTAGGED; else v->flags &= ~BRIDGE_VLAN_INFO_UNTAGGED; - return ret || !!(old_flags ^ v->flags); +out: + return change; +} + +static bool __vlan_flags_would_change(struct net_bridge_vlan *v, u16 flags) +{ + return __vlan_flags_update(v, flags, false); +} + +static void __vlan_flags_commit(struct net_bridge_vlan *v, u16 flags) +{ + __vlan_flags_update(v, flags, true); } static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, @@ -92,7 +109,7 @@ static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, /* Try switchdev op first. In case it is not supported, fallback to * 8021q add. */ - err = br_switchdev_port_vlan_add(dev, v->vid, flags, extack); + err = br_switchdev_port_vlan_add(dev, v->vid, flags, false, extack); if (err == -EOPNOTSUPP) return vlan_vid_add(dev, br->vlan_proto, v->vid); v->priv_flags |= BR_VLFLAG_ADDED_BY_SWITCHDEV; @@ -284,9 +301,12 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, } br_multicast_port_ctx_init(p, v, &v->port_mcast_ctx); } else { - err = br_switchdev_port_vlan_add(dev, v->vid, flags, extack); - if (err && err != -EOPNOTSUPP) - goto out; + if (br_vlan_should_use(v)) { + err = br_switchdev_port_vlan_add(dev, v->vid, flags, + false, extack); + if (err && err != -EOPNOTSUPP) + goto out; + } br_multicast_ctx_init(br, v, &v->br_mcast_ctx); v->priv_flags |= BR_VLFLAG_GLOBAL_MCAST_ENABLED; } @@ -310,7 +330,7 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, goto out_fdb_insert; __vlan_add_list(v); - __vlan_add_flags(v, flags); + __vlan_flags_commit(v, flags); br_multicast_toggle_one_vlan(v, true); if (p) @@ -404,6 +424,7 @@ static void __vlan_flush(const struct net_bridge *br, { struct net_bridge_vlan *vlan, *tmp; u16 v_start = 0, v_end = 0; + int err; __vlan_delete_pvid(vg, vg->pvid); list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) { @@ -417,7 +438,13 @@ static void __vlan_flush(const struct net_bridge *br, } v_end = vlan->vid; - __vlan_del(vlan); + err = __vlan_del(vlan); + if (err) { + br_err(br, + "port %u(%s) failed to delete vlan %d: %pe\n", + (unsigned int) p->port_no, p->dev->name, + vlan->vid, ERR_PTR(err)); + } } /* notify about the last/whole vlan range */ @@ -670,18 +697,29 @@ static int br_vlan_add_existing(struct net_bridge *br, u16 flags, bool *changed, struct netlink_ext_ack *extack) { + bool would_change = __vlan_flags_would_change(vlan, flags); + bool becomes_brentry = false; int err; - err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags, extack); - if (err && err != -EOPNOTSUPP) - return err; - if (!br_vlan_is_brentry(vlan)) { /* Trying to change flags of non-existent bridge vlan */ - if (!(flags & BRIDGE_VLAN_INFO_BRENTRY)) { - err = -EINVAL; - goto err_flags; - } + if (!(flags & BRIDGE_VLAN_INFO_BRENTRY)) + return -EINVAL; + + becomes_brentry = true; + } + + /* Master VLANs that aren't brentries weren't notified before, + * time to notify them now. + */ + if (becomes_brentry || would_change) { + err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags, + would_change, extack); + if (err && err != -EOPNOTSUPP) + return err; + } + + if (becomes_brentry) { /* It was only kept for port vlans, now make it real */ err = br_fdb_add_local(br, NULL, br->dev->dev_addr, vlan->vid); if (err) { @@ -696,13 +734,13 @@ static int br_vlan_add_existing(struct net_bridge *br, br_multicast_toggle_one_vlan(vlan, true); } - if (__vlan_add_flags(vlan, flags)) + __vlan_flags_commit(vlan, flags); + if (would_change) *changed = true; return 0; err_fdb_insert: -err_flags: br_switchdev_port_vlan_del(br->dev, vlan->vid); return err; } @@ -1247,11 +1285,18 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, *changed = false; vlan = br_vlan_find(nbp_vlan_group(port), vid); if (vlan) { - /* Pass the flags to the hardware bridge */ - ret = br_switchdev_port_vlan_add(port->dev, vid, flags, extack); - if (ret && ret != -EOPNOTSUPP) - return ret; - *changed = __vlan_add_flags(vlan, flags); + bool would_change = __vlan_flags_would_change(vlan, flags); + + if (would_change) { + /* Pass the flags to the hardware bridge */ + ret = br_switchdev_port_vlan_add(port->dev, vid, flags, + true, extack); + if (ret && ret != -EOPNOTSUPP) + return ret; + } + + __vlan_flags_commit(vlan, flags); + *changed = would_change; return 0; } diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index fdbed3158555..ebfb2a5c59e4 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -32,6 +32,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *)) { int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; + bool mono_delivery_time = skb->mono_delivery_time; unsigned int hlen, ll_rs, mtu; ktime_t tstamp = skb->tstamp; struct ip_frag_state state; @@ -81,7 +82,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, if (iter.frag) ip_fraglist_prepare(skb, &iter); - skb->tstamp = tstamp; + skb_set_delivery_time(skb, tstamp, mono_delivery_time); err = output(net, sk, data, skb); if (err || !iter.frag) break; @@ -112,7 +113,7 @@ slow_path: goto blackhole; } - skb2->tstamp = tstamp; + skb_set_delivery_time(skb2, tstamp, mono_delivery_time); err = output(net, sk, data, skb2); if (err) goto blackhole; diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 414dc5671c45..4d63ef13a1fd 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -99,7 +99,7 @@ static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt) else skb->ip_summed = CHECKSUM_NONE; - netif_rx_any_context(skb); + netif_rx(skb); /* Update statistics. */ priv->netdev->stats.rx_packets++; diff --git a/net/can/af_can.c b/net/can/af_can.c index cce2af10eb3e..1fb49d51b25d 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -284,7 +284,7 @@ int can_send(struct sk_buff *skb, int loop) } if (newskb) - netif_rx_ni(newskb); + netif_rx(newskb); /* update statistics */ pkg_stats->tx_frames++; diff --git a/net/can/gw.c b/net/can/gw.c index 24221352e059..1ea4cc527db3 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -577,6 +577,13 @@ static inline void cgw_unregister_filter(struct net *net, struct cgw_job *gwj) gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj); } +static void cgw_job_free_rcu(struct rcu_head *rcu_head) +{ + struct cgw_job *gwj = container_of(rcu_head, struct cgw_job, rcu); + + kmem_cache_free(cgw_cache, gwj); +} + static int cgw_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { @@ -596,8 +603,7 @@ static int cgw_notifier(struct notifier_block *nb, if (gwj->src.dev == dev || gwj->dst.dev == dev) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); - synchronize_rcu(); - kmem_cache_free(cgw_cache, gwj); + call_rcu(&gwj->rcu, cgw_job_free_rcu); } } } @@ -1155,8 +1161,7 @@ static void cgw_remove_all_jobs(struct net *net) hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); - synchronize_rcu(); - kmem_cache_free(cgw_cache, gwj); + call_rcu(&gwj->rcu, cgw_job_free_rcu); } } @@ -1224,8 +1229,7 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh, hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); - synchronize_rcu(); - kmem_cache_free(cgw_cache, gwj); + call_rcu(&gwj->rcu, cgw_job_free_rcu); err = 0; break; } diff --git a/net/can/isotp.c b/net/can/isotp.c index d2a430b6a13b..d4c0b4704987 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -14,7 +14,6 @@ * - use CAN_ISOTP_WAIT_TX_DONE flag to block the caller until the PDU is sent * - as we have static buffers the check whether the PDU fits into the buffer * is done at FF reception time (no support for sending 'wait frames') - * - take care of the tx-queue-len as traffic shaping is still on the TODO list * * Copyright (c) 2020 Volkswagen Group Electronic Research * All rights reserved. @@ -87,9 +86,9 @@ MODULE_ALIAS("can-proto-6"); /* ISO 15765-2:2016 supports more than 4095 byte per ISO PDU as the FF_DL can * take full 32 bit values (4 Gbyte). We would need some good concept to handle * this between user space and kernel space. For now increase the static buffer - * to something about 8 kbyte to be able to test this new functionality. + * to something about 64 kbyte to be able to test this new functionality. */ -#define MAX_MSG_LENGTH 8200 +#define MAX_MSG_LENGTH 66000 /* N_PCI type values in bits 7-4 of N_PCI bytes */ #define N_PCI_SF 0x00 /* single frame */ @@ -141,8 +140,10 @@ struct isotp_sock { struct can_isotp_options opt; struct can_isotp_fc_options rxfc, txfc; struct can_isotp_ll_options ll; + u32 frame_txtime; u32 force_tx_stmin; u32 force_rx_stmin; + u32 cfecho; /* consecutive frame echo tag */ struct tpcon rx, tx; struct list_head notifier; wait_queue_head_t wait; @@ -360,7 +361,7 @@ static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae) so->tx_gap = ktime_set(0, 0); /* add transmission time for CAN frame N_As */ - so->tx_gap = ktime_add_ns(so->tx_gap, so->opt.frame_txtime); + so->tx_gap = ktime_add_ns(so->tx_gap, so->frame_txtime); /* add waiting time for consecutive frames N_Cs */ if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN) so->tx_gap = ktime_add_ns(so->tx_gap, @@ -712,6 +713,63 @@ static void isotp_fill_dataframe(struct canfd_frame *cf, struct isotp_sock *so, cf->data[0] = so->opt.ext_address; } +static void isotp_send_cframe(struct isotp_sock *so) +{ + struct sock *sk = &so->sk; + struct sk_buff *skb; + struct net_device *dev; + struct canfd_frame *cf; + int can_send_ret; + int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; + + dev = dev_get_by_index(sock_net(sk), so->ifindex); + if (!dev) + return; + + skb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), GFP_ATOMIC); + if (!skb) { + dev_put(dev); + return; + } + + can_skb_reserve(skb); + can_skb_prv(skb)->ifindex = dev->ifindex; + can_skb_prv(skb)->skbcnt = 0; + + cf = (struct canfd_frame *)skb->data; + skb_put_zero(skb, so->ll.mtu); + + /* create consecutive frame */ + isotp_fill_dataframe(cf, so, ae, 0); + + /* place consecutive frame N_PCI in appropriate index */ + cf->data[ae] = N_PCI_CF | so->tx.sn++; + so->tx.sn %= 16; + so->tx.bs++; + + cf->flags = so->ll.tx_flags; + + skb->dev = dev; + can_skb_set_owner(skb, sk); + + /* cfecho should have been zero'ed by init/isotp_rcv_echo() */ + if (so->cfecho) + pr_notice_once("can-isotp: cfecho is %08X != 0\n", so->cfecho); + + /* set consecutive frame echo tag */ + so->cfecho = *(u32 *)cf->data; + + /* send frame with local echo enabled */ + can_send_ret = can_send(skb, 1); + if (can_send_ret) { + pr_notice_once("can-isotp: %s: can_send_ret %pe\n", + __func__, ERR_PTR(can_send_ret)); + if (can_send_ret == -ENOBUFS) + pr_notice_once("can-isotp: tx queue is full\n"); + } + dev_put(dev); +} + static void isotp_create_fframe(struct canfd_frame *cf, struct isotp_sock *so, int ae) { @@ -748,19 +806,74 @@ static void isotp_create_fframe(struct canfd_frame *cf, struct isotp_sock *so, so->tx.state = ISOTP_WAIT_FIRST_FC; } +static void isotp_rcv_echo(struct sk_buff *skb, void *data) +{ + struct sock *sk = (struct sock *)data; + struct isotp_sock *so = isotp_sk(sk); + struct canfd_frame *cf = (struct canfd_frame *)skb->data; + + /* only handle my own local echo skb's */ + if (skb->sk != sk || so->cfecho != *(u32 *)cf->data) + return; + + /* cancel local echo timeout */ + hrtimer_cancel(&so->txtimer); + + /* local echo skb with consecutive frame has been consumed */ + so->cfecho = 0; + + if (so->tx.idx >= so->tx.len) { + /* we are done */ + so->tx.state = ISOTP_IDLE; + wake_up_interruptible(&so->wait); + return; + } + + if (so->txfc.bs && so->tx.bs >= so->txfc.bs) { + /* stop and wait for FC with timeout */ + so->tx.state = ISOTP_WAIT_FC; + hrtimer_start(&so->txtimer, ktime_set(1, 0), + HRTIMER_MODE_REL_SOFT); + return; + } + + /* no gap between data frames needed => use burst mode */ + if (!so->tx_gap) { + isotp_send_cframe(so); + return; + } + + /* start timer to send next consecutive frame with correct delay */ + hrtimer_start(&so->txtimer, so->tx_gap, HRTIMER_MODE_REL_SOFT); +} + static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) { struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, txtimer); struct sock *sk = &so->sk; - struct sk_buff *skb; - struct net_device *dev; - struct canfd_frame *cf; enum hrtimer_restart restart = HRTIMER_NORESTART; - int can_send_ret; - int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; switch (so->tx.state) { + case ISOTP_SENDING: + + /* cfecho should be consumed by isotp_rcv_echo() here */ + if (!so->cfecho) { + /* start timeout for unlikely lost echo skb */ + hrtimer_set_expires(&so->txtimer, + ktime_add(ktime_get(), + ktime_set(2, 0))); + restart = HRTIMER_RESTART; + + /* push out the next consecutive frame */ + isotp_send_cframe(so); + break; + } + + /* cfecho has not been cleared in isotp_rcv_echo() */ + pr_notice_once("can-isotp: cfecho %08X timeout\n", so->cfecho); + fallthrough; + case ISOTP_WAIT_FC: case ISOTP_WAIT_FIRST_FC: @@ -776,78 +889,6 @@ static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) wake_up_interruptible(&so->wait); break; - case ISOTP_SENDING: - - /* push out the next segmented pdu */ - dev = dev_get_by_index(sock_net(sk), so->ifindex); - if (!dev) - break; - -isotp_tx_burst: - skb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), - GFP_ATOMIC); - if (!skb) { - dev_put(dev); - break; - } - - can_skb_reserve(skb); - can_skb_prv(skb)->ifindex = dev->ifindex; - can_skb_prv(skb)->skbcnt = 0; - - cf = (struct canfd_frame *)skb->data; - skb_put_zero(skb, so->ll.mtu); - - /* create consecutive frame */ - isotp_fill_dataframe(cf, so, ae, 0); - - /* place consecutive frame N_PCI in appropriate index */ - cf->data[ae] = N_PCI_CF | so->tx.sn++; - so->tx.sn %= 16; - so->tx.bs++; - - cf->flags = so->ll.tx_flags; - - skb->dev = dev; - can_skb_set_owner(skb, sk); - - can_send_ret = can_send(skb, 1); - if (can_send_ret) { - pr_notice_once("can-isotp: %s: can_send_ret %pe\n", - __func__, ERR_PTR(can_send_ret)); - if (can_send_ret == -ENOBUFS) - pr_notice_once("can-isotp: tx queue is full, increasing txqueuelen may prevent this error\n"); - } - if (so->tx.idx >= so->tx.len) { - /* we are done */ - so->tx.state = ISOTP_IDLE; - dev_put(dev); - wake_up_interruptible(&so->wait); - break; - } - - if (so->txfc.bs && so->tx.bs >= so->txfc.bs) { - /* stop and wait for FC */ - so->tx.state = ISOTP_WAIT_FC; - dev_put(dev); - hrtimer_set_expires(&so->txtimer, - ktime_add(ktime_get(), - ktime_set(1, 0))); - restart = HRTIMER_RESTART; - break; - } - - /* no gap between data frames needed => use burst mode */ - if (!so->tx_gap) - goto isotp_tx_burst; - - /* start timer to send next data frame with correct delay */ - dev_put(dev); - hrtimer_set_expires(&so->txtimer, - ktime_add(ktime_get(), so->tx_gap)); - restart = HRTIMER_RESTART; - break; - default: WARN_ON_ONCE(1); } @@ -1075,6 +1116,9 @@ static int isotp_release(struct socket *sock) can_rx_unregister(net, dev, so->rxid, SINGLE_MASK(so->rxid), isotp_rcv, sk); + can_rx_unregister(net, dev, so->txid, + SINGLE_MASK(so->txid), + isotp_rcv_echo, sk); dev_put(dev); synchronize_rcu(); } @@ -1161,11 +1205,20 @@ static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) ifindex = dev->ifindex; - if (do_rx_reg) + if (do_rx_reg) { can_rx_register(net, dev, addr->can_addr.tp.rx_id, SINGLE_MASK(addr->can_addr.tp.rx_id), isotp_rcv, sk, "isotp", sk); + /* no consecutive frame echo skb in flight */ + so->cfecho = 0; + + /* register for echo skb's */ + can_rx_register(net, dev, addr->can_addr.tp.tx_id, + SINGLE_MASK(addr->can_addr.tp.tx_id), + isotp_rcv_echo, sk, "isotpe", sk); + } + dev_put(dev); if (so->bound && do_rx_reg) { @@ -1176,6 +1229,9 @@ static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) can_rx_unregister(net, dev, so->rxid, SINGLE_MASK(so->rxid), isotp_rcv, sk); + can_rx_unregister(net, dev, so->txid, + SINGLE_MASK(so->txid), + isotp_rcv_echo, sk); dev_put(dev); } } @@ -1238,6 +1294,14 @@ static int isotp_setsockopt_locked(struct socket *sock, int level, int optname, /* no separate rx_ext_address is given => use ext_address */ if (!(so->opt.flags & CAN_ISOTP_RX_EXT_ADDR)) so->opt.rx_ext_address = so->opt.ext_address; + + /* check for frame_txtime changes (0 => no changes) */ + if (so->opt.frame_txtime) { + if (so->opt.frame_txtime == CAN_ISOTP_FRAME_TXTIME_ZERO) + so->frame_txtime = 0; + else + so->frame_txtime = so->opt.frame_txtime; + } break; case CAN_ISOTP_RECV_FC: @@ -1381,10 +1445,14 @@ static void isotp_notify(struct isotp_sock *so, unsigned long msg, case NETDEV_UNREGISTER: lock_sock(sk); /* remove current filters & unregister */ - if (so->bound && (!(so->opt.flags & CAN_ISOTP_SF_BROADCAST))) + if (so->bound && (!(so->opt.flags & CAN_ISOTP_SF_BROADCAST))) { can_rx_unregister(dev_net(dev), dev, so->rxid, SINGLE_MASK(so->rxid), isotp_rcv, sk); + can_rx_unregister(dev_net(dev), dev, so->txid, + SINGLE_MASK(so->txid), + isotp_rcv_echo, sk); + } so->ifindex = 0; so->bound = 0; @@ -1439,6 +1507,7 @@ static int isotp_init(struct sock *sk) so->opt.rxpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; so->opt.txpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; so->opt.frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; + so->frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; so->rxfc.bs = CAN_ISOTP_DEFAULT_RECV_BS; so->rxfc.stmin = CAN_ISOTP_DEFAULT_RECV_STMIN; so->rxfc.wftmax = CAN_ISOTP_DEFAULT_RECV_WFTMAX; diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index a271688780a2..307ee1174a6e 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -2006,7 +2006,7 @@ struct j1939_session *j1939_tp_send(struct j1939_priv *priv, /* set the end-packet for broadcast */ session->pkt.last = session->pkt.total; - skcb->tskey = session->sk->sk_tskey++; + skcb->tskey = atomic_inc_return(&session->sk->sk_tskey) - 1; session->tskey = skcb->tskey; return session; diff --git a/net/core/dev.c b/net/core/dev.c index 2c3b8744e00c..ba69ddf85af6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -216,18 +216,38 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } -static inline void rps_lock(struct softnet_data *sd) +static inline void rps_lock_irqsave(struct softnet_data *sd, + unsigned long *flags) { -#ifdef CONFIG_RPS - spin_lock(&sd->input_pkt_queue.lock); -#endif + if (IS_ENABLED(CONFIG_RPS)) + spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_save(*flags); } -static inline void rps_unlock(struct softnet_data *sd) +static inline void rps_lock_irq_disable(struct softnet_data *sd) { -#ifdef CONFIG_RPS - spin_unlock(&sd->input_pkt_queue.lock); -#endif + if (IS_ENABLED(CONFIG_RPS)) + spin_lock_irq(&sd->input_pkt_queue.lock); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); +} + +static inline void rps_unlock_irq_restore(struct softnet_data *sd, + unsigned long *flags) +{ + if (IS_ENABLED(CONFIG_RPS)) + spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(*flags); +} + +static inline void rps_unlock_irq_enable(struct softnet_data *sd) +{ + if (IS_ENABLED(CONFIG_RPS)) + spin_unlock_irq(&sd->input_pkt_queue.lock); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); } static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, @@ -1602,7 +1622,8 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) - N(PRE_CHANGEADDR) + N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE) + N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA) } #undef N return "UNKNOWN_NETDEV_EVENT"; @@ -1919,6 +1940,32 @@ static int call_netdevice_notifiers_info(unsigned long val, return raw_notifier_call_chain(&netdev_chain, val, info); } +/** + * call_netdevice_notifiers_info_robust - call per-netns notifier blocks + * for and rollback on error + * @val_up: value passed unmodified to notifier function + * @val_down: value passed unmodified to the notifier function when + * recovering from an error on @val_up + * @info: notifier information data + * + * Call all per-netns network notifier blocks, but not notifier blocks on + * the global notifier chain. Parameters and return value are as for + * raw_notifier_call_chain_robust(). + */ + +static int +call_netdevice_notifiers_info_robust(unsigned long val_up, + unsigned long val_down, + struct netdev_notifier_info *info) +{ + struct net *net = dev_net(info->dev); + + ASSERT_RTNL(); + + return raw_notifier_call_chain_robust(&net->netdev_chain, + val_up, val_down, info); +} + static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack) @@ -2000,7 +2047,8 @@ void net_dec_egress_queue(void) EXPORT_SYMBOL_GPL(net_dec_egress_queue); #endif -static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); +DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); +EXPORT_SYMBOL(netstamp_needed_key); #ifdef CONFIG_JUMP_LABEL static atomic_t netstamp_needed_deferred; static atomic_t netstamp_wanted; @@ -2061,14 +2109,15 @@ EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp = 0; + skb->mono_delivery_time = 0; if (static_branch_unlikely(&netstamp_needed_key)) - __net_timestamp(skb); + skb->tstamp = ktime_get_real(); } #define net_timestamp_check(COND, SKB) \ if (static_branch_unlikely(&netstamp_needed_key)) { \ if ((COND) && !(SKB)->tstamp) \ - __net_timestamp(SKB); \ + (SKB)->tstamp = ktime_get_real(); \ } \ bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) @@ -3710,7 +3759,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, no_lock_out: if (unlikely(to_free)) - kfree_skb_list(to_free); + kfree_skb_list_reason(to_free, + SKB_DROP_REASON_QDISC_DROP); return rc; } @@ -3765,7 +3815,7 @@ no_lock_out: } spin_unlock(root_lock); if (unlikely(to_free)) - kfree_skb_list(to_free); + kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP); if (unlikely(contended)) spin_unlock(&q->busylock); return rc; @@ -3811,7 +3861,7 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) skb->ip_summed = CHECKSUM_UNNECESSARY; WARN_ON(!skb_dst(skb)); skb_dst_force(skb); - netif_rx_ni(skb); + netif_rx(skb); return 0; } EXPORT_SYMBOL(dev_loopback_xmit); @@ -3840,7 +3890,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) case TC_ACT_SHOT: mini_qdisc_qstats_cpu_drop(miniq); *ret = NET_XMIT_DROP; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS); return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: @@ -4456,11 +4506,11 @@ static void rps_trigger_softirq(void *data) * If yes, queue it to our IPI list and return 1 * If no, return 0 */ -static int rps_ipi_queued(struct softnet_data *sd) +static int napi_schedule_rps(struct softnet_data *sd) { -#ifdef CONFIG_RPS struct softnet_data *mysd = this_cpu_ptr(&softnet_data); +#ifdef CONFIG_RPS if (sd != mysd) { sd->rps_ipi_next = mysd->rps_ipi_list; mysd->rps_ipi_list = sd; @@ -4469,6 +4519,7 @@ static int rps_ipi_queued(struct softnet_data *sd) return 1; } #endif /* CONFIG_RPS */ + __napi_schedule_irqoff(&mysd->backlog); return 0; } @@ -4519,15 +4570,15 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) static int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned int *qtail) { + enum skb_drop_reason reason; struct softnet_data *sd; unsigned long flags; unsigned int qlen; + reason = SKB_DROP_REASON_NOT_SPECIFIED; sd = &per_cpu(softnet_data, cpu); - local_irq_save(flags); - - rps_lock(sd); + rps_lock_irqsave(sd, &flags); if (!netif_running(skb->dev)) goto drop; qlen = skb_queue_len(&sd->input_pkt_queue); @@ -4536,29 +4587,25 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); input_queue_tail_incr_save(sd, qtail); - rps_unlock(sd); - local_irq_restore(flags); + rps_unlock_irq_restore(sd, &flags); return NET_RX_SUCCESS; } /* Schedule NAPI for backlog device * We can use non atomic operation since we own the queue lock */ - if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { - if (!rps_ipi_queued(sd)) - ____napi_schedule(sd, &sd->backlog); - } + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) + napi_schedule_rps(sd); goto enqueue; } + reason = SKB_DROP_REASON_CPU_BACKLOG; drop: sd->dropped++; - rps_unlock(sd); - - local_irq_restore(flags); + rps_unlock_irq_restore(sd, &flags); atomic_long_inc(&skb->dev->rx_dropped); - kfree_skb(skb); + kfree_skb_reason(skb, reason); return NET_RX_DROP; } @@ -4778,7 +4825,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) } return XDP_PASS; out_redir: - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_XDP); return XDP_DROP; } EXPORT_SYMBOL_GPL(do_xdp_generic); @@ -4796,7 +4843,6 @@ static int netif_rx_internal(struct sk_buff *skb) struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; - preempt_disable(); rcu_read_lock(); cpu = get_rps_cpu(skb->dev, skb, &rflow); @@ -4806,78 +4852,72 @@ static int netif_rx_internal(struct sk_buff *skb) ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); - preempt_enable(); } else #endif { unsigned int qtail; - ret = enqueue_to_backlog(skb, get_cpu(), &qtail); - put_cpu(); + ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail); } return ret; } /** + * __netif_rx - Slightly optimized version of netif_rx + * @skb: buffer to post + * + * This behaves as netif_rx except that it does not disable bottom halves. + * As a result this function may only be invoked from the interrupt context + * (either hard or soft interrupt). + */ +int __netif_rx(struct sk_buff *skb) +{ + int ret; + + lockdep_assert_once(hardirq_count() | softirq_count()); + + trace_netif_rx_entry(skb); + ret = netif_rx_internal(skb); + trace_netif_rx_exit(ret); + return ret; +} +EXPORT_SYMBOL(__netif_rx); + +/** * netif_rx - post buffer to the network code * @skb: buffer to post * * This function receives a packet from a device driver and queues it for - * the upper (protocol) levels to process. It always succeeds. The buffer - * may be dropped during processing for congestion control or by the - * protocol layers. + * the upper (protocol) levels to process via the backlog NAPI device. It + * always succeeds. The buffer may be dropped during processing for + * congestion control or by the protocol layers. + * The network buffer is passed via the backlog NAPI device. Modern NIC + * driver should use NAPI and GRO. + * This function can used from interrupt and from process context. The + * caller from process context must not disable interrupts before invoking + * this function. * * return values: * NET_RX_SUCCESS (no congestion) * NET_RX_DROP (packet was dropped) * */ - int netif_rx(struct sk_buff *skb) { + bool need_bh_off = !(hardirq_count() | softirq_count()); int ret; + if (need_bh_off) + local_bh_disable(); trace_netif_rx_entry(skb); - ret = netif_rx_internal(skb); trace_netif_rx_exit(ret); - + if (need_bh_off) + local_bh_enable(); return ret; } EXPORT_SYMBOL(netif_rx); -int netif_rx_ni(struct sk_buff *skb) -{ - int err; - - trace_netif_rx_ni_entry(skb); - - preempt_disable(); - err = netif_rx_internal(skb); - if (local_softirq_pending()) - do_softirq(); - preempt_enable(); - trace_netif_rx_ni_exit(err); - - return err; -} -EXPORT_SYMBOL(netif_rx_ni); - -int netif_rx_any_context(struct sk_buff *skb) -{ - /* - * If invoked from contexts which do not invoke bottom half - * processing either at return from interrupt or when softrqs are - * reenabled, use netif_rx_ni() which invokes bottomhalf processing - * directly. - */ - if (in_interrupt()) - return netif_rx(skb); - else - return netif_rx_ni(skb); -} -EXPORT_SYMBOL(netif_rx_any_context); - static __latent_entropy void net_tx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -5001,7 +5041,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, break; case TC_ACT_SHOT: mini_qdisc_qstats_cpu_drop(miniq); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: @@ -5318,11 +5358,13 @@ check_vlan_id: *ppt_prev = pt_prev; } else { drop: - if (!deliver_exact) + if (!deliver_exact) { atomic_long_inc(&skb->dev->rx_dropped); - else + kfree_skb_reason(skb, SKB_DROP_REASON_PTYPE_ABSENT); + } else { atomic_long_inc(&skb->dev->rx_nohandler); - kfree_skb(skb); + kfree_skb(skb); + } /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) */ @@ -5650,8 +5692,7 @@ static void flush_backlog(struct work_struct *work) local_bh_disable(); sd = this_cpu_ptr(&softnet_data); - local_irq_disable(); - rps_lock(sd); + rps_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); @@ -5659,8 +5700,7 @@ static void flush_backlog(struct work_struct *work) input_queue_head_incr(sd); } } - rps_unlock(sd); - local_irq_enable(); + rps_unlock_irq_enable(sd); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { @@ -5678,16 +5718,14 @@ static bool flush_required(int cpu) struct softnet_data *sd = &per_cpu(softnet_data, cpu); bool do_flush; - local_irq_disable(); - rps_lock(sd); + rps_lock_irq_disable(sd); /* as insertion into process_queue happens with the rps lock held, * process_queue access may race only with dequeue */ do_flush = !skb_queue_empty(&sd->input_pkt_queue) || !skb_queue_empty_lockless(&sd->process_queue); - rps_unlock(sd); - local_irq_enable(); + rps_unlock_irq_enable(sd); return do_flush; #endif @@ -5802,8 +5840,7 @@ static int process_backlog(struct napi_struct *napi, int quota) } - local_irq_disable(); - rps_lock(sd); + rps_lock_irq_disable(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). @@ -5819,8 +5856,7 @@ static int process_backlog(struct napi_struct *napi, int quota) skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); } - rps_unlock(sd); - local_irq_enable(); + rps_unlock_irq_enable(sd); } return work; @@ -7727,6 +7763,242 @@ void netdev_bonding_info_change(struct net_device *dev, } EXPORT_SYMBOL(netdev_bonding_info_change); +static int netdev_offload_xstats_enable_l3(struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .info.extack = extack, + .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, + }; + int err; + int rc; + + dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3), + GFP_KERNEL); + if (!dev->offload_xstats_l3) + return -ENOMEM; + + rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE, + NETDEV_OFFLOAD_XSTATS_DISABLE, + &info.info); + err = notifier_to_errno(rc); + if (err) + goto free_stats; + + return 0; + +free_stats: + kfree(dev->offload_xstats_l3); + dev->offload_xstats_l3 = NULL; + return err; +} + +int netdev_offload_xstats_enable(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct netlink_ext_ack *extack) +{ + ASSERT_RTNL(); + + if (netdev_offload_xstats_enabled(dev, type)) + return -EALREADY; + + switch (type) { + case NETDEV_OFFLOAD_XSTATS_TYPE_L3: + return netdev_offload_xstats_enable_l3(dev, extack); + } + + WARN_ON(1); + return -EINVAL; +} +EXPORT_SYMBOL(netdev_offload_xstats_enable); + +static void netdev_offload_xstats_disable_l3(struct net_device *dev) +{ + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, + }; + + call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE, + &info.info); + kfree(dev->offload_xstats_l3); + dev->offload_xstats_l3 = NULL; +} + +int netdev_offload_xstats_disable(struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + ASSERT_RTNL(); + + if (!netdev_offload_xstats_enabled(dev, type)) + return -EALREADY; + + switch (type) { + case NETDEV_OFFLOAD_XSTATS_TYPE_L3: + netdev_offload_xstats_disable_l3(dev); + return 0; + } + + WARN_ON(1); + return -EINVAL; +} +EXPORT_SYMBOL(netdev_offload_xstats_disable); + +static void netdev_offload_xstats_disable_all(struct net_device *dev) +{ + netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3); +} + +static struct rtnl_hw_stats64 * +netdev_offload_xstats_get_ptr(const struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + switch (type) { + case NETDEV_OFFLOAD_XSTATS_TYPE_L3: + return dev->offload_xstats_l3; + } + + WARN_ON(1); + return NULL; +} + +bool netdev_offload_xstats_enabled(const struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + ASSERT_RTNL(); + + return netdev_offload_xstats_get_ptr(dev, type); +} +EXPORT_SYMBOL(netdev_offload_xstats_enabled); + +struct netdev_notifier_offload_xstats_ru { + bool used; +}; + +struct netdev_notifier_offload_xstats_rd { + struct rtnl_hw_stats64 stats; + bool used; +}; + +static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest, + const struct rtnl_hw_stats64 *src) +{ + dest->rx_packets += src->rx_packets; + dest->tx_packets += src->tx_packets; + dest->rx_bytes += src->rx_bytes; + dest->tx_bytes += src->tx_bytes; + dest->rx_errors += src->rx_errors; + dest->tx_errors += src->tx_errors; + dest->rx_dropped += src->rx_dropped; + dest->tx_dropped += src->tx_dropped; + dest->multicast += src->multicast; +} + +static int netdev_offload_xstats_get_used(struct net_device *dev, + enum netdev_offload_xstats_type type, + bool *p_used, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_offload_xstats_ru report_used = {}; + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .info.extack = extack, + .type = type, + .report_used = &report_used, + }; + int rc; + + WARN_ON(!netdev_offload_xstats_enabled(dev, type)); + rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED, + &info.info); + *p_used = report_used.used; + return notifier_to_errno(rc); +} + +static int netdev_offload_xstats_get_stats(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct rtnl_hw_stats64 *p_stats, + bool *p_used, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_offload_xstats_rd report_delta = {}; + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .info.extack = extack, + .type = type, + .report_delta = &report_delta, + }; + struct rtnl_hw_stats64 *stats; + int rc; + + stats = netdev_offload_xstats_get_ptr(dev, type); + if (WARN_ON(!stats)) + return -EINVAL; + + rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA, + &info.info); + + /* Cache whatever we got, even if there was an error, otherwise the + * successful stats retrievals would get lost. + */ + netdev_hw_stats64_add(stats, &report_delta.stats); + + if (p_stats) + *p_stats = *stats; + *p_used = report_delta.used; + + return notifier_to_errno(rc); +} + +int netdev_offload_xstats_get(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct rtnl_hw_stats64 *p_stats, bool *p_used, + struct netlink_ext_ack *extack) +{ + ASSERT_RTNL(); + + if (p_stats) + return netdev_offload_xstats_get_stats(dev, type, p_stats, + p_used, extack); + else + return netdev_offload_xstats_get_used(dev, type, p_used, + extack); +} +EXPORT_SYMBOL(netdev_offload_xstats_get); + +void +netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta, + const struct rtnl_hw_stats64 *stats) +{ + report_delta->used = true; + netdev_hw_stats64_add(&report_delta->stats, stats); +} +EXPORT_SYMBOL(netdev_offload_xstats_report_delta); + +void +netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used) +{ + report_used->used = true; +} +EXPORT_SYMBOL(netdev_offload_xstats_report_used); + +void netdev_offload_xstats_push_delta(struct net_device *dev, + enum netdev_offload_xstats_type type, + const struct rtnl_hw_stats64 *p_stats) +{ + struct rtnl_hw_stats64 *stats; + + ASSERT_RTNL(); + + stats = netdev_offload_xstats_get_ptr(dev, type); + if (WARN_ON(!stats)) + return; + + netdev_hw_stats64_add(stats, p_stats); +} +EXPORT_SYMBOL(netdev_offload_xstats_push_delta); + /** * netdev_get_xmit_slave - Get the xmit slave of master device * @dev: device @@ -9815,8 +10087,8 @@ int netdev_unregister_timeout_secs __read_mostly = 10; #define WAIT_REFS_MIN_MSECS 1 #define WAIT_REFS_MAX_MSECS 250 /** - * netdev_wait_allrefs - wait until all references are gone. - * @dev: target net_device + * netdev_wait_allrefs_any - wait until all references are gone. + * @list: list of net_devices to wait on * * This is called when unregistering network devices. * @@ -9826,37 +10098,42 @@ int netdev_unregister_timeout_secs __read_mostly = 10; * We can get stuck here if buggy protocols don't correctly * call dev_put. */ -static void netdev_wait_allrefs(struct net_device *dev) +static struct net_device *netdev_wait_allrefs_any(struct list_head *list) { unsigned long rebroadcast_time, warning_time; - int wait = 0, refcnt; - - linkwatch_forget_dev(dev); + struct net_device *dev; + int wait = 0; rebroadcast_time = warning_time = jiffies; - refcnt = netdev_refcnt_read(dev); - while (refcnt != 1) { + list_for_each_entry(dev, list, todo_list) + if (netdev_refcnt_read(dev) == 1) + return dev; + + while (true) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); /* Rebroadcast unregister notification */ - call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + list_for_each_entry(dev, list, todo_list) + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); __rtnl_unlock(); rcu_barrier(); rtnl_lock(); - if (test_bit(__LINK_STATE_LINKWATCH_PENDING, - &dev->state)) { - /* We must not have linkwatch events - * pending on unregister. If this - * happens, we simply run the queue - * unscheduled, resulting in a noop - * for this device. - */ - linkwatch_run_queue(); - } + list_for_each_entry(dev, list, todo_list) + if (test_bit(__LINK_STATE_LINKWATCH_PENDING, + &dev->state)) { + /* We must not have linkwatch events + * pending on unregister. If this + * happens, we simply run the queue + * unscheduled, resulting in a noop + * for this device. + */ + linkwatch_run_queue(); + break; + } __rtnl_unlock(); @@ -9871,14 +10148,18 @@ static void netdev_wait_allrefs(struct net_device *dev) wait = min(wait << 1, WAIT_REFS_MAX_MSECS); } - refcnt = netdev_refcnt_read(dev); + list_for_each_entry(dev, list, todo_list) + if (netdev_refcnt_read(dev) == 1) + return dev; - if (refcnt != 1 && - time_after(jiffies, warning_time + + if (time_after(jiffies, warning_time + netdev_unregister_timeout_secs * HZ)) { - pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", - dev->name, refcnt); - ref_tracker_dir_print(&dev->refcnt_tracker, 10); + list_for_each_entry(dev, list, todo_list) { + pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", + dev->name, netdev_refcnt_read(dev)); + ref_tracker_dir_print(&dev->refcnt_tracker, 10); + } + warning_time = jiffies; } } @@ -9910,6 +10191,7 @@ static void netdev_wait_allrefs(struct net_device *dev) */ void netdev_run_todo(void) { + struct net_device *dev, *tmp; struct list_head list; #ifdef CONFIG_LOCKDEP struct list_head unlink_list; @@ -9930,26 +10212,24 @@ void netdev_run_todo(void) __rtnl_unlock(); - /* Wait for rcu callbacks to finish before next phase */ if (!list_empty(&list)) rcu_barrier(); - while (!list_empty(&list)) { - struct net_device *dev - = list_first_entry(&list, struct net_device, todo_list); - list_del(&dev->todo_list); - + list_for_each_entry_safe(dev, tmp, &list, todo_list) { if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { - pr_err("network todo '%s' but state %d\n", - dev->name, dev->reg_state); - dump_stack(); + netdev_WARN(dev, "run_todo but not unregistering\n"); + list_del(&dev->todo_list); continue; } dev->reg_state = NETREG_UNREGISTERED; + linkwatch_forget_dev(dev); + } - netdev_wait_allrefs(dev); + while (!list_empty(&list)) { + dev = netdev_wait_allrefs_any(&list); + list_del(&dev->todo_list); /* paranoia */ BUG_ON(netdev_refcnt_read(dev) != 1); @@ -10408,6 +10688,8 @@ void unregister_netdevice_many(struct list_head *head) dev_xdp_uninstall(dev); + netdev_offload_xstats_disable_all(dev); + /* Notify protocols, that we are about to destroy * this device. They should clean all the things. */ @@ -10673,11 +10955,11 @@ static int dev_cpu_dead(unsigned int oldcpu) /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { - netif_rx_ni(skb); + netif_rx(skb); input_queue_head_incr(oldsd); } while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { - netif_rx_ni(skb); + netif_rx(skb); input_queue_head_incr(oldsd); } @@ -10880,36 +11162,6 @@ static void __net_exit default_device_exit_net(struct net *net) } } -static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) -{ - /* Return (with the rtnl_lock held) when there are no network - * devices unregistering in any network namespace in net_list. - */ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - bool unregistering; - struct net *net; - - ASSERT_RTNL(); - add_wait_queue(&netdev_unregistering_wq, &wait); - for (;;) { - unregistering = false; - - list_for_each_entry(net, net_list, exit_list) { - if (atomic_read(&net->dev_unreg_count) > 0) { - unregistering = true; - break; - } - } - if (!unregistering) - break; - __rtnl_unlock(); - - wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); - rtnl_lock(); - } - remove_wait_queue(&netdev_unregistering_wq, &wait); -} - static void __net_exit default_device_exit_batch(struct list_head *net_list) { /* At exit all network devices most be removed from a network @@ -10926,18 +11178,6 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) default_device_exit_net(net); cond_resched(); } - /* To prevent network device cleanup code from dereferencing - * loopback devices or network devices that have been freed - * wait here for all pending unregistrations to complete, - * before unregistring the loopback device and allowing the - * network namespace be freed. - * - * The netdev todo list containing all network devices - * unregistrations that happen in default_device_exit_batch - * will run in the rtnl_unlock() at the end of - * default_device_exit_batch. - */ - rtnl_lock_unregistering(net_list); list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 4641126b8a20..b89e3e95bffc 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -64,7 +64,6 @@ static const char * const drop_reasons[] = { /* net_dm_mutex * * An overall lock guarding every operation coming from userspace. - * It also guards the global 'hw_stats_list' list. */ static DEFINE_MUTEX(net_dm_mutex); @@ -100,11 +99,9 @@ struct per_cpu_dm_data { }; struct dm_hw_stat_delta { - struct net_device *dev; unsigned long last_rx; - struct list_head list; - struct rcu_head rcu; unsigned long last_drop_val; + struct rcu_head rcu; }; static struct genl_family net_drop_monitor_family; @@ -115,7 +112,6 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_hw_cpu_data); static int dm_hit_limit = 64; static int dm_delay = 1; static unsigned long dm_hw_check_delta = 2*HZ; -static LIST_HEAD(hw_stats_list); static enum net_dm_alert_mode net_dm_alert_mode = NET_DM_ALERT_MODE_SUMMARY; static u32 net_dm_trunc_len; @@ -287,29 +283,27 @@ static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi, int work, int budget) { - struct dm_hw_stat_delta *new_stat; - + struct net_device *dev = napi->dev; + struct dm_hw_stat_delta *stat; /* * Don't check napi structures with no associated device */ - if (!napi->dev) + if (!dev) return; rcu_read_lock(); - list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + stat = rcu_dereference(dev->dm_private); + if (stat) { /* * only add a note to our monitor buffer if: - * 1) this is the dev we received on - * 2) its after the last_rx delta - * 3) our rx_dropped count has gone up + * 1) its after the last_rx delta + * 2) our rx_dropped count has gone up */ - if ((new_stat->dev == napi->dev) && - (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) && - (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { + if (time_after(jiffies, stat->last_rx + dm_hw_check_delta) && + (dev->stats.rx_dropped != stat->last_drop_val)) { trace_drop_common(NULL, NULL); - new_stat->last_drop_val = napi->dev->stats.rx_dropped; - new_stat->last_rx = jiffies; - break; + stat->last_drop_val = dev->stats.rx_dropped; + stat->last_rx = jiffies; } } rcu_read_unlock(); @@ -1194,7 +1188,6 @@ err_module_put: static void net_dm_trace_off_set(void) { - struct dm_hw_stat_delta *new_stat, *temp; const struct net_dm_alert_ops *ops; int cpu; @@ -1218,13 +1211,6 @@ static void net_dm_trace_off_set(void) consume_skb(skb); } - list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) { - if (new_stat->dev == NULL) { - list_del_rcu(&new_stat->list); - kfree_rcu(new_stat, rcu); - } - } - module_put(THIS_MODULE); } @@ -1585,38 +1571,28 @@ static int dropmon_net_event(struct notifier_block *ev_block, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct dm_hw_stat_delta *new_stat = NULL; - struct dm_hw_stat_delta *tmp; + struct dm_hw_stat_delta *stat; switch (event) { case NETDEV_REGISTER: - new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); + if (WARN_ON_ONCE(rtnl_dereference(dev->dm_private))) + break; + stat = kzalloc(sizeof(*stat), GFP_KERNEL); + if (!stat) + break; - if (!new_stat) - goto out; + stat->last_rx = jiffies; + rcu_assign_pointer(dev->dm_private, stat); - new_stat->dev = dev; - new_stat->last_rx = jiffies; - mutex_lock(&net_dm_mutex); - list_add_rcu(&new_stat->list, &hw_stats_list); - mutex_unlock(&net_dm_mutex); break; case NETDEV_UNREGISTER: - mutex_lock(&net_dm_mutex); - list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) { - if (new_stat->dev == dev) { - new_stat->dev = NULL; - if (trace_state == TRACE_OFF) { - list_del_rcu(&new_stat->list); - kfree_rcu(new_stat, rcu); - break; - } - } + stat = rtnl_dereference(dev->dm_private); + if (stat) { + rcu_assign_pointer(dev->dm_private, NULL); + kfree_rcu(stat, rcu); } - mutex_unlock(&net_dm_mutex); break; } -out: return NOTIFY_DONE; } diff --git a/net/core/filter.c b/net/core/filter.c index 818244068c2d..88767f7da150 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2107,7 +2107,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) } skb->dev = dev; - skb->tstamp = 0; + skb_clear_tstamp(skb); dev_xmit_recursion_inc(); ret = dev_queue_xmit(skb); @@ -2176,7 +2176,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, } skb->dev = dev; - skb->tstamp = 0; + skb_clear_tstamp(skb); if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); @@ -2274,7 +2274,7 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, } skb->dev = dev; - skb->tstamp = 0; + skb_clear_tstamp(skb); if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); @@ -2710,6 +2710,9 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, if (unlikely(flags)) return -EINVAL; + if (unlikely(len == 0)) + return 0; + /* First find the starting scatterlist element */ i = msg->sg.start; do { @@ -7385,6 +7388,43 @@ static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_skb_set_delivery_time, struct sk_buff *, skb, + u64, dtime, u32, dtime_type) +{ + /* skb_clear_delivery_time() is done for inet protocol */ + if (skb->protocol != htons(ETH_P_IP) && + skb->protocol != htons(ETH_P_IPV6)) + return -EOPNOTSUPP; + + switch (dtime_type) { + case BPF_SKB_DELIVERY_TIME_MONO: + if (!dtime) + return -EINVAL; + skb->tstamp = dtime; + skb->mono_delivery_time = 1; + break; + case BPF_SKB_DELIVERY_TIME_NONE: + if (dtime) + return -EINVAL; + skb->tstamp = 0; + skb->mono_delivery_time = 0; + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static const struct bpf_func_proto bpf_skb_set_delivery_time_proto = { + .func = bpf_skb_set_delivery_time, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -7746,6 +7786,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_gen_syncookie_proto; case BPF_FUNC_sk_assign: return &bpf_sk_assign_proto; + case BPF_FUNC_skb_set_delivery_time: + return &bpf_skb_set_delivery_time_proto; #endif default: return bpf_sk_base_func_proto(func_id); @@ -8085,7 +8127,9 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type return false; info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; break; - case offsetofend(struct __sk_buff, gso_size) ... offsetof(struct __sk_buff, hwtstamp) - 1: + case offsetof(struct __sk_buff, delivery_time_type): + return false; + case offsetofend(struct __sk_buff, delivery_time_type) ... offsetof(struct __sk_buff, hwtstamp) - 1: /* Explicitly prohibit access to padding in __sk_buff. */ return false; default: @@ -8440,6 +8484,15 @@ static bool tc_cls_act_is_valid_access(int off, int size, break; case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; + case offsetof(struct __sk_buff, delivery_time_type): + /* The convert_ctx_access() on reading and writing + * __sk_buff->tstamp depends on whether the bpf prog + * has used __sk_buff->delivery_time_type or not. + * Thus, we need to set prog->delivery_time_access + * earlier during is_valid_access() here. + */ + ((struct bpf_prog *)prog)->delivery_time_access = 1; + return size == sizeof(__u8); } return bpf_skb_is_valid_access(off, size, type, prog, info); @@ -8835,6 +8888,45 @@ static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static struct bpf_insn *bpf_convert_dtime_type_read(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + __u8 value_reg = si->dst_reg; + __u8 skb_reg = si->src_reg; + __u8 tmp_reg = BPF_REG_AX; + + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, + SKB_MONO_DELIVERY_TIME_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, + SKB_MONO_DELIVERY_TIME_MASK); + *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2); + /* value_reg = BPF_SKB_DELIVERY_TIME_MONO */ + *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_MONO); + *insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 10 : 5); + + *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, skb_reg, + offsetof(struct sk_buff, tstamp)); + *insn++ = BPF_JMP_IMM(BPF_JNE, tmp_reg, 0, 2); + /* value_reg = BPF_SKB_DELIVERY_TIME_NONE */ + *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_NONE); + *insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 6 : 1); + +#ifdef CONFIG_NET_CLS_ACT + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK); + *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2); + /* At ingress, value_reg = 0 */ + *insn++ = BPF_MOV32_IMM(value_reg, 0); + *insn++ = BPF_JMP_A(1); +#endif + + /* value_reg = BPF_SKB_DELIVERYT_TIME_UNSPEC */ + *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_UNSPEC); + + /* 15 insns with CONFIG_NET_CLS_ACT */ + return insn; +} + static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si, struct bpf_insn *insn) { @@ -8856,6 +8948,71 @@ static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si, return insn; } +static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, + const struct bpf_insn *si, + struct bpf_insn *insn) +{ + __u8 value_reg = si->dst_reg; + __u8 skb_reg = si->src_reg; + +#ifdef CONFIG_NET_CLS_ACT + if (!prog->delivery_time_access) { + __u8 tmp_reg = BPF_REG_AX; + + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK); + *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 5); + /* @ingress, read __sk_buff->tstamp as the (rcv) timestamp, + * so check the skb->mono_delivery_time. + */ + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, + SKB_MONO_DELIVERY_TIME_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, + SKB_MONO_DELIVERY_TIME_MASK); + *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2); + /* skb->mono_delivery_time is set, read 0 as the (rcv) timestamp. */ + *insn++ = BPF_MOV64_IMM(value_reg, 0); + *insn++ = BPF_JMP_A(1); + } +#endif + + *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg, + offsetof(struct sk_buff, tstamp)); + return insn; +} + +static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, + const struct bpf_insn *si, + struct bpf_insn *insn) +{ + __u8 value_reg = si->src_reg; + __u8 skb_reg = si->dst_reg; + +#ifdef CONFIG_NET_CLS_ACT + if (!prog->delivery_time_access) { + __u8 tmp_reg = BPF_REG_AX; + + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK); + *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 3); + /* Writing __sk_buff->tstamp at ingress as the (rcv) timestamp. + * Clear the skb->mono_delivery_time. + */ + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, + SKB_MONO_DELIVERY_TIME_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, + ~SKB_MONO_DELIVERY_TIME_MASK); + *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, + SKB_MONO_DELIVERY_TIME_OFFSET); + } +#endif + + /* skb->tstamp = tstamp */ + *insn++ = BPF_STX_MEM(BPF_DW, skb_reg, value_reg, + offsetof(struct sk_buff, tstamp)); + return insn; +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -9164,17 +9321,13 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_DW, - si->dst_reg, si->src_reg, - bpf_target_off(struct sk_buff, - tstamp, 8, - target_size)); + insn = bpf_convert_tstamp_write(prog, si, insn); else - *insn++ = BPF_LDX_MEM(BPF_DW, - si->dst_reg, si->src_reg, - bpf_target_off(struct sk_buff, - tstamp, 8, - target_size)); + insn = bpf_convert_tstamp_read(prog, si, insn); + break; + + case offsetof(struct __sk_buff, delivery_time_type): + insn = bpf_convert_dtime_type_read(si, insn); break; case offsetof(struct __sk_buff, gso_segs): diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 15833e1d6ea1..03b6e649c428 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -22,6 +22,7 @@ #include <linux/ppp_defs.h> #include <linux/stddef.h> #include <linux/if_ether.h> +#include <linux/if_hsr.h> #include <linux/mpls.h> #include <linux/tcp.h> #include <linux/ptp_classify.h> @@ -1282,6 +1283,23 @@ proto_again: break; } + case htons(ETH_P_PRP): + case htons(ETH_P_HSR): { + struct hsr_tag *hdr, _hdr; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, + &_hdr); + if (!hdr) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + proto = hdr->encap_proto; + nhoff += HSR_HLEN; + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + } + default: fdret = FLOW_DISSECT_RET_OUT_BAD; break; diff --git a/net/core/gro.c b/net/core/gro.c index ee5e7e889d8b..78110edf5d4b 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -93,6 +93,31 @@ void dev_remove_offload(struct packet_offload *po) EXPORT_SYMBOL(dev_remove_offload); /** + * skb_eth_gso_segment - segmentation handler for ethernet protocols. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @type: Ethernet Protocol ID + */ +struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, + netdev_features_t features, __be16 type) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { + segs = ptype->callbacks.gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + return segs; +} +EXPORT_SYMBOL(skb_eth_gso_segment); + +/** * skb_mac_gso_segment - mac layer segmentation handler. * @skb: buffer to segment * @features: features for the output path (see dev->features) diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index 6eb2e5ec2c50..8462f926ab45 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -89,8 +89,23 @@ int gro_cells_init(struct gro_cells *gcells, struct net_device *dev) } EXPORT_SYMBOL(gro_cells_init); +struct percpu_free_defer { + struct rcu_head rcu; + void __percpu *ptr; +}; + +static void percpu_free_defer_callback(struct rcu_head *head) +{ + struct percpu_free_defer *defer; + + defer = container_of(head, struct percpu_free_defer, rcu); + free_percpu(defer->ptr); + kfree(defer); +} + void gro_cells_destroy(struct gro_cells *gcells) { + struct percpu_free_defer *defer; int i; if (!gcells->cells) @@ -102,12 +117,23 @@ void gro_cells_destroy(struct gro_cells *gcells) __netif_napi_del(&cell->napi); __skb_queue_purge(&cell->napi_skbs); } - /* This barrier is needed because netpoll could access dev->napi_list - * under rcu protection. + /* We need to observe an rcu grace period before freeing ->cells, + * because netpoll could access dev->napi_list under rcu protection. + * Try hard using call_rcu() instead of synchronize_rcu(), + * because we might be called from cleanup_net(), and we + * definitely do not want to block this critical task. */ - synchronize_net(); - - free_percpu(gcells->cells); + defer = kmalloc(sizeof(*defer), GFP_KERNEL | __GFP_NOWARN); + if (likely(defer)) { + defer->ptr = gcells->cells; + call_rcu(&defer->rcu, percpu_free_defer_callback); + } else { + /* We do not hold RTNL at this point, synchronize_net() + * would not be able to expedite this sync. + */ + synchronize_rcu_expedited(); + free_percpu(gcells->cells); + } gcells->cells = NULL; } EXPORT_SYMBOL(gro_cells_destroy); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ec0bf737b076..f64ebd050f6c 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1171,7 +1171,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, neigh->updated = jiffies; write_unlock_bh(&neigh->lock); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED); return 1; } } else if (neigh->nud_state & NUD_STALE) { @@ -1193,7 +1193,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, if (!buff) break; neigh->arp_queue_len_bytes -= buff->truesize; - kfree_skb(buff); + kfree_skb_reason(buff, SKB_DROP_REASON_NEIGH_QUEUEFULL); NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); } skb_dst_force(skb); @@ -1215,7 +1215,7 @@ out_dead: if (neigh->nud_state & NUD_STALE) goto out_unlock_bh; write_unlock_bh(&neigh->lock); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_DEAD); trace_neigh_event_send_dead(neigh, 1); return 1; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 53ea262ecafd..fbddf966206b 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -213,7 +213,7 @@ static ssize_t speed_show(struct device *dev, if (!rtnl_trylock()) return restart_syscall(); - if (netif_running(netdev)) { + if (netif_running(netdev) && netif_device_present(netdev)) { struct ethtool_link_ksettings cmd; if (!__ethtool_get_link_ksettings(netdev, &cmd)) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index e25d359d84d9..1943c0f0307d 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -26,6 +26,45 @@ #define BIAS_MAX LONG_MAX +#ifdef CONFIG_PAGE_POOL_STATS +/* alloc_stat_inc is intended to be used in softirq context */ +#define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) +/* recycle_stat_inc is safe to use when preemption is possible. */ +#define recycle_stat_inc(pool, __stat) \ + do { \ + struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ + this_cpu_inc(s->__stat); \ + } while (0) + +bool page_pool_get_stats(struct page_pool *pool, + struct page_pool_stats *stats) +{ + int cpu = 0; + + if (!stats) + return false; + + memcpy(&stats->alloc_stats, &pool->alloc_stats, sizeof(pool->alloc_stats)); + + for_each_possible_cpu(cpu) { + const struct page_pool_recycle_stats *pcpu = + per_cpu_ptr(pool->recycle_stats, cpu); + + stats->recycle_stats.cached += pcpu->cached; + stats->recycle_stats.cache_full += pcpu->cache_full; + stats->recycle_stats.ring += pcpu->ring; + stats->recycle_stats.ring_full += pcpu->ring_full; + stats->recycle_stats.released_refcnt += pcpu->released_refcnt; + } + + return true; +} +EXPORT_SYMBOL(page_pool_get_stats); +#else +#define alloc_stat_inc(pool, __stat) +#define recycle_stat_inc(pool, __stat) +#endif + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params) { @@ -73,6 +112,12 @@ static int page_pool_init(struct page_pool *pool, pool->p.flags & PP_FLAG_PAGE_FRAG) return -EINVAL; +#ifdef CONFIG_PAGE_POOL_STATS + pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); + if (!pool->recycle_stats) + return -ENOMEM; +#endif + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) return -ENOMEM; @@ -117,8 +162,10 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) int pref_nid; /* preferred NUMA node */ /* Quicker fallback, avoid locks when ring is empty */ - if (__ptr_ring_empty(r)) + if (__ptr_ring_empty(r)) { + alloc_stat_inc(pool, empty); return NULL; + } /* Softirq guarantee CPU and thus NUMA node is stable. This, * assumes CPU refilling driver RX-ring will also run RX-NAPI. @@ -145,14 +192,17 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) * This limit stress on page buddy alloactor. */ page_pool_return_page(pool, page); + alloc_stat_inc(pool, waive); page = NULL; break; } } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); /* Return last page */ - if (likely(pool->alloc.count > 0)) + if (likely(pool->alloc.count > 0)) { page = pool->alloc.cache[--pool->alloc.count]; + alloc_stat_inc(pool, refill); + } return page; } @@ -166,6 +216,7 @@ static struct page *__page_pool_get_cached(struct page_pool *pool) if (likely(pool->alloc.count)) { /* Fast-path */ page = pool->alloc.cache[--pool->alloc.count]; + alloc_stat_inc(pool, fast); } else { page = page_pool_refill_alloc_cache(pool); } @@ -239,6 +290,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, return NULL; } + alloc_stat_inc(pool, slow_high_order); page_pool_set_pp_info(pool, page); /* Track how many pages are held 'in-flight' */ @@ -293,10 +345,12 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, } /* Return last page */ - if (likely(pool->alloc.count > 0)) + if (likely(pool->alloc.count > 0)) { page = pool->alloc.cache[--pool->alloc.count]; - else + alloc_stat_inc(pool, slow); + } else { page = NULL; + } /* When page just alloc'ed is should/must have refcnt 1. */ return page; @@ -394,7 +448,12 @@ static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) else ret = ptr_ring_produce_bh(&pool->ring, page); - return (ret == 0) ? true : false; + if (!ret) { + recycle_stat_inc(pool, ring); + return true; + } + + return false; } /* Only allow direct recycling in special circumstances, into the @@ -405,11 +464,14 @@ static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) static bool page_pool_recycle_in_cache(struct page *page, struct page_pool *pool) { - if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) + if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { + recycle_stat_inc(pool, cache_full); return false; + } /* Caller MUST have verified/know (page_ref_count(page) == 1) */ pool->alloc.cache[pool->alloc.count++] = page; + recycle_stat_inc(pool, cached); return true; } @@ -459,6 +521,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, * doing refcnt based recycle tricks, meaning another process * will be invoking put_page. */ + recycle_stat_inc(pool, released_refcnt); /* Do not replace this with page_pool_return_page() */ page_pool_release_page(pool, page); put_page(page); @@ -472,6 +535,7 @@ void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); if (page && !page_pool_recycle_in_ring(pool, page)) { /* Cache full, fallback to free pages */ + recycle_stat_inc(pool, ring_full); page_pool_return_page(pool, page); } } @@ -620,6 +684,9 @@ static void page_pool_free(struct page_pool *pool) if (pool->p.flags & PP_FLAG_DMA_MAP) put_device(pool->p.dev); +#ifdef CONFIG_PAGE_POOL_STATS + free_percpu(pool->recycle_stats); +#endif kfree(pool); } diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c index dd4cf01d1e0a..598041b0499e 100644 --- a/net/core/ptp_classifier.c +++ b/net/core/ptp_classifier.c @@ -137,6 +137,18 @@ struct ptp_header *ptp_parse_header(struct sk_buff *skb, unsigned int type) } EXPORT_SYMBOL_GPL(ptp_parse_header); +bool ptp_msg_is_sync(struct sk_buff *skb, unsigned int type) +{ + struct ptp_header *hdr; + + hdr = ptp_parse_header(skb, type); + if (!hdr) + return false; + + return ptp_get_msgtype(hdr, type) == PTP_MSGTYPE_SYNC; +} +EXPORT_SYMBOL_GPL(ptp_msg_is_sync); + void __init ptp_classifier_init(void) { static struct sock_filter ptp_filter[] __initdata = { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a6fad3df42a8..159c9c61e6af 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1699,6 +1699,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, { struct ifinfomsg *ifm; struct nlmsghdr *nlh; + struct Qdisc *qdisc; ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); @@ -1716,6 +1717,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) goto nla_put_failure; + qdisc = rtnl_dereference(dev->qdisc); if (nla_put_string(skb, IFLA_IFNAME, dev->name) || nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) || nla_put_u8(skb, IFLA_OPERSTATE, @@ -1735,8 +1737,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, #endif put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || - (dev->qdisc && - nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) || + (qdisc && + nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) || nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + @@ -3650,13 +3652,24 @@ static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr, bool *changed, struct netlink_ext_ack *extack) { char *alt_ifname; + size_t size; int err; err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack); if (err) return err; - alt_ifname = nla_strdup(attr, GFP_KERNEL); + if (cmd == RTM_NEWLINKPROP) { + size = rtnl_prop_list_size(dev); + size += nla_total_size(ALTIFNAMSIZ); + if (size >= U16_MAX) { + NL_SET_ERR_MSG(extack, + "effective property list too long"); + return -EINVAL; + } + } + + alt_ifname = nla_strdup(attr, GFP_KERNEL_ACCOUNT); if (!alt_ifname) return -ENOMEM; @@ -5046,82 +5059,256 @@ static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr) (!idxattr || idxattr == attrid); } -#define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1) -static int rtnl_get_offload_stats_attr_size(int attr_id) +static bool +rtnl_offload_xstats_have_ndo(const struct net_device *dev, int attr_id) { - switch (attr_id) { - case IFLA_OFFLOAD_XSTATS_CPU_HIT: - return sizeof(struct rtnl_link_stats64); - } + return dev->netdev_ops && + dev->netdev_ops->ndo_has_offload_stats && + dev->netdev_ops->ndo_get_offload_stats && + dev->netdev_ops->ndo_has_offload_stats(dev, attr_id); +} - return 0; +static unsigned int +rtnl_offload_xstats_get_size_ndo(const struct net_device *dev, int attr_id) +{ + return rtnl_offload_xstats_have_ndo(dev, attr_id) ? + sizeof(struct rtnl_link_stats64) : 0; } -static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev, - int *prividx) +static int +rtnl_offload_xstats_fill_ndo(struct net_device *dev, int attr_id, + struct sk_buff *skb) { + unsigned int size = rtnl_offload_xstats_get_size_ndo(dev, attr_id); struct nlattr *attr = NULL; - int attr_id, size; void *attr_data; int err; - if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats && - dev->netdev_ops->ndo_get_offload_stats)) + if (!size) return -ENODATA; - for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST; - attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) { - if (attr_id < *prividx) - continue; + attr = nla_reserve_64bit(skb, attr_id, size, + IFLA_OFFLOAD_XSTATS_UNSPEC); + if (!attr) + return -EMSGSIZE; - size = rtnl_get_offload_stats_attr_size(attr_id); - if (!size) - continue; + attr_data = nla_data(attr); + memset(attr_data, 0, size); - if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id)) - continue; + err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, attr_data); + if (err) + return err; + + return 0; +} + +static unsigned int +rtnl_offload_xstats_get_size_stats(const struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + bool enabled = netdev_offload_xstats_enabled(dev, type); + + return enabled ? sizeof(struct rtnl_hw_stats64) : 0; +} + +struct rtnl_offload_xstats_request_used { + bool request; + bool used; +}; + +static int +rtnl_offload_xstats_get_stats(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct rtnl_offload_xstats_request_used *ru, + struct rtnl_hw_stats64 *stats, + struct netlink_ext_ack *extack) +{ + bool request; + bool used; + int err; - attr = nla_reserve_64bit(skb, attr_id, size, + request = netdev_offload_xstats_enabled(dev, type); + if (!request) { + used = false; + goto out; + } + + err = netdev_offload_xstats_get(dev, type, stats, &used, extack); + if (err) + return err; + +out: + if (ru) { + ru->request = request; + ru->used = used; + } + return 0; +} + +static int +rtnl_offload_xstats_fill_hw_s_info_one(struct sk_buff *skb, int attr_id, + struct rtnl_offload_xstats_request_used *ru) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, attr_id); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST, ru->request)) + goto nla_put_failure; + + if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED, ru->used)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int +rtnl_offload_xstats_fill_hw_s_info(struct sk_buff *skb, struct net_device *dev, + struct netlink_ext_ack *extack) +{ + enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; + struct rtnl_offload_xstats_request_used ru_l3; + struct nlattr *nest; + int err; + + err = rtnl_offload_xstats_get_stats(dev, t_l3, &ru_l3, NULL, extack); + if (err) + return err; + + nest = nla_nest_start(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO); + if (!nest) + return -EMSGSIZE; + + if (rtnl_offload_xstats_fill_hw_s_info_one(skb, + IFLA_OFFLOAD_XSTATS_L3_STATS, + &ru_l3)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int rtnl_offload_xstats_fill(struct sk_buff *skb, struct net_device *dev, + int *prividx, u32 off_filter_mask, + struct netlink_ext_ack *extack) +{ + enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; + int attr_id_hw_s_info = IFLA_OFFLOAD_XSTATS_HW_S_INFO; + int attr_id_l3_stats = IFLA_OFFLOAD_XSTATS_L3_STATS; + int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT; + bool have_data = false; + int err; + + if (*prividx <= attr_id_cpu_hit && + (off_filter_mask & + IFLA_STATS_FILTER_BIT(attr_id_cpu_hit))) { + err = rtnl_offload_xstats_fill_ndo(dev, attr_id_cpu_hit, skb); + if (!err) { + have_data = true; + } else if (err != -ENODATA) { + *prividx = attr_id_cpu_hit; + return err; + } + } + + if (*prividx <= attr_id_hw_s_info && + (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_hw_s_info))) { + *prividx = attr_id_hw_s_info; + + err = rtnl_offload_xstats_fill_hw_s_info(skb, dev, extack); + if (err) + return err; + + have_data = true; + *prividx = 0; + } + + if (*prividx <= attr_id_l3_stats && + (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_l3_stats))) { + unsigned int size_l3; + struct nlattr *attr; + + *prividx = attr_id_l3_stats; + + size_l3 = rtnl_offload_xstats_get_size_stats(dev, t_l3); + attr = nla_reserve_64bit(skb, attr_id_l3_stats, size_l3, IFLA_OFFLOAD_XSTATS_UNSPEC); if (!attr) - goto nla_put_failure; + return -EMSGSIZE; - attr_data = nla_data(attr); - memset(attr_data, 0, size); - err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, - attr_data); + err = rtnl_offload_xstats_get_stats(dev, t_l3, NULL, + nla_data(attr), extack); if (err) - goto get_offload_stats_failure; + return err; + + have_data = true; + *prividx = 0; } - if (!attr) + if (!have_data) return -ENODATA; *prividx = 0; return 0; +} -nla_put_failure: - err = -EMSGSIZE; -get_offload_stats_failure: - *prividx = attr_id; - return err; +static unsigned int +rtnl_offload_xstats_get_size_hw_s_info_one(const struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + bool enabled = netdev_offload_xstats_enabled(dev, type); + + return nla_total_size(0) + + /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST */ + nla_total_size(sizeof(u8)) + + /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED */ + (enabled ? nla_total_size(sizeof(u8)) : 0) + + 0; +} + +static unsigned int +rtnl_offload_xstats_get_size_hw_s_info(const struct net_device *dev) +{ + enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; + + return nla_total_size(0) + + /* IFLA_OFFLOAD_XSTATS_L3_STATS */ + rtnl_offload_xstats_get_size_hw_s_info_one(dev, t_l3) + + 0; } -static int rtnl_get_offload_stats_size(const struct net_device *dev) +static int rtnl_offload_xstats_get_size(const struct net_device *dev, + u32 off_filter_mask) { + enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; + int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT; int nla_size = 0; - int attr_id; int size; - if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats && - dev->netdev_ops->ndo_get_offload_stats)) - return 0; + if (off_filter_mask & + IFLA_STATS_FILTER_BIT(attr_id_cpu_hit)) { + size = rtnl_offload_xstats_get_size_ndo(dev, attr_id_cpu_hit); + nla_size += nla_total_size_64bit(size); + } - for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST; - attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) { - if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id)) - continue; - size = rtnl_get_offload_stats_attr_size(attr_id); + if (off_filter_mask & + IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO)) + nla_size += rtnl_offload_xstats_get_size_hw_s_info(dev); + + if (off_filter_mask & + IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_L3_STATS)) { + size = rtnl_offload_xstats_get_size_stats(dev, t_l3); nla_size += nla_total_size_64bit(size); } @@ -5131,11 +5318,21 @@ static int rtnl_get_offload_stats_size(const struct net_device *dev) return nla_size; } +struct rtnl_stats_dump_filters { + /* mask[0] filters outer attributes. Then individual nests have their + * filtering mask at the index of the nested attribute. + */ + u32 mask[IFLA_STATS_MAX + 1]; +}; + static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, - unsigned int flags, unsigned int filter_mask, - int *idxattr, int *prividx) + unsigned int flags, + const struct rtnl_stats_dump_filters *filters, + int *idxattr, int *prividx, + struct netlink_ext_ack *extack) { + unsigned int filter_mask = filters->mask[0]; struct if_stats_msg *ifsm; struct nlmsghdr *nlh; struct nlattr *attr; @@ -5161,8 +5358,10 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, attr = nla_reserve_64bit(skb, IFLA_STATS_LINK_64, sizeof(struct rtnl_link_stats64), IFLA_STATS_UNSPEC); - if (!attr) + if (!attr) { + err = -EMSGSIZE; goto nla_put_failure; + } sp = nla_data(attr); dev_get_stats(dev, sp); @@ -5175,8 +5374,10 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, *idxattr = IFLA_STATS_LINK_XSTATS; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_XSTATS); - if (!attr) + if (!attr) { + err = -EMSGSIZE; goto nla_put_failure; + } err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); nla_nest_end(skb, attr); @@ -5198,8 +5399,10 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_XSTATS_SLAVE); - if (!attr) + if (!attr) { + err = -EMSGSIZE; goto nla_put_failure; + } err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); nla_nest_end(skb, attr); @@ -5211,13 +5414,19 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, *idxattr)) { + u32 off_filter_mask; + + off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS]; *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS); - if (!attr) + if (!attr) { + err = -EMSGSIZE; goto nla_put_failure; + } - err = rtnl_get_offload_stats(skb, dev, prividx); + err = rtnl_offload_xstats_fill(skb, dev, prividx, + off_filter_mask, extack); if (err == -ENODATA) nla_nest_cancel(skb, attr); else @@ -5233,19 +5442,21 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, *idxattr = IFLA_STATS_AF_SPEC; attr = nla_nest_start_noflag(skb, IFLA_STATS_AF_SPEC); - if (!attr) + if (!attr) { + err = -EMSGSIZE; goto nla_put_failure; + } rcu_read_lock(); list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->fill_stats_af) { struct nlattr *af; - int err; af = nla_nest_start_noflag(skb, af_ops->family); if (!af) { rcu_read_unlock(); + err = -EMSGSIZE; goto nla_put_failure; } err = af_ops->fill_stats_af(skb, dev); @@ -5278,13 +5489,14 @@ nla_put_failure: else nlmsg_end(skb, nlh); - return -EMSGSIZE; + return err; } static size_t if_nlmsg_stats_size(const struct net_device *dev, - u32 filter_mask) + const struct rtnl_stats_dump_filters *filters) { size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg)); + unsigned int filter_mask = filters->mask[0]; if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0)) size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64)); @@ -5320,8 +5532,12 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev, } } - if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) - size += rtnl_get_offload_stats_size(dev); + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) { + u32 off_filter_mask; + + off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS]; + size += rtnl_offload_xstats_get_size(dev, off_filter_mask); + } if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) { struct rtnl_af_ops *af_ops; @@ -5345,6 +5561,79 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev, return size; } +#define RTNL_STATS_OFFLOAD_XSTATS_VALID ((1 << __IFLA_OFFLOAD_XSTATS_MAX) - 1) + +static const struct nla_policy +rtnl_stats_get_policy_filters[IFLA_STATS_MAX + 1] = { + [IFLA_STATS_LINK_OFFLOAD_XSTATS] = + NLA_POLICY_MASK(NLA_U32, RTNL_STATS_OFFLOAD_XSTATS_VALID), +}; + +static const struct nla_policy +rtnl_stats_get_policy[IFLA_STATS_GETSET_MAX + 1] = { + [IFLA_STATS_GET_FILTERS] = + NLA_POLICY_NESTED(rtnl_stats_get_policy_filters), +}; + +static const struct nla_policy +ifla_stats_set_policy[IFLA_STATS_GETSET_MAX + 1] = { + [IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS] = NLA_POLICY_MAX(NLA_U8, 1), +}; + +static int rtnl_stats_get_parse_filters(struct nlattr *ifla_filters, + struct rtnl_stats_dump_filters *filters, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_STATS_MAX + 1]; + int err; + int at; + + err = nla_parse_nested(tb, IFLA_STATS_MAX, ifla_filters, + rtnl_stats_get_policy_filters, extack); + if (err < 0) + return err; + + for (at = 1; at <= IFLA_STATS_MAX; at++) { + if (tb[at]) { + if (!(filters->mask[0] & IFLA_STATS_FILTER_BIT(at))) { + NL_SET_ERR_MSG(extack, "Filtered attribute not enabled in filter_mask"); + return -EINVAL; + } + filters->mask[at] = nla_get_u32(tb[at]); + } + } + + return 0; +} + +static int rtnl_stats_get_parse(const struct nlmsghdr *nlh, + u32 filter_mask, + struct rtnl_stats_dump_filters *filters, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1]; + int err; + int i; + + filters->mask[0] = filter_mask; + for (i = 1; i < ARRAY_SIZE(filters->mask); i++) + filters->mask[i] = -1U; + + err = nlmsg_parse(nlh, sizeof(struct if_stats_msg), tb, + IFLA_STATS_GETSET_MAX, rtnl_stats_get_policy, extack); + if (err < 0) + return err; + + if (tb[IFLA_STATS_GET_FILTERS]) { + err = rtnl_stats_get_parse_filters(tb[IFLA_STATS_GET_FILTERS], + filters, extack); + if (err) + return err; + } + + return 0; +} + static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, bool is_dump, struct netlink_ext_ack *extack) { @@ -5367,10 +5656,6 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request"); return -EINVAL; } - if (nlmsg_attrlen(nlh, sizeof(*ifsm))) { - NL_SET_ERR_MSG(extack, "Invalid attributes after stats header"); - return -EINVAL; - } if (ifsm->filter_mask >= IFLA_STATS_FILTER_BIT(IFLA_STATS_MAX + 1)) { NL_SET_ERR_MSG(extack, "Invalid stats requested through filter mask"); return -EINVAL; @@ -5382,12 +5667,12 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct rtnl_stats_dump_filters filters; struct net *net = sock_net(skb->sk); struct net_device *dev = NULL; int idxattr = 0, prividx = 0; struct if_stats_msg *ifsm; struct sk_buff *nskb; - u32 filter_mask; int err; err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb), @@ -5404,17 +5689,22 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, if (!dev) return -ENODEV; - filter_mask = ifsm->filter_mask; - if (!filter_mask) + if (!ifsm->filter_mask) { + NL_SET_ERR_MSG(extack, "Filter mask must be set for stats get"); return -EINVAL; + } + + err = rtnl_stats_get_parse(nlh, ifsm->filter_mask, &filters, extack); + if (err) + return err; - nskb = nlmsg_new(if_nlmsg_stats_size(dev, filter_mask), GFP_KERNEL); + nskb = nlmsg_new(if_nlmsg_stats_size(dev, &filters), GFP_KERNEL); if (!nskb) return -ENOBUFS; err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, - 0, filter_mask, &idxattr, &prividx); + 0, &filters, &idxattr, &prividx, extack); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */ WARN_ON(err == -EMSGSIZE); @@ -5430,12 +5720,12 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; int h, s_h, err, s_idx, s_idxattr, s_prividx; + struct rtnl_stats_dump_filters filters; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; struct if_stats_msg *ifsm; struct hlist_head *head; struct net_device *dev; - u32 filter_mask = 0; int idx = 0; s_h = cb->args[0]; @@ -5450,12 +5740,16 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) return err; ifsm = nlmsg_data(cb->nlh); - filter_mask = ifsm->filter_mask; - if (!filter_mask) { + if (!ifsm->filter_mask) { NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump"); return -EINVAL; } + err = rtnl_stats_get_parse(cb->nlh, ifsm->filter_mask, &filters, + extack); + if (err) + return err; + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; @@ -5465,8 +5759,9 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, - flags, filter_mask, - &s_idxattr, &s_prividx); + flags, &filters, + &s_idxattr, &s_prividx, + extack); /* If we ran out of room on the first message, * we're in trouble */ @@ -5490,6 +5785,107 @@ out: return skb->len; } +void rtnl_offload_xstats_notify(struct net_device *dev) +{ + struct rtnl_stats_dump_filters response_filters = {}; + struct net *net = dev_net(dev); + int idxattr = 0, prividx = 0; + struct sk_buff *skb; + int err = -ENOBUFS; + + ASSERT_RTNL(); + + response_filters.mask[0] |= + IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS); + response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |= + IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO); + + skb = nlmsg_new(if_nlmsg_stats_size(dev, &response_filters), + GFP_KERNEL); + if (!skb) + goto errout; + + err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, 0, 0, 0, 0, + &response_filters, &idxattr, &prividx, NULL); + if (err < 0) { + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, net, 0, RTNLGRP_STATS, NULL, GFP_KERNEL); + return; + +errout: + rtnl_set_sk_err(net, RTNLGRP_STATS, err); +} +EXPORT_SYMBOL(rtnl_offload_xstats_notify); + +static int rtnl_stats_set(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; + struct rtnl_stats_dump_filters response_filters = {}; + struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1]; + struct net *net = sock_net(skb->sk); + struct net_device *dev = NULL; + struct if_stats_msg *ifsm; + bool notify = false; + int err; + + err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb), + false, extack); + if (err) + return err; + + ifsm = nlmsg_data(nlh); + if (ifsm->family != AF_UNSPEC) { + NL_SET_ERR_MSG(extack, "Address family should be AF_UNSPEC"); + return -EINVAL; + } + + if (ifsm->ifindex > 0) + dev = __dev_get_by_index(net, ifsm->ifindex); + else + return -EINVAL; + + if (!dev) + return -ENODEV; + + if (ifsm->filter_mask) { + NL_SET_ERR_MSG(extack, "Filter mask must be 0 for stats set"); + return -EINVAL; + } + + err = nlmsg_parse(nlh, sizeof(*ifsm), tb, IFLA_STATS_GETSET_MAX, + ifla_stats_set_policy, extack); + if (err < 0) + return err; + + if (tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]) { + u8 req = nla_get_u8(tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]); + + if (req) + err = netdev_offload_xstats_enable(dev, t_l3, extack); + else + err = netdev_offload_xstats_disable(dev, t_l3); + + if (!err) + notify = true; + else if (err != -EALREADY) + return err; + + response_filters.mask[0] |= + IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS); + response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |= + IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO); + } + + if (notify) + rtnl_offload_xstats_notify(dev); + + return 0; +} + /* Process one rtnetlink message. */ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -5715,4 +6111,5 @@ void __init rtnetlink_init(void) rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump, 0); + rtnl_register(PF_UNSPEC, RTM_SETSTATS, rtnl_stats_set, NULL, 0); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9d0388bed0c1..10bde7c6db44 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -201,7 +201,7 @@ static void __build_skb_around(struct sk_buff *skb, void *data, skb->head = data; skb->data = data; skb_reset_tail_pointer(skb); - skb->end = skb->tail + size; + skb_set_end_offset(skb, size); skb->mac_header = (typeof(skb->mac_header))~0U; skb->transport_header = (typeof(skb->transport_header))~0U; @@ -777,16 +777,17 @@ void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) } EXPORT_SYMBOL(kfree_skb_reason); -void kfree_skb_list(struct sk_buff *segs) +void kfree_skb_list_reason(struct sk_buff *segs, + enum skb_drop_reason reason) { while (segs) { struct sk_buff *next = segs->next; - kfree_skb(segs); + kfree_skb_reason(segs, reason); segs = next; } } -EXPORT_SYMBOL(kfree_skb_list); +EXPORT_SYMBOL(kfree_skb_list_reason); /* Dump skb information and contents. * @@ -1736,11 +1737,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->head = data; skb->head_frag = 0; skb->data += off; + + skb_set_end_offset(skb, size); #ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->end = size; off = nhead; -#else - skb->end = skb->head + size; #endif skb->tail += off; skb_headers_offset_update(skb, nhead); @@ -1788,6 +1788,38 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) } EXPORT_SYMBOL(skb_realloc_headroom); +int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) +{ + unsigned int saved_end_offset, saved_truesize; + struct skb_shared_info *shinfo; + int res; + + saved_end_offset = skb_end_offset(skb); + saved_truesize = skb->truesize; + + res = pskb_expand_head(skb, 0, 0, pri); + if (res) + return res; + + skb->truesize = saved_truesize; + + if (likely(skb_end_offset(skb) == saved_end_offset)) + return 0; + + shinfo = skb_shinfo(skb); + + /* We are about to change back skb->end, + * we need to move skb_shinfo() to its new location. + */ + memmove(skb->head + saved_end_offset, + shinfo, + offsetof(struct skb_shared_info, frags[shinfo->nr_frags])); + + skb_set_end_offset(skb, saved_end_offset); + + return 0; +} + /** * skb_expand_head - reallocate header of &sk_buff * @skb: buffer to reallocate @@ -2276,7 +2308,7 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) /* Free pulled out fragments. */ while ((list = skb_shinfo(skb)->frag_list) != insp) { skb_shinfo(skb)->frag_list = list->next; - kfree_skb(list); + consume_skb(list); } /* And insert new clone at head. */ if (clone) { @@ -3876,6 +3908,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, list_skb = list_skb->next; err = 0; + delta_truesize += nskb->truesize; if (skb_shared(nskb)) { tmp = skb_clone(nskb, GFP_ATOMIC); if (tmp) { @@ -3900,7 +3933,6 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, tail = nskb; delta_len += nskb->len; - delta_truesize += nskb->truesize; skb_push(nskb, -skb_network_offset(nskb) + offset); @@ -4730,7 +4762,7 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb, if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; if (sk_is_tcp(sk)) - serr->ee.ee_data -= sk->sk_tskey; + serr->ee.ee_data -= atomic_read(&sk->sk_tskey); } err = sock_queue_err_skb(sk, skb); @@ -4820,7 +4852,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (hwtstamps) *skb_hwtstamps(skb) = *hwtstamps; else - skb->tstamp = ktime_get_real(); + __net_timestamp(skb); __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); } @@ -5350,7 +5382,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) ipvs_reset(skb); skb->mark = 0; - skb->tstamp = 0; + skb_clear_tstamp(skb); } EXPORT_SYMBOL_GPL(skb_scrub_packet); @@ -6044,11 +6076,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, skb->head = data; skb->data = data; skb->head_frag = 0; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->end = size; -#else - skb->end = skb->head + size; -#endif + skb_set_end_offset(skb, size); skb_set_tail_pointer(skb, skb_headlen(skb)); skb_headers_offset_update(skb, 0); skb->cloned = 0; @@ -6105,7 +6133,7 @@ static int pskb_carve_frag_list(struct sk_buff *skb, /* Free pulled out fragments. */ while ((list = shinfo->frag_list) != insp) { shinfo->frag_list = list->next; - kfree_skb(list); + consume_skb(list); } /* And insert new clone at head. */ if (clone) { @@ -6186,11 +6214,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, skb->head = data; skb->head_frag = 0; skb->data = data; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->end = size; -#else - skb->end = skb->head + size; -#endif + skb_set_end_offset(skb, size); skb_reset_tail_pointer(skb); skb_headers_offset_update(skb, 0); skb->cloned = 0; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8eb671c827f9..929a2b096b04 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1153,7 +1153,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, struct sk_psock *psock; struct bpf_prog *prog; int ret = __SK_DROP; - int len = skb->len; + int len = orig_len; /* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */ skb = skb_clone(skb, GFP_ATOMIC); diff --git a/net/core/sock.c b/net/core/sock.c index 09d31a7dc68f..1180a0cb0110 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -879,9 +879,9 @@ int sock_set_timestamping(struct sock *sk, int optname, if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return -EINVAL; - sk->sk_tskey = tcp_sk(sk)->snd_una; + atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); } else { - sk->sk_tskey = 0; + atomic_set(&sk->sk_tskey, 0); } } @@ -1377,9 +1377,9 @@ set_sndbuf: if (!(sk_is_tcp(sk) || (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP))) - ret = -ENOTSUPP; + ret = -EOPNOTSUPP; } else if (sk->sk_family != PF_RDS) { - ret = -ENOTSUPP; + ret = -EOPNOTSUPP; } if (!ret) { if (val < 0 || val > 1) @@ -3718,6 +3718,10 @@ int proto_register(struct proto *prot, int alloc_slab) { int ret = -ENOBUFS; + if (prot->memory_allocated && !prot->sysctl_mem) { + pr_err("%s: missing sysctl_mem\n", prot->name); + return -EINVAL; + } if (alloc_slab) { prot->slab = kmem_cache_create_usercopy(prot->name, prot->obj_size, 0, diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index dbeb8ecbcd98..7123fe7feeac 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -103,8 +103,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, if (orig_sock_table) { static_branch_dec(&rps_needed); static_branch_dec(&rfs_needed); - synchronize_rcu(); - vfree(orig_sock_table); + kvfree_rcu(orig_sock_table); } } } @@ -142,8 +141,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, lockdep_is_held(&flow_limit_update_mutex)); if (cur && !cpumask_test_cpu(i, mask)) { RCU_INIT_POINTER(sd->flow_limit, NULL); - synchronize_rcu(); - kfree(cur); + kfree_rcu(cur); } else if (!cur && cpumask_test_cpu(i, mask)) { cur = kzalloc_node(len, GFP_KERNEL, cpu_to_node(i)); diff --git a/net/core/utils.c b/net/core/utils.c index 1f31a39236d5..938495bc1d34 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -476,9 +476,9 @@ void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, __wsum diff, bool pseudohdr) { if (skb->ip_summed != CHECKSUM_PARTIAL) { - *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum))); + csum_replace_by_diff(sum, diff); if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) - skb->csum = ~csum_add(diff, ~skb->csum); + skb->csum = ~csum_sub(diff, skb->csum); } else if (pseudohdr) { *sum = ~csum_fold(csum_add(diff, csum_unfold(*sum))); } diff --git a/net/core/xdp.c b/net/core/xdp.c index 361df312ee7f..7577adf19ef4 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -359,7 +359,8 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, if (IS_ERR(xdp_alloc)) return PTR_ERR(xdp_alloc); - trace_mem_connect(xdp_alloc, xdp_rxq); + if (trace_mem_connect_enabled() && xdp_alloc) + trace_mem_connect(xdp_alloc, xdp_rxq); return 0; } diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index b441ab330fd3..dc4fb699b56c 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -2073,8 +2073,52 @@ u8 dcb_ieee_getapp_default_prio_mask(const struct net_device *dev) } EXPORT_SYMBOL(dcb_ieee_getapp_default_prio_mask); +static void dcbnl_flush_dev(struct net_device *dev) +{ + struct dcb_app_type *itr, *tmp; + + spin_lock_bh(&dcb_lock); + + list_for_each_entry_safe(itr, tmp, &dcb_app_list, list) { + if (itr->ifindex == dev->ifindex) { + list_del(&itr->list); + kfree(itr); + } + } + + spin_unlock_bh(&dcb_lock); +} + +static int dcbnl_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + switch (event) { + case NETDEV_UNREGISTER: + if (!dev->dcbnl_ops) + return NOTIFY_DONE; + + dcbnl_flush_dev(dev); + + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block dcbnl_nb __read_mostly = { + .notifier_call = dcbnl_netdevice_event, +}; + static int __init dcbnl_init(void) { + int err; + + err = register_netdevice_notifier(&dcbnl_nb); + if (err) + return err; + rtnl_register(PF_UNSPEC, RTM_GETDCB, dcb_doit, NULL, 0); rtnl_register(PF_UNSPEC, RTM_SETDCB, dcb_doit, NULL, 0); diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c index eadc89583168..b05639bdfc8f 100644 --- a/net/decnet/dn_nsp_out.c +++ b/net/decnet/dn_nsp_out.c @@ -52,6 +52,7 @@ #include <linux/init.h> #include <linux/poll.h> #include <linux/if_packet.h> +#include <linux/jiffies.h> #include <net/neighbour.h> #include <net/dst.h> #include <net/flow.h> @@ -351,7 +352,7 @@ void dn_nsp_queue_xmit(struct sock *sk, struct sk_buff *skb, * Slow start: If we have been idle for more than * one RTT, then reset window to min size. */ - if ((jiffies - scp->stamp) > t) + if (time_is_before_jiffies(scp->stamp + t)) scp->snd_window = NSP_MIN_WINDOW; if (oth) diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index d9d0d227092c..89c6c86e746f 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -349,6 +349,7 @@ void dsa_flush_workqueue(void) { flush_workqueue(dsa_owq); } +EXPORT_SYMBOL_GPL(dsa_flush_workqueue); int dsa_devlink_param_get(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx) @@ -466,6 +467,106 @@ struct dsa_port *dsa_port_from_netdev(struct net_device *netdev) } EXPORT_SYMBOL_GPL(dsa_port_from_netdev); +int dsa_port_walk_fdbs(struct dsa_switch *ds, int port, dsa_fdb_walk_cb_t cb) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_mac_addr *a; + int err = 0; + + mutex_lock(&dp->addr_lists_lock); + + list_for_each_entry(a, &dp->fdbs, list) { + err = cb(ds, port, a->addr, a->vid, a->db); + if (err) + break; + } + + mutex_unlock(&dp->addr_lists_lock); + + return err; +} +EXPORT_SYMBOL_GPL(dsa_port_walk_fdbs); + +int dsa_port_walk_mdbs(struct dsa_switch *ds, int port, dsa_fdb_walk_cb_t cb) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_mac_addr *a; + int err = 0; + + mutex_lock(&dp->addr_lists_lock); + + list_for_each_entry(a, &dp->mdbs, list) { + err = cb(ds, port, a->addr, a->vid, a->db); + if (err) + break; + } + + mutex_unlock(&dp->addr_lists_lock); + + return err; +} +EXPORT_SYMBOL_GPL(dsa_port_walk_mdbs); + +bool dsa_db_equal(const struct dsa_db *a, const struct dsa_db *b) +{ + if (a->type != b->type) + return false; + + switch (a->type) { + case DSA_DB_PORT: + return a->dp == b->dp; + case DSA_DB_LAG: + return a->lag.dev == b->lag.dev; + case DSA_DB_BRIDGE: + return a->bridge.num == b->bridge.num; + default: + WARN_ON(1); + return false; + } +} + +bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid, + struct dsa_db db) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_mac_addr *a; + + lockdep_assert_held(&dp->addr_lists_lock); + + list_for_each_entry(a, &dp->fdbs, list) { + if (!ether_addr_equal(a->addr, addr) || a->vid != vid) + continue; + + if (a->db.type == db.type && !dsa_db_equal(&a->db, &db)) + return true; + } + + return false; +} +EXPORT_SYMBOL_GPL(dsa_fdb_present_in_other_db); + +bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb, + struct dsa_db db) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_mac_addr *a; + + lockdep_assert_held(&dp->addr_lists_lock); + + list_for_each_entry(a, &dp->mdbs, list) { + if (!ether_addr_equal(a->addr, mdb->addr) || a->vid != mdb->vid) + continue; + + if (a->db.type == db.type && !dsa_db_equal(&a->db, &db)) + return true; + } + + return false; +} +EXPORT_SYMBOL_GPL(dsa_mdb_present_in_other_db); + static int __init dsa_init_module(void) { int rc; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index e498c927c3d0..bef1aaa7ec1c 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -72,27 +72,24 @@ int dsa_broadcast(unsigned long e, void *v) } /** - * dsa_lag_map() - Map LAG netdev to a linear LAG ID + * dsa_lag_map() - Map LAG structure to a linear LAG array * @dst: Tree in which to record the mapping. - * @lag: Netdev that is to be mapped to an ID. + * @lag: LAG structure that is to be mapped to the tree's array. * - * dsa_lag_id/dsa_lag_dev can then be used to translate between the + * dsa_lag_id/dsa_lag_by_id can then be used to translate between the * two spaces. The size of the mapping space is determined by the * driver by setting ds->num_lag_ids. It is perfectly legal to leave * it unset if it is not needed, in which case these functions become * no-ops. */ -void dsa_lag_map(struct dsa_switch_tree *dst, struct net_device *lag) +void dsa_lag_map(struct dsa_switch_tree *dst, struct dsa_lag *lag) { unsigned int id; - if (dsa_lag_id(dst, lag) >= 0) - /* Already mapped */ - return; - - for (id = 0; id < dst->lags_len; id++) { - if (!dsa_lag_dev(dst, id)) { - dst->lags[id] = lag; + for (id = 1; id <= dst->lags_len; id++) { + if (!dsa_lag_by_id(dst, id)) { + dst->lags[id - 1] = lag; + lag->id = id; return; } } @@ -108,28 +105,36 @@ void dsa_lag_map(struct dsa_switch_tree *dst, struct net_device *lag) /** * dsa_lag_unmap() - Remove a LAG ID mapping * @dst: Tree in which the mapping is recorded. - * @lag: Netdev that was mapped. + * @lag: LAG structure that was mapped. * * As there may be multiple users of the mapping, it is only removed * if there are no other references to it. */ -void dsa_lag_unmap(struct dsa_switch_tree *dst, struct net_device *lag) +void dsa_lag_unmap(struct dsa_switch_tree *dst, struct dsa_lag *lag) { - struct dsa_port *dp; unsigned int id; - dsa_lag_foreach_port(dp, dst, lag) - /* There are remaining users of this mapping */ - return; - dsa_lags_foreach_id(id, dst) { - if (dsa_lag_dev(dst, id) == lag) { - dst->lags[id] = NULL; + if (dsa_lag_by_id(dst, id) == lag) { + dst->lags[id - 1] = NULL; + lag->id = 0; break; } } } +struct dsa_lag *dsa_tree_lag_find(struct dsa_switch_tree *dst, + const struct net_device *lag_dev) +{ + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_lag_dev_get(dp) == lag_dev) + return dp->lag; + + return NULL; +} + struct dsa_bridge *dsa_tree_bridge_find(struct dsa_switch_tree *dst, const struct net_device *br) { @@ -452,10 +457,6 @@ static int dsa_port_setup(struct dsa_port *dp) if (dp->setup) return 0; - mutex_init(&dp->addr_lists_lock); - INIT_LIST_HEAD(&dp->fdbs); - INIT_LIST_HEAD(&dp->mdbs); - if (ds->ops->port_setup) { err = ds->ops->port_setup(ds, dp->index); if (err) @@ -561,7 +562,6 @@ static void dsa_port_teardown(struct dsa_port *dp) { struct devlink_port *dlp = &dp->devlink_port; struct dsa_switch *ds = dp->ds; - struct dsa_mac_addr *a, *tmp; struct net_device *slave; if (!dp->setup) @@ -593,16 +593,6 @@ static void dsa_port_teardown(struct dsa_port *dp) break; } - list_for_each_entry_safe(a, tmp, &dp->fdbs, list) { - list_del(&a->list); - kfree(a); - } - - list_for_each_entry_safe(a, tmp, &dp->mdbs, list) { - list_del(&a->list); - kfree(a); - } - dp->setup = false; } @@ -1059,7 +1049,7 @@ static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) static int dsa_tree_setup_master(struct dsa_switch_tree *dst) { struct dsa_port *dp; - int err; + int err = 0; rtnl_lock(); @@ -1071,7 +1061,7 @@ static int dsa_tree_setup_master(struct dsa_switch_tree *dst) err = dsa_master_setup(master, dp); if (err) - return err; + break; /* Replay master state event */ dsa_tree_master_admin_state_change(dst, master, admin_up); @@ -1082,7 +1072,7 @@ static int dsa_tree_setup_master(struct dsa_switch_tree *dst) rtnl_unlock(); - return 0; + return err; } static void dsa_tree_teardown_master(struct dsa_switch_tree *dst) @@ -1281,7 +1271,7 @@ int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, info.tag_ops = tag_ops; err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info); if (err) - return err; + goto out_unwind_tagger; err = dsa_tree_bind_tag_proto(dst, tag_ops); if (err) @@ -1361,6 +1351,11 @@ static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index) dp->ds = ds; dp->index = index; + mutex_init(&dp->addr_lists_lock); + mutex_init(&dp->vlans_lock); + INIT_LIST_HEAD(&dp->fdbs); + INIT_LIST_HEAD(&dp->mdbs); + INIT_LIST_HEAD(&dp->vlans); INIT_LIST_HEAD(&dp->list); list_add_tail(&dp->list, &dst->ports); @@ -1699,6 +1694,9 @@ static void dsa_switch_release_ports(struct dsa_switch *ds) struct dsa_port *dp, *next; dsa_switch_for_each_port_safe(dp, next, ds) { + WARN_ON(!list_empty(&dp->fdbs)); + WARN_ON(!list_empty(&dp->mdbs)); + WARN_ON(!list_empty(&dp->vlans)); list_del(&dp->list); kfree(dp); } diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 2bbfa9efe9f8..f20bdd8ea0a8 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -25,6 +25,8 @@ enum { DSA_NOTIFIER_FDB_DEL, DSA_NOTIFIER_HOST_FDB_ADD, DSA_NOTIFIER_HOST_FDB_DEL, + DSA_NOTIFIER_LAG_FDB_ADD, + DSA_NOTIFIER_LAG_FDB_DEL, DSA_NOTIFIER_LAG_CHANGE, DSA_NOTIFIER_LAG_JOIN, DSA_NOTIFIER_LAG_LEAVE, @@ -34,6 +36,8 @@ enum { DSA_NOTIFIER_HOST_MDB_DEL, DSA_NOTIFIER_VLAN_ADD, DSA_NOTIFIER_VLAN_DEL, + DSA_NOTIFIER_HOST_VLAN_ADD, + DSA_NOTIFIER_HOST_VLAN_DEL, DSA_NOTIFIER_MTU, DSA_NOTIFIER_TAG_PROTO, DSA_NOTIFIER_TAG_PROTO_CONNECT, @@ -55,6 +59,7 @@ struct dsa_notifier_bridge_info { int sw_index; int port; bool tx_fwd_offload; + struct netlink_ext_ack *extack; }; /* DSA_NOTIFIER_FDB_* */ @@ -63,6 +68,15 @@ struct dsa_notifier_fdb_info { int port; const unsigned char *addr; u16 vid; + struct dsa_db db; +}; + +/* DSA_NOTIFIER_LAG_FDB_* */ +struct dsa_notifier_lag_fdb_info { + struct dsa_lag *lag; + const unsigned char *addr; + u16 vid; + struct dsa_db db; }; /* DSA_NOTIFIER_MDB_* */ @@ -70,11 +84,12 @@ struct dsa_notifier_mdb_info { const struct switchdev_obj_port_mdb *mdb; int sw_index; int port; + struct dsa_db db; }; /* DSA_NOTIFIER_LAG_* */ struct dsa_notifier_lag_info { - struct net_device *lag; + struct dsa_lag lag; int sw_index; int port; @@ -117,9 +132,8 @@ struct dsa_notifier_master_state_info { }; struct dsa_switchdev_event_work { - struct dsa_switch *ds; - int port; struct net_device *dev; + struct net_device *orig_dev; struct work_struct work; unsigned long event; /* Specific for SWITCHDEV_FDB_ADD_TO_DEVICE and @@ -130,6 +144,21 @@ struct dsa_switchdev_event_work { bool host_addr; }; +enum dsa_standalone_event { + DSA_UC_ADD, + DSA_UC_DEL, + DSA_MC_ADD, + DSA_MC_DEL, +}; + +struct dsa_standalone_event_work { + struct work_struct work; + struct net_device *dev; + enum dsa_standalone_event event; + unsigned char addr[ETH_ALEN]; + u16 vid; +}; + struct dsa_slave_priv { /* Copy of CPU port xmit for faster access in slave transmit hot path */ struct sk_buff * (*xmit)(struct sk_buff *skb, @@ -153,8 +182,9 @@ const struct dsa_device_ops *dsa_tag_driver_get(int tag_protocol); void dsa_tag_driver_put(const struct dsa_device_ops *ops); const struct dsa_device_ops *dsa_find_tagger_by_name(const char *buf); +bool dsa_db_equal(const struct dsa_db *a, const struct dsa_db *b); + bool dsa_schedule_work(struct work_struct *work); -void dsa_flush_workqueue(void); const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops); static inline int dsa_tag_protocol_overhead(const struct dsa_device_ops *ops) @@ -210,19 +240,31 @@ int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr, u16 vid); int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr, u16 vid); -int dsa_port_host_fdb_add(struct dsa_port *dp, const unsigned char *addr, - u16 vid); -int dsa_port_host_fdb_del(struct dsa_port *dp, const unsigned char *addr, - u16 vid); +int dsa_port_standalone_host_fdb_add(struct dsa_port *dp, + const unsigned char *addr, u16 vid); +int dsa_port_standalone_host_fdb_del(struct dsa_port *dp, + const unsigned char *addr, u16 vid); +int dsa_port_bridge_host_fdb_add(struct dsa_port *dp, const unsigned char *addr, + u16 vid); +int dsa_port_bridge_host_fdb_del(struct dsa_port *dp, const unsigned char *addr, + u16 vid); +int dsa_port_lag_fdb_add(struct dsa_port *dp, const unsigned char *addr, + u16 vid); +int dsa_port_lag_fdb_del(struct dsa_port *dp, const unsigned char *addr, + u16 vid); int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data); int dsa_port_mdb_add(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb); int dsa_port_mdb_del(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb); -int dsa_port_host_mdb_add(const struct dsa_port *dp, - const struct switchdev_obj_port_mdb *mdb); -int dsa_port_host_mdb_del(const struct dsa_port *dp, - const struct switchdev_obj_port_mdb *mdb); +int dsa_port_standalone_host_mdb_add(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb); +int dsa_port_standalone_host_mdb_del(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb); +int dsa_port_bridge_host_mdb_add(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb); +int dsa_port_bridge_host_mdb_del(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb); int dsa_port_pre_bridge_flags(const struct dsa_port *dp, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack); @@ -234,6 +276,11 @@ int dsa_port_vlan_add(struct dsa_port *dp, struct netlink_ext_ack *extack); int dsa_port_vlan_del(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan); +int dsa_port_host_vlan_add(struct dsa_port *dp, + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack); +int dsa_port_host_vlan_del(struct dsa_port *dp, + const struct switchdev_obj_port_vlan *vlan); int dsa_port_mrp_add(const struct dsa_port *dp, const struct switchdev_obj_mrp *mrp); int dsa_port_mrp_del(const struct dsa_port *dp, @@ -480,9 +527,25 @@ static inline void *dsa_etype_header_pos_tx(struct sk_buff *skb) int dsa_switch_register_notifier(struct dsa_switch *ds); void dsa_switch_unregister_notifier(struct dsa_switch *ds); +static inline bool dsa_switch_supports_uc_filtering(struct dsa_switch *ds) +{ + return ds->ops->port_fdb_add && ds->ops->port_fdb_del && + ds->fdb_isolation && !ds->vlan_filtering_is_global && + !ds->needs_standalone_vlan_filtering; +} + +static inline bool dsa_switch_supports_mc_filtering(struct dsa_switch *ds) +{ + return ds->ops->port_mdb_add && ds->ops->port_mdb_del && + ds->fdb_isolation && !ds->vlan_filtering_is_global && + !ds->needs_standalone_vlan_filtering; +} + /* dsa2.c */ -void dsa_lag_map(struct dsa_switch_tree *dst, struct net_device *lag); -void dsa_lag_unmap(struct dsa_switch_tree *dst, struct net_device *lag); +void dsa_lag_map(struct dsa_switch_tree *dst, struct dsa_lag *lag); +void dsa_lag_unmap(struct dsa_switch_tree *dst, struct dsa_lag *lag); +struct dsa_lag *dsa_tree_lag_find(struct dsa_switch_tree *dst, + const struct net_device *lag_dev); int dsa_tree_notify(struct dsa_switch_tree *dst, unsigned long e, void *v); int dsa_broadcast(unsigned long e, void *v); int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, @@ -502,10 +565,6 @@ struct dsa_bridge *dsa_tree_bridge_find(struct dsa_switch_tree *dst, const struct net_device *br); /* tag_8021q.c */ -int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, - struct dsa_notifier_bridge_info *info); -int dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, - struct dsa_notifier_bridge_info *info); int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds, struct dsa_notifier_tag_8021q_vlan_info *info); int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds, diff --git a/net/dsa/master.c b/net/dsa/master.c index 6ac393cc6ea7..991c2930d631 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -260,11 +260,16 @@ static void dsa_netdev_ops_set(struct net_device *dev, dev->dsa_ptr->netdev_ops = ops; } +/* Keep the master always promiscuous if the tagging protocol requires that + * (garbles MAC DA) or if it doesn't support unicast filtering, case in which + * it would revert to promiscuous mode as soon as we call dev_uc_add() on it + * anyway. + */ static void dsa_master_set_promiscuity(struct net_device *dev, int inc) { const struct dsa_device_ops *ops = dev->dsa_ptr->tag_ops; - if (!ops->promisc_on_master) + if ((dev->priv_flags & IFF_UNICAST_FLT) && !ops->promisc_on_master) return; ASSERT_RTNL(); diff --git a/net/dsa/port.c b/net/dsa/port.c index bd78192e0e47..58291df14cdb 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -176,7 +176,7 @@ static int dsa_port_inherit_brport_flags(struct dsa_port *dp, struct netlink_ext_ack *extack) { const unsigned long mask = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | - BR_BCAST_FLOOD; + BR_BCAST_FLOOD | BR_PORT_LOCKED; struct net_device *brport_dev = dsa_port_to_bridge_port(dp); int flag, err; @@ -200,7 +200,7 @@ static void dsa_port_clear_brport_flags(struct dsa_port *dp) { const unsigned long val = BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD; const unsigned long mask = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | - BR_BCAST_FLOOD; + BR_BCAST_FLOOD | BR_PORT_LOCKED; int flag, err; for_each_set_bit(flag, &mask, 32) { @@ -328,6 +328,7 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br, .tree_index = dp->ds->dst->index, .sw_index = dp->ds->index, .port = dp->index, + .extack = extack, }; struct net_device *dev = dp->slave; struct net_device *brport_dev; @@ -395,10 +396,17 @@ void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) .tree_index = dp->ds->dst->index, .sw_index = dp->ds->index, .port = dp->index, - .bridge = *dp->bridge, }; int err; + /* If the port could not be offloaded to begin with, then + * there is nothing to do. + */ + if (!dp->bridge) + return; + + info.bridge = *dp->bridge; + /* Here the port is already unbridged. Reflect the current configuration * so that drivers can program their chips accordingly. */ @@ -422,7 +430,7 @@ int dsa_port_lag_change(struct dsa_port *dp, }; bool tx_enabled; - if (!dp->lag_dev) + if (!dp->lag) return 0; /* On statically configured aggregates (e.g. loadbalance @@ -440,27 +448,70 @@ int dsa_port_lag_change(struct dsa_port *dp, return dsa_port_notify(dp, DSA_NOTIFIER_LAG_CHANGE, &info); } -int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag, +static int dsa_port_lag_create(struct dsa_port *dp, + struct net_device *lag_dev) +{ + struct dsa_switch *ds = dp->ds; + struct dsa_lag *lag; + + lag = dsa_tree_lag_find(ds->dst, lag_dev); + if (lag) { + refcount_inc(&lag->refcount); + dp->lag = lag; + return 0; + } + + lag = kzalloc(sizeof(*lag), GFP_KERNEL); + if (!lag) + return -ENOMEM; + + refcount_set(&lag->refcount, 1); + mutex_init(&lag->fdb_lock); + INIT_LIST_HEAD(&lag->fdbs); + lag->dev = lag_dev; + dsa_lag_map(ds->dst, lag); + dp->lag = lag; + + return 0; +} + +static void dsa_port_lag_destroy(struct dsa_port *dp) +{ + struct dsa_lag *lag = dp->lag; + + dp->lag = NULL; + dp->lag_tx_enabled = false; + + if (!refcount_dec_and_test(&lag->refcount)) + return; + + WARN_ON(!list_empty(&lag->fdbs)); + dsa_lag_unmap(dp->ds->dst, lag); + kfree(lag); +} + +int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev, struct netdev_lag_upper_info *uinfo, struct netlink_ext_ack *extack) { struct dsa_notifier_lag_info info = { .sw_index = dp->ds->index, .port = dp->index, - .lag = lag, .info = uinfo, }; struct net_device *bridge_dev; int err; - dsa_lag_map(dp->ds->dst, lag); - dp->lag_dev = lag; + err = dsa_port_lag_create(dp, lag_dev); + if (err) + goto err_lag_create; + info.lag = *dp->lag; err = dsa_port_notify(dp, DSA_NOTIFIER_LAG_JOIN, &info); if (err) goto err_lag_join; - bridge_dev = netdev_master_upper_dev_get(lag); + bridge_dev = netdev_master_upper_dev_get(lag_dev); if (!bridge_dev || !netif_is_bridge_master(bridge_dev)) return 0; @@ -473,12 +524,12 @@ int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag, err_bridge_join: dsa_port_notify(dp, DSA_NOTIFIER_LAG_LEAVE, &info); err_lag_join: - dp->lag_dev = NULL; - dsa_lag_unmap(dp->ds->dst, lag); + dsa_port_lag_destroy(dp); +err_lag_create: return err; } -void dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag) +void dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag_dev) { struct net_device *br = dsa_port_bridge_dev_get(dp); @@ -486,17 +537,16 @@ void dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag) dsa_port_pre_bridge_leave(dp, br); } -void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag) +void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag_dev) { struct net_device *br = dsa_port_bridge_dev_get(dp); struct dsa_notifier_lag_info info = { .sw_index = dp->ds->index, .port = dp->index, - .lag = lag, }; int err; - if (!dp->lag_dev) + if (!dp->lag) return; /* Port might have been part of a LAG that in turn was @@ -505,16 +555,15 @@ void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag) if (br) dsa_port_bridge_leave(dp, br); - dp->lag_tx_enabled = false; - dp->lag_dev = NULL; + info.lag = *dp->lag; + + dsa_port_lag_destroy(dp); err = dsa_port_notify(dp, DSA_NOTIFIER_LAG_LEAVE, &info); if (err) dev_err(dp->ds->dev, "port %d failed to notify DSA_NOTIFIER_LAG_LEAVE: %pe\n", dp->index, ERR_PTR(err)); - - dsa_lag_unmap(dp->ds->dst, lag); } /* Must be called under rcu_read_lock() */ @@ -750,8 +799,19 @@ int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr, .port = dp->index, .addr = addr, .vid = vid, + .db = { + .type = DSA_DB_BRIDGE, + .bridge = *dp->bridge, + }, }; + /* Refcounting takes bridge.num as a key, and should be global for all + * bridges in the absence of FDB isolation, and per bridge otherwise. + * Force the bridge.num to zero here in the absence of FDB isolation. + */ + if (!dp->ds->fdb_isolation) + info.db.bridge.num = 0; + return dsa_port_notify(dp, DSA_NOTIFIER_FDB_ADD, &info); } @@ -763,48 +823,154 @@ int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr, .port = dp->index, .addr = addr, .vid = vid, - + .db = { + .type = DSA_DB_BRIDGE, + .bridge = *dp->bridge, + }, }; + if (!dp->ds->fdb_isolation) + info.db.bridge.num = 0; + return dsa_port_notify(dp, DSA_NOTIFIER_FDB_DEL, &info); } -int dsa_port_host_fdb_add(struct dsa_port *dp, const unsigned char *addr, - u16 vid) +static int dsa_port_host_fdb_add(struct dsa_port *dp, + const unsigned char *addr, u16 vid, + struct dsa_db db) { struct dsa_notifier_fdb_info info = { .sw_index = dp->ds->index, .port = dp->index, .addr = addr, .vid = vid, + .db = db, + }; + + if (!dp->ds->fdb_isolation) + info.db.bridge.num = 0; + + return dsa_port_notify(dp, DSA_NOTIFIER_HOST_FDB_ADD, &info); +} + +int dsa_port_standalone_host_fdb_add(struct dsa_port *dp, + const unsigned char *addr, u16 vid) +{ + struct dsa_db db = { + .type = DSA_DB_PORT, + .dp = dp, }; + + return dsa_port_host_fdb_add(dp, addr, vid, db); +} + +int dsa_port_bridge_host_fdb_add(struct dsa_port *dp, + const unsigned char *addr, u16 vid) +{ struct dsa_port *cpu_dp = dp->cpu_dp; + struct dsa_db db = { + .type = DSA_DB_BRIDGE, + .bridge = *dp->bridge, + }; int err; - err = dev_uc_add(cpu_dp->master, addr); - if (err) - return err; + /* Avoid a call to __dev_set_promiscuity() on the master, which + * requires rtnl_lock(), since we can't guarantee that is held here, + * and we can't take it either. + */ + if (cpu_dp->master->priv_flags & IFF_UNICAST_FLT) { + err = dev_uc_add(cpu_dp->master, addr); + if (err) + return err; + } - return dsa_port_notify(dp, DSA_NOTIFIER_HOST_FDB_ADD, &info); + return dsa_port_host_fdb_add(dp, addr, vid, db); } -int dsa_port_host_fdb_del(struct dsa_port *dp, const unsigned char *addr, - u16 vid) +static int dsa_port_host_fdb_del(struct dsa_port *dp, + const unsigned char *addr, u16 vid, + struct dsa_db db) { struct dsa_notifier_fdb_info info = { .sw_index = dp->ds->index, .port = dp->index, .addr = addr, .vid = vid, + .db = db, }; + + if (!dp->ds->fdb_isolation) + info.db.bridge.num = 0; + + return dsa_port_notify(dp, DSA_NOTIFIER_HOST_FDB_DEL, &info); +} + +int dsa_port_standalone_host_fdb_del(struct dsa_port *dp, + const unsigned char *addr, u16 vid) +{ + struct dsa_db db = { + .type = DSA_DB_PORT, + .dp = dp, + }; + + return dsa_port_host_fdb_del(dp, addr, vid, db); +} + +int dsa_port_bridge_host_fdb_del(struct dsa_port *dp, + const unsigned char *addr, u16 vid) +{ struct dsa_port *cpu_dp = dp->cpu_dp; + struct dsa_db db = { + .type = DSA_DB_BRIDGE, + .bridge = *dp->bridge, + }; int err; - err = dev_uc_del(cpu_dp->master, addr); - if (err) - return err; + if (cpu_dp->master->priv_flags & IFF_UNICAST_FLT) { + err = dev_uc_del(cpu_dp->master, addr); + if (err) + return err; + } - return dsa_port_notify(dp, DSA_NOTIFIER_HOST_FDB_DEL, &info); + return dsa_port_host_fdb_del(dp, addr, vid, db); +} + +int dsa_port_lag_fdb_add(struct dsa_port *dp, const unsigned char *addr, + u16 vid) +{ + struct dsa_notifier_lag_fdb_info info = { + .lag = dp->lag, + .addr = addr, + .vid = vid, + .db = { + .type = DSA_DB_BRIDGE, + .bridge = *dp->bridge, + }, + }; + + if (!dp->ds->fdb_isolation) + info.db.bridge.num = 0; + + return dsa_port_notify(dp, DSA_NOTIFIER_LAG_FDB_ADD, &info); +} + +int dsa_port_lag_fdb_del(struct dsa_port *dp, const unsigned char *addr, + u16 vid) +{ + struct dsa_notifier_lag_fdb_info info = { + .lag = dp->lag, + .addr = addr, + .vid = vid, + .db = { + .type = DSA_DB_BRIDGE, + .bridge = *dp->bridge, + }, + }; + + if (!dp->ds->fdb_isolation) + info.db.bridge.num = 0; + + return dsa_port_notify(dp, DSA_NOTIFIER_LAG_FDB_DEL, &info); } int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data) @@ -825,8 +991,15 @@ int dsa_port_mdb_add(const struct dsa_port *dp, .sw_index = dp->ds->index, .port = dp->index, .mdb = mdb, + .db = { + .type = DSA_DB_BRIDGE, + .bridge = *dp->bridge, + }, }; + if (!dp->ds->fdb_isolation) + info.db.bridge.num = 0; + return dsa_port_notify(dp, DSA_NOTIFIER_MDB_ADD, &info); } @@ -837,45 +1010,106 @@ int dsa_port_mdb_del(const struct dsa_port *dp, .sw_index = dp->ds->index, .port = dp->index, .mdb = mdb, + .db = { + .type = DSA_DB_BRIDGE, + .bridge = *dp->bridge, + }, }; + if (!dp->ds->fdb_isolation) + info.db.bridge.num = 0; + return dsa_port_notify(dp, DSA_NOTIFIER_MDB_DEL, &info); } -int dsa_port_host_mdb_add(const struct dsa_port *dp, - const struct switchdev_obj_port_mdb *mdb) +static int dsa_port_host_mdb_add(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb, + struct dsa_db db) { struct dsa_notifier_mdb_info info = { .sw_index = dp->ds->index, .port = dp->index, .mdb = mdb, + .db = db, }; + + if (!dp->ds->fdb_isolation) + info.db.bridge.num = 0; + + return dsa_port_notify(dp, DSA_NOTIFIER_HOST_MDB_ADD, &info); +} + +int dsa_port_standalone_host_mdb_add(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb) +{ + struct dsa_db db = { + .type = DSA_DB_PORT, + .dp = dp, + }; + + return dsa_port_host_mdb_add(dp, mdb, db); +} + +int dsa_port_bridge_host_mdb_add(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb) +{ struct dsa_port *cpu_dp = dp->cpu_dp; + struct dsa_db db = { + .type = DSA_DB_BRIDGE, + .bridge = *dp->bridge, + }; int err; err = dev_mc_add(cpu_dp->master, mdb->addr); if (err) return err; - return dsa_port_notify(dp, DSA_NOTIFIER_HOST_MDB_ADD, &info); + return dsa_port_host_mdb_add(dp, mdb, db); } -int dsa_port_host_mdb_del(const struct dsa_port *dp, - const struct switchdev_obj_port_mdb *mdb) +static int dsa_port_host_mdb_del(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb, + struct dsa_db db) { struct dsa_notifier_mdb_info info = { .sw_index = dp->ds->index, .port = dp->index, .mdb = mdb, + .db = db, }; + + if (!dp->ds->fdb_isolation) + info.db.bridge.num = 0; + + return dsa_port_notify(dp, DSA_NOTIFIER_HOST_MDB_DEL, &info); +} + +int dsa_port_standalone_host_mdb_del(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb) +{ + struct dsa_db db = { + .type = DSA_DB_PORT, + .dp = dp, + }; + + return dsa_port_host_mdb_del(dp, mdb, db); +} + +int dsa_port_bridge_host_mdb_del(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb) +{ struct dsa_port *cpu_dp = dp->cpu_dp; + struct dsa_db db = { + .type = DSA_DB_BRIDGE, + .bridge = *dp->bridge, + }; int err; err = dev_mc_del(cpu_dp->master, mdb->addr); if (err) return err; - return dsa_port_notify(dp, DSA_NOTIFIER_HOST_MDB_DEL, &info); + return dsa_port_host_mdb_del(dp, mdb, db); } int dsa_port_vlan_add(struct dsa_port *dp, @@ -904,6 +1138,48 @@ int dsa_port_vlan_del(struct dsa_port *dp, return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info); } +int dsa_port_host_vlan_add(struct dsa_port *dp, + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) +{ + struct dsa_notifier_vlan_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .vlan = vlan, + .extack = extack, + }; + struct dsa_port *cpu_dp = dp->cpu_dp; + int err; + + err = dsa_port_notify(dp, DSA_NOTIFIER_HOST_VLAN_ADD, &info); + if (err && err != -EOPNOTSUPP) + return err; + + vlan_vid_add(cpu_dp->master, htons(ETH_P_8021Q), vlan->vid); + + return err; +} + +int dsa_port_host_vlan_del(struct dsa_port *dp, + const struct switchdev_obj_port_vlan *vlan) +{ + struct dsa_notifier_vlan_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .vlan = vlan, + }; + struct dsa_port *cpu_dp = dp->cpu_dp; + int err; + + err = dsa_port_notify(dp, DSA_NOTIFIER_HOST_VLAN_DEL, &info); + if (err && err != -EOPNOTSUPP) + return err; + + vlan_vid_del(cpu_dp->master, htons(ETH_P_8021Q), vlan->vid); + + return err; +} + int dsa_port_mrp_add(const struct dsa_port *dp, const struct switchdev_obj_mrp *mrp) { @@ -1011,6 +1287,20 @@ static void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config, } } +static struct phylink_pcs * +dsa_port_phylink_mac_select_pcs(struct phylink_config *config, + phy_interface_t interface) +{ + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct phylink_pcs *pcs = ERR_PTR(-EOPNOTSUPP); + struct dsa_switch *ds = dp->ds; + + if (ds->ops->phylink_mac_select_pcs) + pcs = ds->ops->phylink_mac_select_pcs(ds, dp->index, interface); + + return pcs; +} + static void dsa_port_phylink_mac_config(struct phylink_config *config, unsigned int mode, const struct phylink_link_state *state) @@ -1077,6 +1367,7 @@ static void dsa_port_phylink_mac_link_up(struct phylink_config *config, static const struct phylink_mac_ops dsa_port_phylink_mac_ops = { .validate = dsa_port_phylink_validate, + .mac_select_pcs = dsa_port_phylink_mac_select_pcs, .mac_pcs_get_state = dsa_port_phylink_mac_pcs_get_state, .mac_config = dsa_port_phylink_mac_config, .mac_an_restart = dsa_port_phylink_mac_an_restart, @@ -1194,7 +1485,6 @@ static int dsa_port_phylink_register(struct dsa_port *dp) dp->pl_config.dev = ds->dev; dp->pl_config.type = PHYLINK_DEV; - dp->pl_config.pcs_poll = ds->pcs_poll; err = dsa_port_phylink_create(dp); if (err) @@ -1258,63 +1548,6 @@ void dsa_port_link_unregister_of(struct dsa_port *dp) dsa_port_setup_phy_of(dp, false); } -int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data) -{ - struct phy_device *phydev; - int ret = -EOPNOTSUPP; - - if (of_phy_is_fixed_link(dp->dn)) - return ret; - - phydev = dsa_port_get_phy_device(dp); - if (IS_ERR_OR_NULL(phydev)) - return ret; - - ret = phy_ethtool_get_strings(phydev, data); - put_device(&phydev->mdio.dev); - - return ret; -} -EXPORT_SYMBOL_GPL(dsa_port_get_phy_strings); - -int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data) -{ - struct phy_device *phydev; - int ret = -EOPNOTSUPP; - - if (of_phy_is_fixed_link(dp->dn)) - return ret; - - phydev = dsa_port_get_phy_device(dp); - if (IS_ERR_OR_NULL(phydev)) - return ret; - - ret = phy_ethtool_get_stats(phydev, NULL, data); - put_device(&phydev->mdio.dev); - - return ret; -} -EXPORT_SYMBOL_GPL(dsa_port_get_ethtool_phy_stats); - -int dsa_port_get_phy_sset_count(struct dsa_port *dp) -{ - struct phy_device *phydev; - int ret = -EOPNOTSUPP; - - if (of_phy_is_fixed_link(dp->dn)) - return ret; - - phydev = dsa_port_get_phy_device(dp); - if (IS_ERR_OR_NULL(phydev)) - return ret; - - ret = phy_ethtool_get_sset_count(phydev); - put_device(&phydev->mdio.dev); - - return ret; -} -EXPORT_SYMBOL_GPL(dsa_port_get_phy_sset_count); - int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr) { struct dsa_switch *ds = dp->ds; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 2f6caf5d037e..a61a7c54af20 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -23,6 +23,114 @@ #include "dsa_priv.h" +static void dsa_slave_standalone_event_work(struct work_struct *work) +{ + struct dsa_standalone_event_work *standalone_work = + container_of(work, struct dsa_standalone_event_work, work); + const unsigned char *addr = standalone_work->addr; + struct net_device *dev = standalone_work->dev; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct switchdev_obj_port_mdb mdb; + struct dsa_switch *ds = dp->ds; + u16 vid = standalone_work->vid; + int err; + + switch (standalone_work->event) { + case DSA_UC_ADD: + err = dsa_port_standalone_host_fdb_add(dp, addr, vid); + if (err) { + dev_err(ds->dev, + "port %d failed to add %pM vid %d to fdb: %d\n", + dp->index, addr, vid, err); + break; + } + break; + + case DSA_UC_DEL: + err = dsa_port_standalone_host_fdb_del(dp, addr, vid); + if (err) { + dev_err(ds->dev, + "port %d failed to delete %pM vid %d from fdb: %d\n", + dp->index, addr, vid, err); + } + + break; + case DSA_MC_ADD: + ether_addr_copy(mdb.addr, addr); + mdb.vid = vid; + + err = dsa_port_standalone_host_mdb_add(dp, &mdb); + if (err) { + dev_err(ds->dev, + "port %d failed to add %pM vid %d to mdb: %d\n", + dp->index, addr, vid, err); + break; + } + break; + case DSA_MC_DEL: + ether_addr_copy(mdb.addr, addr); + mdb.vid = vid; + + err = dsa_port_standalone_host_mdb_del(dp, &mdb); + if (err) { + dev_err(ds->dev, + "port %d failed to delete %pM vid %d from mdb: %d\n", + dp->index, addr, vid, err); + } + + break; + } + + kfree(standalone_work); +} + +static int dsa_slave_schedule_standalone_work(struct net_device *dev, + enum dsa_standalone_event event, + const unsigned char *addr, + u16 vid) +{ + struct dsa_standalone_event_work *standalone_work; + + standalone_work = kzalloc(sizeof(*standalone_work), GFP_ATOMIC); + if (!standalone_work) + return -ENOMEM; + + INIT_WORK(&standalone_work->work, dsa_slave_standalone_event_work); + standalone_work->event = event; + standalone_work->dev = dev; + + ether_addr_copy(standalone_work->addr, addr); + standalone_work->vid = vid; + + dsa_schedule_work(&standalone_work->work); + + return 0; +} + +static int dsa_slave_sync_uc(struct net_device *dev, + const unsigned char *addr) +{ + return dsa_slave_schedule_standalone_work(dev, DSA_UC_ADD, addr, 0); +} + +static int dsa_slave_unsync_uc(struct net_device *dev, + const unsigned char *addr) +{ + return dsa_slave_schedule_standalone_work(dev, DSA_UC_DEL, addr, 0); +} + +static int dsa_slave_sync_mc(struct net_device *dev, + const unsigned char *addr) +{ + return dsa_slave_schedule_standalone_work(dev, DSA_MC_ADD, addr, 0); +} + +static int dsa_slave_unsync_mc(struct net_device *dev, + const unsigned char *addr) +{ + return dsa_slave_schedule_standalone_work(dev, DSA_MC_DEL, addr, 0); +} + /* slave mii_bus handling ***************************************************/ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) { @@ -67,6 +175,7 @@ static int dsa_slave_open(struct net_device *dev) { struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; int err; err = dev_open(master, NULL); @@ -75,38 +184,30 @@ static int dsa_slave_open(struct net_device *dev) goto out; } - if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) { - err = dev_uc_add(master, dev->dev_addr); - if (err < 0) + if (dsa_switch_supports_uc_filtering(ds)) { + err = dsa_port_standalone_host_fdb_add(dp, dev->dev_addr, 0); + if (err) goto out; } - if (dev->flags & IFF_ALLMULTI) { - err = dev_set_allmulti(master, 1); - if (err < 0) - goto del_unicast; - } - if (dev->flags & IFF_PROMISC) { - err = dev_set_promiscuity(master, 1); + if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) { + err = dev_uc_add(master, dev->dev_addr); if (err < 0) - goto clear_allmulti; + goto del_host_addr; } err = dsa_port_enable_rt(dp, dev->phydev); if (err) - goto clear_promisc; + goto del_unicast; return 0; -clear_promisc: - if (dev->flags & IFF_PROMISC) - dev_set_promiscuity(master, -1); -clear_allmulti: - if (dev->flags & IFF_ALLMULTI) - dev_set_allmulti(master, -1); del_unicast: if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) dev_uc_del(master, dev->dev_addr); +del_host_addr: + if (dsa_switch_supports_uc_filtering(ds)) + dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0); out: return err; } @@ -115,68 +216,129 @@ static int dsa_slave_close(struct net_device *dev) { struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; dsa_port_disable_rt(dp); - dev_mc_unsync(master, dev); - dev_uc_unsync(master, dev); - if (dev->flags & IFF_ALLMULTI) - dev_set_allmulti(master, -1); - if (dev->flags & IFF_PROMISC) - dev_set_promiscuity(master, -1); - if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) dev_uc_del(master, dev->dev_addr); + if (dsa_switch_supports_uc_filtering(ds)) + dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0); + return 0; } +/* Keep flooding enabled towards this port's CPU port as long as it serves at + * least one port in the tree that requires it. + */ +static void dsa_port_manage_cpu_flood(struct dsa_port *dp) +{ + struct switchdev_brport_flags flags = { + .mask = BR_FLOOD | BR_MCAST_FLOOD, + }; + struct dsa_switch_tree *dst = dp->ds->dst; + struct dsa_port *cpu_dp = dp->cpu_dp; + struct dsa_port *other_dp; + int err; + + list_for_each_entry(other_dp, &dst->ports, list) { + if (!dsa_port_is_user(other_dp)) + continue; + + if (other_dp->cpu_dp != cpu_dp) + continue; + + if (other_dp->slave->flags & IFF_ALLMULTI) + flags.val |= BR_MCAST_FLOOD; + if (other_dp->slave->flags & IFF_PROMISC) + flags.val |= BR_FLOOD; + } + + err = dsa_port_pre_bridge_flags(dp, flags, NULL); + if (err) + return; + + dsa_port_bridge_flags(cpu_dp, flags, NULL); +} + static void dsa_slave_change_rx_flags(struct net_device *dev, int change) { struct net_device *master = dsa_slave_to_master(dev); - if (dev->flags & IFF_UP) { - if (change & IFF_ALLMULTI) - dev_set_allmulti(master, - dev->flags & IFF_ALLMULTI ? 1 : -1); - if (change & IFF_PROMISC) - dev_set_promiscuity(master, - dev->flags & IFF_PROMISC ? 1 : -1); - } + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (change & IFF_ALLMULTI) + dev_set_allmulti(master, + dev->flags & IFF_ALLMULTI ? 1 : -1); + if (change & IFF_PROMISC) + dev_set_promiscuity(master, + dev->flags & IFF_PROMISC ? 1 : -1); + + if (dsa_switch_supports_uc_filtering(ds) && + dsa_switch_supports_mc_filtering(ds)) + dsa_port_manage_cpu_flood(dp); } static void dsa_slave_set_rx_mode(struct net_device *dev) { struct net_device *master = dsa_slave_to_master(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; dev_mc_sync(master, dev); dev_uc_sync(master, dev); + if (dsa_switch_supports_mc_filtering(ds)) + __dev_mc_sync(dev, dsa_slave_sync_mc, dsa_slave_unsync_mc); + if (dsa_switch_supports_uc_filtering(ds)) + __dev_uc_sync(dev, dsa_slave_sync_uc, dsa_slave_unsync_uc); } static int dsa_slave_set_mac_address(struct net_device *dev, void *a) { struct net_device *master = dsa_slave_to_master(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; struct sockaddr *addr = a; int err; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; + /* If the port is down, the address isn't synced yet to hardware or + * to the DSA master, so there is nothing to change. + */ if (!(dev->flags & IFF_UP)) - goto out; + goto out_change_dev_addr; + + if (dsa_switch_supports_uc_filtering(ds)) { + err = dsa_port_standalone_host_fdb_add(dp, addr->sa_data, 0); + if (err) + return err; + } if (!ether_addr_equal(addr->sa_data, master->dev_addr)) { err = dev_uc_add(master, addr->sa_data); if (err < 0) - return err; + goto del_unicast; } if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) dev_uc_del(master, dev->dev_addr); -out: + if (dsa_switch_supports_uc_filtering(ds)) + dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0); + +out_change_dev_addr: eth_hw_addr_set(dev, addr->sa_data); return 0; + +del_unicast: + if (dsa_switch_supports_uc_filtering(ds)) + dsa_port_standalone_host_fdb_del(dp, addr->sa_data, 0); + + return err; } struct dsa_slave_dump_ctx { @@ -348,9 +510,8 @@ static int dsa_slave_vlan_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { - struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); - struct switchdev_obj_port_vlan vlan; + struct switchdev_obj_port_vlan *vlan; int err; if (dsa_port_skip_vlan_configuration(dp)) { @@ -358,14 +519,14 @@ static int dsa_slave_vlan_add(struct net_device *dev, return 0; } - vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj); + vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); /* Deny adding a bridge VLAN when there is already an 802.1Q upper with * the same VID. */ if (br_vlan_enabled(dsa_port_bridge_dev_get(dp))) { rcu_read_lock(); - err = dsa_slave_vlan_check_for_8021q_uppers(dev, &vlan); + err = dsa_slave_vlan_check_for_8021q_uppers(dev, vlan); rcu_read_unlock(); if (err) { NL_SET_ERR_MSG_MOD(extack, @@ -374,21 +535,36 @@ static int dsa_slave_vlan_add(struct net_device *dev, } } - err = dsa_port_vlan_add(dp, &vlan, extack); - if (err) - return err; + return dsa_port_vlan_add(dp, vlan, extack); +} - /* We need the dedicated CPU port to be a member of the VLAN as well. - * Even though drivers often handle CPU membership in special ways, +/* Offload a VLAN installed on the bridge or on a foreign interface by + * installing it as a VLAN towards the CPU port. + */ +static int dsa_slave_host_vlan_add(struct net_device *dev, + const struct switchdev_obj *obj, + struct netlink_ext_ack *extack) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct switchdev_obj_port_vlan vlan; + + /* Do nothing if this is a software bridge */ + if (!dp->bridge) + return -EOPNOTSUPP; + + if (dsa_port_skip_vlan_configuration(dp)) { + NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN"); + return 0; + } + + vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj); + + /* Even though drivers often handle CPU membership in special ways, * it doesn't make sense to program a PVID, so clear this flag. */ vlan.flags &= ~BRIDGE_VLAN_INFO_PVID; - err = dsa_port_vlan_add(dp->cpu_dp, &vlan, extack); - if (err) - return err; - - return vlan_vid_add(master, htons(ETH_P_8021Q), vlan.vid); + return dsa_port_host_vlan_add(dp, &vlan, extack); } static int dsa_slave_port_obj_add(struct net_device *dev, const void *ctx, @@ -412,13 +588,13 @@ static int dsa_slave_port_obj_add(struct net_device *dev, const void *ctx, if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; - err = dsa_port_host_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); + err = dsa_port_bridge_host_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: - if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) - return -EOPNOTSUPP; - - err = dsa_slave_vlan_add(dev, obj, extack); + if (dsa_port_offloads_bridge_port(dp, obj->orig_dev)) + err = dsa_slave_vlan_add(dev, obj, extack); + else + err = dsa_slave_host_vlan_add(dev, obj, extack); break; case SWITCHDEV_OBJ_ID_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) @@ -444,26 +620,33 @@ static int dsa_slave_port_obj_add(struct net_device *dev, const void *ctx, static int dsa_slave_vlan_del(struct net_device *dev, const struct switchdev_obj *obj) { - struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); struct switchdev_obj_port_vlan *vlan; - int err; if (dsa_port_skip_vlan_configuration(dp)) return 0; vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); - /* Do not deprogram the CPU port as it may be shared with other user - * ports which can be members of this VLAN as well. - */ - err = dsa_port_vlan_del(dp, vlan); - if (err) - return err; + return dsa_port_vlan_del(dp, vlan); +} - vlan_vid_del(master, htons(ETH_P_8021Q), vlan->vid); +static int dsa_slave_host_vlan_del(struct net_device *dev, + const struct switchdev_obj *obj) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct switchdev_obj_port_vlan *vlan; - return 0; + /* Do nothing if this is a software bridge */ + if (!dp->bridge) + return -EOPNOTSUPP; + + if (dsa_port_skip_vlan_configuration(dp)) + return 0; + + vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); + + return dsa_port_host_vlan_del(dp, vlan); } static int dsa_slave_port_obj_del(struct net_device *dev, const void *ctx, @@ -486,13 +669,13 @@ static int dsa_slave_port_obj_del(struct net_device *dev, const void *ctx, if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; - err = dsa_port_host_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); + err = dsa_port_bridge_host_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: - if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) - return -EOPNOTSUPP; - - err = dsa_slave_vlan_del(dev, obj); + if (dsa_port_offloads_bridge_port(dp, obj->orig_dev)) + err = dsa_slave_vlan_del(dev, obj); + else + err = dsa_slave_host_vlan_del(dev, obj); break; case SWITCHDEV_OBJ_ID_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) @@ -1347,7 +1530,6 @@ static int dsa_slave_get_ts_info(struct net_device *dev, static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { - struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); struct switchdev_obj_port_vlan vlan = { .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, @@ -1367,7 +1549,7 @@ static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto, } /* And CPU port... */ - ret = dsa_port_vlan_add(dp->cpu_dp, &vlan, &extack); + ret = dsa_port_host_vlan_add(dp, &vlan, &extack); if (ret) { if (extack._msg) netdev_err(dev, "CPU port %d: %s\n", dp->cpu_dp->index, @@ -1375,13 +1557,12 @@ static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto, return ret; } - return vlan_vid_add(master, proto, vid); + return 0; } static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { - struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); struct switchdev_obj_port_vlan vlan = { .vid = vid, @@ -1390,16 +1571,11 @@ static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, }; int err; - /* Do not deprogram the CPU port as it may be shared with other user - * ports which can be members of this VLAN as well. - */ err = dsa_port_vlan_del(dp, &vlan); if (err) return err; - vlan_vid_del(master, proto, vid); - - return 0; + return dsa_port_host_vlan_del(dp, &vlan); } static int dsa_slave_restore_vlan(struct net_device *vdev, int vid, void *arg) @@ -1934,6 +2110,8 @@ int dsa_slave_create(struct dsa_port *port) else eth_hw_addr_inherit(slave_dev, master); slave_dev->priv_flags |= IFF_NO_QUEUE; + if (dsa_switch_supports_uc_filtering(ds)) + slave_dev->priv_flags |= IFF_UNICAST_FLT; slave_dev->netdev_ops = &dsa_slave_netdev_ops; if (ds->ops->port_max_mtu) slave_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index); @@ -2120,7 +2298,7 @@ dsa_slave_lag_changeupper(struct net_device *dev, continue; dp = dsa_slave_to_port(lower); - if (!dp->lag_dev) + if (!dp->lag) /* Software LAG */ continue; @@ -2149,7 +2327,7 @@ dsa_slave_lag_prechangeupper(struct net_device *dev, continue; dp = dsa_slave_to_port(lower); - if (!dp->lag_dev) + if (!dp->lag) /* Software LAG */ continue; @@ -2359,43 +2537,40 @@ static void dsa_fdb_offload_notify(struct dsa_switchdev_event_work *switchdev_work) { struct switchdev_notifier_fdb_info info = {}; - struct dsa_switch *ds = switchdev_work->ds; - struct dsa_port *dp; - - if (!dsa_is_user_port(ds, switchdev_work->port)) - return; info.addr = switchdev_work->addr; info.vid = switchdev_work->vid; info.offloaded = true; - dp = dsa_to_port(ds, switchdev_work->port); call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED, - dp->slave, &info.info, NULL); + switchdev_work->orig_dev, &info.info, NULL); } static void dsa_slave_switchdev_event_work(struct work_struct *work) { struct dsa_switchdev_event_work *switchdev_work = container_of(work, struct dsa_switchdev_event_work, work); - struct dsa_switch *ds = switchdev_work->ds; + const unsigned char *addr = switchdev_work->addr; + struct net_device *dev = switchdev_work->dev; + u16 vid = switchdev_work->vid; + struct dsa_switch *ds; struct dsa_port *dp; int err; - dp = dsa_to_port(ds, switchdev_work->port); + dp = dsa_slave_to_port(dev); + ds = dp->ds; switch (switchdev_work->event) { case SWITCHDEV_FDB_ADD_TO_DEVICE: if (switchdev_work->host_addr) - err = dsa_port_host_fdb_add(dp, switchdev_work->addr, - switchdev_work->vid); + err = dsa_port_bridge_host_fdb_add(dp, addr, vid); + else if (dp->lag) + err = dsa_port_lag_fdb_add(dp, addr, vid); else - err = dsa_port_fdb_add(dp, switchdev_work->addr, - switchdev_work->vid); + err = dsa_port_fdb_add(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to add %pM vid %d to fdb: %d\n", - dp->index, switchdev_work->addr, - switchdev_work->vid, err); + dp->index, addr, vid, err); break; } dsa_fdb_offload_notify(switchdev_work); @@ -2403,16 +2578,15 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work) case SWITCHDEV_FDB_DEL_TO_DEVICE: if (switchdev_work->host_addr) - err = dsa_port_host_fdb_del(dp, switchdev_work->addr, - switchdev_work->vid); + err = dsa_port_bridge_host_fdb_del(dp, addr, vid); + else if (dp->lag) + err = dsa_port_lag_fdb_del(dp, addr, vid); else - err = dsa_port_fdb_del(dp, switchdev_work->addr, - switchdev_work->vid); + err = dsa_port_fdb_del(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to delete %pM vid %d from fdb: %d\n", - dp->index, switchdev_work->addr, - switchdev_work->vid, err); + dp->index, addr, vid, err); } break; @@ -2450,19 +2624,17 @@ static int dsa_slave_fdb_event(struct net_device *dev, if (ctx && ctx != dp) return 0; - if (!ds->ops->port_fdb_add || !ds->ops->port_fdb_del) - return -EOPNOTSUPP; + if (switchdev_fdb_is_dynamically_learned(fdb_info)) { + if (dsa_port_offloads_bridge_port(dp, orig_dev)) + return 0; - if (dsa_slave_dev_check(orig_dev) && - switchdev_fdb_is_dynamically_learned(fdb_info)) - return 0; - - /* FDB entries learned by the software bridge should be installed as - * host addresses only if the driver requests assisted learning. - */ - if (switchdev_fdb_is_dynamically_learned(fdb_info) && - !ds->assisted_learning_on_cpu_port) - return 0; + /* FDB entries learned by the software bridge or by foreign + * bridge ports should be installed as host addresses only if + * the driver requests assisted learning. + */ + if (!ds->assisted_learning_on_cpu_port) + return 0; + } /* Also treat FDB entries on foreign interfaces bridged with us as host * addresses. @@ -2470,6 +2642,18 @@ static int dsa_slave_fdb_event(struct net_device *dev, if (dsa_foreign_dev_check(dev, orig_dev)) host_addr = true; + /* Check early that we're not doing work in vain. + * Host addresses on LAG ports still require regular FDB ops, + * since the CPU port isn't in a LAG. + */ + if (dp->lag && !host_addr) { + if (!ds->ops->lag_fdb_add || !ds->ops->lag_fdb_del) + return -EOPNOTSUPP; + } else { + if (!ds->ops->port_fdb_add || !ds->ops->port_fdb_del) + return -EOPNOTSUPP; + } + switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC); if (!switchdev_work) return -ENOMEM; @@ -2480,10 +2664,9 @@ static int dsa_slave_fdb_event(struct net_device *dev, host_addr ? " as host address" : ""); INIT_WORK(&switchdev_work->work, dsa_slave_switchdev_event_work); - switchdev_work->ds = ds; - switchdev_work->port = dp->index; switchdev_work->event = event; switchdev_work->dev = dev; + switchdev_work->orig_dev = orig_dev; ether_addr_copy(switchdev_work->addr, fdb_info->addr); switchdev_work->vid = fdb_info->vid; @@ -2512,8 +2695,7 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, err = switchdev_handle_fdb_event_to_device(dev, event, ptr, dsa_slave_dev_check, dsa_foreign_dev_check, - dsa_slave_fdb_event, - NULL); + dsa_slave_fdb_event); return notifier_from_errno(err); default: return NOTIFY_DONE; @@ -2530,14 +2712,16 @@ static int dsa_slave_switchdev_blocking_event(struct notifier_block *unused, switch (event) { case SWITCHDEV_PORT_OBJ_ADD: - err = switchdev_handle_port_obj_add(dev, ptr, - dsa_slave_dev_check, - dsa_slave_port_obj_add); + err = switchdev_handle_port_obj_add_foreign(dev, ptr, + dsa_slave_dev_check, + dsa_foreign_dev_check, + dsa_slave_port_obj_add); return notifier_from_errno(err); case SWITCHDEV_PORT_OBJ_DEL: - err = switchdev_handle_port_obj_del(dev, ptr, - dsa_slave_dev_check, - dsa_slave_port_obj_del); + err = switchdev_handle_port_obj_del_foreign(dev, ptr, + dsa_slave_dev_check, + dsa_foreign_dev_check, + dsa_slave_port_obj_del); return notifier_from_errno(err); case SWITCHDEV_PORT_ATTR_SET: err = switchdev_handle_port_attr_set(dev, ptr, diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 4866b58649e4..d25cd1da3eb3 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -96,7 +96,8 @@ static int dsa_switch_bridge_join(struct dsa_switch *ds, return -EOPNOTSUPP; err = ds->ops->port_bridge_join(ds, info->port, info->bridge, - &info->tx_fwd_offload); + &info->tx_fwd_offload, + info->extack); if (err) return err; } @@ -105,12 +106,13 @@ static int dsa_switch_bridge_join(struct dsa_switch *ds, ds->ops->crosschip_bridge_join) { err = ds->ops->crosschip_bridge_join(ds, info->tree_index, info->sw_index, - info->port, info->bridge); + info->port, info->bridge, + info->extack); if (err) return err; } - return dsa_tag_8021q_bridge_join(ds, info); + return 0; } static int dsa_switch_sync_vlan_filtering(struct dsa_switch *ds, @@ -186,7 +188,7 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, return err; } - return dsa_tag_8021q_bridge_leave(ds, info); + return 0; } /* Matches for all upstream-facing ports (the CPU port and all upstream-facing @@ -211,20 +213,22 @@ static bool dsa_port_host_address_match(struct dsa_port *dp, } static struct dsa_mac_addr *dsa_mac_addr_find(struct list_head *addr_list, - const unsigned char *addr, - u16 vid) + const unsigned char *addr, u16 vid, + struct dsa_db db) { struct dsa_mac_addr *a; list_for_each_entry(a, addr_list, list) - if (ether_addr_equal(a->addr, addr) && a->vid == vid) + if (ether_addr_equal(a->addr, addr) && a->vid == vid && + dsa_db_equal(&a->db, &db)) return a; return NULL; } static int dsa_port_do_mdb_add(struct dsa_port *dp, - const struct switchdev_obj_port_mdb *mdb) + const struct switchdev_obj_port_mdb *mdb, + struct dsa_db db) { struct dsa_switch *ds = dp->ds; struct dsa_mac_addr *a; @@ -233,11 +237,11 @@ static int dsa_port_do_mdb_add(struct dsa_port *dp, /* No need to bother with refcounting for user ports */ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) - return ds->ops->port_mdb_add(ds, port, mdb); + return ds->ops->port_mdb_add(ds, port, mdb, db); mutex_lock(&dp->addr_lists_lock); - a = dsa_mac_addr_find(&dp->mdbs, mdb->addr, mdb->vid); + a = dsa_mac_addr_find(&dp->mdbs, mdb->addr, mdb->vid, db); if (a) { refcount_inc(&a->refcount); goto out; @@ -249,7 +253,7 @@ static int dsa_port_do_mdb_add(struct dsa_port *dp, goto out; } - err = ds->ops->port_mdb_add(ds, port, mdb); + err = ds->ops->port_mdb_add(ds, port, mdb, db); if (err) { kfree(a); goto out; @@ -257,6 +261,7 @@ static int dsa_port_do_mdb_add(struct dsa_port *dp, ether_addr_copy(a->addr, mdb->addr); a->vid = mdb->vid; + a->db = db; refcount_set(&a->refcount, 1); list_add_tail(&a->list, &dp->mdbs); @@ -267,7 +272,8 @@ out: } static int dsa_port_do_mdb_del(struct dsa_port *dp, - const struct switchdev_obj_port_mdb *mdb) + const struct switchdev_obj_port_mdb *mdb, + struct dsa_db db) { struct dsa_switch *ds = dp->ds; struct dsa_mac_addr *a; @@ -276,11 +282,11 @@ static int dsa_port_do_mdb_del(struct dsa_port *dp, /* No need to bother with refcounting for user ports */ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) - return ds->ops->port_mdb_del(ds, port, mdb); + return ds->ops->port_mdb_del(ds, port, mdb, db); mutex_lock(&dp->addr_lists_lock); - a = dsa_mac_addr_find(&dp->mdbs, mdb->addr, mdb->vid); + a = dsa_mac_addr_find(&dp->mdbs, mdb->addr, mdb->vid, db); if (!a) { err = -ENOENT; goto out; @@ -289,7 +295,7 @@ static int dsa_port_do_mdb_del(struct dsa_port *dp, if (!refcount_dec_and_test(&a->refcount)) goto out; - err = ds->ops->port_mdb_del(ds, port, mdb); + err = ds->ops->port_mdb_del(ds, port, mdb, db); if (err) { refcount_set(&a->refcount, 1); goto out; @@ -305,7 +311,7 @@ out: } static int dsa_port_do_fdb_add(struct dsa_port *dp, const unsigned char *addr, - u16 vid) + u16 vid, struct dsa_db db) { struct dsa_switch *ds = dp->ds; struct dsa_mac_addr *a; @@ -314,11 +320,11 @@ static int dsa_port_do_fdb_add(struct dsa_port *dp, const unsigned char *addr, /* No need to bother with refcounting for user ports */ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) - return ds->ops->port_fdb_add(ds, port, addr, vid); + return ds->ops->port_fdb_add(ds, port, addr, vid, db); mutex_lock(&dp->addr_lists_lock); - a = dsa_mac_addr_find(&dp->fdbs, addr, vid); + a = dsa_mac_addr_find(&dp->fdbs, addr, vid, db); if (a) { refcount_inc(&a->refcount); goto out; @@ -330,7 +336,7 @@ static int dsa_port_do_fdb_add(struct dsa_port *dp, const unsigned char *addr, goto out; } - err = ds->ops->port_fdb_add(ds, port, addr, vid); + err = ds->ops->port_fdb_add(ds, port, addr, vid, db); if (err) { kfree(a); goto out; @@ -338,6 +344,7 @@ static int dsa_port_do_fdb_add(struct dsa_port *dp, const unsigned char *addr, ether_addr_copy(a->addr, addr); a->vid = vid; + a->db = db; refcount_set(&a->refcount, 1); list_add_tail(&a->list, &dp->fdbs); @@ -348,7 +355,7 @@ out: } static int dsa_port_do_fdb_del(struct dsa_port *dp, const unsigned char *addr, - u16 vid) + u16 vid, struct dsa_db db) { struct dsa_switch *ds = dp->ds; struct dsa_mac_addr *a; @@ -357,11 +364,11 @@ static int dsa_port_do_fdb_del(struct dsa_port *dp, const unsigned char *addr, /* No need to bother with refcounting for user ports */ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) - return ds->ops->port_fdb_del(ds, port, addr, vid); + return ds->ops->port_fdb_del(ds, port, addr, vid, db); mutex_lock(&dp->addr_lists_lock); - a = dsa_mac_addr_find(&dp->fdbs, addr, vid); + a = dsa_mac_addr_find(&dp->fdbs, addr, vid, db); if (!a) { err = -ENOENT; goto out; @@ -370,7 +377,7 @@ static int dsa_port_do_fdb_del(struct dsa_port *dp, const unsigned char *addr, if (!refcount_dec_and_test(&a->refcount)) goto out; - err = ds->ops->port_fdb_del(ds, port, addr, vid); + err = ds->ops->port_fdb_del(ds, port, addr, vid, db); if (err) { refcount_set(&a->refcount, 1); goto out; @@ -385,6 +392,77 @@ out: return err; } +static int dsa_switch_do_lag_fdb_add(struct dsa_switch *ds, struct dsa_lag *lag, + const unsigned char *addr, u16 vid, + struct dsa_db db) +{ + struct dsa_mac_addr *a; + int err = 0; + + mutex_lock(&lag->fdb_lock); + + a = dsa_mac_addr_find(&lag->fdbs, addr, vid, db); + if (a) { + refcount_inc(&a->refcount); + goto out; + } + + a = kzalloc(sizeof(*a), GFP_KERNEL); + if (!a) { + err = -ENOMEM; + goto out; + } + + err = ds->ops->lag_fdb_add(ds, *lag, addr, vid, db); + if (err) { + kfree(a); + goto out; + } + + ether_addr_copy(a->addr, addr); + a->vid = vid; + refcount_set(&a->refcount, 1); + list_add_tail(&a->list, &lag->fdbs); + +out: + mutex_unlock(&lag->fdb_lock); + + return err; +} + +static int dsa_switch_do_lag_fdb_del(struct dsa_switch *ds, struct dsa_lag *lag, + const unsigned char *addr, u16 vid, + struct dsa_db db) +{ + struct dsa_mac_addr *a; + int err = 0; + + mutex_lock(&lag->fdb_lock); + + a = dsa_mac_addr_find(&lag->fdbs, addr, vid, db); + if (!a) { + err = -ENOENT; + goto out; + } + + if (!refcount_dec_and_test(&a->refcount)) + goto out; + + err = ds->ops->lag_fdb_del(ds, *lag, addr, vid, db); + if (err) { + refcount_set(&a->refcount, 1); + goto out; + } + + list_del(&a->list); + kfree(a); + +out: + mutex_unlock(&lag->fdb_lock); + + return err; +} + static int dsa_switch_host_fdb_add(struct dsa_switch *ds, struct dsa_notifier_fdb_info *info) { @@ -397,7 +475,8 @@ static int dsa_switch_host_fdb_add(struct dsa_switch *ds, dsa_switch_for_each_port(dp, ds) { if (dsa_port_host_address_match(dp, info->sw_index, info->port)) { - err = dsa_port_do_fdb_add(dp, info->addr, info->vid); + err = dsa_port_do_fdb_add(dp, info->addr, info->vid, + info->db); if (err) break; } @@ -418,7 +497,8 @@ static int dsa_switch_host_fdb_del(struct dsa_switch *ds, dsa_switch_for_each_port(dp, ds) { if (dsa_port_host_address_match(dp, info->sw_index, info->port)) { - err = dsa_port_do_fdb_del(dp, info->addr, info->vid); + err = dsa_port_do_fdb_del(dp, info->addr, info->vid, + info->db); if (err) break; } @@ -436,7 +516,7 @@ static int dsa_switch_fdb_add(struct dsa_switch *ds, if (!ds->ops->port_fdb_add) return -EOPNOTSUPP; - return dsa_port_do_fdb_add(dp, info->addr, info->vid); + return dsa_port_do_fdb_add(dp, info->addr, info->vid, info->db); } static int dsa_switch_fdb_del(struct dsa_switch *ds, @@ -448,7 +528,43 @@ static int dsa_switch_fdb_del(struct dsa_switch *ds, if (!ds->ops->port_fdb_del) return -EOPNOTSUPP; - return dsa_port_do_fdb_del(dp, info->addr, info->vid); + return dsa_port_do_fdb_del(dp, info->addr, info->vid, info->db); +} + +static int dsa_switch_lag_fdb_add(struct dsa_switch *ds, + struct dsa_notifier_lag_fdb_info *info) +{ + struct dsa_port *dp; + + if (!ds->ops->lag_fdb_add) + return -EOPNOTSUPP; + + /* Notify switch only if it has a port in this LAG */ + dsa_switch_for_each_port(dp, ds) + if (dsa_port_offloads_lag(dp, info->lag)) + return dsa_switch_do_lag_fdb_add(ds, info->lag, + info->addr, info->vid, + info->db); + + return 0; +} + +static int dsa_switch_lag_fdb_del(struct dsa_switch *ds, + struct dsa_notifier_lag_fdb_info *info) +{ + struct dsa_port *dp; + + if (!ds->ops->lag_fdb_del) + return -EOPNOTSUPP; + + /* Notify switch only if it has a port in this LAG */ + dsa_switch_for_each_port(dp, ds) + if (dsa_port_offloads_lag(dp, info->lag)) + return dsa_switch_do_lag_fdb_del(ds, info->lag, + info->addr, info->vid, + info->db); + + return 0; } static int dsa_switch_lag_change(struct dsa_switch *ds, @@ -501,7 +617,7 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds, if (!ds->ops->port_mdb_add) return -EOPNOTSUPP; - return dsa_port_do_mdb_add(dp, info->mdb); + return dsa_port_do_mdb_add(dp, info->mdb, info->db); } static int dsa_switch_mdb_del(struct dsa_switch *ds, @@ -513,7 +629,7 @@ static int dsa_switch_mdb_del(struct dsa_switch *ds, if (!ds->ops->port_mdb_del) return -EOPNOTSUPP; - return dsa_port_do_mdb_del(dp, info->mdb); + return dsa_port_do_mdb_del(dp, info->mdb, info->db); } static int dsa_switch_host_mdb_add(struct dsa_switch *ds, @@ -528,7 +644,7 @@ static int dsa_switch_host_mdb_add(struct dsa_switch *ds, dsa_switch_for_each_port(dp, ds) { if (dsa_port_host_address_match(dp, info->sw_index, info->port)) { - err = dsa_port_do_mdb_add(dp, info->mdb); + err = dsa_port_do_mdb_add(dp, info->mdb, info->db); if (err) break; } @@ -549,7 +665,7 @@ static int dsa_switch_host_mdb_del(struct dsa_switch *ds, dsa_switch_for_each_port(dp, ds) { if (dsa_port_host_address_match(dp, info->sw_index, info->port)) { - err = dsa_port_do_mdb_del(dp, info->mdb); + err = dsa_port_do_mdb_del(dp, info->mdb, info->db); if (err) break; } @@ -558,6 +674,7 @@ static int dsa_switch_host_mdb_del(struct dsa_switch *ds, return err; } +/* Port VLANs match on the targeted port and on all DSA ports */ static bool dsa_port_vlan_match(struct dsa_port *dp, struct dsa_notifier_vlan_info *info) { @@ -570,6 +687,126 @@ static bool dsa_port_vlan_match(struct dsa_port *dp, return false; } +/* Host VLANs match on the targeted port's CPU port, and on all DSA ports + * (upstream and downstream) of that switch and its upstream switches. + */ +static bool dsa_port_host_vlan_match(struct dsa_port *dp, + struct dsa_notifier_vlan_info *info) +{ + struct dsa_port *targeted_dp, *cpu_dp; + struct dsa_switch *targeted_ds; + + targeted_ds = dsa_switch_find(dp->ds->dst->index, info->sw_index); + targeted_dp = dsa_to_port(targeted_ds, info->port); + cpu_dp = targeted_dp->cpu_dp; + + if (dsa_switch_is_upstream_of(dp->ds, targeted_ds)) + return dsa_port_is_dsa(dp) || dp == cpu_dp; + + return false; +} + +static struct dsa_vlan *dsa_vlan_find(struct list_head *vlan_list, + const struct switchdev_obj_port_vlan *vlan) +{ + struct dsa_vlan *v; + + list_for_each_entry(v, vlan_list, list) + if (v->vid == vlan->vid) + return v; + + return NULL; +} + +static int dsa_port_do_vlan_add(struct dsa_port *dp, + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) +{ + struct dsa_switch *ds = dp->ds; + int port = dp->index; + struct dsa_vlan *v; + int err = 0; + + /* No need to bother with refcounting for user ports. */ + if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) + return ds->ops->port_vlan_add(ds, port, vlan, extack); + + /* No need to propagate on shared ports the existing VLANs that were + * re-notified after just the flags have changed. This would cause a + * refcount bump which we need to avoid, since it unbalances the + * additions with the deletions. + */ + if (vlan->changed) + return 0; + + mutex_lock(&dp->vlans_lock); + + v = dsa_vlan_find(&dp->vlans, vlan); + if (v) { + refcount_inc(&v->refcount); + goto out; + } + + v = kzalloc(sizeof(*v), GFP_KERNEL); + if (!v) { + err = -ENOMEM; + goto out; + } + + err = ds->ops->port_vlan_add(ds, port, vlan, extack); + if (err) { + kfree(v); + goto out; + } + + v->vid = vlan->vid; + refcount_set(&v->refcount, 1); + list_add_tail(&v->list, &dp->vlans); + +out: + mutex_unlock(&dp->vlans_lock); + + return err; +} + +static int dsa_port_do_vlan_del(struct dsa_port *dp, + const struct switchdev_obj_port_vlan *vlan) +{ + struct dsa_switch *ds = dp->ds; + int port = dp->index; + struct dsa_vlan *v; + int err = 0; + + /* No need to bother with refcounting for user ports */ + if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) + return ds->ops->port_vlan_del(ds, port, vlan); + + mutex_lock(&dp->vlans_lock); + + v = dsa_vlan_find(&dp->vlans, vlan); + if (!v) { + err = -ENOENT; + goto out; + } + + if (!refcount_dec_and_test(&v->refcount)) + goto out; + + err = ds->ops->port_vlan_del(ds, port, vlan); + if (err) { + refcount_set(&v->refcount, 1); + goto out; + } + + list_del(&v->list); + kfree(v); + +out: + mutex_unlock(&dp->vlans_lock); + + return err; +} + static int dsa_switch_vlan_add(struct dsa_switch *ds, struct dsa_notifier_vlan_info *info) { @@ -581,8 +818,8 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds, dsa_switch_for_each_port(dp, ds) { if (dsa_port_vlan_match(dp, info)) { - err = ds->ops->port_vlan_add(ds, dp->index, info->vlan, - info->extack); + err = dsa_port_do_vlan_add(dp, info->vlan, + info->extack); if (err) return err; } @@ -594,15 +831,61 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds, static int dsa_switch_vlan_del(struct dsa_switch *ds, struct dsa_notifier_vlan_info *info) { + struct dsa_port *dp; + int err; + if (!ds->ops->port_vlan_del) return -EOPNOTSUPP; - if (ds->index == info->sw_index) - return ds->ops->port_vlan_del(ds, info->port, info->vlan); + dsa_switch_for_each_port(dp, ds) { + if (dsa_port_vlan_match(dp, info)) { + err = dsa_port_do_vlan_del(dp, info->vlan); + if (err) + return err; + } + } + + return 0; +} + +static int dsa_switch_host_vlan_add(struct dsa_switch *ds, + struct dsa_notifier_vlan_info *info) +{ + struct dsa_port *dp; + int err; + + if (!ds->ops->port_vlan_add) + return -EOPNOTSUPP; + + dsa_switch_for_each_port(dp, ds) { + if (dsa_port_host_vlan_match(dp, info)) { + err = dsa_port_do_vlan_add(dp, info->vlan, + info->extack); + if (err) + return err; + } + } + + return 0; +} + +static int dsa_switch_host_vlan_del(struct dsa_switch *ds, + struct dsa_notifier_vlan_info *info) +{ + struct dsa_port *dp; + int err; + + if (!ds->ops->port_vlan_del) + return -EOPNOTSUPP; + + dsa_switch_for_each_port(dp, ds) { + if (dsa_port_host_vlan_match(dp, info)) { + err = dsa_port_do_vlan_del(dp, info->vlan); + if (err) + return err; + } + } - /* Do not deprogram the DSA links as they may be used as conduit - * for other VLAN members in the fabric. - */ return 0; } @@ -737,6 +1020,12 @@ static int dsa_switch_event(struct notifier_block *nb, case DSA_NOTIFIER_HOST_FDB_DEL: err = dsa_switch_host_fdb_del(ds, info); break; + case DSA_NOTIFIER_LAG_FDB_ADD: + err = dsa_switch_lag_fdb_add(ds, info); + break; + case DSA_NOTIFIER_LAG_FDB_DEL: + err = dsa_switch_lag_fdb_del(ds, info); + break; case DSA_NOTIFIER_LAG_CHANGE: err = dsa_switch_lag_change(ds, info); break; @@ -764,6 +1053,12 @@ static int dsa_switch_event(struct notifier_block *nb, case DSA_NOTIFIER_VLAN_DEL: err = dsa_switch_vlan_del(ds, info); break; + case DSA_NOTIFIER_HOST_VLAN_ADD: + err = dsa_switch_host_vlan_add(ds, info); + break; + case DSA_NOTIFIER_HOST_VLAN_DEL: + err = dsa_switch_host_vlan_del(ds, info); + break; case DSA_NOTIFIER_MTU: err = dsa_switch_mtu(ds, info); break; diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 27712a81c967..a786569203f0 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -16,15 +16,11 @@ * * | 11 | 10 | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | * +-----------+-----+-----------------+-----------+-----------------------+ - * | DIR | VBID| SWITCH_ID | VBID | PORT | + * | RSV | VBID| SWITCH_ID | VBID | PORT | * +-----------+-----+-----------------+-----------+-----------------------+ * - * DIR - VID[11:10]: - * Direction flags. - * * 1 (0b01) for RX VLAN, - * * 2 (0b10) for TX VLAN. - * These values make the special VIDs of 0, 1 and 4095 to be left - * unused by this coding scheme. + * RSV - VID[11:10]: + * Reserved. Must be set to 3 (0b11). * * SWITCH_ID - VID[8:6]: * Index of switch within DSA tree. Must be between 0 and 7. @@ -32,18 +28,17 @@ * VBID - { VID[9], VID[5:4] }: * Virtual bridge ID. If between 1 and 7, packet targets the broadcast * domain of a bridge. If transmitted as zero, packet targets a single - * port. Field only valid on transmit, must be ignored on receive. + * port. * * PORT - VID[3:0]: * Index of switch port. Must be between 0 and 15. */ -#define DSA_8021Q_DIR_SHIFT 10 -#define DSA_8021Q_DIR_MASK GENMASK(11, 10) -#define DSA_8021Q_DIR(x) (((x) << DSA_8021Q_DIR_SHIFT) & \ - DSA_8021Q_DIR_MASK) -#define DSA_8021Q_DIR_RX DSA_8021Q_DIR(1) -#define DSA_8021Q_DIR_TX DSA_8021Q_DIR(2) +#define DSA_8021Q_RSV_VAL 3 +#define DSA_8021Q_RSV_SHIFT 10 +#define DSA_8021Q_RSV_MASK GENMASK(11, 10) +#define DSA_8021Q_RSV ((DSA_8021Q_RSV_VAL << DSA_8021Q_RSV_SHIFT) & \ + DSA_8021Q_RSV_MASK) #define DSA_8021Q_SWITCH_ID_SHIFT 6 #define DSA_8021Q_SWITCH_ID_MASK GENMASK(8, 6) @@ -67,34 +62,24 @@ #define DSA_8021Q_PORT(x) (((x) << DSA_8021Q_PORT_SHIFT) & \ DSA_8021Q_PORT_MASK) -u16 dsa_8021q_bridge_tx_fwd_offload_vid(unsigned int bridge_num) +u16 dsa_tag_8021q_bridge_vid(unsigned int bridge_num) { /* The VBID value of 0 is reserved for precise TX, but it is also * reserved/invalid for the bridge_num, so all is well. */ - return DSA_8021Q_DIR_TX | DSA_8021Q_VBID(bridge_num); + return DSA_8021Q_RSV | DSA_8021Q_VBID(bridge_num); } -EXPORT_SYMBOL_GPL(dsa_8021q_bridge_tx_fwd_offload_vid); - -/* Returns the VID to be inserted into the frame from xmit for switch steering - * instructions on egress. Encodes switch ID and port ID. - */ -u16 dsa_tag_8021q_tx_vid(const struct dsa_port *dp) -{ - return DSA_8021Q_DIR_TX | DSA_8021Q_SWITCH_ID(dp->ds->index) | - DSA_8021Q_PORT(dp->index); -} -EXPORT_SYMBOL_GPL(dsa_tag_8021q_tx_vid); +EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_vid); /* Returns the VID that will be installed as pvid for this switch port, sent as * tagged egress towards the CPU port and decoded by the rcv function. */ -u16 dsa_tag_8021q_rx_vid(const struct dsa_port *dp) +u16 dsa_tag_8021q_standalone_vid(const struct dsa_port *dp) { - return DSA_8021Q_DIR_RX | DSA_8021Q_SWITCH_ID(dp->ds->index) | + return DSA_8021Q_RSV | DSA_8021Q_SWITCH_ID(dp->ds->index) | DSA_8021Q_PORT(dp->index); } -EXPORT_SYMBOL_GPL(dsa_tag_8021q_rx_vid); +EXPORT_SYMBOL_GPL(dsa_tag_8021q_standalone_vid); /* Returns the decoded switch ID from the RX VID. */ int dsa_8021q_rx_switch_id(u16 vid) @@ -110,21 +95,20 @@ int dsa_8021q_rx_source_port(u16 vid) } EXPORT_SYMBOL_GPL(dsa_8021q_rx_source_port); -bool vid_is_dsa_8021q_rxvlan(u16 vid) +/* Returns the decoded VBID from the RX VID. */ +static int dsa_tag_8021q_rx_vbid(u16 vid) { - return (vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_RX; -} -EXPORT_SYMBOL_GPL(vid_is_dsa_8021q_rxvlan); + u16 vbid_hi = (vid & DSA_8021Q_VBID_HI_MASK) >> DSA_8021Q_VBID_HI_SHIFT; + u16 vbid_lo = (vid & DSA_8021Q_VBID_LO_MASK) >> DSA_8021Q_VBID_LO_SHIFT; -bool vid_is_dsa_8021q_txvlan(u16 vid) -{ - return (vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_TX; + return (vbid_hi << 2) | vbid_lo; } -EXPORT_SYMBOL_GPL(vid_is_dsa_8021q_txvlan); bool vid_is_dsa_8021q(u16 vid) { - return vid_is_dsa_8021q_rxvlan(vid) || vid_is_dsa_8021q_txvlan(vid); + u16 rsv = (vid & DSA_8021Q_RSV_MASK) >> DSA_8021Q_RSV_SHIFT; + + return rsv == DSA_8021Q_RSV_VAL; } EXPORT_SYMBOL_GPL(vid_is_dsa_8021q); @@ -242,12 +226,8 @@ int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds, u16 flags = 0; if (dsa_port_is_user(dp)) - flags |= BRIDGE_VLAN_INFO_UNTAGGED; - - if (vid_is_dsa_8021q_rxvlan(info->vid) && - dsa_8021q_rx_switch_id(info->vid) == ds->index && - dsa_8021q_rx_source_port(info->vid) == dp->index) - flags |= BRIDGE_VLAN_INFO_PVID; + flags |= BRIDGE_VLAN_INFO_UNTAGGED | + BRIDGE_VLAN_INFO_PVID; err = dsa_port_do_tag_8021q_vlan_add(dp, info->vid, flags); @@ -279,162 +259,78 @@ int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds, return 0; } -/* RX VLAN tagging (left) and TX VLAN tagging (right) setup shown for a single - * front-panel switch port (here swp0). +/* There are 2 ways of offloading tag_8021q VLANs. * - * Port identification through VLAN (802.1Q) tags has different requirements - * for it to work effectively: - * - On RX (ingress from network): each front-panel port must have a pvid - * that uniquely identifies it, and the egress of this pvid must be tagged - * towards the CPU port, so that software can recover the source port based - * on the VID in the frame. But this would only work for standalone ports; - * if bridged, this VLAN setup would break autonomous forwarding and would - * force all switched traffic to pass through the CPU. So we must also make - * the other front-panel ports members of this VID we're adding, albeit - * we're not making it their PVID (they'll still have their own). - * - On TX (ingress from CPU and towards network) we are faced with a problem. - * If we were to tag traffic (from within DSA) with the port's pvid, all - * would be well, assuming the switch ports were standalone. Frames would - * have no choice but to be directed towards the correct front-panel port. - * But because we also want the RX VLAN to not break bridging, then - * inevitably that means that we have to give them a choice (of what - * front-panel port to go out on), and therefore we cannot steer traffic - * based on the RX VID. So what we do is simply install one more VID on the - * front-panel and CPU ports, and profit off of the fact that steering will - * work just by virtue of the fact that there is only one other port that's - * a member of the VID we're tagging the traffic with - the desired one. + * One is to use a hardware TCAM to push the port's standalone VLAN into the + * frame when forwarding it to the CPU, as an egress modification rule on the + * CPU port. This is preferable because it has no side effects for the + * autonomous forwarding path, and accomplishes tag_8021q's primary goal of + * identifying the source port of each packet based on VLAN ID. * - * So at the end, each front-panel port will have one RX VID (also the PVID), - * the RX VID of all other front-panel ports that are in the same bridge, and - * one TX VID. Whereas the CPU port will have the RX and TX VIDs of all - * front-panel ports, and on top of that, is also tagged-input and - * tagged-output (VLAN trunk). + * The other is to commit the tag_8021q VLAN as a PVID to the VLAN table, and + * to configure the port as VLAN-unaware. This is less preferable because + * unique source port identification can only be done for standalone ports; + * under a VLAN-unaware bridge, all ports share the same tag_8021q VLAN as + * PVID, and under a VLAN-aware bridge, packets received by software will not + * have tag_8021q VLANs appended, just bridge VLANs. * - * CPU port CPU port - * +-------------+-----+-------------+ +-------------+-----+-------------+ - * | RX VID | | | | TX VID | | | - * | of swp0 | | | | of swp0 | | | - * | +-----+ | | +-----+ | - * | ^ T | | | Tagged | - * | | | | | ingress | - * | +-------+---+---+-------+ | | +-----------+ | - * | | | | | | | | Untagged | - * | | U v U v U v | | v egress | - * | +-----+ +-----+ +-----+ +-----+ | | +-----+ +-----+ +-----+ +-----+ | - * | | | | | | | | | | | | | | | | | | | | - * | |PVID | | | | | | | | | | | | | | | | | | - * +-+-----+-+-----+-+-----+-+-----+-+ +-+-----+-+-----+-+-----+-+-----+-+ - * swp0 swp1 swp2 swp3 swp0 swp1 swp2 swp3 + * For tag_8021q implementations of the second type, this method is used to + * replace the standalone tag_8021q VLAN of a port with the tag_8021q VLAN to + * be used for VLAN-unaware bridging. */ -static bool -dsa_port_tag_8021q_bridge_match(struct dsa_port *dp, - struct dsa_notifier_bridge_info *info) +int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, int port, + struct dsa_bridge bridge) { - /* Don't match on self */ - if (dp->ds->dst->index == info->tree_index && - dp->ds->index == info->sw_index && - dp->index == info->port) - return false; - - if (dsa_port_is_user(dp)) - return dsa_port_offloads_bridge(dp, &info->bridge); - - return false; -} - -int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, - struct dsa_notifier_bridge_info *info) -{ - struct dsa_switch *targeted_ds; - struct dsa_port *targeted_dp; - struct dsa_port *dp; - u16 targeted_rx_vid; + struct dsa_port *dp = dsa_to_port(ds, port); + u16 standalone_vid, bridge_vid; int err; - if (!ds->tag_8021q_ctx) - return 0; - - targeted_ds = dsa_switch_find(info->tree_index, info->sw_index); - targeted_dp = dsa_to_port(targeted_ds, info->port); - targeted_rx_vid = dsa_tag_8021q_rx_vid(targeted_dp); - - dsa_switch_for_each_port(dp, ds) { - u16 rx_vid = dsa_tag_8021q_rx_vid(dp); - - if (!dsa_port_tag_8021q_bridge_match(dp, info)) - continue; + /* Delete the standalone VLAN of the port and replace it with a + * bridging VLAN + */ + standalone_vid = dsa_tag_8021q_standalone_vid(dp); + bridge_vid = dsa_tag_8021q_bridge_vid(bridge.num); - /* Install the RX VID of the targeted port in our VLAN table */ - err = dsa_port_tag_8021q_vlan_add(dp, targeted_rx_vid, true); - if (err) - return err; + err = dsa_port_tag_8021q_vlan_add(dp, bridge_vid, true); + if (err) + return err; - /* Install our RX VID into the targeted port's VLAN table */ - err = dsa_port_tag_8021q_vlan_add(targeted_dp, rx_vid, true); - if (err) - return err; - } + dsa_port_tag_8021q_vlan_del(dp, standalone_vid, false); return 0; } +EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_join); -int dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, - struct dsa_notifier_bridge_info *info) +void dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, int port, + struct dsa_bridge bridge) { - struct dsa_switch *targeted_ds; - struct dsa_port *targeted_dp; - struct dsa_port *dp; - u16 targeted_rx_vid; - - if (!ds->tag_8021q_ctx) - return 0; - - targeted_ds = dsa_switch_find(info->tree_index, info->sw_index); - targeted_dp = dsa_to_port(targeted_ds, info->port); - targeted_rx_vid = dsa_tag_8021q_rx_vid(targeted_dp); - - dsa_switch_for_each_port(dp, ds) { - u16 rx_vid = dsa_tag_8021q_rx_vid(dp); - - if (!dsa_port_tag_8021q_bridge_match(dp, info)) - continue; + struct dsa_port *dp = dsa_to_port(ds, port); + u16 standalone_vid, bridge_vid; + int err; - /* Remove the RX VID of the targeted port from our VLAN table */ - dsa_port_tag_8021q_vlan_del(dp, targeted_rx_vid, true); + /* Delete the bridging VLAN of the port and replace it with a + * standalone VLAN + */ + standalone_vid = dsa_tag_8021q_standalone_vid(dp); + bridge_vid = dsa_tag_8021q_bridge_vid(bridge.num); - /* Remove our RX VID from the targeted port's VLAN table */ - dsa_port_tag_8021q_vlan_del(targeted_dp, rx_vid, true); + err = dsa_port_tag_8021q_vlan_add(dp, standalone_vid, false); + if (err) { + dev_err(ds->dev, + "Failed to delete tag_8021q standalone VLAN %d from port %d: %pe\n", + standalone_vid, port, ERR_PTR(err)); } - return 0; -} - -int dsa_tag_8021q_bridge_tx_fwd_offload(struct dsa_switch *ds, int port, - struct dsa_bridge bridge) -{ - u16 tx_vid = dsa_8021q_bridge_tx_fwd_offload_vid(bridge.num); - - return dsa_port_tag_8021q_vlan_add(dsa_to_port(ds, port), tx_vid, - true); + dsa_port_tag_8021q_vlan_del(dp, bridge_vid, true); } -EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_tx_fwd_offload); +EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_leave); -void dsa_tag_8021q_bridge_tx_fwd_unoffload(struct dsa_switch *ds, int port, - struct dsa_bridge bridge) -{ - u16 tx_vid = dsa_8021q_bridge_tx_fwd_offload_vid(bridge.num); - - dsa_port_tag_8021q_vlan_del(dsa_to_port(ds, port), tx_vid, true); -} -EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_tx_fwd_unoffload); - -/* Set up a port's tag_8021q RX and TX VLAN for standalone mode operation */ +/* Set up a port's standalone tag_8021q VLAN */ static int dsa_tag_8021q_port_setup(struct dsa_switch *ds, int port) { struct dsa_8021q_context *ctx = ds->tag_8021q_ctx; struct dsa_port *dp = dsa_to_port(ds, port); - u16 rx_vid = dsa_tag_8021q_rx_vid(dp); - u16 tx_vid = dsa_tag_8021q_tx_vid(dp); + u16 vid = dsa_tag_8021q_standalone_vid(dp); struct net_device *master; int err; @@ -446,30 +342,16 @@ static int dsa_tag_8021q_port_setup(struct dsa_switch *ds, int port) master = dp->cpu_dp->master; - /* Add this user port's RX VID to the membership list of all others - * (including itself). This is so that bridging will not be hindered. - * L2 forwarding rules still take precedence when there are no VLAN - * restrictions, so there are no concerns about leaking traffic. - */ - err = dsa_port_tag_8021q_vlan_add(dp, rx_vid, false); + err = dsa_port_tag_8021q_vlan_add(dp, vid, false); if (err) { dev_err(ds->dev, - "Failed to apply RX VID %d to port %d: %pe\n", - rx_vid, port, ERR_PTR(err)); + "Failed to apply standalone VID %d to port %d: %pe\n", + vid, port, ERR_PTR(err)); return err; } - /* Add @rx_vid to the master's RX filter. */ - vlan_vid_add(master, ctx->proto, rx_vid); - - /* Finally apply the TX VID on this port and on the CPU port */ - err = dsa_port_tag_8021q_vlan_add(dp, tx_vid, false); - if (err) { - dev_err(ds->dev, - "Failed to apply TX VID %d on port %d: %pe\n", - tx_vid, port, ERR_PTR(err)); - return err; - } + /* Add the VLAN to the master's RX filter. */ + vlan_vid_add(master, ctx->proto, vid); return err; } @@ -478,8 +360,7 @@ static void dsa_tag_8021q_port_teardown(struct dsa_switch *ds, int port) { struct dsa_8021q_context *ctx = ds->tag_8021q_ctx; struct dsa_port *dp = dsa_to_port(ds, port); - u16 rx_vid = dsa_tag_8021q_rx_vid(dp); - u16 tx_vid = dsa_tag_8021q_tx_vid(dp); + u16 vid = dsa_tag_8021q_standalone_vid(dp); struct net_device *master; /* The CPU port is implicitly configured by @@ -490,11 +371,9 @@ static void dsa_tag_8021q_port_teardown(struct dsa_switch *ds, int port) master = dp->cpu_dp->master; - dsa_port_tag_8021q_vlan_del(dp, rx_vid, false); + dsa_port_tag_8021q_vlan_del(dp, vid, false); - vlan_vid_del(master, ctx->proto, rx_vid); - - dsa_port_tag_8021q_vlan_del(dp, tx_vid, false); + vlan_vid_del(master, ctx->proto, vid); } static int dsa_tag_8021q_setup(struct dsa_switch *ds) @@ -573,23 +452,57 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, } EXPORT_SYMBOL_GPL(dsa_8021q_xmit); -void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id) +struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master, + int vbid) +{ + struct dsa_port *cpu_dp = master->dsa_ptr; + struct dsa_switch_tree *dst = cpu_dp->dst; + struct dsa_port *dp; + + if (WARN_ON(!vbid)) + return NULL; + + dsa_tree_for_each_user_port(dp, dst) { + if (!dp->bridge) + continue; + + if (dp->stp_state != BR_STATE_LEARNING && + dp->stp_state != BR_STATE_FORWARDING) + continue; + + if (dp->cpu_dp != cpu_dp) + continue; + + if (dsa_port_bridge_num_get(dp) == vbid) + return dp->slave; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(dsa_tag_8021q_find_port_by_vbid); + +void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id, + int *vbid) { u16 vid, tci; - skb_push_rcsum(skb, ETH_HLEN); if (skb_vlan_tag_present(skb)) { tci = skb_vlan_tag_get(skb); __vlan_hwaccel_clear_tag(skb); } else { + skb_push_rcsum(skb, ETH_HLEN); __skb_vlan_pop(skb, &tci); + skb_pull_rcsum(skb, ETH_HLEN); } - skb_pull_rcsum(skb, ETH_HLEN); vid = tci & VLAN_VID_MASK; *source_port = dsa_8021q_rx_source_port(vid); *switch_id = dsa_8021q_rx_switch_id(vid); + + if (vbid) + *vbid = dsa_tag_8021q_rx_vbid(vid); + skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } EXPORT_SYMBOL_GPL(dsa_8021q_rcv); diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index 8abf39dcac64..e4b6e3f2a3db 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -127,6 +127,7 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, u8 extra) { struct dsa_port *dp = dsa_slave_to_port(dev); + struct net_device *br_dev; u8 tag_dev, tag_port; enum dsa_cmd cmd; u8 *dsa_header; @@ -149,7 +150,16 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, tag_port = dp->index; } - if (skb->protocol == htons(ETH_P_8021Q)) { + br_dev = dsa_port_bridge_dev_get(dp); + + /* If frame is already 802.1Q tagged, we can convert it to a DSA + * tag (avoiding a memmove), but only if the port is standalone + * (in which case we always send FROM_CPU) or if the port's + * bridge has VLAN filtering enabled (in which case the CPU port + * will be a member of the VLAN). + */ + if (skb->protocol == htons(ETH_P_8021Q) && + (!br_dev || br_vlan_enabled(br_dev))) { if (extra) { skb_push(skb, extra); dsa_alloc_etype_header(skb, extra); @@ -166,10 +176,9 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, dsa_header[2] &= ~0x10; } } else { - struct net_device *br = dsa_port_bridge_dev_get(dp); u16 vid; - vid = br ? MV88E6XXX_VID_BRIDGED : MV88E6XXX_VID_STANDALONE; + vid = br_dev ? MV88E6XXX_VID_BRIDGED : MV88E6XXX_VID_STANDALONE; skb_push(skb, DSA_HLEN + extra); dsa_alloc_etype_header(skb, DSA_HLEN + extra); @@ -246,12 +255,14 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev, if (trunk) { struct dsa_port *cpu_dp = dev->dsa_ptr; + struct dsa_lag *lag; /* The exact source port is not available in the tag, * so we inject the frame directly on the upper * team/bond. */ - skb->dev = dsa_lag_dev(cpu_dp->dst, source_port); + lag = dsa_lag_by_id(cpu_dp->dst, source_port + 1); + skb->dev = lag ? lag->dev : NULL; } else { skb->dev = dsa_master_find_slave(dev, source_device, source_port); diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index cb548188f813..98d7d7120bab 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -77,7 +77,6 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev) { - __be16 *lan9303_tag; u16 lan9303_tag1; unsigned int source_port; @@ -87,14 +86,15 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev) return NULL; } - lan9303_tag = dsa_etype_header_pos_rx(skb); - - if (lan9303_tag[0] != htons(ETH_P_8021Q)) { - dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid VLAN marker\n"); - return NULL; + if (skb_vlan_tag_present(skb)) { + lan9303_tag1 = skb_vlan_tag_get(skb); + __vlan_hwaccel_clear_tag(skb); + } else { + skb_push_rcsum(skb, ETH_HLEN); + __skb_vlan_pop(skb, &lan9303_tag1); + skb_pull_rcsum(skb, ETH_HLEN); } - lan9303_tag1 = ntohs(lan9303_tag[1]); source_port = lan9303_tag1 & 0x3; skb->dev = dsa_master_find_slave(dev, 0, source_port); @@ -103,13 +103,6 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev) return NULL; } - /* remove the special VLAN tag between the MAC addresses - * and the current ethertype field. - */ - skb_pull_rcsum(skb, 2 + 2); - - dsa_strip_etype_header(skb, LAN9303_TAG_LEN); - if (!(lan9303_tag1 & LAN9303_TAG_RX_TRAPPED_TO_CPU)) dsa_default_offload_fwd_mark(skb); diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index 68982b2789a5..37ccf00404ea 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -32,6 +32,13 @@ static struct sk_buff *ocelot_defer_xmit(struct dsa_port *dp, if (!xmit_work_fn || !xmit_worker) return NULL; + /* PTP over IP packets need UDP checksumming. We may have inherited + * NETIF_F_HW_CSUM from the DSA master, but these packets are not sent + * through the DSA master, so calculate the checksum here. + */ + if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) + return NULL; + xmit_work = kzalloc(sizeof(*xmit_work), GFP_ATOMIC); if (!xmit_work) return NULL; @@ -55,7 +62,7 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb, struct dsa_port *dp = dsa_slave_to_port(netdev); u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); - u16 tx_vid = dsa_tag_8021q_tx_vid(dp); + u16 tx_vid = dsa_tag_8021q_standalone_vid(dp); struct ethhdr *hdr = eth_hdr(skb); if (ocelot_ptp_rew_op(skb) || is_link_local_ether_addr(hdr->h_dest)) @@ -70,7 +77,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, { int src_port, switch_id; - dsa_8021q_rcv(skb, &src_port, &switch_id); + dsa_8021q_rcv(skb, &src_port, &switch_id, NULL); skb->dev = dsa_master_find_slave(netdev, switch_id, src_port); if (!skb->dev) diff --git a/net/dsa/tag_rtl8_4.c b/net/dsa/tag_rtl8_4.c index 02686ad4045d..a593ead7ff26 100644 --- a/net/dsa/tag_rtl8_4.c +++ b/net/dsa/tag_rtl8_4.c @@ -7,13 +7,8 @@ * NOTE: Currently only supports protocol "4" found in the RTL8365MB, hence * named tag_rtl8_4. * - * This tag header has the following format: + * This tag has the following format: * - * ------------------------------------------- - * | MAC DA | MAC SA | 8 byte tag | Type | ... - * ------------------------------------------- - * _______________/ \______________________________________ - * / \ * 0 7|8 15 * |-----------------------------------+-----------------------------------|--- * | (16-bit) | ^ @@ -58,6 +53,24 @@ * TX/RX | TX (switch->CPU): port number the packet was received on * | RX (CPU->switch): forwarding port mask (if ALLOW=0) * | allowance port mask (if ALLOW=1) + * + * The tag can be positioned before Ethertype, using tag "rtl8_4": + * + * +--------+--------+------------+------+----- + * | MAC DA | MAC SA | 8 byte tag | Type | ... + * +--------+--------+------------+------+----- + * + * The tag can also appear between the end of the payload and before the CRC, + * using tag "rtl8_4t": + * + * +--------+--------+------+-----+---------+------------+-----+ + * | MAC DA | MAC SA | TYPE | ... | payload | 8-byte tag | CRC | + * +--------+--------+------+-----+---------+------------+-----+ + * + * The added bytes after the payload will break most checksums, either in + * software or hardware. To avoid this issue, if the checksum is still pending, + * this tagger checksums the packet in software before adding the tag. + * */ #include <linux/bitfield.h> @@ -84,87 +97,133 @@ #define RTL8_4_TX GENMASK(3, 0) #define RTL8_4_RX GENMASK(10, 0) -static struct sk_buff *rtl8_4_tag_xmit(struct sk_buff *skb, - struct net_device *dev) +static void rtl8_4_write_tag(struct sk_buff *skb, struct net_device *dev, + void *tag) { struct dsa_port *dp = dsa_slave_to_port(dev); - __be16 *tag; - - skb_push(skb, RTL8_4_TAG_LEN); - - dsa_alloc_etype_header(skb, RTL8_4_TAG_LEN); - tag = dsa_etype_header_pos_tx(skb); + __be16 tag16[RTL8_4_TAG_LEN / 2]; /* Set Realtek EtherType */ - tag[0] = htons(ETH_P_REALTEK); + tag16[0] = htons(ETH_P_REALTEK); /* Set Protocol; zero REASON */ - tag[1] = htons(FIELD_PREP(RTL8_4_PROTOCOL, RTL8_4_PROTOCOL_RTL8365MB)); + tag16[1] = htons(FIELD_PREP(RTL8_4_PROTOCOL, RTL8_4_PROTOCOL_RTL8365MB)); /* Zero FID_EN, FID, PRI_EN, PRI, KEEP; set LEARN_DIS */ - tag[2] = htons(FIELD_PREP(RTL8_4_LEARN_DIS, 1)); + tag16[2] = htons(FIELD_PREP(RTL8_4_LEARN_DIS, 1)); /* Zero ALLOW; set RX (CPU->switch) forwarding port mask */ - tag[3] = htons(FIELD_PREP(RTL8_4_RX, BIT(dp->index))); + tag16[3] = htons(FIELD_PREP(RTL8_4_RX, BIT(dp->index))); + + memcpy(tag, tag16, RTL8_4_TAG_LEN); +} + +static struct sk_buff *rtl8_4_tag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + skb_push(skb, RTL8_4_TAG_LEN); + + dsa_alloc_etype_header(skb, RTL8_4_TAG_LEN); + + rtl8_4_write_tag(skb, dev, dsa_etype_header_pos_tx(skb)); return skb; } -static struct sk_buff *rtl8_4_tag_rcv(struct sk_buff *skb, - struct net_device *dev) +static struct sk_buff *rtl8_4t_tag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + /* Calculate the checksum here if not done yet as trailing tags will + * break either software or hardware based checksum + */ + if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) + return NULL; + + rtl8_4_write_tag(skb, dev, skb_put(skb, RTL8_4_TAG_LEN)); + + return skb; +} + +static int rtl8_4_read_tag(struct sk_buff *skb, struct net_device *dev, + void *tag) { - __be16 *tag; + __be16 tag16[RTL8_4_TAG_LEN / 2]; u16 etype; u8 reason; u8 proto; u8 port; - if (unlikely(!pskb_may_pull(skb, RTL8_4_TAG_LEN))) - return NULL; - - tag = dsa_etype_header_pos_rx(skb); + memcpy(tag16, tag, RTL8_4_TAG_LEN); /* Parse Realtek EtherType */ - etype = ntohs(tag[0]); + etype = ntohs(tag16[0]); if (unlikely(etype != ETH_P_REALTEK)) { dev_warn_ratelimited(&dev->dev, "non-realtek ethertype 0x%04x\n", etype); - return NULL; + return -EPROTO; } /* Parse Protocol */ - proto = FIELD_GET(RTL8_4_PROTOCOL, ntohs(tag[1])); + proto = FIELD_GET(RTL8_4_PROTOCOL, ntohs(tag16[1])); if (unlikely(proto != RTL8_4_PROTOCOL_RTL8365MB)) { dev_warn_ratelimited(&dev->dev, "unknown realtek protocol 0x%02x\n", proto); - return NULL; + return -EPROTO; } /* Parse REASON */ - reason = FIELD_GET(RTL8_4_REASON, ntohs(tag[1])); + reason = FIELD_GET(RTL8_4_REASON, ntohs(tag16[1])); /* Parse TX (switch->CPU) */ - port = FIELD_GET(RTL8_4_TX, ntohs(tag[3])); + port = FIELD_GET(RTL8_4_TX, ntohs(tag16[3])); skb->dev = dsa_master_find_slave(dev, 0, port); if (!skb->dev) { dev_warn_ratelimited(&dev->dev, "could not find slave for port %d\n", port); - return NULL; + return -ENOENT; } + if (reason != RTL8_4_REASON_TRAP) + dsa_default_offload_fwd_mark(skb); + + return 0; +} + +static struct sk_buff *rtl8_4_tag_rcv(struct sk_buff *skb, + struct net_device *dev) +{ + if (unlikely(!pskb_may_pull(skb, RTL8_4_TAG_LEN))) + return NULL; + + if (unlikely(rtl8_4_read_tag(skb, dev, dsa_etype_header_pos_rx(skb)))) + return NULL; + /* Remove tag and recalculate checksum */ skb_pull_rcsum(skb, RTL8_4_TAG_LEN); dsa_strip_etype_header(skb, RTL8_4_TAG_LEN); - if (reason != RTL8_4_REASON_TRAP) - dsa_default_offload_fwd_mark(skb); + return skb; +} + +static struct sk_buff *rtl8_4t_tag_rcv(struct sk_buff *skb, + struct net_device *dev) +{ + if (skb_linearize(skb)) + return NULL; + + if (unlikely(rtl8_4_read_tag(skb, dev, skb_tail_pointer(skb) - RTL8_4_TAG_LEN))) + return NULL; + + if (pskb_trim_rcsum(skb, skb->len - RTL8_4_TAG_LEN)) + return NULL; return skb; } +/* Ethertype version */ static const struct dsa_device_ops rtl8_4_netdev_ops = { .name = "rtl8_4", .proto = DSA_TAG_PROTO_RTL8_4, @@ -172,7 +231,28 @@ static const struct dsa_device_ops rtl8_4_netdev_ops = { .rcv = rtl8_4_tag_rcv, .needed_headroom = RTL8_4_TAG_LEN, }; -module_dsa_tag_driver(rtl8_4_netdev_ops); -MODULE_LICENSE("GPL"); +DSA_TAG_DRIVER(rtl8_4_netdev_ops); + MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL8_4); + +/* Tail version */ +static const struct dsa_device_ops rtl8_4t_netdev_ops = { + .name = "rtl8_4t", + .proto = DSA_TAG_PROTO_RTL8_4T, + .xmit = rtl8_4t_tag_xmit, + .rcv = rtl8_4t_tag_rcv, + .needed_tailroom = RTL8_4_TAG_LEN, +}; + +DSA_TAG_DRIVER(rtl8_4t_netdev_ops); + +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL8_4T); + +static struct dsa_tag_driver *dsa_tag_drivers[] = { + &DSA_TAG_DRIVER_NAME(rtl8_4_netdev_ops), + &DSA_TAG_DRIVER_NAME(rtl8_4t_netdev_ops), +}; +module_dsa_tag_drivers(dsa_tag_drivers); + +MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 72d5e0ef8dcf..83e4136516b0 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -226,7 +226,7 @@ static struct sk_buff *sja1105_imprecise_xmit(struct sk_buff *skb, * TX VLAN that targets the bridge's entire broadcast domain, * instead of just the specific port. */ - tx_vid = dsa_8021q_bridge_tx_fwd_offload_vid(bridge_num); + tx_vid = dsa_tag_8021q_bridge_vid(bridge_num); return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp), tx_vid); } @@ -267,7 +267,7 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb, struct dsa_port *dp = dsa_slave_to_port(netdev); u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); - u16 tx_vid = dsa_tag_8021q_tx_vid(dp); + u16 tx_vid = dsa_tag_8021q_standalone_vid(dp); if (skb->offload_fwd_mark) return sja1105_imprecise_xmit(skb, netdev); @@ -295,7 +295,7 @@ static struct sk_buff *sja1110_xmit(struct sk_buff *skb, struct dsa_port *dp = dsa_slave_to_port(netdev); u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); - u16 tx_vid = dsa_tag_8021q_tx_vid(dp); + u16 tx_vid = dsa_tag_8021q_standalone_vid(dp); __be32 *tx_trailer; __be16 *tx_header; int trailer_pos; @@ -509,7 +509,7 @@ static bool sja1110_skb_has_inband_control_extension(const struct sk_buff *skb) * packet. */ static void sja1105_vlan_rcv(struct sk_buff *skb, int *source_port, - int *switch_id, u16 *vid) + int *switch_id, int *vbid, u16 *vid) { struct vlan_ethhdr *hdr = (struct vlan_ethhdr *)skb_mac_header(skb); u16 vlan_tci; @@ -519,8 +519,8 @@ static void sja1105_vlan_rcv(struct sk_buff *skb, int *source_port, else vlan_tci = ntohs(hdr->h_vlan_TCI); - if (vid_is_dsa_8021q_rxvlan(vlan_tci & VLAN_VID_MASK)) - return dsa_8021q_rcv(skb, source_port, switch_id); + if (vid_is_dsa_8021q(vlan_tci & VLAN_VID_MASK)) + return dsa_8021q_rcv(skb, source_port, switch_id, vbid); /* Try our best with imprecise RX */ *vid = vlan_tci & VLAN_VID_MASK; @@ -529,7 +529,7 @@ static void sja1105_vlan_rcv(struct sk_buff *skb, int *source_port, static struct sk_buff *sja1105_rcv(struct sk_buff *skb, struct net_device *netdev) { - int source_port = -1, switch_id = -1; + int source_port = -1, switch_id = -1, vbid = -1; struct sja1105_meta meta = {0}; struct ethhdr *hdr; bool is_link_local; @@ -542,7 +542,7 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, if (sja1105_skb_has_tag_8021q(skb)) { /* Normal traffic path. */ - sja1105_vlan_rcv(skb, &source_port, &switch_id, &vid); + sja1105_vlan_rcv(skb, &source_port, &switch_id, &vbid, &vid); } else if (is_link_local) { /* Management traffic path. Switch embeds the switch ID and * port ID into bytes of the destination MAC, courtesy of @@ -561,7 +561,9 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, return NULL; } - if (source_port == -1 || switch_id == -1) + if (vbid >= 1) + skb->dev = dsa_tag_8021q_find_port_by_vbid(netdev, vbid); + else if (source_port == -1 || switch_id == -1) skb->dev = dsa_find_designated_bridge_port_by_vid(netdev, vid); else skb->dev = dsa_master_find_slave(netdev, switch_id, source_port); @@ -686,7 +688,7 @@ static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb, static struct sk_buff *sja1110_rcv(struct sk_buff *skb, struct net_device *netdev) { - int source_port = -1, switch_id = -1; + int source_port = -1, switch_id = -1, vbid = -1; bool host_only = false; u16 vid = 0; @@ -700,9 +702,11 @@ static struct sk_buff *sja1110_rcv(struct sk_buff *skb, /* Packets with in-band control extensions might still have RX VLANs */ if (likely(sja1105_skb_has_tag_8021q(skb))) - sja1105_vlan_rcv(skb, &source_port, &switch_id, &vid); + sja1105_vlan_rcv(skb, &source_port, &switch_id, &vbid, &vid); - if (source_port == -1 || switch_id == -1) + if (vbid >= 1) + skb->dev = dsa_tag_8021q_find_port_by_vbid(netdev, vbid); + else if (source_port == -1 || switch_id == -1) skb->dev = dsa_find_designated_bridge_port_by_vid(netdev, vid); else skb->dev = dsa_master_find_slave(netdev, switch_id, source_port); diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 75856db299e9..29d01662a48b 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -363,7 +363,7 @@ extern const struct nla_policy ethnl_features_set_policy[ETHTOOL_A_FEATURES_WANT extern const struct nla_policy ethnl_privflags_get_policy[ETHTOOL_A_PRIVFLAGS_HEADER + 1]; extern const struct nla_policy ethnl_privflags_set_policy[ETHTOOL_A_PRIVFLAGS_FLAGS + 1]; extern const struct nla_policy ethnl_rings_get_policy[ETHTOOL_A_RINGS_HEADER + 1]; -extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_RX_BUF_LEN + 1]; +extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_CQE_SIZE + 1]; extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1]; extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1]; extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1]; diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c index 18a5035d3bee..9f33c9689b56 100644 --- a/net/ethtool/rings.c +++ b/net/ethtool/rings.c @@ -54,7 +54,8 @@ static int rings_reply_size(const struct ethnl_req_info *req_base, nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO */ nla_total_size(sizeof(u32)) + /* _RINGS_TX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_BUF_LEN */ - nla_total_size(sizeof(u8)); /* _RINGS_TCP_DATA_SPLIT */ + nla_total_size(sizeof(u8)) + /* _RINGS_TCP_DATA_SPLIT */ + nla_total_size(sizeof(u32)); /* _RINGS_CQE_SIZE */ } static int rings_fill_reply(struct sk_buff *skb, @@ -91,7 +92,9 @@ static int rings_fill_reply(struct sk_buff *skb, (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_BUF_LEN, kr->rx_buf_len))) || (kr->tcp_data_split && (nla_put_u8(skb, ETHTOOL_A_RINGS_TCP_DATA_SPLIT, - kr->tcp_data_split)))) + kr->tcp_data_split))) || + (kr->cqe_size && + (nla_put_u32(skb, ETHTOOL_A_RINGS_CQE_SIZE, kr->cqe_size)))) return -EMSGSIZE; return 0; @@ -119,6 +122,7 @@ const struct nla_policy ethnl_rings_set_policy[] = { [ETHTOOL_A_RINGS_RX_JUMBO] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_TX] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_RX_BUF_LEN] = NLA_POLICY_MIN(NLA_U32, 1), + [ETHTOOL_A_RINGS_CQE_SIZE] = NLA_POLICY_MIN(NLA_U32, 1), }; int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info) @@ -159,6 +163,8 @@ int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info) ethnl_update_u32(&ringparam.tx_pending, tb[ETHTOOL_A_RINGS_TX], &mod); ethnl_update_u32(&kernel_ringparam.rx_buf_len, tb[ETHTOOL_A_RINGS_RX_BUF_LEN], &mod); + ethnl_update_u32(&kernel_ringparam.cqe_size, + tb[ETHTOOL_A_RINGS_CQE_SIZE], &mod); ret = 0; if (!mod) goto out_ops; @@ -190,6 +196,15 @@ int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info) goto out_ops; } + if (kernel_ringparam.cqe_size && + !(ops->supported_ring_params & ETHTOOL_RING_USE_CQE_SIZE)) { + ret = -EOPNOTSUPP; + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_RINGS_CQE_SIZE], + "setting cqe size not supported"); + goto out_ops; + } + ret = dev->ethtool_ops->set_ringparam(dev, &ringparam, &kernel_ringparam, info->extack); if (ret < 0) diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index b3c6ffa1894d..584e21788799 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -20,6 +20,13 @@ #include "hsr_framereg.h" #include "hsr_netlink.h" +#ifdef CONFIG_LOCKDEP +int lockdep_hsr_is_held(spinlock_t *lock) +{ + return lockdep_is_held(lock); +} +#endif + u32 hsr_mac_hash(struct hsr_priv *hsr, const unsigned char *addr) { u32 hash = jhash(addr, ETH_ALEN, hsr->hash_seed); @@ -27,11 +34,12 @@ u32 hsr_mac_hash(struct hsr_priv *hsr, const unsigned char *addr) return reciprocal_scale(hash, hsr->hash_buckets); } -struct hsr_node *hsr_node_get_first(struct hlist_head *head) +struct hsr_node *hsr_node_get_first(struct hlist_head *head, spinlock_t *lock) { struct hlist_node *first; - first = rcu_dereference(hlist_first_rcu(head)); + first = rcu_dereference_bh_check(hlist_first_rcu(head), + lockdep_hsr_is_held(lock)); if (first) return hlist_entry(first, struct hsr_node, mac_list); @@ -59,7 +67,7 @@ bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr) { struct hsr_node *node; - node = hsr_node_get_first(&hsr->self_node_db); + node = hsr_node_get_first(&hsr->self_node_db, &hsr->list_lock); if (!node) { WARN_ONCE(1, "HSR: No self node\n"); return false; @@ -106,7 +114,7 @@ int hsr_create_self_node(struct hsr_priv *hsr, ether_addr_copy(node->macaddress_B, addr_b); spin_lock_bh(&hsr->list_lock); - oldnode = hsr_node_get_first(self_node_db); + oldnode = hsr_node_get_first(self_node_db, &hsr->list_lock); if (oldnode) { hlist_replace_rcu(&oldnode->mac_list, &node->mac_list); spin_unlock_bh(&hsr->list_lock); @@ -125,7 +133,7 @@ void hsr_del_self_node(struct hsr_priv *hsr) struct hsr_node *node; spin_lock_bh(&hsr->list_lock); - node = hsr_node_get_first(self_node_db); + node = hsr_node_get_first(self_node_db, &hsr->list_lock); if (node) { hlist_del_rcu(&node->mac_list); kfree_rcu(node, rcu_head); @@ -191,7 +199,7 @@ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, spin_lock_bh(&hsr->list_lock); hlist_for_each_entry_rcu(node, node_db, mac_list, - lockdep_is_held(&hsr->list_lock)) { + lockdep_hsr_is_held(&hsr->list_lock)) { if (ether_addr_equal(node->macaddress_A, addr)) goto out; if (ether_addr_equal(node->macaddress_B, addr)) @@ -597,7 +605,8 @@ void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos, hash = hsr_mac_hash(hsr, addr); if (!_pos) { - node = hsr_node_get_first(&hsr->node_db[hash]); + node = hsr_node_get_first(&hsr->node_db[hash], + &hsr->list_lock); if (node) ether_addr_copy(addr, node->macaddress_A); return node; diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index d7cce6b161e3..f3762e9e42b5 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -28,8 +28,14 @@ struct hsr_frame_info { bool is_from_san; }; +#ifdef CONFIG_LOCKDEP +int lockdep_hsr_is_held(spinlock_t *lock); +#else +#define lockdep_hsr_is_held(lock) 1 +#endif + u32 hsr_mac_hash(struct hsr_priv *hsr, const unsigned char *addr); -struct hsr_node *hsr_node_get_first(struct hlist_head *head); +struct hsr_node *hsr_node_get_first(struct hlist_head *head, spinlock_t *lock); void hsr_del_self_node(struct hsr_priv *hsr); void hsr_del_nodes(struct hlist_head *node_db); struct hsr_node *hsr_get_node(struct hsr_port *port, struct hlist_head *node_db, diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index ca556bda3467..b158ba409f9a 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -45,22 +45,6 @@ /* PRP V1 life redundancy box MAC address */ #define PRP_TLV_REDBOX_MAC 30 -/* HSR Tag. - * As defined in IEC-62439-3:2010, the HSR tag is really { ethertype = 0x88FB, - * path, LSDU_size, sequence Nr }. But we let eth_header() create { h_dest, - * h_source, h_proto = 0x88FB }, and add { path, LSDU_size, sequence Nr, - * encapsulated protocol } instead. - * - * Field names as defined in the IEC:2010 standard for HSR. - */ -struct hsr_tag { - __be16 path_and_LSDU_size; - __be16 sequence_nr; - __be16 encap_proto; -} __packed; - -#define HSR_HLEN 6 - #define HSR_V1_SUP_LSDUSIZE 52 #define HSR_HSIZE_SHIFT 8 diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index be6f06adefe0..a91283d1e5bf 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -130,6 +130,7 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq, goto err; fq->q.stamp = skb->tstamp; + fq->q.mono_delivery_time = skb->mono_delivery_time; if (frag_type == LOWPAN_DISPATCH_FRAG1) fq->q.flags |= INET_FRAG_FIRST_IN; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 9c465bac1eb0..72fde2888ad2 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1376,8 +1376,11 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, } ops = rcu_dereference(inet_offloads[proto]); - if (likely(ops && ops->callbacks.gso_segment)) + if (likely(ops && ops->callbacks.gso_segment)) { segs = ops->callbacks.gso_segment(skb, features); + if (!segs) + skb->network_header = skb_mac_header(skb) + nhoff - skb->head; + } if (IS_ERR_OR_NULL(segs)) goto out; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 4db0325f6e1a..2d0c05ca9c6f 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -293,7 +293,7 @@ static int arp_constructor(struct neighbour *neigh) static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb) { dst_link_failure(skb); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED); } /* Create and send an arp packet. */ @@ -1116,13 +1116,18 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev) return err; } -static int arp_invalidate(struct net_device *dev, __be32 ip) +int arp_invalidate(struct net_device *dev, __be32 ip, bool force) { struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev); int err = -ENXIO; struct neigh_table *tbl = &arp_tbl; if (neigh) { + if ((neigh->nud_state & NUD_VALID) && !force) { + neigh_release(neigh); + return 0; + } + if (neigh->nud_state & ~NUD_NOARP) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE| @@ -1169,7 +1174,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r, if (!dev) return -EINVAL; } - return arp_invalidate(dev, ip); + return arp_invalidate(dev, ip, true); } /* diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index fba2bffd65f7..53a6b14dc50a 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -104,6 +104,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_FLAGS] = { .type = NLA_U32 }, [IFA_RT_PRIORITY] = { .type = NLA_U32 }, [IFA_TARGET_NETNSID] = { .type = NLA_S32 }, + [IFA_PROTO] = { .type = NLA_U8 }, }; struct inet_fill_args { @@ -889,6 +890,9 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, if (tb[IFA_RT_PRIORITY]) ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]); + if (tb[IFA_PROTO]) + ifa->ifa_proto = nla_get_u8(tb[IFA_PROTO]); + if (tb[IFA_CACHEINFO]) { struct ifa_cacheinfo *ci; @@ -1625,6 +1629,7 @@ static size_t inet_nlmsg_size(void) + nla_total_size(4) /* IFA_BROADCAST */ + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ + nla_total_size(4) /* IFA_FLAGS */ + + nla_total_size(1) /* IFA_PROTO */ + nla_total_size(4) /* IFA_RT_PRIORITY */ + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */ } @@ -1699,6 +1704,8 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, nla_put_in_addr(skb, IFA_BROADCAST, ifa->ifa_broadcast)) || (ifa->ifa_label[0] && nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || + (ifa->ifa_proto && + nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto)) || nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) || (ifa->ifa_rt_priority && nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) || diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 851f542928a3..70e6c87fbe3d 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -446,6 +446,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; + unsigned int allocsz; /* this is non-NULL only with TCP/UDP Encapsulation */ if (x->encap) { @@ -455,6 +456,10 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * return err; } + allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); + if (allocsz > ESP_SKB_FRAG_MAXSIZE) + goto cow; + if (!skb_cloned(skb)) { if (tailen <= skb_tailroom(skb)) { nfrags = 1; @@ -671,7 +676,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, __xfrm_state_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index d87f02a6e934..935026f4c807 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -110,8 +110,7 @@ static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - __skb_push(skb, skb->mac_len); - return skb_mac_gso_segment(skb, features); + return skb_eth_gso_segment(skb, features, htons(ETH_P_IP)); } static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x, @@ -160,6 +159,9 @@ static struct sk_buff *xfrm4_beet_gso_segment(struct xfrm_state *x, skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; } + if (proto == IPPROTO_IPV6) + skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4; + __skb_pull(skb, skb_transport_offset(skb)); ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 54811728d906..7408051632ac 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -437,6 +437,9 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, if (net->ipv4.fib_has_custom_local_routes || fib4_has_custom_rules(net)) goto full_check; + /* Within the same container, it is regarded as a martian source, + * and the same host but different containers are not. + */ if (inet_lookup_ifaddr_rcu(net, src)) return -EINVAL; @@ -1121,9 +1124,11 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) return; /* Add broadcast address, if it is explicitly assigned. */ - if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) + if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) { fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim, 0); + arp_invalidate(dev, ifa->ifa_broadcast, false); + } if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) && (prefix != addr || ifa->ifa_prefixlen < 32)) { @@ -1137,6 +1142,7 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) if (ifa->ifa_prefixlen < 31) { fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask, 32, prim, 0); + arp_invalidate(dev, prefix | ~mask, false); } } } diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index a63014b54809..f9b9e26c32c1 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -17,10 +17,9 @@ struct fib_alias { u8 fa_slen; u32 tb_id; s16 fa_default; - u8 offload:1, - trap:1, - offload_failed:1, - unused:5; + u8 offload; + u8 trap; + u8 offload_failed; struct rcu_head rcu; }; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c9c4f2f66b38..c5a29703185a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -526,9 +526,9 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, fri.dst_len = dst_len; fri.tos = inet_dscp_to_dsfield(fa->fa_dscp); fri.type = fa->fa_type; - fri.offload = fa->offload; - fri.trap = fa->trap; - fri.offload_failed = fa->offload_failed; + fri.offload = READ_ONCE(fa->offload); + fri.trap = READ_ONCE(fa->trap); + fri.offload_failed = READ_ONCE(fa->offload_failed); err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index c05cd105e95e..2af2b99e0bea 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1052,19 +1052,23 @@ void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) if (!fa_match) goto out; - if (fa_match->offload == fri->offload && fa_match->trap == fri->trap && - fa_match->offload_failed == fri->offload_failed) + /* These are paired with the WRITE_ONCE() happening in this function. + * The reason is that we are only protected by RCU at this point. + */ + if (READ_ONCE(fa_match->offload) == fri->offload && + READ_ONCE(fa_match->trap) == fri->trap && + READ_ONCE(fa_match->offload_failed) == fri->offload_failed) goto out; - fa_match->offload = fri->offload; - fa_match->trap = fri->trap; + WRITE_ONCE(fa_match->offload, fri->offload); + WRITE_ONCE(fa_match->trap, fri->trap); /* 2 means send notifications only if offload_failed was changed. */ if (net->ipv4.sysctl_fib_notify_on_flag_change == 2 && - fa_match->offload_failed == fri->offload_failed) + READ_ONCE(fa_match->offload_failed) == fri->offload_failed) goto out; - fa_match->offload_failed = fri->offload_failed; + WRITE_ONCE(fa_match->offload_failed, fri->offload_failed); if (!net->ipv4.sysctl_fib_notify_on_flag_change) goto out; @@ -2306,9 +2310,9 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, fri.dst_len = KEYLENGTH - fa->fa_slen; fri.tos = inet_dscp_to_dsfield(fa->fa_dscp); fri.type = fa->fa_type; - fri.offload = fa->offload; - fri.trap = fa->trap; - fri.offload_failed = fa->offload_failed; + fri.offload = READ_ONCE(fa->offload); + fri.trap = READ_ONCE(fa->trap); + fri.offload_failed = READ_ONCE(fa->offload_failed); err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 341096807100..63948f6aeca0 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -572,6 +572,7 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, skb_mark_not_on_list(head); head->prev = NULL; head->tstamp = q->stamp; + head->mono_delivery_time = q->mono_delivery_time; } EXPORT_SYMBOL(inet_frag_reasm_finish); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 00ec819f949b..92ba3350274b 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -79,7 +79,7 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s if (unlikely(opt->optlen)) ip_forward_options(skb); - skb->tstamp = 0; + skb_clear_tstamp(skb); return dst_output(net, sk, skb); } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index fad803d2d711..fb153569889e 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -349,6 +349,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) qp->iif = dev->ifindex; qp->q.stamp = skb->tstamp; + qp->q.mono_delivery_time = skb->mono_delivery_time; qp->q.meat += skb->len; qp->ecn |= ecn; add_frag_mem_limit(qp->q.fqdir, skb->truesize); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d94f9f7e60c3..95f7bb052784 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -226,6 +226,7 @@ resubmit: static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { + skb_clear_delivery_time(skb); __skb_pull(skb, skb_network_header_len(skb)); rcu_read_lock(); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 0c0574eb5f5b..00b4bf26fd93 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -233,7 +233,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s net_dbg_ratelimited("%s: No header cache and no neighbour!\n", __func__); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); return -EINVAL; } @@ -317,7 +317,7 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk case NET_XMIT_CN: return __ip_finish_output(net, sk, skb) ? : ret; default: - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); return ret; } } @@ -337,7 +337,7 @@ static int ip_mc_finish_output(struct net *net, struct sock *sk, case NET_XMIT_SUCCESS: break; default: - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); return ret; } @@ -536,7 +536,7 @@ packet_routed: no_route: rcu_read_unlock(); IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_IP_OUTNOROUTES); return -EHOSTUNREACH; } EXPORT_SYMBOL(__ip_queue_xmit); @@ -761,6 +761,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, { struct iphdr *iph; struct sk_buff *skb2; + bool mono_delivery_time = skb->mono_delivery_time; struct rtable *rt = skb_rtable(skb); unsigned int mtu, hlen, ll_rs; struct ip_fraglist_iter iter; @@ -852,7 +853,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, } } - skb->tstamp = tstamp; + skb_set_delivery_time(skb, tstamp, mono_delivery_time); err = output(net, sk, skb); if (!err) @@ -908,7 +909,7 @@ slow_path: /* * Put this fragment into the sending queue. */ - skb2->tstamp = tstamp; + skb_set_delivery_time(skb2, tstamp, mono_delivery_time); err = output(net, sk, skb2); if (err) goto fail; @@ -991,7 +992,7 @@ static int __ip_append_data(struct sock *sk, if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) - tskey = sk->sk_tskey++; + tskey = atomic_inc_return(&sk->sk_tskey) - 1; hh_len = LL_RESERVED_SPACE(rt->dst.dev); @@ -1727,6 +1728,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; + nskb->mono_delivery_time = !!transmit_time; ip_push_pending_frames(sk, &fl4); } out: diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index bcf7bc71cb56..3ee947557b88 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -172,16 +172,22 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) struct sock *sk = NULL; struct inet_sock *isk; struct hlist_nulls_node *hnode; - int dif = skb->dev->ifindex; + int dif, sdif; if (skb->protocol == htons(ETH_P_IP)) { + dif = inet_iif(skb); + sdif = inet_sdif(skb); pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n", (int)ident, &ip_hdr(skb)->daddr, dif); #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6)) { + dif = inet6_iif(skb); + sdif = inet6_sdif(skb); pr_debug("try to find: num = %d, daddr = %pI6c, dif = %d\n", (int)ident, &ipv6_hdr(skb)->daddr, dif); #endif + } else { + return NULL; } read_lock_bh(&ping_table.lock); @@ -221,7 +227,7 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) } if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && - sk->sk_bound_dev_if != inet_sdif(skb)) + sk->sk_bound_dev_if != sdif) continue; sock_hold(sk); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 202d6b1fff43..f444f5983405 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3400,8 +3400,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fa->fa_dscp == inet_dsfield_to_dscp(fri.tos) && fa->fa_info == res.fi && fa->fa_type == fri.type) { - fri.offload = fa->offload; - fri.trap = fa->trap; + fri.offload = READ_ONCE(fa->offload); + fri.trap = READ_ONCE(fa->trap); break; } } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 1cae27b5dcd8..ad80d180b60b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1272,6 +1272,13 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ONE, }, { + .procname = "tcp_tso_rtt_log", + .data = &init_net.ipv4.sysctl_tcp_tso_rtt_log, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + }, + { .procname = "tcp_min_rtt_wlen", .data = &init_net.ipv4.sysctl_tcp_min_rtt_wlen, .maxlen = sizeof(int), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 760e8221d321..cf18fbcbf123 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -688,7 +688,8 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, return skb->len < size_goal && sock_net(sk)->ipv4.sysctl_tcp_autocorking && !tcp_rtx_queue_empty(sk) && - refcount_read(&sk->sk_wmem_alloc) > skb->truesize; + refcount_read(&sk->sk_wmem_alloc) > skb->truesize && + tcp_skb_can_collapse_to(skb); } void tcp_push(struct sock *sk, int flags, int mss_now, @@ -1683,11 +1684,13 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, if (!copied) copied = used; break; - } else if (used <= len) { - seq += used; - copied += used; - offset += used; } + if (WARN_ON_ONCE(used > len)) + used = len; + seq += used; + copied += used; + offset += used; + /* If recv_actor drops the lock (e.g. TCP splice * receive) the skb pointer might be invalid when * getting here: tcp_collapse might have deleted it @@ -4431,6 +4434,73 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *ke } EXPORT_SYMBOL(tcp_md5_hash_key); +/* Called with rcu_read_lock() */ +enum skb_drop_reason +tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, + const void *saddr, const void *daddr, + int family, int dif, int sdif) +{ + /* + * This gets called for each TCP segment that arrives + * so we want to be efficient. + * We have 3 drop cases: + * o No MD5 hash and one expected. + * o MD5 hash and we're not expecting one. + * o MD5 hash and its wrong. + */ + const __u8 *hash_location = NULL; + struct tcp_md5sig_key *hash_expected; + const struct tcphdr *th = tcp_hdr(skb); + struct tcp_sock *tp = tcp_sk(sk); + int genhash, l3index; + u8 newhash[16]; + + /* sdif set, means packet ingressed via a device + * in an L3 domain and dif is set to the l3mdev + */ + l3index = sdif ? dif : 0; + + hash_expected = tcp_md5_do_lookup(sk, l3index, saddr, family); + hash_location = tcp_parse_md5sig_option(th); + + /* We've parsed the options - do we have a hash? */ + if (!hash_expected && !hash_location) + return SKB_NOT_DROPPED_YET; + + if (hash_expected && !hash_location) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); + return SKB_DROP_REASON_TCP_MD5NOTFOUND; + } + + if (!hash_expected && hash_location) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); + return SKB_DROP_REASON_TCP_MD5UNEXPECTED; + } + + /* check the signature */ + genhash = tp->af_specific->calc_md5_hash(newhash, hash_expected, + NULL, skb); + + if (genhash || memcmp(hash_location, newhash, 16) != 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); + if (family == AF_INET) { + net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", + saddr, ntohs(th->source), + daddr, ntohs(th->dest), + genhash ? " tcp_v4_calc_md5_hash failed" + : "", l3index); + } else { + net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u L3 index %d\n", + genhash ? "failed" : "mismatch", + saddr, ntohs(th->source), + daddr, ntohs(th->dest), l3index); + } + return SKB_DROP_REASON_TCP_MD5FAILURE; + } + return SKB_NOT_DROPPED_YET; +} +EXPORT_SYMBOL(tcp_inbound_md5_hash); + #endif void tcp_done(struct sock *sk) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 92e65d56dc2c..2088f93fa37b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4684,10 +4684,16 @@ static bool tcp_ooo_try_coalesce(struct sock *sk, return res; } -static void tcp_drop(struct sock *sk, struct sk_buff *skb) +static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb, + enum skb_drop_reason reason) { sk_drops_add(sk, skb); - __kfree_skb(skb); + kfree_skb_reason(skb, reason); +} + +static void tcp_drop(struct sock *sk, struct sk_buff *skb) +{ + tcp_drop_reason(sk, skb, SKB_DROP_REASON_NOT_SPECIFIED); } /* This one checks to see if we can put data from the @@ -4773,7 +4779,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); sk->sk_data_ready(sk); - tcp_drop(sk, skb); + tcp_drop_reason(sk, skb, SKB_DROP_REASON_PROTO_MEM); return; } @@ -4836,7 +4842,8 @@ coalesce_done: /* All the bits are present. Drop. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - tcp_drop(sk, skb); + tcp_drop_reason(sk, skb, + SKB_DROP_REASON_TCP_OFOMERGE); skb = NULL; tcp_dsack_set(sk, seq, end_seq); goto add_sack; @@ -4855,7 +4862,8 @@ coalesce_done: TCP_SKB_CB(skb1)->end_seq); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - tcp_drop(sk, skb1); + tcp_drop_reason(sk, skb1, + SKB_DROP_REASON_TCP_OFOMERGE); goto merge_right; } } else if (tcp_ooo_try_coalesce(sk, skb1, @@ -4883,7 +4891,7 @@ merge_right: tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - tcp_drop(sk, skb1); + tcp_drop_reason(sk, skb1, SKB_DROP_REASON_TCP_OFOMERGE); } /* If there is no skb after us, we are the last_skb ! */ if (!skb1) @@ -4982,6 +4990,7 @@ void tcp_data_ready(struct sock *sk) static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + enum skb_drop_reason reason; bool fragstolen; int eaten; @@ -5000,6 +5009,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) skb_dst_drop(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); + reason = SKB_DROP_REASON_NOT_SPECIFIED; tp->rx_opt.dsack = 0; /* Queue data for delivery to the user. @@ -5008,6 +5018,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { if (tcp_receive_window(tp) == 0) { + reason = SKB_DROP_REASON_TCP_ZEROWINDOW; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; } @@ -5017,6 +5028,7 @@ queue_and_out: if (skb_queue_len(&sk->sk_receive_queue) == 0) sk_forced_mem_schedule(sk, skb->truesize); else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { + reason = SKB_DROP_REASON_PROTO_MEM; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); sk->sk_data_ready(sk); goto drop; @@ -5053,6 +5065,7 @@ queue_and_out: if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { tcp_rcv_spurious_retrans(sk, skb); /* A retransmit, 2nd most common case. Force an immediate ack. */ + reason = SKB_DROP_REASON_TCP_OLD_DATA; NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); @@ -5060,13 +5073,16 @@ out_of_window: tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); inet_csk_schedule_ack(sk); drop: - tcp_drop(sk, skb); + tcp_drop_reason(sk, skb, reason); return; } /* Out of window. F.e. zero window probe. */ - if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) + if (!before(TCP_SKB_CB(skb)->seq, + tp->rcv_nxt + tcp_receive_window(tp))) { + reason = SKB_DROP_REASON_TCP_OVERWINDOW; goto out_of_window; + } if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { /* Partial packet, seq < rcv_next < end_seq */ @@ -5076,6 +5092,7 @@ drop: * remembering D-SACK for its head made in previous line. */ if (!tcp_receive_window(tp)) { + reason = SKB_DROP_REASON_TCP_ZEROWINDOW; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; } @@ -5781,6 +5798,7 @@ discard: */ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; const struct tcphdr *th = (const struct tcphdr *)skb->data; struct tcp_sock *tp = tcp_sk(sk); unsigned int len = skb->len; @@ -5869,6 +5887,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr; return; } else { /* Header too small */ + reason = SKB_DROP_REASON_PKT_TOO_SMALL; TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; } @@ -5924,8 +5943,10 @@ slow_path: if (len < (th->doff << 2) || tcp_checksum_complete(skb)) goto csum_error; - if (!th->ack && !th->rst && !th->syn) + if (!th->ack && !th->rst && !th->syn) { + reason = SKB_DROP_REASON_TCP_FLAGS; goto discard; + } /* * Standard slow path. @@ -5951,12 +5972,13 @@ step5: return; csum_error: + reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); discard: - tcp_drop(sk, skb); + tcp_drop_reason(sk, skb, reason); } EXPORT_SYMBOL(tcp_rcv_established); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6873f46fc8ba..f9cec624068d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1409,72 +1409,6 @@ EXPORT_SYMBOL(tcp_v4_md5_hash_skb); #endif -/* Called with rcu_read_lock() */ -static bool tcp_v4_inbound_md5_hash(const struct sock *sk, - const struct sk_buff *skb, - int dif, int sdif) -{ -#ifdef CONFIG_TCP_MD5SIG - /* - * This gets called for each TCP segment that arrives - * so we want to be efficient. - * We have 3 drop cases: - * o No MD5 hash and one expected. - * o MD5 hash and we're not expecting one. - * o MD5 hash and its wrong. - */ - const __u8 *hash_location = NULL; - struct tcp_md5sig_key *hash_expected; - const struct iphdr *iph = ip_hdr(skb); - const struct tcphdr *th = tcp_hdr(skb); - const union tcp_md5_addr *addr; - unsigned char newhash[16]; - int genhash, l3index; - - /* sdif set, means packet ingressed via a device - * in an L3 domain and dif is set to the l3mdev - */ - l3index = sdif ? dif : 0; - - addr = (union tcp_md5_addr *)&iph->saddr; - hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); - hash_location = tcp_parse_md5sig_option(th); - - /* We've parsed the options - do we have a hash? */ - if (!hash_expected && !hash_location) - return false; - - if (hash_expected && !hash_location) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); - return true; - } - - if (!hash_expected && hash_location) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); - return true; - } - - /* Okay, so this is hash_expected and hash_location - - * so we need to calculate the checksum. - */ - genhash = tcp_v4_md5_hash_skb(newhash, - hash_expected, - NULL, skb); - - if (genhash || memcmp(hash_location, newhash, 16) != 0) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); - net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", - &iph->saddr, ntohs(th->source), - &iph->daddr, ntohs(th->dest), - genhash ? " tcp_v4_calc_md5_hash failed" - : "", l3index); - return true; - } - return false; -#endif - return false; -} - static void tcp_v4_init_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) @@ -1704,6 +1638,7 @@ INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, */ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { + enum skb_drop_reason reason; struct sock *rsk; if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ @@ -1726,6 +1661,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; } + reason = SKB_DROP_REASON_NOT_SPECIFIED; if (tcp_checksum_complete(skb)) goto csum_err; @@ -1753,7 +1689,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) reset: tcp_v4_send_reset(rsk, skb); discard: - kfree_skb(skb); + kfree_skb_reason(skb, reason); /* Be careful here. If this function gets more complicated and * gcc suffers from register pressure on the x86, sk (in %ebx) * might be destroyed here. This current version compiles correctly, @@ -1762,6 +1698,7 @@ discard: return 0; csum_err: + reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); @@ -1807,7 +1744,8 @@ int tcp_v4_early_demux(struct sk_buff *skb) return 0; } -bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) +bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, + enum skb_drop_reason *reason) { u32 limit, tail_gso_size, tail_gso_segs; struct skb_shared_info *shinfo; @@ -1833,6 +1771,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) if (unlikely(tcp_checksum_complete(skb))) { bh_unlock_sock(sk); trace_tcp_bad_csum(skb); + *reason = SKB_DROP_REASON_TCP_CSUM; __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); return true; @@ -1921,6 +1860,7 @@ no_coalesce: if (unlikely(sk_add_backlog(sk, skb, limit))) { bh_unlock_sock(sk); + *reason = SKB_DROP_REASON_SOCKET_BACKLOG; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); return true; } @@ -1971,13 +1911,13 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, int tcp_v4_rcv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); + enum skb_drop_reason drop_reason; int sdif = inet_sdif(skb); int dif = inet_iif(skb); const struct iphdr *iph; const struct tcphdr *th; bool refcounted; struct sock *sk; - int drop_reason; int ret; drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; @@ -2025,7 +1965,10 @@ process: struct sock *nsk; sk = req->rsk_listener; - if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) { + drop_reason = tcp_inbound_md5_hash(sk, skb, + &iph->saddr, &iph->daddr, + AF_INET, dif, sdif); + if (unlikely(drop_reason)) { sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; @@ -2057,6 +2000,8 @@ process: iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); nsk = tcp_check_req(sk, skb, req, false, &req_stolen); + } else { + drop_reason = SKB_DROP_REASON_SOCKET_FILTER; } if (!nsk) { reqsk_put(req); @@ -2092,10 +2037,14 @@ process: } } - if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { + drop_reason = SKB_DROP_REASON_XFRM_POLICY; goto discard_and_relse; + } - if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif)) + drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr, + &iph->daddr, AF_INET, dif, sdif); + if (drop_reason) goto discard_and_relse; nf_reset_ct(skb); @@ -2124,7 +2073,7 @@ process: if (!sock_owned_by_user(sk)) { ret = tcp_v4_do_rcv(sk, skb); } else { - if (tcp_add_backlog(sk, skb)) + if (tcp_add_backlog(sk, skb, &drop_reason)) goto discard_and_relse; } bh_unlock_sock(sk); @@ -2166,6 +2115,7 @@ discard_and_relse: do_time_wait: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + drop_reason = SKB_DROP_REASON_XFRM_POLICY; inet_twsk_put(inet_twsk(sk)); goto discard_it; } @@ -3187,6 +3137,7 @@ static int __net_init tcp_sk_init(struct net *net) /* rfc5961 challenge ack rate limiting */ net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; net->ipv4.sysctl_tcp_min_tso_segs = 2; + net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ net->ipv4.sysctl_tcp_min_rtt_wlen = 300; net->ipv4.sysctl_tcp_autocorking = 1; net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e76bf1e9251e..81aaa7da3e8c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1253,7 +1253,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tp = tcp_sk(sk); prior_wstamp = tp->tcp_wstamp_ns; tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); - skb->skb_mstamp_ns = tp->tcp_wstamp_ns; + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); if (clone_it) { oskb = skb; @@ -1589,7 +1589,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, skb_split(skb, buff, len); - buff->tstamp = skb->tstamp; + skb_set_delivery_time(buff, skb->tstamp, true); tcp_fragment_tstamp(skb, buff); old_factor = tcp_skb_pcount(skb); @@ -1951,25 +1951,34 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, } /* Return how many segs we'd like on a TSO packet, - * to send one TSO packet per ms + * depending on current pacing rate, and how close the peer is. + * + * Rationale is: + * - For close peers, we rather send bigger packets to reduce + * cpu costs, because occasional losses will be repaired fast. + * - For long distance/rtt flows, we would like to get ACK clocking + * with 1 ACK per ms. + * + * Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting + * in bigger TSO bursts. We we cut the RTT-based allowance in half + * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance + * is below 1500 bytes after 6 * ~500 usec = 3ms. */ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, int min_tso_segs) { - u32 bytes, segs; + unsigned long bytes; + u32 r; - bytes = min_t(unsigned long, - sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), - sk->sk_gso_max_size); + bytes = sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift); - /* Goal is to send at least one packet per ms, - * not one big TSO packet every 100 ms. - * This preserves ACK clocking and is consistent - * with tcp_tso_should_defer() heuristic. - */ - segs = max_t(u32, bytes / mss_now, min_tso_segs); + r = tcp_min_rtt(tcp_sk(sk)) >> sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log; + if (r < BITS_PER_TYPE(sk->sk_gso_max_size)) + bytes += sk->sk_gso_max_size >> r; + + bytes = min_t(unsigned long, bytes, sk->sk_gso_max_size); - return segs; + return max_t(u32, bytes / mss_now, min_tso_segs); } /* Return the number of segments we want in the skb we are transmitting. @@ -2616,7 +2625,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ - skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache; + tp->tcp_wstamp_ns = tp->tcp_clock_cache; + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); tcp_init_tso_segs(skb, mss_now); goto repair; /* Skip network transmission */ @@ -3541,11 +3551,12 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok)) - skb->skb_mstamp_ns = cookie_init_timestamp(req, now); + skb_set_delivery_time(skb, cookie_init_timestamp(req, now), + true); else #endif { - skb->skb_mstamp_ns = now; + skb_set_delivery_time(skb, now, true); if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); } @@ -3594,7 +3605,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb, synack_type, &opts); - skb->skb_mstamp_ns = now; + skb_set_delivery_time(skb, now, true); tcp_add_tx_delay(skb, tp); return skb; @@ -3771,7 +3782,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); - syn->skb_mstamp_ns = syn_data->skb_mstamp_ns; + skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, true); /* Now full SYN+DATA was cloned and sent (or not), * remove the SYN from the original skb (syn_data) diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c index b91003538d87..bc3a043a5d5c 100644 --- a/net/ipv4/udp_tunnel_nic.c +++ b/net/ipv4/udp_tunnel_nic.c @@ -846,7 +846,7 @@ udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) list_for_each_entry(node, &info->shared->devices, list) if (node->dev == dev) break; - if (node->dev != dev) + if (list_entry_is_head(node, &info->shared->devices, list)) return; list_del(&node->list); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 02d31d4fcab3..b22504176588 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -400,16 +400,16 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) /* We refer to the device */ dev_hold_track(dev, &ndev->dev_tracker, GFP_KERNEL); - if (dev != blackhole_netdev) { - if (snmp6_alloc_dev(ndev) < 0) { - netdev_dbg(dev, "%s: cannot allocate memory for statistics\n", - __func__); - neigh_parms_release(&nd_tbl, ndev->nd_parms); - dev_put_track(dev, &ndev->dev_tracker); - kfree(ndev); - return ERR_PTR(err); - } + if (snmp6_alloc_dev(ndev) < 0) { + netdev_dbg(dev, "%s: cannot allocate memory for statistics\n", + __func__); + neigh_parms_release(&nd_tbl, ndev->nd_parms); + dev_put_track(dev, &ndev->dev_tracker); + kfree(ndev); + return ERR_PTR(err); + } + if (dev != blackhole_netdev) { if (snmp6_register_dev(ndev) < 0) { netdev_dbg(dev, "%s: cannot create /proc/net/dev_snmp6/%s\n", __func__, dev->name); @@ -1115,6 +1115,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, ifa->prefix_len = cfg->plen; ifa->rt_priority = cfg->rt_priority; ifa->flags = cfg->ifa_flags; + ifa->ifa_proto = cfg->ifa_proto; /* No need to add the TENTATIVE flag for addresses with NODAD */ if (!(cfg->ifa_flags & IFA_F_NODAD)) ifa->flags |= IFA_F_TENTATIVE; @@ -1836,8 +1837,8 @@ out: } EXPORT_SYMBOL(ipv6_dev_get_saddr); -int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, - u32 banned_flags) +static int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, + u32 banned_flags) { struct inet6_ifaddr *ifp; int err = -EADDRNOTAVAIL; @@ -2593,6 +2594,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, .valid_lft = valid_lft, .preferred_lft = prefered_lft, .scope = addr_type & IPV6_ADDR_SCOPE_MASK, + .ifa_proto = IFAPROT_KERNEL_RA }; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD @@ -3077,7 +3079,7 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg) } static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, - int plen, int scope) + int plen, int scope, u8 proto) { struct inet6_ifaddr *ifp; struct ifa6_config cfg = { @@ -3086,7 +3088,8 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, .ifa_flags = IFA_F_PERMANENT, .valid_lft = INFINITY_LIFE_TIME, .preferred_lft = INFINITY_LIFE_TIME, - .scope = scope + .scope = scope, + .ifa_proto = proto }; ifp = ipv6_add_addr(idev, &cfg, true, NULL); @@ -3131,7 +3134,7 @@ static void add_v4_addrs(struct inet6_dev *idev) } if (addr.s6_addr32[3]) { - add_addr(idev, &addr, plen, scope); + add_addr(idev, &addr, plen, scope, IFAPROT_UNSPEC); addrconf_prefix_route(&addr, plen, 0, idev->dev, 0, pflags, GFP_KERNEL); return; @@ -3154,7 +3157,8 @@ static void add_v4_addrs(struct inet6_dev *idev) flag |= IFA_HOST; } - add_addr(idev, &addr, plen, flag); + add_addr(idev, &addr, plen, flag, + IFAPROT_UNSPEC); addrconf_prefix_route(&addr, plen, 0, idev->dev, 0, pflags, GFP_KERNEL); } @@ -3177,7 +3181,7 @@ static void init_loopback(struct net_device *dev) return; } - add_addr(idev, &in6addr_loopback, 128, IFA_HOST); + add_addr(idev, &in6addr_loopback, 128, IFA_HOST, IFAPROT_KERNEL_LO); } void addrconf_add_linklocal(struct inet6_dev *idev, @@ -3189,7 +3193,8 @@ void addrconf_add_linklocal(struct inet6_dev *idev, .ifa_flags = flags | IFA_F_PERMANENT, .valid_lft = INFINITY_LIFE_TIME, .preferred_lft = INFINITY_LIFE_TIME, - .scope = IFA_LINK + .scope = IFA_LINK, + .ifa_proto = IFAPROT_KERNEL_LL }; struct inet6_ifaddr *ifp; @@ -3725,6 +3730,7 @@ static int addrconf_ifdown(struct net_device *dev, bool unregister) struct inet6_dev *idev; struct inet6_ifaddr *ifa, *tmp; bool keep_addr = false; + bool was_ready; int state, i; ASSERT_RTNL(); @@ -3790,7 +3796,10 @@ restart: addrconf_del_rs_timer(idev); - /* Step 2: clear flags for stateless addrconf */ + /* Step 2: clear flags for stateless addrconf, repeated down + * detection + */ + was_ready = idev->if_flags & IF_READY; if (!unregister) idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY); @@ -3864,7 +3873,7 @@ restart: if (unregister) { ipv6_ac_destroy_dev(idev); ipv6_mc_destroy_dev(idev); - } else { + } else if (was_ready) { ipv6_mc_down(idev); } @@ -4627,6 +4636,7 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = { [IFA_FLAGS] = { .len = sizeof(u32) }, [IFA_RT_PRIORITY] = { .len = sizeof(u32) }, [IFA_TARGET_NETNSID] = { .type = NLA_S32 }, + [IFA_PROTO] = { .type = NLA_U8 }, }; static int @@ -4752,6 +4762,7 @@ static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp, ifp->tstamp = jiffies; ifp->valid_lft = cfg->valid_lft; ifp->prefered_lft = cfg->preferred_lft; + ifp->ifa_proto = cfg->ifa_proto; if (cfg->rt_priority && cfg->rt_priority != ifp->rt_priority) ifp->rt_priority = cfg->rt_priority; @@ -4845,6 +4856,9 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[IFA_RT_PRIORITY]) cfg.rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]); + if (tb[IFA_PROTO]) + cfg.ifa_proto = nla_get_u8(tb[IFA_PROTO]); + cfg.valid_lft = INFINITY_LIFE_TIME; cfg.preferred_lft = INFINITY_LIFE_TIME; @@ -4948,6 +4962,7 @@ static inline int inet6_ifaddr_msgsize(void) + nla_total_size(16) /* IFA_ADDRESS */ + nla_total_size(sizeof(struct ifa_cacheinfo)) + nla_total_size(4) /* IFA_FLAGS */ + + nla_total_size(1) /* IFA_PROTO */ + nla_total_size(4) /* IFA_RT_PRIORITY */; } @@ -4985,6 +5000,7 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) goto error; + spin_lock_bh(&ifa->lock); if (!((ifa->flags&IFA_F_PERMANENT) && (ifa->prefered_lft == INFINITY_LIFE_TIME))) { preferred = ifa->prefered_lft; @@ -5006,6 +5022,7 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, preferred = INFINITY_LIFE_TIME; valid = INFINITY_LIFE_TIME; } + spin_unlock_bh(&ifa->lock); if (!ipv6_addr_any(&ifa->peer_addr)) { if (nla_put_in6_addr(skb, IFA_LOCAL, &ifa->addr) < 0 || @@ -5025,6 +5042,10 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, if (nla_put_u32(skb, IFA_FLAGS, ifa->flags) < 0) goto error; + if (ifa->ifa_proto && + nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto)) + goto error; + nlmsg_end(skb, nlh); return 0; @@ -7187,7 +7208,7 @@ static void __net_exit addrconf_exit_net(struct net *net) kfree(net->ipv6.devconf_all); net->ipv6.devconf_all = NULL; - cancel_delayed_work(&net->ipv6.addr_chk_work); + cancel_delayed_work_sync(&net->ipv6.addr_chk_work); /* * Check hash table, then free it. */ diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 8fe7900f1949..7d7b7523d126 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -441,11 +441,14 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; u32 flags = BIND_WITH_LOCK; + const struct proto *prot; int err = 0; + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); /* If the socket has its own bind function then use it. */ - if (sk->sk_prot->bind) - return sk->sk_prot->bind(sk, uaddr, addr_len); + if (prot->bind) + return prot->bind(sk, uaddr, addr_len); if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; @@ -555,6 +558,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) void __user *argp = (void __user *)arg; struct sock *sk = sock->sk; struct net *net = sock_net(sk); + const struct proto *prot; switch (cmd) { case SIOCADDRT: @@ -572,9 +576,11 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCSIFDSTADDR: return addrconf_set_dstaddr(net, argp); default: - if (!sk->sk_prot->ioctl) + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); + if (!prot->ioctl) return -ENOIOCTLCMD; - return sk->sk_prot->ioctl(sk, cmd, arg); + return prot->ioctl(sk, cmd, arg); } /*NOTREACHED*/ return 0; @@ -636,11 +642,14 @@ INDIRECT_CALLABLE_DECLARE(int udpv6_sendmsg(struct sock *, struct msghdr *, int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; + const struct proto *prot; if (unlikely(inet_send_prepare(sk))) return -EAGAIN; - return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udpv6_sendmsg, + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); + return INDIRECT_CALL_2(prot->sendmsg, tcp_sendmsg, udpv6_sendmsg, sk, msg, size); } @@ -650,13 +659,16 @@ int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; + const struct proto *prot; int addr_len = 0; int err; if (likely(!(flags & MSG_ERRQUEUE))) sock_rps_record_flow(sk); - err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udpv6_recvmsg, + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); + err = INDIRECT_CALL_2(prot->recvmsg, tcp_recvmsg, udpv6_recvmsg, sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, &addr_len); if (err >= 0) diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 8bb2c407b46b..b0ffbcd5432d 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -482,6 +482,7 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; + unsigned int allocsz; if (x->encap) { int err = esp6_output_encap(x, skb, esp); @@ -490,6 +491,10 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info return err; } + allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); + if (allocsz > ESP_SKB_FRAG_MAXSIZE) + goto cow; + if (!skb_cloned(skb)) { if (tailen <= skb_tailroom(skb)) { nfrags = 1; @@ -707,7 +712,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, __xfrm_state_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index ba5e81cd569c..3a293838a91d 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -145,8 +145,7 @@ static struct sk_buff *xfrm6_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - __skb_push(skb, skb->mac_len); - return skb_mac_gso_segment(skb, features); + return skb_eth_gso_segment(skb, features, htons(ETH_P_IPV6)); } static struct sk_buff *xfrm6_transport_gso_segment(struct xfrm_state *x, @@ -199,6 +198,9 @@ static struct sk_buff *xfrm6_beet_gso_segment(struct xfrm_state *x, ipv6_skip_exthdr(skb, 0, &proto, &frag); } + if (proto == IPPROTO_IPIP) + skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6; + __skb_pull(skb, skb_transport_offset(skb)); ops = rcu_dereference(inet6_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c index e159eb4328a8..1098131ed90c 100644 --- a/net/ipv6/ioam6.c +++ b/net/ipv6/ioam6.c @@ -635,7 +635,8 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, struct ioam6_schema *sc, u8 sclen, bool is_input) { - struct __kernel_sock_timeval ts; + struct timespec64 ts; + ktime_t tstamp; u64 raw64; u32 raw32; u16 raw16; @@ -680,10 +681,9 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, if (!skb->dev) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); } else { - if (!skb->tstamp) - __net_timestamp(skb); + tstamp = skb_tstamp_cond(skb, true); + ts = ktime_to_timespec64(tstamp); - skb_get_new_timestamp(skb, &ts); *(__be32 *)data = cpu_to_be32((u32)ts.tv_sec); } data += sizeof(__be32); @@ -694,13 +694,12 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, if (!skb->dev) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); } else { - if (!skb->tstamp) - __net_timestamp(skb); + if (!trace->type.bit2) { + tstamp = skb_tstamp_cond(skb, true); + ts = ktime_to_timespec64(tstamp); + } - if (!trace->type.bit2) - skb_get_new_timestamp(skb, &ts); - - *(__be32 *)data = cpu_to_be32((u32)ts.tv_usec); + *(__be32 *)data = cpu_to_be32((u32)(ts.tv_nsec / NSEC_PER_USEC)); } data += sizeof(__be32); } diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index aa673a6a7e43..ceb85c67ce39 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -450,8 +450,10 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, err = -EINVAL; goto done; } - if (fl_shared_exclusive(fl) || fl->opt) + if (fl_shared_exclusive(fl) || fl->opt) { + WRITE_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl, 1); static_branch_deferred_inc(&ipv6_flowlabel_exclusive); + } return fl; done: diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index d4b1e2c5aa76..5b5ea35635f9 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -459,6 +459,7 @@ discard: static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { + skb_clear_delivery_time(skb); rcu_read_lock(); ip6_protocol_deliver_rcu(net, skb, 0, false); rcu_read_unlock(); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index d37a79a8554e..c4fc03c1ac99 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -114,6 +114,8 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, if (likely(ops && ops->callbacks.gso_segment)) { skb_reset_transport_header(skb); segs = ops->callbacks.gso_segment(skb, features); + if (!segs) + skb->network_header = skb_mac_header(skb) + nhoff - skb->head; } if (IS_ERR_OR_NULL(segs)) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 0c6c971ce0a5..e69fac576970 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -130,7 +130,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * rcu_read_unlock_bh(); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); return -EINVAL; } @@ -202,7 +202,7 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s case NET_XMIT_CN: return __ip6_finish_output(net, sk, skb) ? : ret; default: - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); return ret; } } @@ -217,7 +217,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (unlikely(idev->cnf.disable_ipv6)) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); return 0; } @@ -440,7 +440,7 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk, } #endif - skb->tstamp = 0; + skb_clear_tstamp(skb); return dst_output(net, sk, skb); } @@ -813,6 +813,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? inet6_sk(skb->sk) : NULL; + bool mono_delivery_time = skb->mono_delivery_time; struct ip6_frag_state state; unsigned int mtu, hlen, nexthdr_offset; ktime_t tstamp = skb->tstamp; @@ -903,7 +904,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (iter.frag) ip6_fraglist_prepare(skb, &iter); - skb->tstamp = tstamp; + skb_set_delivery_time(skb, tstamp, mono_delivery_time); err = output(net, sk, skb); if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), @@ -962,7 +963,7 @@ slow_path: /* * Put this fragment into the sending queue. */ - frag->tstamp = tstamp; + skb_set_delivery_time(frag, tstamp, mono_delivery_time); err = output(net, sk, frag); if (err) goto fail; @@ -1406,8 +1407,6 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, if (np->frag_size) mtu = np->frag_size; } - if (mtu < IPV6_MIN_MTU) - return -EINVAL; cork->base.fragsize = mtu; cork->base.gso_size = ipc6->gso_size; cork->base.tx_flags = 0; @@ -1464,14 +1463,12 @@ static int __ip6_append_data(struct sock *sk, if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) - tskey = sk->sk_tskey++; + tskey = atomic_inc_return(&sk->sk_tskey) - 1; hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + (opt ? opt->opt_nflen : 0); - maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - - sizeof(struct frag_hdr); headersize = sizeof(struct ipv6hdr) + (opt ? opt->opt_flen + opt->opt_nflen : 0) + @@ -1479,6 +1476,13 @@ static int __ip6_append_data(struct sock *sk, sizeof(struct frag_hdr) : 0) + rt->rt6i_nfheader_len; + if (mtu < fragheaderlen || + ((mtu - fragheaderlen) & ~7) + fragheaderlen < sizeof(struct frag_hdr)) + goto emsgsize; + + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - + sizeof(struct frag_hdr); + /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit * the first fragment */ @@ -1487,6 +1491,7 @@ static int __ip6_append_data(struct sock *sk, if (cork->length + length > mtu - headersize && ipc6->dontfrag && (sk->sk_protocol == IPPROTO_UDP || + sk->sk_protocol == IPPROTO_ICMPV6 || sk->sk_protocol == IPPROTO_RAW)) { ipv6_local_rxpmtu(sk, fl6, mtu - headersize + sizeof(struct ipv6hdr)); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 0ebaaec3faf9..a9775c830194 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1040,7 +1040,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, int ret; #ifdef CONFIG_IPV6_PIMSM_V2 - if (assert == MRT6MSG_WHOLEPKT) + if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE) skb = skb_realloc_headroom(pkt, -skb_network_offset(pkt) +sizeof(*msg)); else @@ -1056,7 +1056,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, skb->ip_summed = CHECKSUM_UNNECESSARY; #ifdef CONFIG_IPV6_PIMSM_V2 - if (assert == MRT6MSG_WHOLEPKT) { + if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE) { /* Ugly, but we have no choice with this interface. Duplicate old header, fix length etc. And all this only to mangle msg->im6_msgtype and @@ -1068,8 +1068,11 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, skb_reset_transport_header(skb); msg = (struct mrt6msg *)skb_transport_header(skb); msg->im6_mbz = 0; - msg->im6_msgtype = MRT6MSG_WHOLEPKT; - msg->im6_mif = mrt->mroute_reg_vif_num; + msg->im6_msgtype = assert; + if (assert == MRT6MSG_WRMIFWHOLE) + msg->im6_mif = mifi; + else + msg->im6_mif = mrt->mroute_reg_vif_num; msg->im6_pad = 0; msg->im6_src = ipv6_hdr(pkt)->saddr; msg->im6_dst = ipv6_hdr(pkt)->daddr; @@ -1650,6 +1653,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, mifi_t mifi; struct net *net = sock_net(sk); struct mr_table *mrt; + bool do_wrmifwhole; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) @@ -1763,12 +1767,15 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; + + do_wrmifwhole = (v == MRT6MSG_WRMIFWHOLE); v = !!v; rtnl_lock(); ret = 0; if (v != mrt->mroute_do_pim) { mrt->mroute_do_pim = v; mrt->mroute_do_assert = v; + mrt->mroute_do_wrvifwhole = do_wrmifwhole; } rtnl_unlock(); return ret; @@ -2144,6 +2151,9 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, MFC_ASSERT_THRESH)) { c->_c.mfc_un.res.last_assert = jiffies; ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF); + if (mrt->mroute_do_wrvifwhole) + ip6mr_cache_report(mrt, skb, true_vifi, + MRT6MSG_WRMIFWHOLE); } goto dont_forward; } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index a733803a710c..222f6bf220ba 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -475,7 +475,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, &tcp_prot, 1); - sk->sk_prot = &tcp_prot; + /* Paired with READ_ONCE(sk->sk_prot) in net/ipv6/af_inet6.c */ + WRITE_ONCE(sk->sk_prot, &tcp_prot); icsk->icsk_af_ops = &ipv4_specific; sk->sk_socket->ops = &inet_stream_ops; sk->sk_family = PF_INET; @@ -489,7 +490,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, prot, 1); - sk->sk_prot = prot; + /* Paired with READ_ONCE(sk->sk_prot) in net/ipv6/af_inet6.c */ + WRITE_ONCE(sk->sk_prot, prot); sk->sk_socket->ops = &inet_dgram_ops; sk->sk_family = PF_INET; } diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index bed8155508c8..909f937befd7 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1371,27 +1371,23 @@ static void mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, } /* called with rcu_read_lock() */ -int igmp6_event_query(struct sk_buff *skb) +void igmp6_event_query(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); - if (!idev) - return -EINVAL; - - if (idev->dead) { - kfree_skb(skb); - return -ENODEV; - } + if (!idev || idev->dead) + goto out; spin_lock_bh(&idev->mc_query_lock); if (skb_queue_len(&idev->mc_query_queue) < MLD_MAX_SKBS) { __skb_queue_tail(&idev->mc_query_queue, skb); if (!mod_delayed_work(mld_wq, &idev->mc_query_work, 0)) in6_dev_hold(idev); + skb = NULL; } spin_unlock_bh(&idev->mc_query_lock); - - return 0; +out: + kfree_skb(skb); } static void __mld_query_work(struct sk_buff *skb) @@ -1542,27 +1538,23 @@ static void mld_query_work(struct work_struct *work) } /* called with rcu_read_lock() */ -int igmp6_event_report(struct sk_buff *skb) +void igmp6_event_report(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); - if (!idev) - return -EINVAL; - - if (idev->dead) { - kfree_skb(skb); - return -ENODEV; - } + if (!idev || idev->dead) + goto out; spin_lock_bh(&idev->mc_report_lock); if (skb_queue_len(&idev->mc_report_queue) < MLD_MAX_SKBS) { __skb_queue_tail(&idev->mc_report_queue, skb); if (!mod_delayed_work(mld_wq, &idev->mc_report_work, 0)) in6_dev_hold(idev); + skb = NULL; } spin_unlock_bh(&idev->mc_report_lock); - - return 0; +out: + kfree_skb(skb); } static void __mld_report_work(struct sk_buff *skb) @@ -1759,7 +1751,7 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) skb_reserve(skb, hlen); skb_tailroom_reserve(skb, mtu, tlen); - if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) { + if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { /* <draft-ietf-magma-mld-source-05.txt>: * use unspecified address as the source address * when a valid link-local address is not available. diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index f03b597e4121..fcb288b0ae13 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -466,9 +466,8 @@ static void ip6_nd_hdr(struct sk_buff *skb, hdr->daddr = *daddr; } -static void ndisc_send_skb(struct sk_buff *skb, - const struct in6_addr *daddr, - const struct in6_addr *saddr) +void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, + const struct in6_addr *saddr) { struct dst_entry *dst = skb_dst(skb); struct net *net = dev_net(skb->dev); @@ -515,6 +514,7 @@ static void ndisc_send_skb(struct sk_buff *skb, rcu_read_unlock(); } +EXPORT_SYMBOL(ndisc_send_skb); void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, @@ -598,22 +598,16 @@ static void ndisc_send_unsol_na(struct net_device *dev) in6_dev_put(idev); } -void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, - const struct in6_addr *daddr, const struct in6_addr *saddr, - u64 nonce) +struct sk_buff *ndisc_ns_create(struct net_device *dev, const struct in6_addr *solicit, + const struct in6_addr *saddr, u64 nonce) { - struct sk_buff *skb; - struct in6_addr addr_buf; int inc_opt = dev->addr_len; - int optlen = 0; + struct sk_buff *skb; struct nd_msg *msg; + int optlen = 0; - if (!saddr) { - if (ipv6_get_lladdr(dev, &addr_buf, - (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC))) - return; - saddr = &addr_buf; - } + if (!saddr) + return NULL; if (ipv6_addr_any(saddr)) inc_opt = false; @@ -625,7 +619,7 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) - return; + return NULL; msg = skb_put(skb, sizeof(*msg)); *msg = (struct nd_msg) { @@ -647,7 +641,28 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, memcpy(opt + 2, &nonce, 6); } - ndisc_send_skb(skb, daddr, saddr); + return skb; +} +EXPORT_SYMBOL(ndisc_ns_create); + +void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, + const struct in6_addr *daddr, const struct in6_addr *saddr, + u64 nonce) +{ + struct in6_addr addr_buf; + struct sk_buff *skb; + + if (!saddr) { + if (ipv6_get_lladdr(dev, &addr_buf, + (IFA_F_TENTATIVE | IFA_F_OPTIMISTIC))) + return; + saddr = &addr_buf; + } + + skb = ndisc_ns_create(dev, solicit, saddr, nonce); + + if (skb) + ndisc_send_skb(skb, daddr, saddr); } void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, @@ -1337,8 +1352,12 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } neigh->flags |= NTF_ROUTER; - } else if (rt) { + } else if (rt && IPV6_EXTRACT_PREF(rt->fib6_flags) != pref) { + struct nl_info nlinfo = { + .nl_net = net, + }; rt->fib6_flags = (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); + inet6_rt_notify(RTM_NEWROUTE, rt, &nlinfo, NLM_F_REPLACE); } if (rt) diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 6ab710b5a1a8..1da332450d98 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -121,6 +121,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct sk_buff *)) { int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; + bool mono_delivery_time = skb->mono_delivery_time; ktime_t tstamp = skb->tstamp; struct ip6_frag_state state; u8 *prevhdr, nexthdr = 0; @@ -186,7 +187,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (iter.frag) ip6_fraglist_prepare(skb, &iter); - skb->tstamp = tstamp; + skb_set_delivery_time(skb, tstamp, mono_delivery_time); err = output(net, sk, data, skb); if (err || !iter.frag) break; @@ -219,7 +220,7 @@ slow_path: goto blackhole; } - skb2->tstamp = tstamp; + skb_set_delivery_time(skb2, tstamp, mono_delivery_time); err = output(net, sk, data, skb2); if (err) goto blackhole; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 5c47be29b9ee..7dd3629dd19e 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -264,6 +264,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, fq->iif = dev->ifindex; fq->q.stamp = skb->tstamp; + fq->q.mono_delivery_time = skb->mono_delivery_time; fq->q.meat += skb->len; fq->ecn |= ecn; if (payload_len > fq->q.max_size) diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index d5544cf67ffe..ff033d16549e 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -101,11 +101,21 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc6.sockc.tsflags = sk->sk_tsflags; ipc6.sockc.mark = sk->sk_mark; - err = sock_cmsg_send(sk, msg, &ipc6.sockc); - if (err) - return err; + if (msg->msg_controllen) { + struct ipv6_txoptions opt = {}; + + opt.tot_len = sizeof(opt); + ipc6.opt = &opt; - /* TODO: use ip6_datagram_send_ctl to get options from cmsg */ + err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6); + if (err < 0) + return err; + + /* Changes to txoptions and flow info are not implemented, yet. + * Drop the options, fl6 is wiped below. + */ + ipc6.opt = NULL; + } memset(&fl6, 0, sizeof(fl6)); @@ -140,7 +150,8 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) pfh.wcheck = 0; pfh.family = AF_INET6; - ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); + if (ipc6.hlimit < 0) + ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); lock_sock(sk); err = ip6_append_data(sk, ping_getfrag, &pfh, len, diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 28e44782c94d..ff866f2a879e 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -194,6 +194,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, fq->iif = dev->ifindex; fq->q.stamp = skb->tstamp; + fq->q.mono_delivery_time = skb->mono_delivery_time; fq->q.meat += skb->len; fq->ecn |= ecn; add_frag_mem_limit(fq->q.fqdir, skb->truesize); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 46c7a1b36075..6188712f24b0 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5760,11 +5760,11 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, } if (!dst) { - if (rt->offload) + if (READ_ONCE(rt->offload)) rtm->rtm_flags |= RTM_F_OFFLOAD; - if (rt->trap) + if (READ_ONCE(rt->trap)) rtm->rtm_flags |= RTM_F_TRAP; - if (rt->offload_failed) + if (READ_ONCE(rt->offload_failed)) rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED; } @@ -6222,19 +6222,20 @@ void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, struct sk_buff *skb; int err; - if (f6i->offload == offload && f6i->trap == trap && - f6i->offload_failed == offload_failed) + if (READ_ONCE(f6i->offload) == offload && + READ_ONCE(f6i->trap) == trap && + READ_ONCE(f6i->offload_failed) == offload_failed) return; - f6i->offload = offload; - f6i->trap = trap; + WRITE_ONCE(f6i->offload, offload); + WRITE_ONCE(f6i->trap, trap); /* 2 means send notifications only if offload_failed was changed. */ if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 && - f6i->offload_failed == offload_failed) + READ_ONCE(f6i->offload_failed) == offload_failed) return; - f6i->offload_failed = offload_failed; + WRITE_ONCE(f6i->offload_failed, offload_failed); if (!rcu_access_pointer(f6i->fib6_node)) /* The route was removed from the tree, do not send diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0c648bf07f39..13678d3908fa 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -773,57 +773,6 @@ clear_hash_noput: #endif -static bool tcp_v6_inbound_md5_hash(const struct sock *sk, - const struct sk_buff *skb, - int dif, int sdif) -{ -#ifdef CONFIG_TCP_MD5SIG - const __u8 *hash_location = NULL; - struct tcp_md5sig_key *hash_expected; - const struct ipv6hdr *ip6h = ipv6_hdr(skb); - const struct tcphdr *th = tcp_hdr(skb); - int genhash, l3index; - u8 newhash[16]; - - /* sdif set, means packet ingressed via a device - * in an L3 domain and dif is set to the l3mdev - */ - l3index = sdif ? dif : 0; - - hash_expected = tcp_v6_md5_do_lookup(sk, &ip6h->saddr, l3index); - hash_location = tcp_parse_md5sig_option(th); - - /* We've parsed the options - do we have a hash? */ - if (!hash_expected && !hash_location) - return false; - - if (hash_expected && !hash_location) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); - return true; - } - - if (!hash_expected && hash_location) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); - return true; - } - - /* check the signature */ - genhash = tcp_v6_md5_hash_skb(newhash, - hash_expected, - NULL, skb); - - if (genhash || memcmp(hash_location, newhash, 16) != 0) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); - net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u L3 index %d\n", - genhash ? "failed" : "mismatch", - &ip6h->saddr, ntohs(th->source), - &ip6h->daddr, ntohs(th->dest), l3index); - return true; - } -#endif - return false; -} - static void tcp_v6_init_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) @@ -921,12 +870,11 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 } #endif - buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, - GFP_ATOMIC); + buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (!buff) return; - skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len); + skb_reserve(buff, MAX_TCP_HEADER); t1 = skb_push(buff, tot_len); skb_reset_transport_header(buff); @@ -992,7 +940,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 } else { mark = sk->sk_mark; } - buff->tstamp = tcp_transmit_time(sk); + skb_set_delivery_time(buff, tcp_transmit_time(sk), true); } fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark; fl6.fl6_dport = t1->dest; @@ -1472,6 +1420,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) { struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct sk_buff *opt_skb = NULL; + enum skb_drop_reason reason; struct tcp_sock *tp; /* Imagine: socket is IPv6. IPv4 packet arrives, @@ -1506,6 +1455,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (np->rxopt.all) opt_skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC)); + reason = SKB_DROP_REASON_NOT_SPECIFIED; if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ struct dst_entry *dst; @@ -1559,9 +1509,10 @@ reset: discard: if (opt_skb) __kfree_skb(opt_skb); - kfree_skb(skb); + kfree_skb_reason(skb, reason); return 0; csum_err: + reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); @@ -1627,6 +1578,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) { + enum skb_drop_reason drop_reason; int sdif = inet6_sdif(skb); int dif = inet6_iif(skb); const struct tcphdr *th; @@ -1636,6 +1588,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) int ret; struct net *net = dev_net(skb->dev); + drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (skb->pkt_type != PACKET_HOST) goto discard_it; @@ -1649,8 +1602,10 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) th = (const struct tcphdr *)skb->data; - if (unlikely(th->doff < sizeof(struct tcphdr)/4)) + if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { + drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; goto bad_packet; + } if (!pskb_may_pull(skb, th->doff*4)) goto discard_it; @@ -1677,7 +1632,10 @@ process: struct sock *nsk; sk = req->rsk_listener; - if (tcp_v6_inbound_md5_hash(sk, skb, dif, sdif)) { + drop_reason = tcp_inbound_md5_hash(sk, skb, + &hdr->saddr, &hdr->daddr, + AF_INET6, dif, sdif); + if (drop_reason) { sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; @@ -1706,6 +1664,8 @@ process: hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); nsk = tcp_check_req(sk, skb, req, false, &req_stolen); + } else { + drop_reason = SKB_DROP_REASON_SOCKET_FILTER; } if (!nsk) { reqsk_put(req); @@ -1741,14 +1701,20 @@ process: } } - if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { + drop_reason = SKB_DROP_REASON_XFRM_POLICY; goto discard_and_relse; + } - if (tcp_v6_inbound_md5_hash(sk, skb, dif, sdif)) + drop_reason = tcp_inbound_md5_hash(sk, skb, &hdr->saddr, &hdr->daddr, + AF_INET6, dif, sdif); + if (drop_reason) goto discard_and_relse; - if (tcp_filter(sk, skb)) + if (tcp_filter(sk, skb)) { + drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto discard_and_relse; + } th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); @@ -1769,7 +1735,7 @@ process: if (!sock_owned_by_user(sk)) { ret = tcp_v6_do_rcv(sk, skb); } else { - if (tcp_add_backlog(sk, skb)) + if (tcp_add_backlog(sk, skb, &drop_reason)) goto discard_and_relse; } bh_unlock_sock(sk); @@ -1779,6 +1745,7 @@ put_and_return: return ret ? -1 : 0; no_tcp_socket: + drop_reason = SKB_DROP_REASON_NO_SOCKET; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; @@ -1786,6 +1753,7 @@ no_tcp_socket: if (tcp_checksum_complete(skb)) { csum_error: + drop_reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); bad_packet: @@ -1795,7 +1763,7 @@ bad_packet: } discard_it: - kfree_skb(skb); + kfree_skb_reason(skb, drop_reason); return 0; discard_and_relse: @@ -1806,6 +1774,7 @@ discard_and_relse: do_time_wait: if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + drop_reason = SKB_DROP_REASON_XFRM_POLICY; inet_twsk_put(inet_twsk(sk)); goto discard_it; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c6872596b408..7f0fa9bd9ffe 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -912,6 +912,7 @@ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; const struct in6_addr *saddr, *daddr; struct net *net = dev_net(skb->dev); struct udphdr *uh; @@ -988,6 +989,8 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, return udp6_unicast_rcv_skb(sk, skb, uh); } + reason = SKB_DROP_REASON_NO_SOCKET; + if (!uh->check) goto report_csum_error; @@ -1000,10 +1003,12 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, __UDP6_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); - kfree_skb(skb); + kfree_skb_reason(skb, reason); return 0; short_packet: + if (reason == SKB_DROP_REASON_NOT_SPECIFIED) + reason = SKB_DROP_REASON_PKT_TOO_SMALL; net_dbg_ratelimited("UDP%sv6: short packet: From [%pI6c]:%u %d/%d to [%pI6c]:%u\n", proto == IPPROTO_UDPLITE ? "-Lite" : "", saddr, ntohs(uh->source), @@ -1014,10 +1019,12 @@ short_packet: report_csum_error: udp6_csum_zero_error(skb); csum_error: + if (reason == SKB_DROP_REASON_NOT_SPECIFIED) + reason = SKB_DROP_REASON_UDP_CSUM; __UDP6_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); discard: __UDP6_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); - kfree_skb(skb); + kfree_skb_reason(skb, reason); return 0; } diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index d0d280077721..ad07904642ca 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -45,6 +45,19 @@ static int __xfrm6_output_finish(struct net *net, struct sock *sk, struct sk_buf return xfrm_output(sk, skb); } +static int xfrm6_noneed_fragment(struct sk_buff *skb) +{ + struct frag_hdr *fh; + u8 prevhdr = ipv6_hdr(skb)->nexthdr; + + if (prevhdr != NEXTHDR_FRAGMENT) + return 0; + fh = (struct frag_hdr *)(skb->data + sizeof(struct ipv6hdr)); + if (fh->nexthdr == NEXTHDR_ESP || fh->nexthdr == NEXTHDR_AUTH) + return 1; + return 0; +} + static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -73,6 +86,9 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) xfrm6_local_rxpmtu(skb, mtu); kfree_skb(skb); return -EMSGSIZE; + } else if (toobig && xfrm6_noneed_fragment(skb)) { + skb->ignore_df = 1; + goto skip_frag; } else if (!skb->ignore_df && toobig && skb->sk) { xfrm_local_error(skb, mtu); kfree_skb(skb); diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index 8f4d49a7d3e8..eb0295d90039 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -319,7 +319,7 @@ static inline int iucv_call_b2f0(int command, union iucv_param *parm) */ static int __iucv_query_maxconn(void *param, unsigned long *max_pathid) { - unsigned long reg1 = (unsigned long)param; + unsigned long reg1 = virt_to_phys(param); int cc; asm volatile ( diff --git a/net/key/af_key.c b/net/key/af_key.c index de24a7d474df..9bf52a09b5ff 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2623,7 +2623,7 @@ static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, } return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i, - kma ? &k : NULL, net, NULL); + kma ? &k : NULL, net, NULL, 0); out: return err; diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 74a878f213d3..1deb3d874a4b 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2010, Intel Corporation * Copyright(c) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2021 Intel Corporation + * Copyright (C) 2018 - 2022 Intel Corporation */ #include <linux/ieee80211.h> @@ -626,6 +626,14 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, return -EINVAL; } + if (test_sta_flag(sta, WLAN_STA_MFP) && + !test_sta_flag(sta, WLAN_STA_AUTHORIZED)) { + ht_dbg(sdata, + "MFP STA not authorized - deny BA session request %pM tid %d\n", + sta->sta.addr, tid); + return -EINVAL; + } + /* * 802.11n-2009 11.5.1.1: If the initiating STA is an HT STA, is a * member of an IBSS, and has no other existing Block Ack agreement diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index b07ccfc44115..01b69109402e 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -378,7 +378,7 @@ struct ieee80211_mgd_auth_data { u8 key[WLAN_KEY_LEN_WEP104]; u8 key_len, key_idx; - bool done; + bool done, waiting; bool peer_confirmed; bool timeout_started; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 29bfce6b3561..476b21365058 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -37,6 +37,7 @@ #define IEEE80211_AUTH_TIMEOUT_SAE (HZ * 2) #define IEEE80211_AUTH_MAX_TRIES 3 #define IEEE80211_AUTH_WAIT_ASSOC (HZ * 5) +#define IEEE80211_AUTH_WAIT_SAE_RETRY (HZ * 2) #define IEEE80211_ASSOC_TIMEOUT (HZ / 5) #define IEEE80211_ASSOC_TIMEOUT_LONG (HZ / 2) #define IEEE80211_ASSOC_TIMEOUT_SHORT (HZ / 10) @@ -734,7 +735,7 @@ static void ieee80211_add_eht_ie(struct ieee80211_sub_if_data *sdata, ieee80211_ie_build_eht_cap(pos, he_cap, eht_cap, pos + eht_cap_size); } -static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) +static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; @@ -754,6 +755,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); const struct ieee80211_sband_iftype_data *iftd; struct ieee80211_prep_tx_info info = {}; + int ret; /* we know it's writable, cast away the const */ if (assoc_data->ie_len) @@ -767,7 +769,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (WARN_ON(!chanctx_conf)) { rcu_read_unlock(); - return; + return -EINVAL; } chan = chanctx_conf->def.chan; rcu_read_unlock(); @@ -818,7 +820,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) (iftd ? iftd->vendor_elems.len : 0), GFP_KERNEL); if (!skb) - return; + return -ENOMEM; skb_reserve(skb, local->hw.extra_tx_headroom); @@ -1104,15 +1106,22 @@ skip_rates: skb_put_data(skb, assoc_data->ie + offset, noffset - offset); } - if (assoc_data->fils_kek_len && - fils_encrypt_assoc_req(skb, assoc_data) < 0) { - dev_kfree_skb(skb); - return; + if (assoc_data->fils_kek_len) { + ret = fils_encrypt_assoc_req(skb, assoc_data); + if (ret < 0) { + dev_kfree_skb(skb); + return ret; + } } pos = skb_tail_pointer(skb); kfree(ifmgd->assoc_req_ies); ifmgd->assoc_req_ies = kmemdup(ie_start, pos - ie_start, GFP_ATOMIC); + if (!ifmgd->assoc_req_ies) { + dev_kfree_skb(skb); + return -ENOMEM; + } + ifmgd->assoc_req_ies_len = pos - ie_start; drv_mgd_prepare_tx(local, sdata, &info); @@ -1122,6 +1131,8 @@ skip_rates: IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; ieee80211_tx_skb(sdata, skb); + + return 0; } void ieee80211_send_pspoll(struct ieee80211_local *local, @@ -3074,8 +3085,15 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, (status_code == WLAN_STATUS_ANTI_CLOG_REQUIRED || (auth_transaction == 1 && (status_code == WLAN_STATUS_SAE_HASH_TO_ELEMENT || - status_code == WLAN_STATUS_SAE_PK)))) + status_code == WLAN_STATUS_SAE_PK)))) { + /* waiting for userspace now */ + ifmgd->auth_data->waiting = true; + ifmgd->auth_data->timeout = + jiffies + IEEE80211_AUTH_WAIT_SAE_RETRY; + ifmgd->auth_data->timeout_started = true; + run_again(sdata, ifmgd->auth_data->timeout); goto notify_driver; + } sdata_info(sdata, "%pM denied authentication (status %d)\n", mgmt->sa, status_code); @@ -4586,6 +4604,7 @@ static int ieee80211_do_assoc(struct ieee80211_sub_if_data *sdata) { struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data; struct ieee80211_local *local = sdata->local; + int ret; sdata_assert_lock(sdata); @@ -4606,7 +4625,9 @@ static int ieee80211_do_assoc(struct ieee80211_sub_if_data *sdata) sdata_info(sdata, "associate with %pM (try %d/%d)\n", assoc_data->bss->bssid, assoc_data->tries, IEEE80211_ASSOC_MAX_TRIES); - ieee80211_send_assoc(sdata); + ret = ieee80211_send_assoc(sdata); + if (ret) + return ret; if (!ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { assoc_data->timeout = jiffies + IEEE80211_ASSOC_TIMEOUT; @@ -4679,10 +4700,10 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) if (ifmgd->auth_data && ifmgd->auth_data->timeout_started && time_after(jiffies, ifmgd->auth_data->timeout)) { - if (ifmgd->auth_data->done) { + if (ifmgd->auth_data->done || ifmgd->auth_data->waiting) { /* - * ok ... we waited for assoc but userspace didn't, - * so let's just kill the auth data + * ok ... we waited for assoc or continuation but + * userspace didn't do it, so kill the auth data */ ieee80211_destroy_auth_data(sdata, false); } else if (ieee80211_auth(sdata)) { diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index d54a4d98c648..beb6b92eb780 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2607,7 +2607,8 @@ static void ieee80211_deliver_skb_to_local_stack(struct sk_buff *skb, * address, so that the authenticator (e.g. hostapd) will see * the frame, but bridge won't forward it anywhere else. Note * that due to earlier filtering, the only other address can - * be the PAE group address. + * be the PAE group address, unless the hardware allowed them + * through in 802.3 offloaded mode. */ if (unlikely(skb->protocol == sdata->control_port_protocol && !ether_addr_equal(ehdr->h_dest, sdata->vif.addr))) @@ -2922,13 +2923,13 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) ether_addr_equal(sdata->vif.addr, hdr->addr3)) return RX_CONTINUE; - ac = ieee80211_select_queue_80211(sdata, skb, hdr); + ac = ieee802_1d_to_ac[skb->priority]; q = sdata->vif.hw_queue[ac]; if (ieee80211_queue_stopped(&local->hw, q)) { IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_congestion); return RX_DROP_MONITOR; } - skb_set_queue_mapping(skb, q); + skb_set_queue_mapping(skb, ac); if (!--mesh_hdr->ttl) { if (!is_multicast_ether_addr(hdr->addr1)) @@ -4514,12 +4515,7 @@ static void ieee80211_rx_8023(struct ieee80211_rx_data *rx, /* deliver to local stack */ skb->protocol = eth_type_trans(skb, fast_rx->dev); - memset(skb->cb, 0, sizeof(skb->cb)); - if (rx->list) - list_add_tail(&skb->list, rx->list); - else - netif_receive_skb(skb); - + ieee80211_deliver_skb_to_local_stack(skb, rx); } static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, diff --git a/net/mctp/device.c b/net/mctp/device.c index 02ddc0f1bd3e..f49be882e98e 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -25,12 +25,25 @@ struct mctp_dump_cb { size_t a_idx; }; -/* unlocked: caller must hold rcu_read_lock */ +/* unlocked: caller must hold rcu_read_lock. + * Returned mctp_dev has its refcount incremented, or NULL if unset. + */ struct mctp_dev *__mctp_dev_get(const struct net_device *dev) { - return rcu_dereference(dev->mctp_ptr); + struct mctp_dev *mdev = rcu_dereference(dev->mctp_ptr); + + /* RCU guarantees that any mdev is still live. + * Zero refcount implies a pending free, return NULL. + */ + if (mdev) + if (!refcount_inc_not_zero(&mdev->refs)) + return NULL; + return mdev; } +/* Returned mctp_dev does not have refcount incremented. The returned pointer + * remains live while rtnl_lock is held, as that prevents mctp_unregister() + */ struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev) { return rtnl_dereference(dev->mctp_ptr); @@ -107,7 +120,7 @@ static int mctp_dump_addrinfo(struct sk_buff *skb, struct netlink_callback *cb) struct ifaddrmsg *hdr; struct mctp_dev *mdev; int ifindex; - int idx, rc; + int idx = 0, rc; hdr = nlmsg_data(cb->nlh); // filter by ifindex if requested @@ -124,6 +137,7 @@ static int mctp_dump_addrinfo(struct sk_buff *skb, struct netlink_callback *cb) if (mdev) { rc = mctp_dump_dev_addrinfo(mdev, skb, cb); + mctp_dev_put(mdev); // Error indicates full buffer, this // callback will get retried. if (rc < 0) @@ -209,7 +223,7 @@ static int mctp_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (!mdev) return -ENODEV; - if (!mctp_address_ok(addr->s_addr)) + if (!mctp_address_unicast(addr->s_addr)) return -EINVAL; /* Prevent duplicates. Under RTNL so don't need to lock for reading */ @@ -298,7 +312,7 @@ void mctp_dev_hold(struct mctp_dev *mdev) void mctp_dev_put(struct mctp_dev *mdev) { - if (refcount_dec_and_test(&mdev->refs)) { + if (mdev && refcount_dec_and_test(&mdev->refs)) { dev_put(mdev->dev); kfree_rcu(mdev, rcu); } @@ -370,6 +384,7 @@ static size_t mctp_get_link_af_size(const struct net_device *dev, if (!mdev) return 0; ret = nla_total_size(4); /* IFLA_MCTP_NET */ + mctp_dev_put(mdev); return ret; } @@ -413,10 +428,10 @@ static void mctp_unregister(struct net_device *dev) struct mctp_dev *mdev; mdev = mctp_dev_get_rtnl(dev); - if (mctp_known(dev) != (bool)mdev) { + if (mdev && !mctp_known(dev)) { // Sanity check, should match what was set in mctp_register - netdev_warn(dev, "%s: mdev pointer %d but type (%d) match is %d", - __func__, (bool)mdev, mctp_known(dev), dev->type); + netdev_warn(dev, "%s: BUG mctp_ptr set for unknown type %d", + __func__, dev->type); return; } if (!mdev) @@ -440,7 +455,7 @@ static int mctp_register(struct net_device *dev) if (mdev) { if (!mctp_known(dev)) - netdev_warn(dev, "%s: mctp_dev set for unknown type %d", + netdev_warn(dev, "%s: BUG mctp_ptr set for unknown type %d", __func__, dev->type); return 0; } diff --git a/net/mctp/neigh.c b/net/mctp/neigh.c index 6ad3e33bd4d4..ffa0f9e0983f 100644 --- a/net/mctp/neigh.c +++ b/net/mctp/neigh.c @@ -143,7 +143,7 @@ static int mctp_rtm_newneigh(struct sk_buff *skb, struct nlmsghdr *nlh, } eid = nla_get_u8(tb[NDA_DST]); - if (!mctp_address_ok(eid)) { + if (!mctp_address_unicast(eid)) { NL_SET_ERR_MSG(extack, "Invalid neighbour EID"); return -EINVAL; } diff --git a/net/mctp/route.c b/net/mctp/route.c index 17e3482aa770..d5e7db83fe9d 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -419,13 +419,14 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * this function. */ rc = mctp_key_add(key, msk); - if (rc) + if (rc) { kfree(key); + } else { + trace_mctp_key_acquire(key); - trace_mctp_key_acquire(key); - - /* we don't need to release key->lock on exit */ - mctp_key_unref(key); + /* we don't need to release key->lock on exit */ + mctp_key_unref(key); + } key = NULL; } else { @@ -455,7 +456,6 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * the reassembly/response key */ if (!rc && flags & MCTP_HDR_FLAG_EOM) { - msk = container_of(key->sk, struct mctp_sock, sk); sock_queue_rcv_skb(key->sk, key->reasm_head); key->reasm_head = NULL; __mctp_key_done_in(key, net, f, MCTP_TRACE_KEY_REPLIED); @@ -835,9 +835,8 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, { struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); struct mctp_skb_cb *cb = mctp_cb(skb); - struct mctp_route tmp_rt; + struct mctp_route tmp_rt = {0}; struct mctp_sk_key *key; - struct net_device *dev; struct mctp_hdr *hdr; unsigned long flags; unsigned int mtu; @@ -850,12 +849,12 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, if (rt) { ext_rt = false; - dev = NULL; - if (WARN_ON(!rt->dev)) goto out_release; } else if (cb->ifindex) { + struct net_device *dev; + ext_rt = true; rt = &tmp_rt; @@ -865,7 +864,6 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, rcu_read_unlock(); return rc; } - rt->dev = __mctp_dev_get(dev); rcu_read_unlock(); @@ -946,10 +944,9 @@ out_release: if (!ext_rt) mctp_route_release(rt); - dev_put(dev); + mctp_dev_put(tmp_rt.dev); return rc; - } /* route management */ @@ -961,7 +958,7 @@ static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start, struct net *net = dev_net(mdev->dev); struct mctp_route *rt, *ert; - if (!mctp_address_ok(daddr_start)) + if (!mctp_address_unicast(daddr_start)) return -EINVAL; if (daddr_extent > 0xff || daddr_start + daddr_extent >= 255) @@ -1091,6 +1088,17 @@ static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev, if (mh->ver < MCTP_VER_MIN || mh->ver > MCTP_VER_MAX) goto err_drop; + /* source must be valid unicast or null; drop reserved ranges and + * broadcast + */ + if (!(mctp_address_unicast(mh->src) || mctp_address_null(mh->src))) + goto err_drop; + + /* dest address: as above, but allow broadcast */ + if (!(mctp_address_unicast(mh->dest) || mctp_address_null(mh->dest) || + mctp_address_broadcast(mh->dest))) + goto err_drop; + /* MCTP drivers must populate halen/haddr */ if (dev->type == ARPHRD_MCTP) { cb = mctp_cb(skb); @@ -1112,11 +1120,13 @@ static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev, rt->output(rt, skb); mctp_route_release(rt); + mctp_dev_put(mdev); return NET_RX_SUCCESS; err_drop: kfree_skb(skb); + mctp_dev_put(mdev); return NET_RX_DROP; } diff --git a/net/mctp/test/utils.c b/net/mctp/test/utils.c index 7b7918702592..e03ba66bbe18 100644 --- a/net/mctp/test/utils.c +++ b/net/mctp/test/utils.c @@ -54,7 +54,6 @@ struct mctp_test_dev *mctp_test_create_dev(void) rcu_read_lock(); dev->mdev = __mctp_dev_get(ndev); - mctp_dev_hold(dev->mdev); rcu_read_unlock(); return dev; diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 3240b72271a7..e55d3dfbee0c 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -35,17 +35,23 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("AddAddr", MPTCP_MIB_ADDADDR), SNMP_MIB_ITEM("EchoAdd", MPTCP_MIB_ECHOADD), SNMP_MIB_ITEM("PortAdd", MPTCP_MIB_PORTADD), + SNMP_MIB_ITEM("AddAddrDrop", MPTCP_MIB_ADDADDRDROP), SNMP_MIB_ITEM("MPJoinPortSynRx", MPTCP_MIB_JOINPORTSYNRX), SNMP_MIB_ITEM("MPJoinPortSynAckRx", MPTCP_MIB_JOINPORTSYNACKRX), SNMP_MIB_ITEM("MPJoinPortAckRx", MPTCP_MIB_JOINPORTACKRX), SNMP_MIB_ITEM("MismatchPortSynRx", MPTCP_MIB_MISMATCHPORTSYNRX), SNMP_MIB_ITEM("MismatchPortAckRx", MPTCP_MIB_MISMATCHPORTACKRX), SNMP_MIB_ITEM("RmAddr", MPTCP_MIB_RMADDR), + SNMP_MIB_ITEM("RmAddrDrop", MPTCP_MIB_RMADDRDROP), SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW), SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX), SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX), SNMP_MIB_ITEM("MPFailTx", MPTCP_MIB_MPFAILTX), SNMP_MIB_ITEM("MPFailRx", MPTCP_MIB_MPFAILRX), + SNMP_MIB_ITEM("MPFastcloseTx", MPTCP_MIB_MPFASTCLOSETX), + SNMP_MIB_ITEM("MPFastcloseRx", MPTCP_MIB_MPFASTCLOSERX), + SNMP_MIB_ITEM("MPRstTx", MPTCP_MIB_MPRSTTX), + SNMP_MIB_ITEM("MPRstRx", MPTCP_MIB_MPRSTRX), SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED), SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE), SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index ecd3d8b117e0..00576179a619 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -28,17 +28,23 @@ enum linux_mptcp_mib_field { MPTCP_MIB_ADDADDR, /* Received ADD_ADDR with echo-flag=0 */ MPTCP_MIB_ECHOADD, /* Received ADD_ADDR with echo-flag=1 */ MPTCP_MIB_PORTADD, /* Received ADD_ADDR with a port-number */ + MPTCP_MIB_ADDADDRDROP, /* Dropped incoming ADD_ADDR */ MPTCP_MIB_JOINPORTSYNRX, /* Received a SYN MP_JOIN with a different port-number */ MPTCP_MIB_JOINPORTSYNACKRX, /* Received a SYNACK MP_JOIN with a different port-number */ MPTCP_MIB_JOINPORTACKRX, /* Received an ACK MP_JOIN with a different port-number */ MPTCP_MIB_MISMATCHPORTSYNRX, /* Received a SYN MP_JOIN with a mismatched port-number */ MPTCP_MIB_MISMATCHPORTACKRX, /* Received an ACK MP_JOIN with a mismatched port-number */ MPTCP_MIB_RMADDR, /* Received RM_ADDR */ + MPTCP_MIB_RMADDRDROP, /* Dropped incoming RM_ADDR */ MPTCP_MIB_RMSUBFLOW, /* Remove a subflow */ MPTCP_MIB_MPPRIOTX, /* Transmit a MP_PRIO */ MPTCP_MIB_MPPRIORX, /* Received a MP_PRIO */ MPTCP_MIB_MPFAILTX, /* Transmit a MP_FAIL */ MPTCP_MIB_MPFAILRX, /* Received a MP_FAIL */ + MPTCP_MIB_MPFASTCLOSETX, /* Transmit a MP_FASTCLOSE */ + MPTCP_MIB_MPFASTCLOSERX, /* Received a MP_FASTCLOSE */ + MPTCP_MIB_MPRSTTX, /* Transmit a MP_RST */ + MPTCP_MIB_MPRSTRX, /* Received a MP_RST */ MPTCP_MIB_RCVPRUNED, /* Incoming packet dropped due to memory limit */ MPTCP_MIB_SUBFLOWSTALE, /* Subflows entered 'stale' status */ MPTCP_MIB_SUBFLOWRECOVER, /* Subflows returned to active status after being stale */ diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 3e82ac24d548..325383646f5c 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -323,6 +323,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; mp_opt->suboptions |= OPTION_MPTCP_FASTCLOSE; + pr_debug("MP_FASTCLOSE: recv_key=%llu", mp_opt->rcvr_key); break; case MPTCPOPT_RST: @@ -355,8 +356,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, } } -void mptcp_get_options(const struct sock *sk, - const struct sk_buff *skb, +void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt) { const struct tcphdr *th = tcp_hdr(skb); @@ -653,7 +653,6 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * bool drop_other_suboptions = false; unsigned int opt_size = *size; bool echo; - bool port; int len; /* add addr will strip the existing options, be sure to avoid breaking @@ -662,12 +661,12 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * if (!mptcp_pm_should_add_signal(msk) || (opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) || !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &opts->addr, - &echo, &port, &drop_other_suboptions)) + &echo, &drop_other_suboptions)) return false; if (drop_other_suboptions) remaining += opt_size; - len = mptcp_add_addr_len(opts->addr.family, echo, port); + len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port); if (remaining < len) return false; @@ -834,11 +833,13 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX); } /* MP_RST can be used with MP_FASTCLOSE and MP_FAIL if there is room */ if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTTX); } return true; } @@ -1086,8 +1087,7 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk, &mp_opt->addr); pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n", - msk, (unsigned long long)hmac, - (unsigned long long)mp_opt->ahmac); + msk, hmac, mp_opt->ahmac); return hmac == mp_opt->ahmac; } @@ -1114,7 +1114,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) return true; } - mptcp_get_options(sk, skb, &mp_opt); + mptcp_get_options(skb, &mp_opt); /* The subflow can be in close state only if check_fully_established() * just sent a reset. If so, tell the caller to ignore the current packet. @@ -1127,6 +1127,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) msk->local_key == mp_opt.rcvr_key) { WRITE_ONCE(msk->rcv_fastclose, true); mptcp_schedule_work((struct sock *)msk); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSERX); } if ((mp_opt.suboptions & OPTION_MPTCP_ADD_ADDR) && @@ -1161,6 +1162,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) subflow->reset_seen = 1; subflow->reset_reason = mp_opt.reset_reason; subflow->reset_transient = mp_opt.reset_transient; + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTRX); } if (!(mp_opt.suboptions & OPTION_MPTCP_DSS)) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 696b2c4613a7..01809eef29b4 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -213,13 +213,15 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk, mptcp_pm_add_addr_send_ack(msk); } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { pm->remote = *addr; + } else { + __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); } spin_unlock_bh(&pm->lock); } void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, - struct mptcp_addr_info *addr) + const struct mptcp_addr_info *addr) { struct mptcp_pm_data *pm = &msk->pm; @@ -253,8 +255,10 @@ void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, mptcp_event_addr_removed(msk, rm_list->ids[i]); spin_lock_bh(&pm->lock); - mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED); - pm->rm_list_rx = *rm_list; + if (mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED)) + pm->rm_list_rx = *rm_list; + else + __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_RMADDRDROP); spin_unlock_bh(&pm->lock); } @@ -275,14 +279,15 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) /* path manager helpers */ -bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, struct sk_buff *skb, +bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, unsigned int opt_size, unsigned int remaining, struct mptcp_addr_info *addr, bool *echo, - bool *port, bool *drop_other_suboptions) + bool *drop_other_suboptions) { int ret = false; u8 add_addr; u8 family; + bool port; spin_lock_bh(&msk->pm.lock); @@ -300,10 +305,10 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, struct sk_buff *skb, } *echo = mptcp_pm_should_add_signal_echo(msk); - *port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port); + port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port); family = *echo ? msk->pm.remote.family : msk->pm.local.family; - if (remaining < mptcp_add_addr_len(family, *echo, *port)) + if (remaining < mptcp_add_addr_len(family, *echo, port)) goto out_unlock; if (*echo) { diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index e4fd54fff1d2..800515fe5e1d 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -83,16 +83,6 @@ static bool addresses_equal(const struct mptcp_addr_info *a, return a->port == b->port; } -static bool address_zero(const struct mptcp_addr_info *addr) -{ - struct mptcp_addr_info zero; - - memset(&zero, 0, sizeof(zero)); - zero.family = addr->family; - - return addresses_equal(addr, &zero, true); -} - static void local_address(const struct sock_common *skc, struct mptcp_addr_info *addr) { @@ -120,7 +110,7 @@ static void remote_address(const struct sock_common *skc, } static bool lookup_subflow_by_saddr(const struct list_head *list, - struct mptcp_addr_info *saddr) + const struct mptcp_addr_info *saddr) { struct mptcp_subflow_context *subflow; struct mptcp_addr_info cur; @@ -138,7 +128,7 @@ static bool lookup_subflow_by_saddr(const struct list_head *list, } static bool lookup_subflow_by_daddr(const struct list_head *list, - struct mptcp_addr_info *daddr) + const struct mptcp_addr_info *daddr) { struct mptcp_subflow_context *subflow; struct mptcp_addr_info cur; @@ -157,10 +147,10 @@ static bool lookup_subflow_by_daddr(const struct list_head *list, static struct mptcp_pm_addr_entry * select_local_address(const struct pm_nl_pernet *pernet, - struct mptcp_sock *msk) + const struct mptcp_sock *msk) { + const struct sock *sk = (const struct sock *)msk; struct mptcp_pm_addr_entry *entry, *ret = NULL; - struct sock *sk = (struct sock *)msk; msk_owned_by_me(msk); @@ -190,7 +180,7 @@ select_local_address(const struct pm_nl_pernet *pernet, } static struct mptcp_pm_addr_entry * -select_signal_address(struct pm_nl_pernet *pernet, struct mptcp_sock *msk) +select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk) { struct mptcp_pm_addr_entry *entry, *ret = NULL; @@ -214,16 +204,16 @@ select_signal_address(struct pm_nl_pernet *pernet, struct mptcp_sock *msk) return ret; } -unsigned int mptcp_pm_get_add_addr_signal_max(struct mptcp_sock *msk) +unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk) { - struct pm_nl_pernet *pernet; + const struct pm_nl_pernet *pernet; - pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); + pernet = net_generic(sock_net((const struct sock *)msk), pm_nl_pernet_id); return READ_ONCE(pernet->add_addr_signal_max); } EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max); -unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk) +unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet; @@ -232,7 +222,7 @@ unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); -unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk) +unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet; @@ -241,7 +231,7 @@ unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max); -unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk) +unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet; @@ -264,8 +254,8 @@ bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) } struct mptcp_pm_add_entry * -mptcp_lookup_anno_list_by_saddr(struct mptcp_sock *msk, - struct mptcp_addr_info *addr) +mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *entry; @@ -346,7 +336,7 @@ out: struct mptcp_pm_add_entry * mptcp_pm_del_add_timer(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, bool check_id) + const struct mptcp_addr_info *addr, bool check_id) { struct mptcp_pm_add_entry *entry; struct sock *sk = (struct sock *)msk; @@ -364,7 +354,7 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, } static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, - struct mptcp_pm_addr_entry *entry) + const struct mptcp_pm_addr_entry *entry) { struct mptcp_pm_add_entry *add_entry = NULL; struct sock *sk = (struct sock *)msk; @@ -410,8 +400,8 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk) } } -static bool lookup_address_in_vec(struct mptcp_addr_info *addrs, unsigned int nr, - struct mptcp_addr_info *addr) +static bool lookup_address_in_vec(const struct mptcp_addr_info *addrs, unsigned int nr, + const struct mptcp_addr_info *addr) { int i; @@ -493,9 +483,9 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info, } static int -lookup_id_by_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *addr) +lookup_id_by_addr(const struct pm_nl_pernet *pernet, const struct mptcp_addr_info *addr) { - struct mptcp_pm_addr_entry *entry; + const struct mptcp_pm_addr_entry *entry; int ret = -1; rcu_read_lock(); @@ -546,6 +536,16 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) if (msk->pm.add_addr_signaled < add_addr_signal_max) { local = select_signal_address(pernet, msk); + /* due to racing events on both ends we can reach here while + * previous add address is still running: if we invoke now + * mptcp_pm_announce_addr(), that will fail and the + * corresponding id will be marked as used. + * Instead let the PM machinery reschedule us when the + * current address announce will be completed. + */ + if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL)) + return; + if (local) { if (mptcp_pm_alloc_anno_list(msk, local)) { __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); @@ -650,6 +650,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) unsigned int add_addr_accept_max; struct mptcp_addr_info remote; unsigned int subflows_max; + bool reset_port = false; int i, nr; add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); @@ -659,15 +660,19 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) msk->pm.add_addr_accepted, add_addr_accept_max, msk->pm.remote.family); - if (lookup_subflow_by_daddr(&msk->conn_list, &msk->pm.remote)) + remote = msk->pm.remote; + if (lookup_subflow_by_daddr(&msk->conn_list, &remote)) goto add_addr_echo; + /* pick id 0 port, if none is provided the remote address */ + if (!remote.port) { + reset_port = true; + remote.port = sk->sk_dport; + } + /* connect to the specified remote address, using whatever * local address the routing configuration will pick. */ - remote = msk->pm.remote; - if (!remote.port) - remote.port = sk->sk_dport; nr = fill_local_addresses_vec(msk, addrs); msk->pm.add_addr_accepted++; @@ -680,8 +685,12 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) __mptcp_subflow_connect(sk, &addrs[i], &remote); spin_lock_bh(&msk->pm.lock); + /* be sure to echo exactly the received address */ + if (reset_port) + remote.port = 0; + add_addr_echo: - mptcp_pm_announce_addr(msk, &msk->pm.remote, true); + mptcp_pm_announce_addr(msk, &remote, true); mptcp_pm_nl_addr_send_ack(msk); } @@ -858,10 +867,18 @@ static bool address_use_port(struct mptcp_pm_addr_entry *entry) MPTCP_PM_ADDR_FLAG_SIGNAL; } +/* caller must ensure the RCU grace period is already elapsed */ +static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry) +{ + if (entry->lsk) + sock_release(entry->lsk); + kfree(entry); +} + static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, struct mptcp_pm_addr_entry *entry) { - struct mptcp_pm_addr_entry *cur; + struct mptcp_pm_addr_entry *cur, *del_entry = NULL; unsigned int addr_max; int ret = -EINVAL; @@ -882,8 +899,22 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, list_for_each_entry(cur, &pernet->local_addr_list, list) { if (addresses_equal(&cur->addr, &entry->addr, address_use_port(entry) && - address_use_port(cur))) - goto out; + address_use_port(cur))) { + /* allow replacing the exiting endpoint only if such + * endpoint is an implicit one and the user-space + * did not provide an endpoint id + */ + if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)) + goto out; + if (entry->addr.id) + goto out; + + pernet->addrs--; + entry->addr.id = cur->addr.id; + list_del_rcu(&cur->list); + del_entry = cur; + break; + } } if (!entry->addr.id) { @@ -919,6 +950,12 @@ find_next: out: spin_unlock_bh(&pernet->lock); + + /* just replaced an existing entry, free it */ + if (del_entry) { + synchronize_rcu(); + __mptcp_pm_release_addr_entry(del_entry); + } return ret; } @@ -992,9 +1029,6 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) if (addresses_equal(&msk_local, &skc_local, false)) return 0; - if (address_zero(&skc_local)) - return 0; - pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); rcu_read_lock(); @@ -1017,7 +1051,7 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) entry->addr.id = 0; entry->addr.port = 0; entry->ifindex = 0; - entry->flags = 0; + entry->flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; entry->lsk = NULL; ret = mptcp_pm_nl_append_new_local_addr(pernet, entry); if (ret < 0) @@ -1230,6 +1264,17 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } + if (addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL && + addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) { + GENL_SET_ERR_MSG(info, "flags mustn't have both signal and fullmesh"); + return -EINVAL; + } + + if (addr.flags & MPTCP_PM_ADDR_FLAG_IMPLICIT) { + GENL_SET_ERR_MSG(info, "can't create IMPLICIT endpoint"); + return -EINVAL; + } + entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) { GENL_SET_ERR_MSG(info, "can't allocate addr"); @@ -1281,7 +1326,7 @@ int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id, } static bool remove_anno_list_by_saddr(struct mptcp_sock *msk, - struct mptcp_addr_info *addr) + const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *entry; @@ -1296,7 +1341,7 @@ static bool remove_anno_list_by_saddr(struct mptcp_sock *msk, } static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, + const struct mptcp_addr_info *addr, bool force) { struct mptcp_rm_list list = { .nr = 0 }; @@ -1314,11 +1359,12 @@ static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, } static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, - struct mptcp_addr_info *addr) + const struct mptcp_pm_addr_entry *entry) { - struct mptcp_sock *msk; - long s_slot = 0, s_num = 0; + const struct mptcp_addr_info *addr = &entry->addr; struct mptcp_rm_list list = { .nr = 0 }; + long s_slot = 0, s_num = 0; + struct mptcp_sock *msk; pr_debug("remove_id=%d", addr->id); @@ -1335,7 +1381,8 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, lock_sock(sk); remove_subflow = lookup_subflow_by_saddr(&msk->conn_list, addr); - mptcp_pm_remove_anno_addr(msk, addr, remove_subflow); + mptcp_pm_remove_anno_addr(msk, addr, remove_subflow && + !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)); if (remove_subflow) mptcp_pm_remove_subflow(msk, &list); release_sock(sk); @@ -1348,14 +1395,6 @@ next: return 0; } -/* caller must ensure the RCU grace period is already elapsed */ -static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry) -{ - if (entry->lsk) - sock_release(entry->lsk); - kfree(entry); -} - static int mptcp_nl_remove_id_zero_address(struct net *net, struct mptcp_addr_info *addr) { @@ -1432,7 +1471,7 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) __clear_bit(entry->addr.id, pernet->id_bitmap); spin_unlock_bh(&pernet->lock); - mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), &entry->addr); + mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), entry); synchronize_rcu(); __mptcp_pm_release_addr_entry(entry); @@ -1447,14 +1486,12 @@ static void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, list_for_each_entry(entry, rm_list, list) { if (lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) && - alist.nr < MPTCP_RM_IDS_MAX && - slist.nr < MPTCP_RM_IDS_MAX) { - alist.ids[alist.nr++] = entry->addr.id; + slist.nr < MPTCP_RM_IDS_MAX) slist.ids[slist.nr++] = entry->addr.id; - } else if (remove_anno_list_by_saddr(msk, &entry->addr) && - alist.nr < MPTCP_RM_IDS_MAX) { + + if (remove_anno_list_by_saddr(msk, &entry->addr) && + alist.nr < MPTCP_RM_IDS_MAX) alist.ids[alist.nr++] = entry->addr.id; - } } if (alist.nr) { diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f60f01b14fac..101aeebeb9eb 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -117,6 +117,9 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) list_add(&subflow->node, &msk->conn_list); sock_hold(ssock->sk); subflow->request_mptcp = 1; + + /* This is the first subflow, always with id 0 */ + subflow->local_id_valid = 1; mptcp_sock_graft(msk->first, sk->sk_socket); return 0; @@ -466,9 +469,12 @@ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) static void mptcp_set_datafin_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + u32 retransmits; + + retransmits = min_t(u32, icsk->icsk_retransmits, + ilog2(TCP_RTO_MAX / TCP_RTO_MIN)); - mptcp_sk(sk)->timer_ival = min(TCP_RTO_MAX, - TCP_RTO_MIN << icsk->icsk_retransmits); + mptcp_sk(sk)->timer_ival = TCP_RTO_MIN << retransmits; } static void __mptcp_set_timeout(struct sock *sk, long tout) @@ -1353,6 +1359,7 @@ alloc_skb: out: if (READ_ONCE(msk->csum_enabled)) mptcp_update_data_checksum(skb, copy); + trace_mptcp_sendmsg_frag(mpext); mptcp_subflow_ctx(ssk)->rel_write_seq += copy; return copy; } @@ -3294,6 +3301,17 @@ static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) return 0; delta = msk->write_seq - v; + if (__mptcp_check_fallback(msk) && msk->first) { + struct tcp_sock *tp = tcp_sk(msk->first); + + /* the first subflow is disconnected after close - see + * __mptcp_close_ssk(). tcp_disconnect() moves the write_seq + * so ignore that status, too. + */ + if (!((1 << msk->first->sk_state) & + (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))) + delta += READ_ONCE(tp->write_seq) - tp->snd_una; + } if (delta > INT_MAX) delta = INT_MAX; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 85317ce38e3f..3c1a3036550f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -442,7 +442,8 @@ struct mptcp_subflow_context { rx_eof : 1, can_ack : 1, /* only after processing the remote a key */ disposable : 1, /* ctx can be free at ulp release time */ - stale : 1; /* unable to snd/rcv data, do not use for xmit */ + stale : 1, /* unable to snd/rcv data, do not use for xmit */ + local_id_valid : 1; /* local_id is correctly initialized */ enum mptcp_data_avail data_avail; u32 remote_nonce; u64 thmac; @@ -468,9 +469,7 @@ struct mptcp_subflow_context { struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ const struct inet_connection_sock_af_ops *icsk_af_ops; - void (*tcp_data_ready)(struct sock *sk); void (*tcp_state_change)(struct sock *sk); - void (*tcp_write_space)(struct sock *sk); void (*tcp_error_report)(struct sock *sk); struct rcu_head rcu; @@ -614,9 +613,9 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow); static inline void mptcp_subflow_tcp_fallback(struct sock *sk, struct mptcp_subflow_context *ctx) { - sk->sk_data_ready = ctx->tcp_data_ready; + sk->sk_data_ready = sock_def_readable; sk->sk_state_change = ctx->tcp_state_change; - sk->sk_write_space = ctx->tcp_write_space; + sk->sk_write_space = sk_stream_write_space; sk->sk_error_report = ctx->tcp_error_report; inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; @@ -643,8 +642,7 @@ int __init mptcp_proto_v6_init(void); struct sock *mptcp_sk_clone(const struct sock *sk, const struct mptcp_options_received *mp_opt, struct request_sock *req); -void mptcp_get_options(const struct sock *sk, - const struct sk_buff *skb, +void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt); void mptcp_finish_connect(struct sock *sk); @@ -743,7 +741,7 @@ void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk, void mptcp_pm_add_addr_received(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, - struct mptcp_addr_info *addr); + const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, @@ -754,10 +752,10 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk); bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk); struct mptcp_pm_add_entry * mptcp_pm_del_add_timer(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, bool check_id); + const struct mptcp_addr_info *addr, bool check_id); struct mptcp_pm_add_entry * -mptcp_lookup_anno_list_by_saddr(struct mptcp_sock *msk, - struct mptcp_addr_info *addr); +mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, + const struct mptcp_addr_info *addr); int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id, u8 *flags, int *ifindex); @@ -816,10 +814,10 @@ static inline int mptcp_rm_addr_len(const struct mptcp_rm_list *rm_list) return TCPOLEN_MPTCP_RM_ADDR_BASE + roundup(rm_list->nr - 1, 4) + 1; } -bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, struct sk_buff *skb, +bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, unsigned int opt_size, unsigned int remaining, struct mptcp_addr_info *addr, bool *echo, - bool *port, bool *drop_other_suboptions); + bool *drop_other_suboptions); bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_rm_list *rm_list); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); @@ -830,10 +828,10 @@ void mptcp_pm_nl_work(struct mptcp_sock *msk); void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); -unsigned int mptcp_pm_get_add_addr_signal_max(struct mptcp_sock *msk); -unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk); -unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk); -unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk); +unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk); +unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); +unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk); +unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk); void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index dacf3cee0027..f949d22f52bd 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -343,6 +343,8 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, case SO_RCVLOWAT: case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: + case SO_SNDTIMEO_OLD: + case SO_SNDTIMEO_NEW: case SO_BUSY_POLL: case SO_PREFER_BUSY_POLL: case SO_BUSY_POLL_BUDGET: diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index bea47a1180dc..aba260f547da 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -153,7 +153,7 @@ static int subflow_check_req(struct request_sock *req, return -EINVAL; #endif - mptcp_get_options(sk_listener, skb, &mp_opt); + mptcp_get_options(skb, &mp_opt); opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC); opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ); @@ -250,7 +250,7 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, int err; subflow_init_req(req, sk_listener); - mptcp_get_options(sk_listener, skb, &mp_opt); + mptcp_get_options(skb, &mp_opt); opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC); opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ); @@ -344,9 +344,7 @@ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow) thmac = get_unaligned_be64(hmac); pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n", - subflow, subflow->token, - (unsigned long long)thmac, - (unsigned long long)subflow->thmac); + subflow, subflow->token, thmac, subflow->thmac); return thmac == subflow->thmac; } @@ -410,7 +408,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->ssn_offset = TCP_SKB_CB(skb)->seq; pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); - mptcp_get_options(sk, skb, &mp_opt); + mptcp_get_options(skb, &mp_opt); if (subflow->request_mptcp) { if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) { MPTCP_INC_STATS(sock_net(sk), @@ -483,9 +481,53 @@ do_reset: mptcp_subflow_reset(sk); } +static void subflow_set_local_id(struct mptcp_subflow_context *subflow, int local_id) +{ + subflow->local_id = local_id; + subflow->local_id_valid = 1; +} + +static int subflow_chk_local_id(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + int err; + + if (likely(subflow->local_id_valid)) + return 0; + + err = mptcp_pm_get_local_id(msk, (struct sock_common *)sk); + if (err < 0) + return err; + + subflow_set_local_id(subflow, err); + return 0; +} + +static int subflow_rebuild_header(struct sock *sk) +{ + int err = subflow_chk_local_id(sk); + + if (unlikely(err < 0)) + return err; + + return inet_sk_rebuild_header(sk); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static int subflow_v6_rebuild_header(struct sock *sk) +{ + int err = subflow_chk_local_id(sk); + + if (unlikely(err < 0)) + return err; + + return inet6_sk_rebuild_header(sk); +} +#endif + struct request_sock_ops mptcp_subflow_request_sock_ops; -EXPORT_SYMBOL_GPL(mptcp_subflow_request_sock_ops); -static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops; +static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init; static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) { @@ -506,9 +548,9 @@ drop: } #if IS_ENABLED(CONFIG_MPTCP_IPV6) -static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops; -static struct inet_connection_sock_af_ops subflow_v6_specific; -static struct inet_connection_sock_af_ops subflow_v6m_specific; +static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init; +static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init; +static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init; static struct proto tcpv6_prot_override; static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) @@ -663,7 +705,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, * reordered MPC will cause fallback, but we don't have other * options. */ - mptcp_get_options(sk, skb, &mp_opt); + mptcp_get_options(skb, &mp_opt); if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) { fallback = true; goto create_child; @@ -673,7 +715,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, if (!new_msk) fallback = true; } else if (subflow_req->mp_join) { - mptcp_get_options(sk, skb, &mp_opt); + mptcp_get_options(skb, &mp_opt); if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ) || !subflow_hmac_valid(req, &mp_opt) || !mptcp_can_accept_new_subflow(subflow_req->msk)) { @@ -790,7 +832,7 @@ dispose_child: return child; } -static struct inet_connection_sock_af_ops subflow_specific; +static struct inet_connection_sock_af_ops subflow_specific __ro_after_init; static struct proto tcp_prot_override; enum mapping_status { @@ -1107,7 +1149,7 @@ static bool subflow_check_data_avail(struct sock *ssk) struct sk_buff *skb; if (!skb_peek(&ssk->sk_receive_queue)) - WRITE_ONCE(subflow->data_avail, 0); + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); if (subflow->data_avail) return true; @@ -1172,7 +1214,7 @@ fallback: subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; tcp_send_active_reset(ssk, GFP_ATOMIC); - WRITE_ONCE(subflow->data_avail, 0); + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); return true; } @@ -1185,7 +1227,7 @@ fallback: subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMPTCP; tcp_send_active_reset(ssk, GFP_ATOMIC); - WRITE_ONCE(subflow->data_avail, 0); + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); return false; } @@ -1207,7 +1249,7 @@ bool mptcp_subflow_data_available(struct sock *sk) if (subflow->map_valid && mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { subflow->map_valid = 0; - WRITE_ONCE(subflow->data_avail, 0); + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); pr_debug("Done with mapping: seq=%u data_len=%u", subflow->map_subflow_seq, @@ -1311,7 +1353,7 @@ static void subflow_write_space(struct sock *ssk) mptcp_write_space(sk); } -static struct inet_connection_sock_af_ops * +static const struct inet_connection_sock_af_ops * subflow_default_af_ops(struct sock *sk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -1326,7 +1368,7 @@ void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_connection_sock_af_ops *target; + const struct inet_connection_sock_af_ops *target; target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); @@ -1401,13 +1443,8 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, get_random_bytes(&subflow->local_nonce, sizeof(u32)); } while (!subflow->local_nonce); - if (!local_id) { - err = mptcp_pm_get_local_id(msk, (struct sock_common *)ssk); - if (err < 0) - goto failed; - - local_id = err; - } + if (local_id) + subflow_set_local_id(subflow, local_id); mptcp_pm_get_flags_and_ifindex_by_id(sock_net(sk), local_id, &flags, &ifindex); @@ -1432,7 +1469,6 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk, remote_token, local_id, remote_id); subflow->remote_token = remote_token; - subflow->local_id = local_id; subflow->remote_id = remote_id; subflow->request_join = 1; subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP); @@ -1657,10 +1693,12 @@ static int subflow_ulp_init(struct sock *sk) tp->is_mptcp = 1; ctx->icsk_af_ops = icsk->icsk_af_ops; icsk->icsk_af_ops = subflow_default_af_ops(sk); - ctx->tcp_data_ready = sk->sk_data_ready; ctx->tcp_state_change = sk->sk_state_change; - ctx->tcp_write_space = sk->sk_write_space; ctx->tcp_error_report = sk->sk_error_report; + + WARN_ON_ONCE(sk->sk_data_ready != sock_def_readable); + WARN_ON_ONCE(sk->sk_write_space != sk_stream_write_space); + sk->sk_data_ready = subflow_data_ready; sk->sk_write_space = subflow_write_space; sk->sk_state_change = subflow_state_change; @@ -1715,9 +1753,7 @@ static void subflow_ulp_clone(const struct request_sock *req, new_ctx->conn_finished = 1; new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; - new_ctx->tcp_data_ready = old_ctx->tcp_data_ready; new_ctx->tcp_state_change = old_ctx->tcp_state_change; - new_ctx->tcp_write_space = old_ctx->tcp_write_space; new_ctx->tcp_error_report = old_ctx->tcp_error_report; new_ctx->rel_write_seq = 1; new_ctx->tcp_sock = newsk; @@ -1731,15 +1767,22 @@ static void subflow_ulp_clone(const struct request_sock *req, new_ctx->token = subflow_req->token; new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->idsn = subflow_req->idsn; + + /* this is the first subflow, id is always 0 */ + new_ctx->local_id_valid = 1; } else if (subflow_req->mp_join) { new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->mp_join = 1; new_ctx->fully_established = 1; new_ctx->backup = subflow_req->backup; - new_ctx->local_id = subflow_req->local_id; new_ctx->remote_id = subflow_req->remote_id; new_ctx->token = subflow_req->token; new_ctx->thmac = subflow_req->thmac; + + /* the subflow req id is valid, fetched via subflow_check_req() + * and subflow_token_join_request() + */ + subflow_set_local_id(new_ctx, subflow_req->local_id); } } @@ -1792,6 +1835,7 @@ void __init mptcp_subflow_init(void) subflow_specific.conn_request = subflow_v4_conn_request; subflow_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_specific.sk_rx_dst_set = subflow_finish_connect; + subflow_specific.rebuild_header = subflow_rebuild_header; tcp_prot_override = tcp_prot; tcp_prot_override.release_cb = tcp_release_cb_override; @@ -1804,6 +1848,7 @@ void __init mptcp_subflow_init(void) subflow_v6_specific.conn_request = subflow_v6_conn_request; subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; + subflow_v6_specific.rebuild_header = subflow_v6_rebuild_header; subflow_v6m_specific = subflow_v6_specific; subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; @@ -1811,6 +1856,7 @@ void __init mptcp_subflow_init(void) subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; subflow_v6m_specific.net_frag_header_len = 0; + subflow_v6m_specific.rebuild_header = subflow_rebuild_header; tcpv6_prot_override = tcpv6_prot; tcpv6_prot_override.release_cb = tcp_release_cb_override; diff --git a/net/netfilter/core.c b/net/netfilter/core.c index d1c9dfbb11fa..9a4feb922cf6 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -428,14 +428,15 @@ static int __nf_register_net_hook(struct net *net, int pf, p = nf_entry_dereference(*pp); new_hooks = nf_hook_entries_grow(p, reg); - if (!IS_ERR(new_hooks)) + if (!IS_ERR(new_hooks)) { + hooks_validate(new_hooks); rcu_assign_pointer(*pp, new_hooks); + } mutex_unlock(&nf_hook_mutex); if (IS_ERR(new_hooks)) return PTR_ERR(new_hooks); - hooks_validate(new_hooks); #ifdef CONFIG_NETFILTER_INGRESS if (nf_ingress_hook(reg, pf)) net_inc_ingress_queue(); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index d2e5a8f644b8..029171379884 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -610,7 +610,7 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, nf_reset_ct(skb); skb_forward_csum(skb); if (skb->dev) - skb->tstamp = 0; + skb_clear_tstamp(skb); } return ret; } @@ -652,7 +652,7 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, if (!local) { skb_forward_csum(skb); if (skb->dev) - skb->tstamp = 0; + skb_clear_tstamp(skb); NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, NULL, skb_dst(skb)->dev, dst_output); } else @@ -674,7 +674,7 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, ip_vs_drop_early_demux_sk(skb); skb_forward_csum(skb); if (skb->dev) - skb->tstamp = 0; + skb_clear_tstamp(skb); NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, NULL, skb_dst(skb)->dev, dst_output); } else diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index a579e59ee5c5..7873bd1389c3 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -19,7 +19,7 @@ static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev) skb_push(skb, skb->mac_len); skb->dev = dev; - skb->tstamp = 0; + skb_clear_tstamp(skb); dev_queue_xmit(skb); } diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 889cf88d3dba..f1d387129f02 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -376,7 +376,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, nf_flow_nat_ip(flow, skb, thoff, dir, iph); ip_decrease_ttl(iph); - skb->tstamp = 0; + skb_clear_tstamp(skb); if (flow_table->flags & NF_FLOWTABLE_COUNTER) nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); @@ -611,7 +611,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, nf_flow_nat_ipv6(flow, skb, dir, ip6h); ip6h->hop_limit--; - skb->tstamp = 0; + skb_clear_tstamp(skb); if (flow_table->flags & NF_FLOWTABLE_COUNTER) nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index b561e0a44a45..fc4265acd9c4 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -110,7 +110,11 @@ static int nf_flow_rule_match(struct nf_flow_match *match, nf_flow_rule_lwt_match(match, tun_info); } - key->meta.ingress_ifindex = tuple->iifidx; + if (tuple->xmit_type == FLOW_OFFLOAD_XMIT_TC) + key->meta.ingress_ifindex = tuple->tc.iifidx; + else + key->meta.ingress_ifindex = tuple->iifidx; + mask->meta.ingress_ifindex = 0xffffffff; if (tuple->encap_num > 0 && !(tuple->in_vlan_ingress & BIT(0)) && diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 6d12afabfe8a..63d1516816b1 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -46,6 +46,15 @@ void nf_unregister_queue_handler(void) } EXPORT_SYMBOL(nf_unregister_queue_handler); +static void nf_queue_sock_put(struct sock *sk) +{ +#ifdef CONFIG_INET + sock_gen_put(sk); +#else + sock_put(sk); +#endif +} + static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; @@ -54,7 +63,7 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) dev_put(state->in); dev_put(state->out); if (state->sk) - sock_put(state->sk); + nf_queue_sock_put(state->sk); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) dev_put(entry->physin); @@ -87,19 +96,21 @@ static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry) } /* Bump dev refs so they don't vanish while packet is out */ -void nf_queue_entry_get_refs(struct nf_queue_entry *entry) +bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; + if (state->sk && !refcount_inc_not_zero(&state->sk->sk_refcnt)) + return false; + dev_hold(state->in); dev_hold(state->out); - if (state->sk) - sock_hold(state->sk); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) dev_hold(entry->physin); dev_hold(entry->physout); #endif + return true; } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); @@ -169,6 +180,18 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, break; } + if (skb_sk_is_prefetched(skb)) { + struct sock *sk = skb->sk; + + if (!sk_is_refcounted(sk)) { + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + return -ENOTCONN; + + /* drop refcount on skb_orphan */ + skb->destructor = sock_edemux; + } + } + entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC); if (!entry) return -ENOMEM; @@ -187,7 +210,10 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, __nf_queue_entry_init_physdevs(entry); - nf_queue_entry_get_refs(entry); + if (!nf_queue_entry_get_refs(entry)) { + kfree(entry); + return -ENOTCONN; + } switch (entry->state.pf) { case AF_INET: diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5fa16990da95..c86748b3873b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4502,7 +4502,7 @@ static void nft_set_catchall_destroy(const struct nft_ctx *ctx, list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { list_del_rcu(&catchall->list); nft_set_elem_destroy(set, catchall->elem, true); - kfree_rcu(catchall); + kfree_rcu(catchall, rcu); } } @@ -5669,7 +5669,7 @@ static void nft_setelem_catchall_remove(const struct net *net, list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { if (catchall->elem == elem->priv) { list_del_rcu(&catchall->list); - kfree_rcu(catchall); + kfree_rcu(catchall, rcu); break; } } @@ -6551,12 +6551,15 @@ static int nf_tables_updobj(const struct nft_ctx *ctx, { struct nft_object *newobj; struct nft_trans *trans; - int err; + int err = -ENOMEM; + + if (!try_module_get(type->owner)) + return -ENOENT; trans = nft_trans_alloc(ctx, NFT_MSG_NEWOBJ, sizeof(struct nft_trans_obj)); if (!trans) - return -ENOMEM; + goto err_trans; newobj = nft_obj_init(ctx, type, attr); if (IS_ERR(newobj)) { @@ -6573,6 +6576,8 @@ static int nf_tables_updobj(const struct nft_ctx *ctx, err_free_trans: kfree(trans); +err_trans: + module_put(type->owner); return err; } @@ -8185,7 +8190,7 @@ static void nft_obj_commit_update(struct nft_trans *trans) if (obj->ops->update) obj->ops->update(obj, newobj); - kfree(newobj); + nft_obj_destroy(&trans->ctx, newobj); } static void nft_commit_release(struct nft_trans *trans) @@ -8976,7 +8981,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) break; case NFT_MSG_NEWOBJ: if (nft_trans_obj_update(trans)) { - kfree(nft_trans_obj_newobj(trans)); + nft_obj_destroy(&trans->ctx, nft_trans_obj_newobj(trans)); nft_trans_destroy(trans); } else { trans->ctx.table->use--; @@ -9636,10 +9641,13 @@ EXPORT_SYMBOL_GPL(__nft_release_basechain); static void __nft_release_hook(struct net *net, struct nft_table *table) { + struct nft_flowtable *flowtable; struct nft_chain *chain; list_for_each_entry(chain, &table->chains, list) nf_tables_unregister_hook(net, table, chain); + list_for_each_entry(flowtable, &table->flowtables, list) + nft_unregister_flowtable_net_hooks(net, &flowtable->hook_list); } static void __nft_release_hooks(struct net *net) diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 9656c1646222..2d36952b1392 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -94,7 +94,8 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net, expr = nft_expr_first(rule); while (nft_expr_more(rule, expr)) { - if (expr->ops->offload_flags & NFT_OFFLOAD_F_ACTION) + if (expr->ops->offload_action && + expr->ops->offload_action(expr)) num_actions++; expr = nft_expr_next(expr); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index ae9c0756bba5..d97eb280cb2e 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -460,6 +460,7 @@ __build_packet_message(struct nfnl_log_net *log, sk_buff_data_t old_tail = inst->skb->tail; struct sock *sk; const unsigned char *hwhdrp; + ktime_t tstamp; nlh = nfnl_msg_put(inst->skb, 0, 0, nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET), @@ -588,9 +589,10 @@ __build_packet_message(struct nfnl_log_net *log, goto nla_put_failure; } - if (hooknum <= NF_INET_FORWARD && skb->tstamp) { + tstamp = skb_tstamp_cond(skb, false); + if (hooknum <= NF_INET_FORWARD && tstamp) { struct nfulnl_msg_packet_timestamp ts; - struct timespec64 kts = ktime_to_timespec64(skb->tstamp); + struct timespec64 kts = ktime_to_timespec64(tstamp); ts.sec = cpu_to_be64(kts.tv_sec); ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 8c15978d9258..a364f8e5e698 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -392,6 +392,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, bool csum_verify; char *secdata = NULL; u32 seclen = 0; + ktime_t tstamp; size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) @@ -407,7 +408,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, + nla_total_size(sizeof(u_int32_t)) /* skbinfo */ + nla_total_size(sizeof(u_int32_t)); /* cap_len */ - if (entskb->tstamp) + tstamp = skb_tstamp_cond(entskb, false); + if (tstamp) size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); size += nfqnl_get_bridge_size(entry); @@ -582,9 +584,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, if (nfqnl_put_bridge(entry, skb) < 0) goto nla_put_failure; - if (entry->state.hook <= NF_INET_FORWARD && entskb->tstamp) { + if (entry->state.hook <= NF_INET_FORWARD && tstamp) { struct nfqnl_msg_packet_timestamp ts; - struct timespec64 kts = ktime_to_timespec64(entskb->tstamp); + struct timespec64 kts = ktime_to_timespec64(tstamp); ts.sec = cpu_to_be64(kts.tv_sec); ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); @@ -715,9 +717,15 @@ static struct nf_queue_entry * nf_queue_entry_dup(struct nf_queue_entry *e) { struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); - if (entry) - nf_queue_entry_get_refs(entry); - return entry; + + if (!entry) + return NULL; + + if (nf_queue_entry_get_refs(entry)) + return entry; + + kfree(entry); + return NULL; } #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c index bbf3fcba3df4..5b5c607fbf83 100644 --- a/net/netfilter/nft_dup_netdev.c +++ b/net/netfilter/nft_dup_netdev.c @@ -67,6 +67,11 @@ static int nft_dup_netdev_offload(struct nft_offload_ctx *ctx, return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_MIRRED, oif); } +static bool nft_dup_netdev_offload_action(const struct nft_expr *expr) +{ + return true; +} + static struct nft_expr_type nft_dup_netdev_type; static const struct nft_expr_ops nft_dup_netdev_ops = { .type = &nft_dup_netdev_type, @@ -75,6 +80,7 @@ static const struct nft_expr_ops nft_dup_netdev_ops = { .init = nft_dup_netdev_init, .dump = nft_dup_netdev_dump, .offload = nft_dup_netdev_offload, + .offload_action = nft_dup_netdev_offload_action, }; static struct nft_expr_type nft_dup_netdev_type __read_mostly = { diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index fa9301ca6033..08e7a289738e 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -79,6 +79,11 @@ static int nft_fwd_netdev_offload(struct nft_offload_ctx *ctx, return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_REDIRECT, oif); } +static bool nft_fwd_netdev_offload_action(const struct nft_expr *expr) +{ + return true; +} + struct nft_fwd_neigh { u8 sreg_dev; u8 sreg_addr; @@ -140,7 +145,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, return; skb->dev = dev; - skb->tstamp = 0; + skb_clear_tstamp(skb); neigh_xmit(neigh_table, dev, addr, skb); out: regs->verdict.code = verdict; @@ -222,6 +227,7 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = { .dump = nft_fwd_netdev_dump, .validate = nft_fwd_validate, .offload = nft_fwd_netdev_offload, + .offload_action = nft_fwd_netdev_offload_action, }; static const struct nft_expr_ops * diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 90c64d27ae53..d0f67d325bdf 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -213,6 +213,16 @@ static int nft_immediate_offload(struct nft_offload_ctx *ctx, return 0; } +static bool nft_immediate_offload_action(const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + + if (priv->dreg == NFT_REG_VERDICT) + return true; + + return false; +} + static const struct nft_expr_ops nft_imm_ops = { .type = &nft_imm_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)), @@ -224,7 +234,7 @@ static const struct nft_expr_ops nft_imm_ops = { .dump = nft_immediate_dump, .validate = nft_immediate_validate, .offload = nft_immediate_offload, - .offload_flags = NFT_OFFLOAD_F_ACTION, + .offload_action = nft_immediate_offload_action, }; struct nft_expr_type nft_imm_type __read_mostly = { diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index c4f308460dd1..a726b623963d 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -340,11 +340,20 @@ static int nft_limit_obj_pkts_dump(struct sk_buff *skb, return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS); } +static void nft_limit_obj_pkts_destroy(const struct nft_ctx *ctx, + struct nft_object *obj) +{ + struct nft_limit_priv_pkts *priv = nft_obj_data(obj); + + nft_limit_destroy(ctx, &priv->limit); +} + static struct nft_object_type nft_limit_obj_type; static const struct nft_object_ops nft_limit_obj_pkts_ops = { .type = &nft_limit_obj_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv_pkts)), .init = nft_limit_obj_pkts_init, + .destroy = nft_limit_obj_pkts_destroy, .eval = nft_limit_obj_pkts_eval, .dump = nft_limit_obj_pkts_dump, }; @@ -378,11 +387,20 @@ static int nft_limit_obj_bytes_dump(struct sk_buff *skb, return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES); } +static void nft_limit_obj_bytes_destroy(const struct nft_ctx *ctx, + struct nft_object *obj) +{ + struct nft_limit_priv *priv = nft_obj_data(obj); + + nft_limit_destroy(ctx, priv); +} + static struct nft_object_type nft_limit_obj_type; static const struct nft_object_ops nft_limit_obj_bytes_ops = { .type = &nft_limit_obj_type, .size = sizeof(struct nft_limit_priv), .init = nft_limit_obj_bytes_init, + .destroy = nft_limit_obj_bytes_destroy, .eval = nft_limit_obj_bytes_eval, .dump = nft_limit_obj_bytes_dump, }; diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index a0109fa1e92d..1133e06f3c40 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -191,8 +191,10 @@ static int nft_synproxy_do_init(const struct nft_ctx *ctx, if (err) goto nf_ct_failure; err = nf_synproxy_ipv6_init(snet, ctx->net); - if (err) + if (err) { + nf_synproxy_ipv4_fini(snet, ctx->net); goto nf_ct_failure; + } break; } diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 5e6459e11605..7013f55f05d1 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -220,8 +220,10 @@ static void socket_mt_destroy(const struct xt_mtdtor_param *par) { if (par->family == NFPROTO_IPV4) nf_defrag_ipv4_disable(par->net); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) else if (par->family == NFPROTO_IPV6) - nf_defrag_ipv4_disable(par->net); + nf_defrag_ipv6_disable(par->net); +#endif } static struct xt_match socket_mt_reg[] __read_mostly = { diff --git a/net/nfc/llcp.h b/net/nfc/llcp.h index d49d4bf2e37c..c1d9be636933 100644 --- a/net/nfc/llcp.h +++ b/net/nfc/llcp.h @@ -6,7 +6,6 @@ enum llcp_state { LLCP_CONNECTED = 1, /* wait_for_packet() wants that */ LLCP_CONNECTING, - LLCP_DISCONNECTING, LLCP_CLOSED, LLCP_BOUND, LLCP_LISTEN, diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index 5ad5157aa9c5..3364caabef8b 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -383,7 +383,7 @@ u8 nfc_llcp_get_sdp_ssap(struct nfc_llcp_local *local, pr_debug("WKS %d\n", ssap); /* This is a WKS, let's check if it's free */ - if (local->local_wks & BIT(ssap)) { + if (test_bit(ssap, &local->local_wks)) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; @@ -737,13 +737,6 @@ static void nfc_llcp_tx_work(struct work_struct *work) print_hex_dump_debug("LLCP Tx: ", DUMP_PREFIX_OFFSET, 16, 1, skb->data, skb->len, true); - if (ptype == LLCP_PDU_DISC && sk != NULL && - sk->sk_state == LLCP_DISCONNECTING) { - nfc_llcp_sock_unlink(&local->sockets, sk); - sock_orphan(sk); - sock_put(sk); - } - if (ptype == LLCP_PDU_I) copy_skb = skb_copy(skb, GFP_ATOMIC); diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 0b93a17b9f11..4ca35791c93b 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -108,21 +108,13 @@ static int llcp_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) llcp_sock->service_name_len, GFP_KERNEL); if (!llcp_sock->service_name) { - nfc_llcp_local_put(llcp_sock->local); - llcp_sock->local = NULL; - llcp_sock->dev = NULL; ret = -ENOMEM; - goto put_dev; + goto sock_llcp_put_local; } llcp_sock->ssap = nfc_llcp_get_sdp_ssap(local, llcp_sock); if (llcp_sock->ssap == LLCP_SAP_MAX) { - nfc_llcp_local_put(llcp_sock->local); - llcp_sock->local = NULL; - kfree(llcp_sock->service_name); - llcp_sock->service_name = NULL; - llcp_sock->dev = NULL; ret = -EADDRINUSE; - goto put_dev; + goto free_service_name; } llcp_sock->reserved_ssap = llcp_sock->ssap; @@ -132,6 +124,19 @@ static int llcp_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) pr_debug("Socket bound to SAP %d\n", llcp_sock->ssap); sk->sk_state = LLCP_BOUND; + nfc_put_device(dev); + release_sock(sk); + + return 0; + +free_service_name: + kfree(llcp_sock->service_name); + llcp_sock->service_name = NULL; + +sock_llcp_put_local: + nfc_llcp_local_put(llcp_sock->local); + llcp_sock->local = NULL; + llcp_sock->dev = NULL; put_dev: nfc_put_device(dev); @@ -626,23 +631,16 @@ static int llcp_sock_release(struct socket *sock) } } - if (llcp_sock->reserved_ssap < LLCP_SAP_MAX) - nfc_llcp_put_ssap(llcp_sock->local, llcp_sock->ssap); - - release_sock(sk); - - /* Keep this sock alive and therefore do not remove it from the sockets - * list until the DISC PDU has been actually sent. Otherwise we would - * reply with DM PDUs before sending the DISC one. - */ - if (sk->sk_state == LLCP_DISCONNECTING) - return err; - if (sock->type == SOCK_RAW) nfc_llcp_sock_unlink(&local->raw_sockets, sk); else nfc_llcp_sock_unlink(&local->sockets, sk); + if (llcp_sock->reserved_ssap < LLCP_SAP_MAX) + nfc_llcp_put_ssap(llcp_sock->local, llcp_sock->ssap); + + release_sock(sk); + out: sock_orphan(sk); sock_put(sk); @@ -712,10 +710,8 @@ static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, llcp_sock->local = nfc_llcp_local_get(local); llcp_sock->ssap = nfc_llcp_get_local_ssap(local); if (llcp_sock->ssap == LLCP_SAP_MAX) { - nfc_llcp_local_put(llcp_sock->local); - llcp_sock->local = NULL; ret = -ENOMEM; - goto put_dev; + goto sock_llcp_put_local; } llcp_sock->reserved_ssap = llcp_sock->ssap; @@ -760,8 +756,11 @@ sock_unlink: sock_llcp_release: nfc_llcp_put_ssap(local, llcp_sock->ssap); + +sock_llcp_put_local: nfc_llcp_local_put(llcp_sock->local); llcp_sock->local = NULL; + llcp_sock->dev = NULL; put_dev: nfc_put_device(dev); diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 076774034bb9..780d9e2246f3 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -423,12 +423,43 @@ static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, memcpy(addr, new_addr, sizeof(__be32[4])); } -static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl, u32 mask) +static void set_ipv6_dsfield(struct sk_buff *skb, struct ipv6hdr *nh, u8 ipv6_tclass, u8 mask) { + u8 old_ipv6_tclass = ipv6_get_dsfield(nh); + + ipv6_tclass = OVS_MASKED(old_ipv6_tclass, ipv6_tclass, mask); + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum_replace(&skb->csum, (__force __wsum)(old_ipv6_tclass << 12), + (__force __wsum)(ipv6_tclass << 12)); + + ipv6_change_dsfield(nh, ~mask, ipv6_tclass); +} + +static void set_ipv6_fl(struct sk_buff *skb, struct ipv6hdr *nh, u32 fl, u32 mask) +{ + u32 ofl; + + ofl = nh->flow_lbl[0] << 16 | nh->flow_lbl[1] << 8 | nh->flow_lbl[2]; + fl = OVS_MASKED(ofl, fl, mask); + /* Bits 21-24 are always unmasked, so this retains their values. */ - OVS_SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16)); - OVS_SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8)); - OVS_SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask); + nh->flow_lbl[0] = (u8)(fl >> 16); + nh->flow_lbl[1] = (u8)(fl >> 8); + nh->flow_lbl[2] = (u8)fl; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum_replace(&skb->csum, (__force __wsum)htonl(ofl), (__force __wsum)htonl(fl)); +} + +static void set_ipv6_ttl(struct sk_buff *skb, struct ipv6hdr *nh, u8 new_ttl, u8 mask) +{ + new_ttl = OVS_MASKED(nh->hop_limit, new_ttl, mask); + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum_replace(&skb->csum, (__force __wsum)(nh->hop_limit << 8), + (__force __wsum)(new_ttl << 8)); + nh->hop_limit = new_ttl; } static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl, @@ -546,18 +577,17 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, } } if (mask->ipv6_tclass) { - ipv6_change_dsfield(nh, ~mask->ipv6_tclass, key->ipv6_tclass); + set_ipv6_dsfield(skb, nh, key->ipv6_tclass, mask->ipv6_tclass); flow_key->ip.tos = ipv6_get_dsfield(nh); } if (mask->ipv6_label) { - set_ipv6_fl(nh, ntohl(key->ipv6_label), + set_ipv6_fl(skb, nh, ntohl(key->ipv6_label), ntohl(mask->ipv6_label)); flow_key->ipv6.label = *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); } if (mask->ipv6_hlimit) { - OVS_SET_MASKED(nh->hop_limit, key->ipv6_hlimit, - mask->ipv6_hlimit); + set_ipv6_ttl(skb, nh, key->ipv6_hlimit, mask->ipv6_hlimit); flow_key->ip.ttl = nh->hop_limit; } return 0; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index f6cd24fd530c..372bf54a0ca9 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -241,6 +241,144 @@ static bool icmphdr_ok(struct sk_buff *skb) sizeof(struct icmphdr)); } +/** + * get_ipv6_ext_hdrs() - Parses packet and sets IPv6 extension header flags. + * + * @skb: buffer where extension header data starts in packet + * @nh: ipv6 header + * @ext_hdrs: flags are stored here + * + * OFPIEH12_UNREP is set if more than one of a given IPv6 extension header + * is unexpectedly encountered. (Two destination options headers may be + * expected and would not cause this bit to be set.) + * + * OFPIEH12_UNSEQ is set if IPv6 extension headers were not in the order + * preferred (but not required) by RFC 2460: + * + * When more than one extension header is used in the same packet, it is + * recommended that those headers appear in the following order: + * IPv6 header + * Hop-by-Hop Options header + * Destination Options header + * Routing header + * Fragment header + * Authentication header + * Encapsulating Security Payload header + * Destination Options header + * upper-layer header + */ +static void get_ipv6_ext_hdrs(struct sk_buff *skb, struct ipv6hdr *nh, + u16 *ext_hdrs) +{ + u8 next_type = nh->nexthdr; + unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); + int dest_options_header_count = 0; + + *ext_hdrs = 0; + + while (ipv6_ext_hdr(next_type)) { + struct ipv6_opt_hdr _hdr, *hp; + + switch (next_type) { + case IPPROTO_NONE: + *ext_hdrs |= OFPIEH12_NONEXT; + /* stop parsing */ + return; + + case IPPROTO_ESP: + if (*ext_hdrs & OFPIEH12_ESP) + *ext_hdrs |= OFPIEH12_UNREP; + if ((*ext_hdrs & ~(OFPIEH12_HOP | OFPIEH12_DEST | + OFPIEH12_ROUTER | IPPROTO_FRAGMENT | + OFPIEH12_AUTH | OFPIEH12_UNREP)) || + dest_options_header_count >= 2) { + *ext_hdrs |= OFPIEH12_UNSEQ; + } + *ext_hdrs |= OFPIEH12_ESP; + break; + + case IPPROTO_AH: + if (*ext_hdrs & OFPIEH12_AUTH) + *ext_hdrs |= OFPIEH12_UNREP; + if ((*ext_hdrs & + ~(OFPIEH12_HOP | OFPIEH12_DEST | OFPIEH12_ROUTER | + IPPROTO_FRAGMENT | OFPIEH12_UNREP)) || + dest_options_header_count >= 2) { + *ext_hdrs |= OFPIEH12_UNSEQ; + } + *ext_hdrs |= OFPIEH12_AUTH; + break; + + case IPPROTO_DSTOPTS: + if (dest_options_header_count == 0) { + if (*ext_hdrs & + ~(OFPIEH12_HOP | OFPIEH12_UNREP)) + *ext_hdrs |= OFPIEH12_UNSEQ; + *ext_hdrs |= OFPIEH12_DEST; + } else if (dest_options_header_count == 1) { + if (*ext_hdrs & + ~(OFPIEH12_HOP | OFPIEH12_DEST | + OFPIEH12_ROUTER | OFPIEH12_FRAG | + OFPIEH12_AUTH | OFPIEH12_ESP | + OFPIEH12_UNREP)) { + *ext_hdrs |= OFPIEH12_UNSEQ; + } + } else { + *ext_hdrs |= OFPIEH12_UNREP; + } + dest_options_header_count++; + break; + + case IPPROTO_FRAGMENT: + if (*ext_hdrs & OFPIEH12_FRAG) + *ext_hdrs |= OFPIEH12_UNREP; + if ((*ext_hdrs & ~(OFPIEH12_HOP | + OFPIEH12_DEST | + OFPIEH12_ROUTER | + OFPIEH12_UNREP)) || + dest_options_header_count >= 2) { + *ext_hdrs |= OFPIEH12_UNSEQ; + } + *ext_hdrs |= OFPIEH12_FRAG; + break; + + case IPPROTO_ROUTING: + if (*ext_hdrs & OFPIEH12_ROUTER) + *ext_hdrs |= OFPIEH12_UNREP; + if ((*ext_hdrs & ~(OFPIEH12_HOP | + OFPIEH12_DEST | + OFPIEH12_UNREP)) || + dest_options_header_count >= 2) { + *ext_hdrs |= OFPIEH12_UNSEQ; + } + *ext_hdrs |= OFPIEH12_ROUTER; + break; + + case IPPROTO_HOPOPTS: + if (*ext_hdrs & OFPIEH12_HOP) + *ext_hdrs |= OFPIEH12_UNREP; + /* OFPIEH12_HOP is set to 1 if a hop-by-hop IPv6 + * extension header is present as the first + * extension header in the packet. + */ + if (*ext_hdrs == 0) + *ext_hdrs |= OFPIEH12_HOP; + else + *ext_hdrs |= OFPIEH12_UNSEQ; + break; + + default: + return; + } + + hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); + if (!hp) + break; + next_type = hp->nexthdr; + start += ipv6_optlen(hp); + } +} + static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) { unsigned short frag_off; @@ -256,6 +394,8 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) nh = ipv6_hdr(skb); + get_ipv6_ext_hdrs(skb, nh, &key->ipv6.exthdrs); + key->ip.proto = NEXTHDR_NONE; key->ip.tos = ipv6_get_dsfield(nh); key->ip.ttl = nh->hop_limit; diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 758a8c77f736..073ab73ffeaa 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -32,6 +32,19 @@ enum sw_flow_mac_proto { #define SW_FLOW_KEY_INVALID 0x80 #define MPLS_LABEL_DEPTH 3 +/* Bit definitions for IPv6 Extension Header pseudo-field. */ +enum ofp12_ipv6exthdr_flags { + OFPIEH12_NONEXT = 1 << 0, /* "No next header" encountered. */ + OFPIEH12_ESP = 1 << 1, /* Encrypted Sec Payload header present. */ + OFPIEH12_AUTH = 1 << 2, /* Authentication header present. */ + OFPIEH12_DEST = 1 << 3, /* 1 or 2 dest headers present. */ + OFPIEH12_FRAG = 1 << 4, /* Fragment header present. */ + OFPIEH12_ROUTER = 1 << 5, /* Router header present. */ + OFPIEH12_HOP = 1 << 6, /* Hop-by-hop header present. */ + OFPIEH12_UNREP = 1 << 7, /* Unexpected repeats encountered. */ + OFPIEH12_UNSEQ = 1 << 8 /* Unexpected sequencing encountered. */ +}; + /* Store options at the end of the array if they are less than the * maximum size. This allows us to get the benefits of variable length * matching for small options. @@ -121,6 +134,7 @@ struct sw_flow_key { struct in6_addr dst; /* IPv6 destination address. */ } addr; __be32 label; /* IPv6 flow label. */ + u16 exthdrs; /* IPv6 extension header flags */ union { struct { struct in6_addr src; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index fd1f809e9bc1..5176f6ccac8e 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -346,7 +346,7 @@ size_t ovs_key_attr_size(void) /* Whenever adding new OVS_KEY_ FIELDS, we should consider * updating this function. */ - BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 29); + BUILD_BUG_ON(OVS_KEY_ATTR_MAX != 32); return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ @@ -369,7 +369,8 @@ size_t ovs_key_attr_size(void) + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + nla_total_size(40) /* OVS_KEY_ATTR_IPV6 */ + nla_total_size(2) /* OVS_KEY_ATTR_ICMPV6 */ - + nla_total_size(28); /* OVS_KEY_ATTR_ND */ + + nla_total_size(28) /* OVS_KEY_ATTR_ND */ + + nla_total_size(2); /* OVS_KEY_ATTR_IPV6_EXTHDRS */ } static const struct ovs_len_tbl ovs_vxlan_ext_key_lens[OVS_VXLAN_EXT_MAX + 1] = { @@ -437,6 +438,8 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { .len = sizeof(struct ovs_key_ct_tuple_ipv6) }, [OVS_KEY_ATTR_NSH] = { .len = OVS_ATTR_NESTED, .next = ovs_nsh_key_attr_lens, }, + [OVS_KEY_ATTR_IPV6_EXTHDRS] = { + .len = sizeof(struct ovs_key_ipv6_exthdrs) }, }; static bool check_attr_len(unsigned int attr_len, unsigned int expected_len) @@ -479,7 +482,14 @@ static int __parse_flow_nlattrs(const struct nlattr *attr, return -EINVAL; } - if (attrs & (1 << type)) { + if (type == OVS_KEY_ATTR_PACKET_TYPE || + type == OVS_KEY_ATTR_ND_EXTENSIONS || + type == OVS_KEY_ATTR_TUNNEL_INFO) { + OVS_NLERR(log, "Key type %d is not supported", type); + return -EINVAL; + } + + if (attrs & (1ULL << type)) { OVS_NLERR(log, "Duplicate key (type %d).", type); return -EINVAL; } @@ -492,7 +502,7 @@ static int __parse_flow_nlattrs(const struct nlattr *attr, } if (!nz || !is_all_zero(nla_data(nla), nla_len(nla))) { - attrs |= 1 << type; + attrs |= 1ULL << type; a[type] = nla; } } @@ -1597,6 +1607,17 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, attrs &= ~(1 << OVS_KEY_ATTR_IPV6); } + if (attrs & (1ULL << OVS_KEY_ATTR_IPV6_EXTHDRS)) { + const struct ovs_key_ipv6_exthdrs *ipv6_exthdrs_key; + + ipv6_exthdrs_key = nla_data(a[OVS_KEY_ATTR_IPV6_EXTHDRS]); + + SW_FLOW_KEY_PUT(match, ipv6.exthdrs, + ipv6_exthdrs_key->hdrs, is_mask); + + attrs &= ~(1ULL << OVS_KEY_ATTR_IPV6_EXTHDRS); + } + if (attrs & (1 << OVS_KEY_ATTR_ARP)) { const struct ovs_key_arp *arp_key; @@ -2099,6 +2120,7 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, ipv4_key->ipv4_frag = output->ip.frag; } else if (swkey->eth.type == htons(ETH_P_IPV6)) { struct ovs_key_ipv6 *ipv6_key; + struct ovs_key_ipv6_exthdrs *ipv6_exthdrs_key; nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key)); if (!nla) @@ -2113,6 +2135,13 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, ipv6_key->ipv6_tclass = output->ip.tos; ipv6_key->ipv6_hlimit = output->ip.ttl; ipv6_key->ipv6_frag = output->ip.frag; + + nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6_EXTHDRS, + sizeof(*ipv6_exthdrs_key)); + if (!nla) + goto nla_put_failure; + ipv6_exthdrs_key = nla_data(nla); + ipv6_exthdrs_key->hdrs = output->ipv6.exthdrs; } else if (swkey->eth.type == htons(ETH_P_NSH)) { if (nsh_key_to_nlattr(&output->nsh, is_mask, skb)) goto nla_put_failure; diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index cf2ce5812489..82a74f998966 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -507,7 +507,7 @@ void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto) } skb->dev = vport->dev; - skb->tstamp = 0; + skb_clear_tstamp(skb); vport->ops->send(skb); return; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ab87f22cc7ec..1b93ce1a5600 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -460,7 +460,7 @@ static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts, return TP_STATUS_TS_RAW_HARDWARE; if ((flags & SOF_TIMESTAMPING_SOFTWARE) && - ktime_to_timespec64_cond(skb->tstamp, ts)) + ktime_to_timespec64_cond(skb_tstamp(skb), ts)) return TP_STATUS_TS_SOFTWARE; return 0; @@ -2199,6 +2199,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, spin_lock(&sk->sk_receive_queue.lock); po->stats.stats1.tp_packets++; sock_skb_set_dropcount(sk, skb); + skb_clear_delivery_time(skb); __skb_queue_tail(&sk->sk_receive_queue, skb); spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); @@ -2377,6 +2378,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, po->stats.stats1.tp_packets++; if (copy_skb) { status |= TP_STATUS_COPY; + skb_clear_delivery_time(copy_skb); __skb_queue_tail(&sk->sk_receive_queue, copy_skb); } spin_unlock(&sk->sk_receive_queue.lock); diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c index 65218b7ce9f9..2b582da1e88c 100644 --- a/net/phonet/af_phonet.c +++ b/net/phonet/af_phonet.c @@ -146,7 +146,7 @@ EXPORT_SYMBOL(phonet_header_ops); * Prepends an ISI header and sends a datagram. */ static int pn_send(struct sk_buff *skb, struct net_device *dev, - u16 dst, u16 src, u8 res, u8 irq) + u16 dst, u16 src, u8 res) { struct phonethdr *ph; int err; @@ -182,7 +182,7 @@ static int pn_send(struct sk_buff *skb, struct net_device *dev, if (skb->pkt_type == PACKET_LOOPBACK) { skb_reset_mac_header(skb); skb_orphan(skb); - err = (irq ? netif_rx(skb) : netif_rx_ni(skb)) ? -ENOBUFS : 0; + err = netif_rx(skb) ? -ENOBUFS : 0; } else { err = dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, skb->len); @@ -214,7 +214,7 @@ static int pn_raw_send(const void *data, int len, struct net_device *dev, skb_reserve(skb, MAX_PHONET_HEADER); __skb_put(skb, len); skb_copy_to_linear_data(skb, data, len); - return pn_send(skb, dev, dst, src, res, 1); + return pn_send(skb, dev, dst, src, res); } /* @@ -269,7 +269,7 @@ int pn_skb_send(struct sock *sk, struct sk_buff *skb, if (!pn_addr(src)) src = pn_object(saddr, pn_obj(src)); - err = pn_send(skb, dev, dst, src, res, 0); + err = pn_send(skb, dev, dst, src, res); dev_put(dev); return err; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 32563cef85bf..4f51094da9da 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -274,7 +274,7 @@ static int tcf_action_offload_add_ex(struct tc_action *action, err = tc_setup_action(&fl_action->action, actions); if (err) { NL_SET_ERR_MSG_MOD(extack, - "Failed to setup tc actions for offload\n"); + "Failed to setup tc actions for offload"); goto fl_err; } @@ -1037,6 +1037,7 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, restart_act_graph: for (i = 0; i < nr_actions; i++) { const struct tc_action *a = actions[i]; + int repeat_ttl; if (jmp_prgcnt > 0) { jmp_prgcnt -= 1; @@ -1045,11 +1046,17 @@ restart_act_graph: if (tc_act_skip_sw(a->tcfa_flags)) continue; + + repeat_ttl = 32; repeat: ret = a->ops->act(skb, a, res); - if (ret == TC_ACT_REPEAT) - goto repeat; /* we need a ttl - JHS */ - + if (unlikely(ret == TC_ACT_REPEAT)) { + if (--repeat_ttl != 0) + goto repeat; + /* suspicious opcode, stop pipeline */ + net_warn_ratelimited("TC_ACT_REPEAT abuse ?\n"); + return TC_ACT_OK; + } if (TC_ACT_EXT_CMP(ret, TC_ACT_JUMP)) { jmp_prgcnt = ret & TCA_ACT_MAX_PRIO_MASK; if (!jmp_prgcnt || (jmp_prgcnt > nr_actions)) { @@ -1439,6 +1446,8 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, continue; if (skip_sw != tc_act_skip_sw(act->tcfa_flags) || skip_hw != tc_act_skip_hw(act->tcfa_flags)) { + NL_SET_ERR_MSG(extack, + "Mismatch between action and filter offload flags"); err = -EINVAL; goto err; } diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index a77d8908e737..fea2d78b9ddc 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -53,6 +53,8 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, bpf_compute_data_pointers(skb); filter_res = bpf_prog_run(filter, skb); } + if (unlikely(!skb->tstamp && skb->mono_delivery_time)) + skb->mono_delivery_time = 0; if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) skb_orphan(skb); diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 7108e71ce4db..89e46f66e3d9 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -355,6 +355,13 @@ static void tcf_ct_flow_table_put(struct tcf_ct_params *params) } } +static void tcf_ct_flow_tc_ifidx(struct flow_offload *entry, + struct nf_conn_act_ct_ext *act_ct_ext, u8 dir) +{ + entry->tuplehash[dir].tuple.xmit_type = FLOW_OFFLOAD_XMIT_TC; + entry->tuplehash[dir].tuple.tc.iifidx = act_ct_ext->ifindex[dir]; +} + static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, struct nf_conn *ct, bool tcp) @@ -379,10 +386,8 @@ static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, act_ct_ext = nf_conn_act_ct_ext_find(ct); if (act_ct_ext) { - entry->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx = - act_ct_ext->ifindex[IP_CT_DIR_ORIGINAL]; - entry->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.iifidx = - act_ct_ext->ifindex[IP_CT_DIR_REPLY]; + tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_ORIGINAL); + tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_REPLY); } err = flow_offload_add(&ct_ft->nf_ft, entry); @@ -527,11 +532,6 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, struct nf_conn *ct; u8 dir; - /* Previously seen or loopback */ - ct = nf_ct_get(skb, &ctinfo); - if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED) - return false; - switch (family) { case NFPROTO_IPV4: if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple, &tcph)) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 0923aa2b8f8a..f4d917705263 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -239,6 +239,20 @@ release_idr: return err; } +static bool tcf_police_mtu_check(struct sk_buff *skb, u32 limit) +{ + u32 len; + + if (skb_is_gso(skb)) + return skb_gso_validate_mac_len(skb, limit); + + len = qdisc_pkt_len(skb); + if (skb_at_tc_ingress(skb)) + len += skb->mac_len; + + return len <= limit; +} + static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -261,7 +275,7 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, goto inc_overlimits; } - if (qdisc_pkt_len(skb) <= p->tcfp_mtu) { + if (tcf_police_mtu_check(skb, p->tcfp_mtu)) { if (!p->rate_present && !p->pps_present) { ret = p->tcfp_result; goto end; @@ -405,20 +419,66 @@ static int tcf_police_search(struct net *net, struct tc_action **a, u32 index) return tcf_idr_search(tn, a, index); } +static int tcf_police_act_to_flow_act(int tc_act, u32 *extval) +{ + int act_id = -EOPNOTSUPP; + + if (!TC_ACT_EXT_OPCODE(tc_act)) { + if (tc_act == TC_ACT_OK) + act_id = FLOW_ACTION_ACCEPT; + else if (tc_act == TC_ACT_SHOT) + act_id = FLOW_ACTION_DROP; + else if (tc_act == TC_ACT_PIPE) + act_id = FLOW_ACTION_PIPE; + } else if (TC_ACT_EXT_CMP(tc_act, TC_ACT_GOTO_CHAIN)) { + act_id = FLOW_ACTION_GOTO; + *extval = tc_act & TC_ACT_EXT_VAL_MASK; + } else if (TC_ACT_EXT_CMP(tc_act, TC_ACT_JUMP)) { + act_id = FLOW_ACTION_JUMP; + *extval = tc_act & TC_ACT_EXT_VAL_MASK; + } + + return act_id; +} + static int tcf_police_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind) { if (bind) { struct flow_action_entry *entry = entry_data; + struct tcf_police *police = to_police(act); + struct tcf_police_params *p; + int act_id; + + p = rcu_dereference_protected(police->params, + lockdep_is_held(&police->tcf_lock)); entry->id = FLOW_ACTION_POLICE; entry->police.burst = tcf_police_burst(act); entry->police.rate_bytes_ps = tcf_police_rate_bytes_ps(act); + entry->police.peakrate_bytes_ps = tcf_police_peakrate_bytes_ps(act); + entry->police.avrate = tcf_police_tcfp_ewma_rate(act); + entry->police.overhead = tcf_police_rate_overhead(act); entry->police.burst_pkt = tcf_police_burst_pkt(act); entry->police.rate_pkt_ps = tcf_police_rate_pkt_ps(act); entry->police.mtu = tcf_police_tcfp_mtu(act); + + act_id = tcf_police_act_to_flow_act(police->tcf_action, + &entry->police.exceed.extval); + if (act_id < 0) + return act_id; + + entry->police.exceed.act_id = act_id; + + act_id = tcf_police_act_to_flow_act(p->tcfp_result, + &entry->police.notexceed.extval); + if (act_id < 0) + return act_id; + + entry->police.notexceed.act_id = act_id; + *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ff1e6b474fef..2957f8f5cea7 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1061,7 +1061,7 @@ static int __tcf_qdisc_find(struct net *net, struct Qdisc **q, /* Find qdisc */ if (!*parent) { - *q = dev->qdisc; + *q = rcu_dereference(dev->qdisc); *parent = (*q)->handle; } else { *q = qdisc_lookup_rcu(dev, TC_H_MAJ(*parent)); @@ -2606,7 +2606,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) parent = tcm->tcm_parent; if (!parent) - q = dev->qdisc; + q = rtnl_dereference(dev->qdisc); else q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); if (!q) @@ -2981,7 +2981,7 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; if (!tcm->tcm_parent) - q = dev->qdisc; + q = rtnl_dereference(dev->qdisc); else q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index df19a847829e..c85b85a192bf 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -102,6 +102,8 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, bpf_compute_data_pointers(skb); filter_res = bpf_prog_run(prog->filter, skb); } + if (unlikely(!skb->tstamp && skb->mono_delivery_time)) + skb->mono_delivery_time = 0; if (prog->exts_integrated) { res->class = 0; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 179825a3b2fd..e3c0e8ea2dbb 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -301,7 +301,7 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) if (!handle) return NULL; - q = qdisc_match_from_root(dev->qdisc, handle); + q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle); if (q) goto out; @@ -320,7 +320,7 @@ struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle) if (!handle) return NULL; - q = qdisc_match_from_root(dev->qdisc, handle); + q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle); if (q) goto out; @@ -1082,10 +1082,10 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, skip: if (!ingress) { notify_and_destroy(net, skb, n, classid, - dev->qdisc, new); + rtnl_dereference(dev->qdisc), new); if (new && !new->ops->attach) qdisc_refcount_inc(new); - dev->qdisc = new ? : &noop_qdisc; + rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc); if (new && new->ops->attach) new->ops->attach(new); @@ -1451,7 +1451,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, q = dev_ingress_queue(dev)->qdisc_sleeping; } } else { - q = dev->qdisc; + q = rtnl_dereference(dev->qdisc); } if (!q) { NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device"); @@ -1540,7 +1540,7 @@ replay: q = dev_ingress_queue(dev)->qdisc_sleeping; } } else { - q = dev->qdisc; + q = rtnl_dereference(dev->qdisc); } /* It may be default qdisc, ignore it */ @@ -1762,7 +1762,8 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) s_q_idx = 0; q_idx = 0; - if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx, + if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc), + skb, cb, &q_idx, s_q_idx, true, tca[TCA_DUMP_INVISIBLE]) < 0) goto done; @@ -2033,7 +2034,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, } else if (qid1) { qid = qid1; } else if (qid == 0) - qid = dev->qdisc->handle; + qid = rtnl_dereference(dev->qdisc)->handle; /* Now qid is genuine qdisc handle consistent * both with parent and child. @@ -2044,7 +2045,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, portid = TC_H_MAKE(qid, portid); } else { if (qid == 0) - qid = dev->qdisc->handle; + qid = rtnl_dereference(dev->qdisc)->handle; } /* OK. Locate qdisc */ @@ -2205,7 +2206,8 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; t = 0; - if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t, true) < 0) + if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc), + skb, tcm, cb, &t, s_t, true) < 0) goto done; dev_queue = dev_ingress_queue(dev); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index f893d9a81b01..5bab9f8b8f45 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1164,30 +1164,33 @@ static void attach_default_qdiscs(struct net_device *dev) if (!netif_is_multiqueue(dev) || dev->priv_flags & IFF_NO_QUEUE) { netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); - dev->qdisc = txq->qdisc_sleeping; - qdisc_refcount_inc(dev->qdisc); + qdisc = txq->qdisc_sleeping; + rcu_assign_pointer(dev->qdisc, qdisc); + qdisc_refcount_inc(qdisc); } else { qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL); if (qdisc) { - dev->qdisc = qdisc; + rcu_assign_pointer(dev->qdisc, qdisc); qdisc->ops->attach(qdisc); } } + qdisc = rtnl_dereference(dev->qdisc); /* Detect default qdisc setup/init failed and fallback to "noqueue" */ - if (dev->qdisc == &noop_qdisc) { + if (qdisc == &noop_qdisc) { netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n", default_qdisc_ops->id, noqueue_qdisc_ops.id); dev->priv_flags |= IFF_NO_QUEUE; netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); - dev->qdisc = txq->qdisc_sleeping; - qdisc_refcount_inc(dev->qdisc); + qdisc = txq->qdisc_sleeping; + rcu_assign_pointer(dev->qdisc, qdisc); + qdisc_refcount_inc(qdisc); dev->priv_flags ^= IFF_NO_QUEUE; } #ifdef CONFIG_NET_SCHED - if (dev->qdisc != &noop_qdisc) - qdisc_hash_add(dev->qdisc, false); + if (qdisc != &noop_qdisc) + qdisc_hash_add(qdisc, false); #endif } @@ -1217,7 +1220,7 @@ void dev_activate(struct net_device *dev) * and noqueue_qdisc for virtual interfaces */ - if (dev->qdisc == &noop_qdisc) + if (rtnl_dereference(dev->qdisc) == &noop_qdisc) attach_default_qdiscs(dev); if (!netif_carrier_ok(dev)) @@ -1383,7 +1386,7 @@ static int qdisc_change_tx_queue_len(struct net_device *dev, void dev_qdisc_change_real_num_tx(struct net_device *dev, unsigned int new_real_tx) { - struct Qdisc *qdisc = dev->qdisc; + struct Qdisc *qdisc = rtnl_dereference(dev->qdisc); if (qdisc->ops->change_real_num_tx) qdisc->ops->change_real_num_tx(qdisc, new_real_tx); @@ -1447,7 +1450,7 @@ static void dev_init_scheduler_queue(struct net_device *dev, void dev_init_scheduler(struct net_device *dev) { - dev->qdisc = &noop_qdisc; + rcu_assign_pointer(dev->qdisc, &noop_qdisc); netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc); if (dev_ingress_queue(dev)) dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); @@ -1475,8 +1478,8 @@ void dev_shutdown(struct net_device *dev) netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); if (dev_ingress_queue(dev)) shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); - qdisc_put(dev->qdisc); - dev->qdisc = &noop_qdisc; + qdisc_put(rtnl_dereference(dev->qdisc)); + rcu_assign_pointer(dev->qdisc, &noop_qdisc); WARN_ON(timer_pending(&dev->watchdog_timer)); } diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 034e2c74497d..d9c6d8f30f09 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -61,10 +61,6 @@ static void inet_diag_msg_sctpasoc_fill(struct inet_diag_msg *r, r->idiag_timer = SCTP_EVENT_TIMEOUT_T3_RTX; r->idiag_retrans = asoc->rtx_data_chunks; r->idiag_expires = jiffies_to_msecs(t3_rtx->expires - jiffies); - } else { - r->idiag_timer = 0; - r->idiag_retrans = 0; - r->idiag_expires = 0; } } @@ -144,13 +140,14 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, r = nlmsg_data(nlh); BUG_ON(!sk_fullsock(sk)); + r->idiag_timer = 0; + r->idiag_retrans = 0; + r->idiag_expires = 0; if (asoc) { inet_diag_msg_sctpasoc_fill(r, sk, asoc); } else { inet_diag_msg_common_fill(r, sk); r->idiag_state = sk->sk_state; - r->idiag_timer = 0; - r->idiag_retrans = 0; } if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin)) diff --git a/net/smc/Makefile b/net/smc/Makefile index 196fb6f01b14..875efcd126a2 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o smc-y += smc_tracepoint.o +smc-$(CONFIG_SYSCTL) += smc_sysctl.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 246c874de629..f0d118e9f155 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -51,6 +51,7 @@ #include "smc_close.h" #include "smc_stats.h" #include "smc_tracepoint.h" +#include "smc_sysctl.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -192,12 +193,27 @@ void smc_unhash_sk(struct sock *sk) } EXPORT_SYMBOL_GPL(smc_unhash_sk); +/* This will be called before user really release sock_lock. So do the + * work which we didn't do because of user hold the sock_lock in the + * BH context + */ +static void smc_release_cb(struct sock *sk) +{ + struct smc_sock *smc = smc_sk(sk); + + if (smc->conn.tx_in_release_sock) { + smc_tx_pending(&smc->conn); + smc->conn.tx_in_release_sock = false; + } +} + struct proto smc_proto = { .name = "SMC", .owner = THIS_MODULE, .keepalive = smc_set_keepalive, .hash = smc_hash_sk, .unhash = smc_unhash_sk, + .release_cb = smc_release_cb, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v4_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, @@ -210,6 +226,7 @@ struct proto smc_proto6 = { .keepalive = smc_set_keepalive, .hash = smc_hash_sk, .unhash = smc_unhash_sk, + .release_cb = smc_release_cb, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v6_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, @@ -268,7 +285,7 @@ static int smc_release(struct socket *sock) { struct sock *sk = sock->sk; struct smc_sock *smc; - int rc = 0; + int old_state, rc = 0; if (!sk) goto out; @@ -276,8 +293,10 @@ static int smc_release(struct socket *sock) sock_hold(sk); /* sock_put below */ smc = smc_sk(sk); + old_state = sk->sk_state; + /* cleanup for a dangling non-blocking connect */ - if (smc->connect_nonblock && sk->sk_state == SMC_INIT) + if (smc->connect_nonblock && old_state == SMC_INIT) tcp_abort(smc->clcsock->sk, ECONNABORTED); if (cancel_work_sync(&smc->connect_work)) @@ -291,6 +310,10 @@ static int smc_release(struct socket *sock) else lock_sock(sk); + if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE && + !smc->use_fallback) + smc_close_active_abort(smc); + rc = __smc_release(smc); /* detach socket */ @@ -752,14 +775,17 @@ static void smc_fback_error_report(struct sock *clcsk) static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { struct sock *clcsk; + int rc = 0; mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { - mutex_unlock(&smc->clcsock_release_lock); - return -EBADF; + rc = -EBADF; + goto out; } clcsk = smc->clcsock->sk; + if (smc->use_fallback) + goto out; smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -787,8 +813,9 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->clcsock->sk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); } +out: mutex_unlock(&smc->clcsock_release_lock); - return 0; + return rc; } /* fall back during connect */ @@ -1372,8 +1399,14 @@ static int __smc_connect(struct smc_sock *smc) /* perform CLC handshake */ rc = smc_connect_clc(smc, aclc2, ini); - if (rc) + if (rc) { + /* -EAGAIN on timeout, see tcp_recvmsg() */ + if (rc == -EAGAIN) { + rc = -ETIMEDOUT; + smc->sk.sk_err = ETIMEDOUT; + } goto vlan_cleanup; + } /* check if smc modes and versions of CLC proposal and accept match */ rc = smc_connect_check_aclc(ini, aclc); @@ -2705,10 +2738,14 @@ static int __smc_setsockopt(struct socket *sock, int level, int optname, lock_sock(sk); switch (optname) { case SMC_LIMIT_HS: - if (optlen < sizeof(int)) - return -EINVAL; - if (copy_from_sockptr(&val, optval, sizeof(int))) - return -EFAULT; + if (optlen < sizeof(int)) { + rc = -EINVAL; + break; + } + if (copy_from_sockptr(&val, optval, sizeof(int))) { + rc = -EFAULT; + break; + } smc->limit_smc_hs = !!val; rc = 0; @@ -2781,8 +2818,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, sk->sk_state != SMC_CLOSED) { if (val) { SMC_STAT_INC(smc, ndly_cnt); - mod_delayed_work(smc->conn.lgr->tx_wq, - &smc->conn.tx_work, 0); + smc_tx_pending(&smc->conn); + cancel_delayed_work(&smc->conn.tx_work); } } break; @@ -3142,11 +3179,17 @@ unsigned int smc_net_id; static __net_init int smc_net_init(struct net *net) { + int rc; + + rc = smc_sysctl_net_init(net); + if (rc) + return rc; return smc_pnet_net_init(net); } static void __net_exit smc_net_exit(struct net *net) { + smc_sysctl_net_exit(net); smc_pnet_net_exit(net); } @@ -3256,12 +3299,14 @@ static int __init smc_init(void) rc = tcp_register_ulp(&smc_ulp_ops); if (rc) { pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); - goto out_sock; + goto out_ib; } static_branch_enable(&tcp_have_smc); return 0; +out_ib: + smc_ib_unregister_client(); out_sock: sock_unregister(PF_SMC); out_proto6: diff --git a/net/smc/smc.h b/net/smc/smc.h index a096d8af21a0..ea0620529ebe 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -29,6 +29,7 @@ #define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM * devices */ +#define SMC_AUTOCORKING_DEFAULT_SIZE 0x10000 /* 64K by default */ extern struct proto smc_proto; extern struct proto smc_proto6; @@ -192,6 +193,7 @@ struct smc_connection { * - dec on polled tx cqe */ wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/ + atomic_t tx_pushing; /* nr_threads trying tx push */ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ u32 tx_off; /* base offset in peer rmb */ @@ -211,6 +213,10 @@ struct smc_connection { * data still pending */ char urg_rx_byte; /* urgent byte */ + bool tx_in_release_sock; + /* flush pending tx data in + * sock release_cb() + */ atomic_t bytes_to_rcv; /* arrived data, * not yet received */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 9d5a97168969..5c731f27996e 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -48,9 +48,19 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, conn->tx_cdc_seq_fin = cdcpend->ctrl_seq; } - if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) && - unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) - wake_up(&conn->cdc_pend_tx_wq); + if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) { + /* If user owns the sock_lock, mark the connection need sending. + * User context will later try to send when it release sock_lock + * in smc_release_cb() + */ + if (sock_owned_by_user(&smc->sk)) + conn->tx_in_release_sock = true; + else + smc_tx_pending(conn); + + if (unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) + wake_up(&conn->cdc_pend_tx_wq); + } WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0); smc_tx_sndbuf_nonfull(smc); @@ -350,8 +360,12 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ if ((diff_cons && smc_tx_prepared_sends(conn)) || conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || - conn->local_rx_ctrl.prod_flags.urg_data_pending) - smc_tx_sndbuf_nonempty(conn); + conn->local_rx_ctrl.prod_flags.urg_data_pending) { + if (!sock_owned_by_user(&smc->sk)) + smc_tx_pending(conn); + else + conn->tx_in_release_sock = true; + } if (diff_cons && conn->urg_tx_pend && atomic_read(&conn->peer_rmbe_space) == conn->peer_rmbe_size) { diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 29525d03b253..f40f6ed0fbdb 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1161,8 +1161,8 @@ void smc_conn_free(struct smc_connection *conn) cancel_work_sync(&conn->abort_work); } if (!list_empty(&lgr->list)) { - smc_lgr_unregister_conn(conn); smc_buf_unuse(conn, lgr); /* allow buffer reuse */ + smc_lgr_unregister_conn(conn); } if (!lgr->conns_num) @@ -1864,7 +1864,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) (ini->smcd_version == SMC_V2 || lgr->vlan_id == ini->vlan_id) && (role == SMC_CLNT || ini->is_smcd || - lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) { + (lgr->conns_num < SMC_RMBS_PER_LGR_MAX && + !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) { /* link group found */ ini->first_contact_local = 0; conn->lgr = lgr; @@ -1988,7 +1989,7 @@ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, */ static inline int smc_rmb_wnd_update_limit(int rmbe_size) { - return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); + return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } /* map an rmb buf to a link */ diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index ff61b7b95875..7984f8883472 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -113,7 +113,7 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) pnettable = &sn->pnettable; /* remove table entry */ - write_lock(&pnettable->lock); + mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (!pnet_name || @@ -131,7 +131,7 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) rc = 0; } } - write_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); /* if this is not the initial namespace, stop here */ if (net != &init_net) @@ -192,7 +192,7 @@ static int smc_pnet_add_by_ndev(struct net_device *ndev) sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; - write_lock(&pnettable->lock); + mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (pnetelem->type == SMC_PNET_ETH && !pnetelem->ndev && !strncmp(pnetelem->eth_name, ndev->name, IFNAMSIZ)) { @@ -206,7 +206,7 @@ static int smc_pnet_add_by_ndev(struct net_device *ndev) break; } } - write_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); return rc; } @@ -224,7 +224,7 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev) sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; - write_lock(&pnettable->lock); + mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev == ndev) { dev_put_track(pnetelem->ndev, &pnetelem->dev_tracker); @@ -237,7 +237,7 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev) break; } } - write_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); return rc; } @@ -370,7 +370,7 @@ static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, strncpy(new_pe->eth_name, eth_name, IFNAMSIZ); rc = -EEXIST; new_netdev = true; - write_lock(&pnettable->lock); + mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_ETH && !strncmp(tmp_pe->eth_name, eth_name, IFNAMSIZ)) { @@ -385,9 +385,9 @@ static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, GFP_ATOMIC); } list_add_tail(&new_pe->list, &pnettable->pnetlist); - write_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); } else { - write_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); kfree(new_pe); goto out_put; } @@ -448,7 +448,7 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, new_pe->ib_port = ib_port; new_ibdev = true; - write_lock(&pnettable->lock); + mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { @@ -458,9 +458,9 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, } if (new_ibdev) { list_add_tail(&new_pe->list, &pnettable->pnetlist); - write_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); } else { - write_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); kfree(new_pe); } return (new_ibdev) ? 0 : -EEXIST; @@ -605,7 +605,7 @@ static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid, pnettable = &sn->pnettable; /* dump pnettable entries */ - read_lock(&pnettable->lock); + mutex_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { if (pnetid && !smc_pnet_match(pnetelem->pnet_name, pnetid)) continue; @@ -620,7 +620,7 @@ static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid, break; } } - read_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); return idx; } @@ -864,7 +864,7 @@ int smc_pnet_net_init(struct net *net) struct smc_pnetids_ndev *pnetids_ndev = &sn->pnetids_ndev; INIT_LIST_HEAD(&pnettable->pnetlist); - rwlock_init(&pnettable->lock); + mutex_init(&pnettable->lock); INIT_LIST_HEAD(&pnetids_ndev->list); rwlock_init(&pnetids_ndev->lock); @@ -947,7 +947,7 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; - read_lock(&pnettable->lock); + mutex_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { if (pnetelem->type == SMC_PNET_ETH && ndev == pnetelem->ndev) { /* get pnetid of netdev device */ @@ -956,7 +956,7 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, break; } } - read_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); return rc; } @@ -1159,7 +1159,7 @@ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) sn = net_generic(&init_net, smc_net_id); pnettable = &sn->pnettable; - read_lock(&pnettable->lock); + mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX) && @@ -1169,7 +1169,7 @@ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) break; } } - read_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); return rc; } @@ -1188,7 +1188,7 @@ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) sn = net_generic(&init_net, smc_net_id); pnettable = &sn->pnettable; - read_lock(&pnettable->lock); + mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { @@ -1197,7 +1197,7 @@ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) break; } } - read_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); return rc; } diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h index 14039272f7e4..80a88eea4949 100644 --- a/net/smc/smc_pnet.h +++ b/net/smc/smc_pnet.h @@ -29,7 +29,7 @@ struct smc_link_group; * @pnetlist: List of PNETIDs */ struct smc_pnettable { - rwlock_t lock; + struct mutex lock; struct list_head pnetlist; }; diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c new file mode 100644 index 000000000000..bae19419e755 --- /dev/null +++ b/net/smc/smc_sysctl.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * smc_sysctl.c: sysctl interface to SMC subsystem. + * + * Copyright (c) 2022, Alibaba Inc. + * + * Author: Tony Lu <tonylu@linux.alibaba.com> + * + */ + +#include <linux/init.h> +#include <linux/sysctl.h> +#include <net/net_namespace.h> + +#include "smc.h" +#include "smc_sysctl.h" + +static struct ctl_table smc_table[] = { + { + .procname = "autocorking_size", + .data = &init_net.smc.sysctl_autocorking_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec, + }, + { } +}; + +int __net_init smc_sysctl_net_init(struct net *net) +{ + struct ctl_table *table; + + table = smc_table; + if (!net_eq(net, &init_net)) { + int i; + + table = kmemdup(table, sizeof(smc_table), GFP_KERNEL); + if (!table) + goto err_alloc; + + for (i = 0; i < ARRAY_SIZE(smc_table) - 1; i++) + table[i].data += (void *)net - (void *)&init_net; + } + + net->smc.smc_hdr = register_net_sysctl(net, "net/smc", table); + if (!net->smc.smc_hdr) + goto err_reg; + + net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; + + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +void __net_exit smc_sysctl_net_exit(struct net *net) +{ + unregister_net_sysctl_table(net->smc.smc_hdr); +} diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h new file mode 100644 index 000000000000..0becc11bd2f4 --- /dev/null +++ b/net/smc/smc_sysctl.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * smc_sysctl.c: sysctl interface to SMC subsystem. + * + * Copyright (c) 2022, Alibaba Inc. + * + * Author: Tony Lu <tonylu@linux.alibaba.com> + * + */ + +#ifndef _SMC_SYSCTL_H +#define _SMC_SYSCTL_H + +#ifdef CONFIG_SYSCTL + +int __net_init smc_sysctl_net_init(struct net *net); +void __net_exit smc_sysctl_net_exit(struct net *net); + +#else + +static inline int smc_sysctl_net_init(struct net *net) +{ + net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; + return 0; +} + +static inline void smc_sysctl_net_exit(struct net *net) { } + +#endif /* CONFIG_SYSCTL */ + +#endif /* _SMC_SYSCTL_H */ diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index a96ce162825e..98ca9229fe87 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -131,6 +131,51 @@ static bool smc_tx_is_corked(struct smc_sock *smc) return (tp->nonagle & TCP_NAGLE_CORK) ? true : false; } +/* If we have pending CDC messages, do not send: + * Because CQE of this CDC message will happen shortly, it gives + * a chance to coalesce future sendmsg() payload in to one RDMA Write, + * without need for a timer, and with no latency trade off. + * Algorithm here: + * 1. First message should never cork + * 2. If we have pending Tx CDC messages, wait for the first CDC + * message's completion + * 3. Don't cork to much data in a single RDMA Write to prevent burst + * traffic, total corked message should not exceed sendbuf/2 + */ +static bool smc_should_autocork(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + int corking_size; + + corking_size = min_t(unsigned int, conn->sndbuf_desc->len >> 1, + sock_net(&smc->sk)->smc.sysctl_autocorking_size); + + if (atomic_read(&conn->cdc_pend_tx_wr) == 0 || + smc_tx_prepared_sends(conn) > corking_size) + return false; + return true; +} + +static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg) +{ + struct smc_connection *conn = &smc->conn; + + if (smc_should_autocork(smc)) + return true; + + /* for a corked socket defer the RDMA writes if + * sndbuf_space is still available. The applications + * should known how/when to uncork it. + */ + if ((msg->msg_flags & MSG_MORE || + smc_tx_is_corked(smc) || + msg->msg_flags & MSG_SENDPAGE_NOTLAST) && + atomic_read(&conn->sndbuf_space)) + return true; + + return false; +} + /* sndbuf producer: main API called by socket layer. * called under sock lock. */ @@ -235,15 +280,11 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) */ if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; - if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc) || - msg->msg_flags & MSG_SENDPAGE_NOTLAST) && - (atomic_read(&conn->sndbuf_space))) - /* for a corked socket defer the RDMA writes if - * sndbuf_space is still available. The applications - * should known how/when to uncork it. - */ - continue; - smc_tx_sndbuf_nonempty(conn); + /* If we need to cork, do nothing and wait for the next + * sendmsg() call or push on tx completion + */ + if (!smc_tx_should_cork(smc, msg)) + smc_tx_sndbuf_nonempty(conn); trace_smc_tx_sendmsg(smc, copylen); } /* while (msg_data_left(msg)) */ @@ -590,13 +631,26 @@ static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) return rc; } -int smc_tx_sndbuf_nonempty(struct smc_connection *conn) +static int __smc_tx_sndbuf_nonempty(struct smc_connection *conn) { - int rc; + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + int rc = 0; + + /* No data in the send queue */ + if (unlikely(smc_tx_prepared_sends(conn) <= 0)) + goto out; + + /* Peer don't have RMBE space */ + if (unlikely(atomic_read(&conn->peer_rmbe_space) <= 0)) { + SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); + goto out; + } if (conn->killed || - conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) - return -EPIPE; /* connection being aborted */ + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { + rc = -EPIPE; /* connection being aborted */ + goto out; + } if (conn->lgr->is_smcd) rc = smcd_tx_sndbuf_nonempty(conn); else @@ -604,13 +658,45 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) if (!rc) { /* trigger socket release if connection is closing */ - struct smc_sock *smc = container_of(conn, struct smc_sock, - conn); smc_close_wake_tx_prepared(smc); } + +out: + return rc; +} + +int smc_tx_sndbuf_nonempty(struct smc_connection *conn) +{ + int rc; + + /* This make sure only one can send simultaneously to prevent wasting + * of CPU and CDC slot. + * Record whether someone has tried to push while we are pushing. + */ + if (atomic_inc_return(&conn->tx_pushing) > 1) + return 0; + +again: + atomic_set(&conn->tx_pushing, 1); + smp_wmb(); /* Make sure tx_pushing is 1 before real send */ + rc = __smc_tx_sndbuf_nonempty(conn); + + /* We need to check whether someone else have added some data into + * the send queue and tried to push but failed after the atomic_set() + * when we are pushing. + * If so, we need to push again to prevent those data hang in the send + * queue. + */ + if (unlikely(!atomic_dec_and_test(&conn->tx_pushing))) + goto again; + return rc; } +/* Wakeup sndbuf consumers from process context + * since there is more data to transmit. The caller + * must hold sock lock. + */ void smc_tx_pending(struct smc_connection *conn) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); @@ -626,7 +712,8 @@ void smc_tx_pending(struct smc_connection *conn) } /* Wakeup sndbuf consumers from process context - * since there is more data to transmit + * since there is more data to transmit in locked + * sock. */ void smc_tx_work(struct work_struct *work) { diff --git a/net/socket.c b/net/socket.c index 50cf75730fd7..982eecad464c 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3448,7 +3448,7 @@ EXPORT_SYMBOL(kernel_connect); * @addr: address holder * * Fills the @addr pointer with the address which the socket is bound. - * Returns 0 or an error code. + * Returns the length of the address in bytes or an error code. */ int kernel_getsockname(struct socket *sock, struct sockaddr *addr) @@ -3463,7 +3463,7 @@ EXPORT_SYMBOL(kernel_getsockname); * @addr: address holder * * Fills the @addr pointer with the address which the socket is connected. - * Returns 0 or an error code. + * Returns the length of the address in bytes or an error code. */ int kernel_getpeername(struct socket *sock, struct sockaddr *addr) diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 12e6b4146bfb..474f76383033 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -409,6 +409,27 @@ static int switchdev_lower_dev_walk(struct net_device *lower_dev, } static struct net_device * +switchdev_lower_dev_find_rcu(struct net_device *dev, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev)) +{ + struct switchdev_nested_priv switchdev_priv = { + .check_cb = check_cb, + .foreign_dev_check_cb = foreign_dev_check_cb, + .dev = dev, + .lower_dev = NULL, + }; + struct netdev_nested_priv priv = { + .data = &switchdev_priv, + }; + + netdev_walk_all_lower_dev_rcu(dev, switchdev_lower_dev_walk, &priv); + + return switchdev_priv.lower_dev; +} + +static struct net_device * switchdev_lower_dev_find(struct net_device *dev, bool (*check_cb)(const struct net_device *dev), bool (*foreign_dev_check_cb)(const struct net_device *dev, @@ -424,7 +445,7 @@ switchdev_lower_dev_find(struct net_device *dev, .data = &switchdev_priv, }; - netdev_walk_all_lower_dev_rcu(dev, switchdev_lower_dev_walk, &priv); + netdev_walk_all_lower_dev(dev, switchdev_lower_dev_walk, &priv); return switchdev_priv.lower_dev; } @@ -437,63 +458,40 @@ static int __switchdev_handle_fdb_event_to_device(struct net_device *dev, const struct net_device *foreign_dev), int (*mod_cb)(struct net_device *dev, struct net_device *orig_dev, unsigned long event, const void *ctx, - const struct switchdev_notifier_fdb_info *fdb_info), - int (*lag_mod_cb)(struct net_device *dev, struct net_device *orig_dev, - unsigned long event, const void *ctx, - const struct switchdev_notifier_fdb_info *fdb_info)) + const struct switchdev_notifier_fdb_info *fdb_info)) { const struct switchdev_notifier_info *info = &fdb_info->info; - struct net_device *br, *lower_dev; + struct net_device *br, *lower_dev, *switchdev; struct list_head *iter; int err = -EOPNOTSUPP; if (check_cb(dev)) return mod_cb(dev, orig_dev, event, info->ctx, fdb_info); - if (netif_is_lag_master(dev)) { - if (!switchdev_lower_dev_find(dev, check_cb, foreign_dev_check_cb)) - goto maybe_bridged_with_us; - - /* This is a LAG interface that we offload */ - if (!lag_mod_cb) - return -EOPNOTSUPP; - - return lag_mod_cb(dev, orig_dev, event, info->ctx, fdb_info); - } - /* Recurse through lower interfaces in case the FDB entry is pointing - * towards a bridge device. + * towards a bridge or a LAG device. */ - if (netif_is_bridge_master(dev)) { - if (!switchdev_lower_dev_find(dev, check_cb, foreign_dev_check_cb)) - return 0; - - /* This is a bridge interface that we offload */ - netdev_for_each_lower_dev(dev, lower_dev, iter) { - /* Do not propagate FDB entries across bridges */ - if (netif_is_bridge_master(lower_dev)) - continue; - - /* Bridge ports might be either us, or LAG interfaces - * that we offload. - */ - if (!check_cb(lower_dev) && - !switchdev_lower_dev_find(lower_dev, check_cb, - foreign_dev_check_cb)) - continue; - - err = __switchdev_handle_fdb_event_to_device(lower_dev, orig_dev, - event, fdb_info, check_cb, - foreign_dev_check_cb, - mod_cb, lag_mod_cb); - if (err && err != -EOPNOTSUPP) - return err; - } + netdev_for_each_lower_dev(dev, lower_dev, iter) { + /* Do not propagate FDB entries across bridges */ + if (netif_is_bridge_master(lower_dev)) + continue; - return 0; + /* Bridge ports might be either us, or LAG interfaces + * that we offload. + */ + if (!check_cb(lower_dev) && + !switchdev_lower_dev_find_rcu(lower_dev, check_cb, + foreign_dev_check_cb)) + continue; + + err = __switchdev_handle_fdb_event_to_device(lower_dev, orig_dev, + event, fdb_info, check_cb, + foreign_dev_check_cb, + mod_cb); + if (err && err != -EOPNOTSUPP) + return err; } -maybe_bridged_with_us: /* Event is neither on a bridge nor a LAG. Check whether it is on an * interface that is in a bridge with us. */ @@ -501,12 +499,16 @@ maybe_bridged_with_us: if (!br || !netif_is_bridge_master(br)) return 0; - if (!switchdev_lower_dev_find(br, check_cb, foreign_dev_check_cb)) + switchdev = switchdev_lower_dev_find_rcu(br, check_cb, foreign_dev_check_cb); + if (!switchdev) return 0; + if (!foreign_dev_check_cb(switchdev, dev)) + return err; + return __switchdev_handle_fdb_event_to_device(br, orig_dev, event, fdb_info, check_cb, foreign_dev_check_cb, - mod_cb, lag_mod_cb); + mod_cb); } int switchdev_handle_fdb_event_to_device(struct net_device *dev, unsigned long event, @@ -516,16 +518,13 @@ int switchdev_handle_fdb_event_to_device(struct net_device *dev, unsigned long e const struct net_device *foreign_dev), int (*mod_cb)(struct net_device *dev, struct net_device *orig_dev, unsigned long event, const void *ctx, - const struct switchdev_notifier_fdb_info *fdb_info), - int (*lag_mod_cb)(struct net_device *dev, struct net_device *orig_dev, - unsigned long event, const void *ctx, - const struct switchdev_notifier_fdb_info *fdb_info)) + const struct switchdev_notifier_fdb_info *fdb_info)) { int err; err = __switchdev_handle_fdb_event_to_device(dev, dev, event, fdb_info, check_cb, foreign_dev_check_cb, - mod_cb, lag_mod_cb); + mod_cb); if (err == -EOPNOTSUPP) err = 0; @@ -536,13 +535,15 @@ EXPORT_SYMBOL_GPL(switchdev_handle_fdb_event_to_device); static int __switchdev_handle_port_obj_add(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), int (*add_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack)) { struct switchdev_notifier_info *info = &port_obj_info->info; + struct net_device *br, *lower_dev, *switchdev; struct netlink_ext_ack *extack; - struct net_device *lower_dev; struct list_head *iter; int err = -EOPNOTSUPP; @@ -566,15 +567,46 @@ static int __switchdev_handle_port_obj_add(struct net_device *dev, if (netif_is_bridge_master(lower_dev)) continue; + /* When searching for switchdev interfaces that are neighbors + * of foreign ones, and @dev is a bridge, do not recurse on the + * foreign interface again, it was already visited. + */ + if (foreign_dev_check_cb && !check_cb(lower_dev) && + !switchdev_lower_dev_find(lower_dev, check_cb, foreign_dev_check_cb)) + continue; + err = __switchdev_handle_port_obj_add(lower_dev, port_obj_info, - check_cb, add_cb); + check_cb, foreign_dev_check_cb, + add_cb); if (err && err != -EOPNOTSUPP) return err; } - return err; + /* Event is neither on a bridge nor a LAG. Check whether it is on an + * interface that is in a bridge with us. + */ + if (!foreign_dev_check_cb) + return err; + + br = netdev_master_upper_dev_get(dev); + if (!br || !netif_is_bridge_master(br)) + return err; + + switchdev = switchdev_lower_dev_find(br, check_cb, foreign_dev_check_cb); + if (!switchdev) + return err; + + if (!foreign_dev_check_cb(switchdev, dev)) + return err; + + return __switchdev_handle_port_obj_add(br, port_obj_info, check_cb, + foreign_dev_check_cb, add_cb); } +/* Pass through a port object addition, if @dev passes @check_cb, or replicate + * it towards all lower interfaces of @dev that pass @check_cb, if @dev is a + * bridge or a LAG. + */ int switchdev_handle_port_obj_add(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), @@ -585,21 +617,46 @@ int switchdev_handle_port_obj_add(struct net_device *dev, int err; err = __switchdev_handle_port_obj_add(dev, port_obj_info, check_cb, - add_cb); + NULL, add_cb); if (err == -EOPNOTSUPP) err = 0; return err; } EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_add); +/* Same as switchdev_handle_port_obj_add(), except if object is notified on a + * @dev that passes @foreign_dev_check_cb, it is replicated towards all devices + * that pass @check_cb and are in the same bridge as @dev. + */ +int switchdev_handle_port_obj_add_foreign(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*add_cb)(struct net_device *dev, const void *ctx, + const struct switchdev_obj *obj, + struct netlink_ext_ack *extack)) +{ + int err; + + err = __switchdev_handle_port_obj_add(dev, port_obj_info, check_cb, + foreign_dev_check_cb, add_cb); + if (err == -EOPNOTSUPP) + err = 0; + return err; +} +EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_add_foreign); + static int __switchdev_handle_port_obj_del(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), int (*del_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj)) { struct switchdev_notifier_info *info = &port_obj_info->info; - struct net_device *lower_dev; + struct net_device *br, *lower_dev, *switchdev; struct list_head *iter; int err = -EOPNOTSUPP; @@ -621,15 +678,46 @@ static int __switchdev_handle_port_obj_del(struct net_device *dev, if (netif_is_bridge_master(lower_dev)) continue; + /* When searching for switchdev interfaces that are neighbors + * of foreign ones, and @dev is a bridge, do not recurse on the + * foreign interface again, it was already visited. + */ + if (foreign_dev_check_cb && !check_cb(lower_dev) && + !switchdev_lower_dev_find(lower_dev, check_cb, foreign_dev_check_cb)) + continue; + err = __switchdev_handle_port_obj_del(lower_dev, port_obj_info, - check_cb, del_cb); + check_cb, foreign_dev_check_cb, + del_cb); if (err && err != -EOPNOTSUPP) return err; } - return err; + /* Event is neither on a bridge nor a LAG. Check whether it is on an + * interface that is in a bridge with us. + */ + if (!foreign_dev_check_cb) + return err; + + br = netdev_master_upper_dev_get(dev); + if (!br || !netif_is_bridge_master(br)) + return err; + + switchdev = switchdev_lower_dev_find(br, check_cb, foreign_dev_check_cb); + if (!switchdev) + return err; + + if (!foreign_dev_check_cb(switchdev, dev)) + return err; + + return __switchdev_handle_port_obj_del(br, port_obj_info, check_cb, + foreign_dev_check_cb, del_cb); } +/* Pass through a port object deletion, if @dev passes @check_cb, or replicate + * it towards all lower interfaces of @dev that pass @check_cb, if @dev is a + * bridge or a LAG. + */ int switchdev_handle_port_obj_del(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), @@ -639,13 +727,35 @@ int switchdev_handle_port_obj_del(struct net_device *dev, int err; err = __switchdev_handle_port_obj_del(dev, port_obj_info, check_cb, - del_cb); + NULL, del_cb); if (err == -EOPNOTSUPP) err = 0; return err; } EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_del); +/* Same as switchdev_handle_port_obj_del(), except if object is notified on a + * @dev that passes @foreign_dev_check_cb, it is replicated towards all devices + * that pass @check_cb and are in the same bridge as @dev. + */ +int switchdev_handle_port_obj_del_foreign(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*del_cb)(struct net_device *dev, const void *ctx, + const struct switchdev_obj *obj)) +{ + int err; + + err = __switchdev_handle_port_obj_del(dev, port_obj_info, check_cb, + foreign_dev_check_cb, del_cb); + if (err == -EOPNOTSUPP) + err = 0; + return err; +} +EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_del_foreign); + static int __switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, bool (*check_cb)(const struct net_device *dev), diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 473a790f5894..6d39ca05f249 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -352,16 +352,18 @@ static int tipc_enable_bearer(struct net *net, const char *name, goto rejected; } - test_and_set_bit_lock(0, &b->up); - rcu_assign_pointer(tn->bearer_list[bearer_id], b); - if (skb) - tipc_bearer_xmit_skb(net, bearer_id, skb, &b->bcast_addr); - + /* Create monitoring data before accepting activate messages */ if (tipc_mon_create(net, bearer_id)) { bearer_disable(net, b); + kfree_skb(skb); return -ENOMEM; } + test_and_set_bit_lock(0, &b->up); + rcu_assign_pointer(tn->bearer_list[bearer_id], b); + if (skb) + tipc_bearer_xmit_skb(net, bearer_id, skb, &b->bcast_addr); + pr_info("Enabled bearer <%s>, priority %u\n", name, prio); return res; @@ -768,7 +770,7 @@ void tipc_clone_to_loopback(struct net *net, struct sk_buff_head *pkts) skb->pkt_type = PACKET_HOST; skb->ip_summed = CHECKSUM_UNNECESSARY; skb->protocol = eth_type_trans(skb, dev); - netif_rx_ni(skb); + netif_rx(skb); } } diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 9325479295b8..f09316a9035f 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -2276,7 +2276,7 @@ static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr) struct tipc_crypto *tx = tipc_net(rx->net)->crypto_tx; struct tipc_aead_key *skey = NULL; u16 key_gen = msg_key_gen(hdr); - u16 size = msg_data_sz(hdr); + u32 size = msg_data_sz(hdr); u8 *data = msg_data(hdr); unsigned int keylen; diff --git a/net/tipc/link.c b/net/tipc/link.c index 1e14d7f8f28f..e260c0d557f5 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -2286,6 +2286,11 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, break; case STATE_MSG: + /* Validate Gap ACK blocks, drop if invalid */ + glen = tipc_get_gap_ack_blks(&ga, l, hdr, true); + if (glen > dlen) + break; + l->rcv_nxt_state = msg_seqno(hdr) + 1; /* Update own tolerance if peer indicates a non-zero value */ @@ -2311,10 +2316,6 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, break; } - /* Receive Gap ACK blocks from peer if any */ - glen = tipc_get_gap_ack_blks(&ga, l, hdr, true); - if(glen > dlen) - break; tipc_mon_rcv(l->net, data + glen, dlen - glen, l->addr, &l->mon_state, l->bearer_id); diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 01396dd1c899..1d8ba233d047 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -967,7 +967,7 @@ static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, list_for_each_entry(p, &sr->all_publ, all_publ) if (p->key == *last_key) break; - if (p->key != *last_key) + if (list_entry_is_head(p, &sr->all_publ, all_publ)) return -EPIPE; } else { p = list_first_entry(&sr->all_publ, diff --git a/net/tipc/node.c b/net/tipc/node.c index 9947b7dfe1d2..6ef95ce565bd 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -403,7 +403,7 @@ static void tipc_node_write_unlock(struct tipc_node *n) u32 flags = n->action_flags; struct list_head *publ_list; struct tipc_uaddr ua; - u32 bearer_id; + u32 bearer_id, node; if (likely(!flags)) { write_unlock_bh(&n->lock); @@ -413,7 +413,8 @@ static void tipc_node_write_unlock(struct tipc_node *n) tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, TIPC_LINK_STATE, n->addr, n->addr); sk.ref = n->link_id; - sk.node = n->addr; + sk.node = tipc_own_addr(net); + node = n->addr; bearer_id = n->link_id & 0xffff; publ_list = &n->publ_list; @@ -423,17 +424,17 @@ static void tipc_node_write_unlock(struct tipc_node *n) write_unlock_bh(&n->lock); if (flags & TIPC_NOTIFY_NODE_DOWN) - tipc_publ_notify(net, publ_list, sk.node, n->capabilities); + tipc_publ_notify(net, publ_list, node, n->capabilities); if (flags & TIPC_NOTIFY_NODE_UP) - tipc_named_node_up(net, sk.node, n->capabilities); + tipc_named_node_up(net, node, n->capabilities); if (flags & TIPC_NOTIFY_LINK_UP) { - tipc_mon_peer_up(net, sk.node, bearer_id); + tipc_mon_peer_up(net, node, bearer_id); tipc_nametbl_publish(net, &ua, &sk, sk.ref); } if (flags & TIPC_NOTIFY_LINK_DOWN) { - tipc_mon_peer_down(net, sk.node, bearer_id); + tipc_mon_peer_down(net, node, bearer_id); tipc_nametbl_withdraw(net, &ua, &sk, sk.ref); } } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3e63c83e641c..7545321c3440 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3749,7 +3749,7 @@ static int __tipc_nl_list_sk_publ(struct sk_buff *skb, if (p->key == *last_publ) break; } - if (p->key != *last_publ) { + if (list_entry_is_head(p, &tsk->publications, binding_sock)) { /* We never set seq or call nl_dump_check_consistent() * this means that setting prev_seq here will cause the * consistence check to fail in the netlink callback diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 3235261f138d..38baeb189d4e 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1401,6 +1401,7 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr, sk->sk_state = sk->sk_state == TCP_ESTABLISHED ? TCP_CLOSING : TCP_CLOSE; sock->state = SS_UNCONNECTED; vsock_transport_cancel_pkt(vsk); + vsock_remove_connected(vsk); goto out_wait; } else if (timeout == 0) { err = -ETIMEDOUT; diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 1e9be50469ce..527ae669f6f7 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -33,7 +33,7 @@ $(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.hex) echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);'; \ ) > $@ -$(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDI) \ +$(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR) \ $(wildcard $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR)/*.x509) @$(kecho) " GEN $@" $(Q)(set -e; \ diff --git a/net/wireless/core.c b/net/wireless/core.c index 3a54c8e6b6c6..f08d4b3bb148 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2021 Intel Corporation + * Copyright (C) 2018-2022 Intel Corporation */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -332,29 +332,20 @@ static void cfg80211_event_work(struct work_struct *work) void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev, *tmp; - bool found = false; ASSERT_RTNL(); - list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { + list_for_each_entry_safe(wdev, tmp, &rdev->wiphy.wdev_list, list) { if (wdev->nl_owner_dead) { if (wdev->netdev) dev_close(wdev->netdev); - found = true; - } - } - - if (!found) - return; - wiphy_lock(&rdev->wiphy); - list_for_each_entry_safe(wdev, tmp, &rdev->wiphy.wdev_list, list) { - if (wdev->nl_owner_dead) { + wiphy_lock(&rdev->wiphy); cfg80211_leave(rdev, wdev); rdev_del_virtual_intf(rdev, wdev); + wiphy_unlock(&rdev->wiphy); } } - wiphy_unlock(&rdev->wiphy); } static void cfg80211_destroy_iface_wk(struct work_struct *work) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7543c73a3f1d..ee1c2b6b6971 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -13504,6 +13504,9 @@ static int handle_nan_filter(struct nlattr *attr_filter, i = 0; nla_for_each_nested(attr, attr_filter, rem) { filter[i].filter = nla_memdup(attr, GFP_KERNEL); + if (!filter[i].filter) + goto err; + filter[i].len = nla_len(attr); i++; } @@ -13516,6 +13519,15 @@ static int handle_nan_filter(struct nlattr *attr_filter, } return 0; + +err: + i = 0; + nla_for_each_nested(attr, attr_filter, rem) { + kfree(filter[i].filter); + i++; + } + kfree(filter); + return -ENOMEM; } static int nl80211_nan_add_func(struct sk_buff *skb, @@ -17909,7 +17921,8 @@ void cfg80211_ch_switch_notify(struct net_device *dev, wdev->chandef = *chandef; wdev->preset_chandef = *chandef; - if (wdev->iftype == NL80211_IFTYPE_STATION && + if ((wdev->iftype == NL80211_IFTYPE_STATION || + wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) && !WARN_ON(!wdev->current_bss)) cfg80211_update_assoc_bss_entry(wdev, chandef->chan); diff --git a/net/wireless/util.c b/net/wireless/util.c index 2eda097aee7f..a60d7d638e72 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -2284,7 +2284,7 @@ void cfg80211_send_layer2_update(struct net_device *dev, const u8 *addr) skb->dev = dev; skb->protocol = eth_type_trans(skb, dev); memset(skb->cb, 0, sizeof(skb->cb)); - netif_rx_ni(skb); + netif_rx(skb); } EXPORT_SYMBOL(cfg80211_send_layer2_update); diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 3fa066419d37..39bce5d764de 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -223,6 +223,9 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, if (x->encap || x->tfcpad) return -EINVAL; + if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND)) + return -EINVAL; + dev = dev_get_by_index(net, xuo->ifindex); if (!dev) { if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) { @@ -262,7 +265,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, netdev_tracker_alloc(dev, &xso->dev_tracker, GFP_ATOMIC); xso->real_dev = dev; xso->num_exthdrs = 1; - xso->flags = xuo->flags; + /* Don't forward bit that is not implemented */ + xso->flags = xuo->flags & ~XFRM_OFFLOAD_IPV6; err = dev->xfrmdev_ops->xdo_dev_state_add(x); if (err) { diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 57448fc519fc..5113fa0fbcee 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -190,7 +190,7 @@ static void xfrmi_dev_uninit(struct net_device *dev) static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) { - skb->tstamp = 0; + skb_clear_tstamp(skb); skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; skb->ignore_df = 0; @@ -304,7 +304,10 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + if (skb->len > 1280) + icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + else + goto xmit; } else { if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) goto xmit; @@ -673,12 +676,12 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], struct net *net = xi->net; struct xfrm_if_parms p = {}; + xfrmi_netlink_parms(data, &p); if (!p.if_id) { NL_SET_ERR_MSG(extack, "if_id must be non zero"); return -EINVAL; } - xfrmi_netlink_parms(data, &p); xi = xfrmi_locate(net, &p); if (!xi) { xi = netdev_priv(dev); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 04d1ce9b510f..882526159d3a 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -4256,7 +4256,7 @@ static bool xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp, } static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel, - u8 dir, u8 type, struct net *net) + u8 dir, u8 type, struct net *net, u32 if_id) { struct xfrm_policy *pol, *ret = NULL; struct hlist_head *chain; @@ -4265,7 +4265,8 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir); hlist_for_each_entry(pol, chain, bydst) { - if (xfrm_migrate_selector_match(sel, &pol->selector) && + if ((if_id == 0 || pol->if_id == if_id) && + xfrm_migrate_selector_match(sel, &pol->selector) && pol->type == type) { ret = pol; priority = ret->priority; @@ -4277,7 +4278,8 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * if ((pol->priority >= priority) && ret) break; - if (xfrm_migrate_selector_match(sel, &pol->selector) && + if ((if_id == 0 || pol->if_id == if_id) && + xfrm_migrate_selector_match(sel, &pol->selector) && pol->type == type) { ret = pol; break; @@ -4393,7 +4395,7 @@ static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate) int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_migrate, struct xfrm_kmaddress *k, struct net *net, - struct xfrm_encap_tmpl *encap) + struct xfrm_encap_tmpl *encap, u32 if_id) { int i, err, nx_cur = 0, nx_new = 0; struct xfrm_policy *pol = NULL; @@ -4412,14 +4414,14 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, } /* Stage 1 - find policy */ - if ((pol = xfrm_migrate_policy_find(sel, dir, type, net)) == NULL) { + if ((pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id)) == NULL) { err = -ENOENT; goto out; } /* Stage 2 - find and update state(s) */ for (i = 0, mp = m; i < num_migrate; i++, mp++) { - if ((x = xfrm_migrate_state_find(mp, net))) { + if ((x = xfrm_migrate_state_find(mp, net, if_id))) { x_cur[nx_cur] = x; nx_cur++; xc = xfrm_state_migrate(x, mp, encap); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index ca6bee18346d..b749935152ba 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1579,9 +1579,6 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, memcpy(&x->mark, &orig->mark, sizeof(x->mark)); memcpy(&x->props.smark, &orig->props.smark, sizeof(x->props.smark)); - if (xfrm_init_state(x) < 0) - goto error; - x->props.flags = orig->props.flags; x->props.extra_flags = orig->props.extra_flags; @@ -1606,7 +1603,8 @@ out: return NULL; } -struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net) +struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net, + u32 if_id) { unsigned int h; struct xfrm_state *x = NULL; @@ -1622,6 +1620,8 @@ struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *n continue; if (m->reqid && x->props.reqid != m->reqid) continue; + if (if_id != 0 && x->if_id != if_id) + continue; if (!xfrm_addr_equal(&x->id.daddr, &m->old_daddr, m->old_family) || !xfrm_addr_equal(&x->props.saddr, &m->old_saddr, @@ -1637,6 +1637,8 @@ struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *n if (x->props.mode != m->mode || x->id.proto != m->proto) continue; + if (if_id != 0 && x->if_id != if_id) + continue; if (!xfrm_addr_equal(&x->id.daddr, &m->old_daddr, m->old_family) || !xfrm_addr_equal(&x->props.saddr, &m->old_saddr, @@ -1663,6 +1665,11 @@ struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, if (!xc) return NULL; + xc->props.family = m->new_family; + + if (xfrm_init_state(xc) < 0) + goto error; + memcpy(&xc->id.daddr, &m->new_daddr, sizeof(xc->id.daddr)); memcpy(&xc->props.saddr, &m->new_saddr, sizeof(xc->props.saddr)); @@ -2572,7 +2579,7 @@ void xfrm_state_delete_tunnel(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_delete_tunnel); -u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu) +u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) { const struct xfrm_type *type = READ_ONCE(x->type); struct crypto_aead *aead; @@ -2603,17 +2610,7 @@ u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu) return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - net_adj) & ~(blksize - 1)) + net_adj - 2; } -EXPORT_SYMBOL_GPL(__xfrm_state_mtu); - -u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) -{ - mtu = __xfrm_state_mtu(x, mtu); - - if (x->props.family == AF_INET6 && mtu < IPV6_MIN_MTU) - return IPV6_MIN_MTU; - - return mtu; -} +EXPORT_SYMBOL_GPL(xfrm_state_mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) { diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 8cd6c8129004..72b2f173aac8 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -630,13 +630,8 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, xfrm_smark_init(attrs, &x->props.smark); - if (attrs[XFRMA_IF_ID]) { + if (attrs[XFRMA_IF_ID]) x->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); - if (!x->if_id) { - err = -EINVAL; - goto error; - } - } err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]); if (err) @@ -1432,13 +1427,8 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, mark = xfrm_mark_get(attrs, &m); - if (attrs[XFRMA_IF_ID]) { + if (attrs[XFRMA_IF_ID]) if_id = nla_get_u32(attrs[XFRMA_IF_ID]); - if (!if_id) { - err = -EINVAL; - goto out_noput; - } - } if (p->info.seq) { x = xfrm_find_acq_byseq(net, mark, p->info.seq); @@ -1751,13 +1741,8 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_us xfrm_mark_get(attrs, &xp->mark); - if (attrs[XFRMA_IF_ID]) { + if (attrs[XFRMA_IF_ID]) xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); - if (!xp->if_id) { - err = -EINVAL; - goto error; - } - } return xp; error: @@ -2608,6 +2593,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, int n = 0; struct net *net = sock_net(skb->sk); struct xfrm_encap_tmpl *encap = NULL; + u32 if_id = 0; if (attrs[XFRMA_MIGRATE] == NULL) return -EINVAL; @@ -2632,7 +2618,10 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, return -ENOMEM; } - err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap); + if (attrs[XFRMA_IF_ID]) + if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + + err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap, if_id); kfree(encap); |