diff options
Diffstat (limited to 'net')
292 files changed, 8646 insertions, 3681 deletions
diff --git a/net/802/hippi.c b/net/802/hippi.c index f80b33a8f7e0..887e73d520e4 100644 --- a/net/802/hippi.c +++ b/net/802/hippi.c @@ -121,7 +121,7 @@ int hippi_mac_addr(struct net_device *dev, void *p) struct sockaddr *addr = p; if (netif_running(dev)) return -EBUSY; - memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); + dev_addr_set(dev, addr->sa_data); return 0; } EXPORT_SYMBOL(hippi_mac_addr); diff --git a/net/802/p8022.c b/net/802/p8022.c index a6585627051d..79c23173116c 100644 --- a/net/802/p8022.c +++ b/net/802/p8022.c @@ -23,7 +23,7 @@ #include <net/p8022.h> static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb, - unsigned char *dest) + const unsigned char *dest) { llc_build_and_send_ui_pkt(dl->sap, skb, dest, dl->sap->laddr.lsap); return 0; diff --git a/net/802/psnap.c b/net/802/psnap.c index 4492e8d7ad20..1406bfdbda13 100644 --- a/net/802/psnap.c +++ b/net/802/psnap.c @@ -79,7 +79,7 @@ drop: * Put a SNAP header on a frame and pass to 802.2 */ static int snap_request(struct datalink_proto *dl, - struct sk_buff *skb, u8 *dest) + struct sk_buff *skb, const u8 *dest) { memcpy(skb_push(skb, 5), dl->type, 5); llc_build_and_send_ui_pkt(snap_sap, skb, dest, snap_sap->laddr.lsap); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 0c21d1fec852..90330b893134 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -250,7 +250,7 @@ bool vlan_dev_inherit_address(struct net_device *dev, if (dev->addr_assign_type != NET_ADDR_STOLEN) return false; - ether_addr_copy(dev->dev_addr, real_dev->dev_addr); + eth_hw_addr_set(dev, real_dev->dev_addr); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return true; } @@ -349,7 +349,7 @@ static int vlan_dev_set_mac_address(struct net_device *dev, void *p) dev_uc_del(real_dev, dev->dev_addr); out: - ether_addr_copy(dev->dev_addr, addr->sa_data); + eth_hw_addr_set(dev, addr->sa_data); return 0; } @@ -586,7 +586,7 @@ static int vlan_dev_init(struct net_device *dev) dev->dev_id = real_dev->dev_id; if (is_zero_ether_addr(dev->dev_addr)) { - ether_addr_copy(dev->dev_addr, real_dev->dev_addr); + eth_hw_addr_set(dev, real_dev->dev_addr); dev->addr_assign_type = NET_ADDR_STOLEN; } if (is_zero_ether_addr(dev->broadcast)) diff --git a/net/Kconfig b/net/Kconfig index fb13460c6dab..074472dfa94a 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -294,7 +294,7 @@ config CGROUP_NET_CLASSID config NET_RX_BUSY_POLL bool - default y + default y if !PREEMPT_RT config BQL bool diff --git a/net/atm/br2684.c b/net/atm/br2684.c index dd2a8dabed84..11854fde52db 100644 --- a/net/atm/br2684.c +++ b/net/atm/br2684.c @@ -578,7 +578,7 @@ static int br2684_regvcc(struct atm_vcc *atmvcc, void __user * arg) if (list_empty(&brdev->brvccs) && !brdev->mac_was_set) { unsigned char *esi = atmvcc->dev->esi; if (esi[0] | esi[1] | esi[2] | esi[3] | esi[4] | esi[5]) - memcpy(net_dev->dev_addr, esi, net_dev->addr_len); + dev_addr_set(net_dev, esi); else net_dev->dev_addr[2] = 1; } diff --git a/net/atm/lec.c b/net/atm/lec.c index 7226c784dbe0..8eaea4a4bbd6 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -355,8 +355,7 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb) pr_debug("%s: msg from zeppelin:%d\n", dev->name, mesg->type); switch (mesg->type) { case l_set_mac_addr: - for (i = 0; i < 6; i++) - dev->dev_addr[i] = mesg->content.normal.mac_addr[i]; + eth_hw_addr_set(dev, mesg->content.normal.mac_addr); break; case l_del_mac_addr: for (i = 0; i < 6; i++) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 2631efc6e359..2f34bbdde0e8 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -202,7 +202,7 @@ struct sock *ax25_get_socket(ax25_address *my_addr, ax25_address *dest_addr, * Find an AX.25 control block given both ends. It will only pick up * floating AX.25 control blocks or non Raw socket bound control blocks. */ -ax25_cb *ax25_find_cb(ax25_address *src_addr, ax25_address *dest_addr, +ax25_cb *ax25_find_cb(const ax25_address *src_addr, ax25_address *dest_addr, ax25_digi *digi, struct net_device *dev) { ax25_cb *s; diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 4ac2e0847652..d0a043a51848 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -35,7 +35,7 @@ ax25_dev *ax25_addr_ax25dev(ax25_address *addr) spin_lock_bh(&ax25_dev_lock); for (ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next) - if (ax25cmp(addr, (ax25_address *)ax25_dev->dev->dev_addr) == 0) { + if (ax25cmp(addr, (const ax25_address *)ax25_dev->dev->dev_addr) == 0) { res = ax25_dev; } spin_unlock_bh(&ax25_dev_lock); diff --git a/net/ax25/ax25_iface.c b/net/ax25/ax25_iface.c index b4083f30af0d..979bc4b828a0 100644 --- a/net/ax25/ax25_iface.c +++ b/net/ax25/ax25_iface.c @@ -98,7 +98,7 @@ void ax25_linkfail_release(struct ax25_linkfail *lf) EXPORT_SYMBOL(ax25_linkfail_release); -int ax25_listen_register(ax25_address *callsign, struct net_device *dev) +int ax25_listen_register(const ax25_address *callsign, struct net_device *dev) { struct listen_struct *listen; @@ -121,7 +121,7 @@ int ax25_listen_register(ax25_address *callsign, struct net_device *dev) EXPORT_SYMBOL(ax25_listen_register); -void ax25_listen_release(ax25_address *callsign, struct net_device *dev) +void ax25_listen_release(const ax25_address *callsign, struct net_device *dev) { struct listen_struct *s, *listen; @@ -171,7 +171,7 @@ int (*ax25_protocol_function(unsigned int pid))(struct sk_buff *, ax25_cb *) return res; } -int ax25_listen_mine(ax25_address *callsign, struct net_device *dev) +int ax25_listen_mine(const ax25_address *callsign, struct net_device *dev) { struct listen_struct *listen; diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index cd6afe895db9..1cac25aca637 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -181,7 +181,7 @@ static int ax25_process_rx_frame(ax25_cb *ax25, struct sk_buff *skb, int type, i } static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, - ax25_address *dev_addr, struct packet_type *ptype) + const ax25_address *dev_addr, struct packet_type *ptype) { ax25_address src, dest, *next_digi = NULL; int type = 0, mine = 0, dama; @@ -447,5 +447,5 @@ int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev, skb_pull(skb, AX25_KISS_HEADER_LEN); /* Remove the KISS byte */ - return ax25_rcv(skb, dev, (ax25_address *)dev->dev_addr, ptype); + return ax25_rcv(skb, dev, (const ax25_address *)dev->dev_addr, ptype); } diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c index 22f2f66c6e0a..3db76d2470e9 100644 --- a/net/ax25/ax25_out.c +++ b/net/ax25/ax25_out.c @@ -29,7 +29,7 @@ static DEFINE_SPINLOCK(ax25_frag_lock); -ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, ax25_address *src, ax25_address *dest, ax25_digi *digi, struct net_device *dev) +ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, const ax25_address *src, ax25_address *dest, ax25_digi *digi, struct net_device *dev) { ax25_dev *ax25_dev; ax25_cb *ax25; diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 1669744304c5..7242b32fff80 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -254,7 +254,7 @@ batadv_claim_hash_find(struct batadv_priv *bat_priv, * Return: backbone gateway if found or NULL otherwise */ static struct batadv_bla_backbone_gw * -batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr, +batadv_backbone_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_hashtable *hash = bat_priv->bla.backbone_hash; @@ -336,7 +336,7 @@ batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) * @vid: the VLAN ID * @claimtype: the type of the claim (CLAIM, UNCLAIM, ANNOUNCE, ...) */ -static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, +static void batadv_bla_send_claim(struct batadv_priv *bat_priv, const u8 *mac, unsigned short vid, int claimtype) { struct sk_buff *skb; @@ -488,7 +488,7 @@ static void batadv_bla_loopdetect_report(struct work_struct *work) * Return: the (possibly created) backbone gateway or NULL on error */ static struct batadv_bla_backbone_gw * -batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, +batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, const u8 *orig, unsigned short vid, bool own_backbone) { struct batadv_bla_backbone_gw *entry; @@ -926,7 +926,7 @@ static bool batadv_handle_request(struct batadv_priv *bat_priv, */ static bool batadv_handle_unclaim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, - u8 *backbone_addr, u8 *claim_addr, + const u8 *backbone_addr, const u8 *claim_addr, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; @@ -964,7 +964,7 @@ static bool batadv_handle_unclaim(struct batadv_priv *bat_priv, */ static bool batadv_handle_claim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, - u8 *backbone_addr, u8 *claim_addr, + const u8 *backbone_addr, const u8 *claim_addr, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; @@ -2126,7 +2126,7 @@ batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, struct batadv_hard_iface *primary_if, struct batadv_bla_claim *claim) { - u8 *primary_addr = primary_if->net_dev->dev_addr; + const u8 *primary_addr = primary_if->net_dev->dev_addr; u16 backbone_crc; bool is_own; void *hdr; @@ -2294,7 +2294,7 @@ batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, struct batadv_hard_iface *primary_if, struct batadv_bla_backbone_gw *backbone_gw) { - u8 *primary_addr = primary_if->net_dev->dev_addr; + const u8 *primary_addr = primary_if->net_dev->dev_addr; u16 backbone_crc; bool is_own; int msecs; diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index a3b6658ed789..433901dcf0c3 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -89,7 +89,7 @@ static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface) rcu_read_lock(); do { upper = netdev_master_upper_dev_get_rcu(upper); - } while (upper && !(upper->priv_flags & IFF_EBRIDGE)); + } while (upper && !netif_is_bridge_master(upper)); dev_hold(upper); rcu_read_unlock(); diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 970d0d7ccc98..83f31494ea4d 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -747,7 +747,8 @@ batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_orig_node *orig_node = NULL; struct batadv_hard_iface *primary_if = NULL; bool ret = false; - u8 *orig_addr, orig_ttvn; + const u8 *orig_addr; + u8 orig_ttvn; if (batadv_is_my_client(bat_priv, dst_addr, vid)) { primary_if = batadv_primary_if_get_selected(bat_priv); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 0604b0279573..7ee09337fc40 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -134,7 +134,7 @@ static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) return -EADDRNOTAVAIL; ether_addr_copy(old_addr, dev->dev_addr); - ether_addr_copy(dev->dev_addr, addr->sa_data); + eth_hw_addr_set(dev, addr->sa_data); /* only modify transtable if it has been initialized before */ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 56b9fe97b3b4..fbcb15c7c29b 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -631,9 +631,9 @@ static void batadv_tp_recv_ack(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node = NULL; const struct batadv_icmp_tp_packet *icmp; struct batadv_tp_vars *tp_vars; + const unsigned char *dev_addr; size_t packet_len, mss; u32 rtt, recv_ack, cwnd; - unsigned char *dev_addr; packet_len = BATADV_TP_PLEN; mss = BATADV_TP_PLEN; diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 992773376e51..0cb58eb04093 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -587,8 +587,8 @@ void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length */ -void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, - u8 *dst, u8 type, u8 version, +void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, const u8 *src, + const u8 *dst, u8 type, u8 version, void *tvlv_value, u16 tvlv_value_len) { struct batadv_unicast_tvlv_packet *unicast_tvlv_packet; diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h index 54f2a35653d0..4cf8af00fc11 100644 --- a/net/batman-adv/tvlv.h +++ b/net/batman-adv/tvlv.h @@ -42,8 +42,8 @@ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, u8 *src, u8 *dst, void *tvlv_buff, u16 tvlv_buff_len); -void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, - u8 *dst, u8 type, u8 version, +void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, const u8 *src, + const u8 *dst, u8 type, u8 version, void *tvlv_value, u16 tvlv_value_len); #endif /* _NET_BATMAN_ADV_TVLV_H_ */ diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile index cc0995301f93..291770fc9551 100644 --- a/net/bluetooth/Makefile +++ b/net/bluetooth/Makefile @@ -14,7 +14,8 @@ bluetooth_6lowpan-y := 6lowpan.o bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \ hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o lib.o \ - ecdh_helper.o hci_request.o mgmt_util.o mgmt_config.o + ecdh_helper.o hci_request.o mgmt_util.o mgmt_config.o hci_codec.o \ + eir.o bluetooth-$(CONFIG_BT_BREDR) += sco.o bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o diff --git a/net/bluetooth/eir.c b/net/bluetooth/eir.c new file mode 100644 index 000000000000..7e930f77ecab --- /dev/null +++ b/net/bluetooth/eir.c @@ -0,0 +1,335 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BlueZ - Bluetooth protocol stack for Linux + * + * Copyright (C) 2021 Intel Corporation + */ + +#include <net/bluetooth/bluetooth.h> +#include <net/bluetooth/hci_core.h> +#include <net/bluetooth/mgmt.h> + +#include "eir.h" + +#define PNP_INFO_SVCLASS_ID 0x1200 + +u8 eir_append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len) +{ + size_t short_len; + size_t complete_len; + + /* no space left for name (+ NULL + type + len) */ + if ((HCI_MAX_AD_LENGTH - ad_len) < HCI_MAX_SHORT_NAME_LENGTH + 3) + return ad_len; + + /* use complete name if present and fits */ + complete_len = strlen(hdev->dev_name); + if (complete_len && complete_len <= HCI_MAX_SHORT_NAME_LENGTH) + return eir_append_data(ptr, ad_len, EIR_NAME_COMPLETE, + hdev->dev_name, complete_len + 1); + + /* use short name if present */ + short_len = strlen(hdev->short_name); + if (short_len) + return eir_append_data(ptr, ad_len, EIR_NAME_SHORT, + hdev->short_name, short_len + 1); + + /* use shortened full name if present, we already know that name + * is longer then HCI_MAX_SHORT_NAME_LENGTH + */ + if (complete_len) { + u8 name[HCI_MAX_SHORT_NAME_LENGTH + 1]; + + memcpy(name, hdev->dev_name, HCI_MAX_SHORT_NAME_LENGTH); + name[HCI_MAX_SHORT_NAME_LENGTH] = '\0'; + + return eir_append_data(ptr, ad_len, EIR_NAME_SHORT, name, + sizeof(name)); + } + + return ad_len; +} + +u8 eir_append_appearance(struct hci_dev *hdev, u8 *ptr, u8 ad_len) +{ + return eir_append_le16(ptr, ad_len, EIR_APPEARANCE, hdev->appearance); +} + +static u8 *create_uuid16_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) +{ + u8 *ptr = data, *uuids_start = NULL; + struct bt_uuid *uuid; + + if (len < 4) + return ptr; + + list_for_each_entry(uuid, &hdev->uuids, list) { + u16 uuid16; + + if (uuid->size != 16) + continue; + + uuid16 = get_unaligned_le16(&uuid->uuid[12]); + if (uuid16 < 0x1100) + continue; + + if (uuid16 == PNP_INFO_SVCLASS_ID) + continue; + + if (!uuids_start) { + uuids_start = ptr; + uuids_start[0] = 1; + uuids_start[1] = EIR_UUID16_ALL; + ptr += 2; + } + + /* Stop if not enough space to put next UUID */ + if ((ptr - data) + sizeof(u16) > len) { + uuids_start[1] = EIR_UUID16_SOME; + break; + } + + *ptr++ = (uuid16 & 0x00ff); + *ptr++ = (uuid16 & 0xff00) >> 8; + uuids_start[0] += sizeof(uuid16); + } + + return ptr; +} + +static u8 *create_uuid32_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) +{ + u8 *ptr = data, *uuids_start = NULL; + struct bt_uuid *uuid; + + if (len < 6) + return ptr; + + list_for_each_entry(uuid, &hdev->uuids, list) { + if (uuid->size != 32) + continue; + + if (!uuids_start) { + uuids_start = ptr; + uuids_start[0] = 1; + uuids_start[1] = EIR_UUID32_ALL; + ptr += 2; + } + + /* Stop if not enough space to put next UUID */ + if ((ptr - data) + sizeof(u32) > len) { + uuids_start[1] = EIR_UUID32_SOME; + break; + } + + memcpy(ptr, &uuid->uuid[12], sizeof(u32)); + ptr += sizeof(u32); + uuids_start[0] += sizeof(u32); + } + + return ptr; +} + +static u8 *create_uuid128_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) +{ + u8 *ptr = data, *uuids_start = NULL; + struct bt_uuid *uuid; + + if (len < 18) + return ptr; + + list_for_each_entry(uuid, &hdev->uuids, list) { + if (uuid->size != 128) + continue; + + if (!uuids_start) { + uuids_start = ptr; + uuids_start[0] = 1; + uuids_start[1] = EIR_UUID128_ALL; + ptr += 2; + } + + /* Stop if not enough space to put next UUID */ + if ((ptr - data) + 16 > len) { + uuids_start[1] = EIR_UUID128_SOME; + break; + } + + memcpy(ptr, uuid->uuid, 16); + ptr += 16; + uuids_start[0] += 16; + } + + return ptr; +} + +void eir_create(struct hci_dev *hdev, u8 *data) +{ + u8 *ptr = data; + size_t name_len; + + name_len = strlen(hdev->dev_name); + + if (name_len > 0) { + /* EIR Data type */ + if (name_len > 48) { + name_len = 48; + ptr[1] = EIR_NAME_SHORT; + } else { + ptr[1] = EIR_NAME_COMPLETE; + } + + /* EIR Data length */ + ptr[0] = name_len + 1; + + memcpy(ptr + 2, hdev->dev_name, name_len); + + ptr += (name_len + 2); + } + + if (hdev->inq_tx_power != HCI_TX_POWER_INVALID) { + ptr[0] = 2; + ptr[1] = EIR_TX_POWER; + ptr[2] = (u8)hdev->inq_tx_power; + + ptr += 3; + } + + if (hdev->devid_source > 0) { + ptr[0] = 9; + ptr[1] = EIR_DEVICE_ID; + + put_unaligned_le16(hdev->devid_source, ptr + 2); + put_unaligned_le16(hdev->devid_vendor, ptr + 4); + put_unaligned_le16(hdev->devid_product, ptr + 6); + put_unaligned_le16(hdev->devid_version, ptr + 8); + + ptr += 10; + } + + ptr = create_uuid16_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data)); + ptr = create_uuid32_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data)); + ptr = create_uuid128_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data)); +} + +u8 eir_create_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) +{ + struct adv_info *adv = NULL; + u8 ad_len = 0, flags = 0; + u32 instance_flags; + + /* Return 0 when the current instance identifier is invalid. */ + if (instance) { + adv = hci_find_adv_instance(hdev, instance); + if (!adv) + return 0; + } + + instance_flags = hci_adv_instance_flags(hdev, instance); + + /* If instance already has the flags set skip adding it once + * again. + */ + if (adv && eir_get_data(adv->adv_data, adv->adv_data_len, EIR_FLAGS, + NULL)) + goto skip_flags; + + /* The Add Advertising command allows userspace to set both the general + * and limited discoverable flags. + */ + if (instance_flags & MGMT_ADV_FLAG_DISCOV) + flags |= LE_AD_GENERAL; + + if (instance_flags & MGMT_ADV_FLAG_LIMITED_DISCOV) + flags |= LE_AD_LIMITED; + + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) + flags |= LE_AD_NO_BREDR; + + if (flags || (instance_flags & MGMT_ADV_FLAG_MANAGED_FLAGS)) { + /* If a discovery flag wasn't provided, simply use the global + * settings. + */ + if (!flags) + flags |= mgmt_get_adv_discov_flags(hdev); + + /* If flags would still be empty, then there is no need to + * include the "Flags" AD field". + */ + if (flags) { + ptr[0] = 0x02; + ptr[1] = EIR_FLAGS; + ptr[2] = flags; + + ad_len += 3; + ptr += 3; + } + } + +skip_flags: + if (adv) { + memcpy(ptr, adv->adv_data, adv->adv_data_len); + ad_len += adv->adv_data_len; + ptr += adv->adv_data_len; + } + + if (instance_flags & MGMT_ADV_FLAG_TX_POWER) { + s8 adv_tx_power; + + if (ext_adv_capable(hdev)) { + if (adv) + adv_tx_power = adv->tx_power; + else + adv_tx_power = hdev->adv_tx_power; + } else { + adv_tx_power = hdev->adv_tx_power; + } + + /* Provide Tx Power only if we can provide a valid value for it */ + if (adv_tx_power != HCI_TX_POWER_INVALID) { + ptr[0] = 0x02; + ptr[1] = EIR_TX_POWER; + ptr[2] = (u8)adv_tx_power; + + ad_len += 3; + ptr += 3; + } + } + + return ad_len; +} + +static u8 create_default_scan_rsp(struct hci_dev *hdev, u8 *ptr) +{ + u8 scan_rsp_len = 0; + + if (hdev->appearance) + scan_rsp_len = eir_append_appearance(hdev, ptr, scan_rsp_len); + + return eir_append_local_name(hdev, ptr, scan_rsp_len); +} + +u8 eir_create_scan_rsp(struct hci_dev *hdev, u8 instance, u8 *ptr) +{ + struct adv_info *adv; + u8 scan_rsp_len = 0; + + if (!instance) + return create_default_scan_rsp(hdev, ptr); + + adv = hci_find_adv_instance(hdev, instance); + if (!adv) + return 0; + + if ((adv->flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) + scan_rsp_len = eir_append_appearance(hdev, ptr, scan_rsp_len); + + memcpy(&ptr[scan_rsp_len], adv->scan_rsp_data, adv->scan_rsp_len); + + scan_rsp_len += adv->scan_rsp_len; + + if (adv->flags & MGMT_ADV_FLAG_LOCAL_NAME) + scan_rsp_len = eir_append_local_name(hdev, ptr, scan_rsp_len); + + return scan_rsp_len; +} diff --git a/net/bluetooth/eir.h b/net/bluetooth/eir.h new file mode 100644 index 000000000000..724662f8f8b1 --- /dev/null +++ b/net/bluetooth/eir.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BlueZ - Bluetooth protocol stack for Linux + * + * Copyright (C) 2021 Intel Corporation + */ + +void eir_create(struct hci_dev *hdev, u8 *data); + +u8 eir_create_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr); +u8 eir_create_scan_rsp(struct hci_dev *hdev, u8 instance, u8 *ptr); + +u8 eir_append_local_name(struct hci_dev *hdev, u8 *eir, u8 ad_len); +u8 eir_append_appearance(struct hci_dev *hdev, u8 *ptr, u8 ad_len); + +static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, + u8 *data, u8 data_len) +{ + eir[eir_len++] = sizeof(type) + data_len; + eir[eir_len++] = type; + memcpy(&eir[eir_len], data, data_len); + eir_len += data_len; + + return eir_len; +} + +static inline u16 eir_append_le16(u8 *eir, u16 eir_len, u8 type, u16 data) +{ + eir[eir_len++] = sizeof(type) + sizeof(data); + eir[eir_len++] = type; + put_unaligned_le16(data, &eir[eir_len]); + eir_len += sizeof(data); + + return eir_len; +} + +static inline void *eir_get_data(u8 *eir, size_t eir_len, u8 type, + size_t *data_len) +{ + size_t parsed = 0; + + if (eir_len < 2) + return NULL; + + while (parsed < eir_len - 1) { + u8 field_len = eir[0]; + + if (field_len == 0) + break; + + parsed += field_len + 1; + + if (parsed > eir_len) + break; + + if (eir[1] != type) { + eir += field_len + 1; + continue; + } + + /* Zero length data */ + if (field_len == 1) + return NULL; + + if (data_len) + *data_len = field_len - 1; + + return &eir[2]; + } + + return NULL; +} diff --git a/net/bluetooth/hci_codec.c b/net/bluetooth/hci_codec.c new file mode 100644 index 000000000000..f0421d0edaa3 --- /dev/null +++ b/net/bluetooth/hci_codec.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* Copyright (C) 2021 Intel Corporation */ + +#include <net/bluetooth/bluetooth.h> +#include <net/bluetooth/hci_core.h> +#include "hci_codec.h" + +static int hci_codec_list_add(struct list_head *list, + struct hci_op_read_local_codec_caps *sent, + struct hci_rp_read_local_codec_caps *rp, + void *caps, + __u32 len) +{ + struct codec_list *entry; + + entry = kzalloc(sizeof(*entry) + len, GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->id = sent->id; + if (sent->id == 0xFF) { + entry->cid = __le16_to_cpu(sent->cid); + entry->vid = __le16_to_cpu(sent->vid); + } + entry->transport = sent->transport; + entry->len = len; + entry->num_caps = rp->num_caps; + if (rp->num_caps) + memcpy(entry->caps, caps, len); + list_add(&entry->list, list); + + return 0; +} + +void hci_codec_list_clear(struct list_head *codec_list) +{ + struct codec_list *c, *n; + + list_for_each_entry_safe(c, n, codec_list, list) { + list_del(&c->list); + kfree(c); + } +} + +static void hci_read_codec_capabilities(struct hci_dev *hdev, __u8 transport, + struct hci_op_read_local_codec_caps + *cmd) +{ + __u8 i; + + for (i = 0; i < TRANSPORT_TYPE_MAX; i++) { + if (transport & BIT(i)) { + struct hci_rp_read_local_codec_caps *rp; + struct hci_codec_caps *caps; + struct sk_buff *skb; + __u8 j; + __u32 len; + + cmd->transport = i; + skb = __hci_cmd_sync(hdev, HCI_OP_READ_LOCAL_CODEC_CAPS, + sizeof(*cmd), cmd, + HCI_CMD_TIMEOUT); + if (IS_ERR(skb)) { + bt_dev_err(hdev, "Failed to read codec capabilities (%ld)", + PTR_ERR(skb)); + continue; + } + + if (skb->len < sizeof(*rp)) + goto error; + + rp = (void *)skb->data; + + if (rp->status) + goto error; + + if (!rp->num_caps) { + len = 0; + /* this codec doesn't have capabilities */ + goto skip_caps_parse; + } + + skb_pull(skb, sizeof(*rp)); + + for (j = 0, len = 0; j < rp->num_caps; j++) { + caps = (void *)skb->data; + if (skb->len < sizeof(*caps)) + goto error; + if (skb->len < caps->len) + goto error; + len += sizeof(caps->len) + caps->len; + skb_pull(skb, sizeof(caps->len) + caps->len); + } + +skip_caps_parse: + hci_dev_lock(hdev); + hci_codec_list_add(&hdev->local_codecs, cmd, rp, + (__u8 *)rp + sizeof(*rp), len); + hci_dev_unlock(hdev); +error: + kfree_skb(skb); + } + } +} + +void hci_read_supported_codecs(struct hci_dev *hdev) +{ + struct sk_buff *skb; + struct hci_rp_read_local_supported_codecs *rp; + struct hci_std_codecs *std_codecs; + struct hci_vnd_codecs *vnd_codecs; + struct hci_op_read_local_codec_caps caps; + __u8 i; + + skb = __hci_cmd_sync(hdev, HCI_OP_READ_LOCAL_CODECS, 0, NULL, + HCI_CMD_TIMEOUT); + + if (IS_ERR(skb)) { + bt_dev_err(hdev, "Failed to read local supported codecs (%ld)", + PTR_ERR(skb)); + return; + } + + if (skb->len < sizeof(*rp)) + goto error; + + rp = (void *)skb->data; + + if (rp->status) + goto error; + + skb_pull(skb, sizeof(rp->status)); + + std_codecs = (void *)skb->data; + + /* validate codecs length before accessing */ + if (skb->len < flex_array_size(std_codecs, codec, std_codecs->num) + + sizeof(std_codecs->num)) + goto error; + + /* enumerate codec capabilities of standard codecs */ + memset(&caps, 0, sizeof(caps)); + for (i = 0; i < std_codecs->num; i++) { + caps.id = std_codecs->codec[i]; + caps.direction = 0x00; + hci_read_codec_capabilities(hdev, LOCAL_CODEC_ACL_MASK, &caps); + } + + skb_pull(skb, flex_array_size(std_codecs, codec, std_codecs->num) + + sizeof(std_codecs->num)); + + vnd_codecs = (void *)skb->data; + + /* validate vendor codecs length before accessing */ + if (skb->len < + flex_array_size(vnd_codecs, codec, vnd_codecs->num) + + sizeof(vnd_codecs->num)) + goto error; + + /* enumerate vendor codec capabilities */ + for (i = 0; i < vnd_codecs->num; i++) { + caps.id = 0xFF; + caps.cid = vnd_codecs->codec[i].cid; + caps.vid = vnd_codecs->codec[i].vid; + caps.direction = 0x00; + hci_read_codec_capabilities(hdev, LOCAL_CODEC_ACL_MASK, &caps); + } + +error: + kfree_skb(skb); +} + +void hci_read_supported_codecs_v2(struct hci_dev *hdev) +{ + struct sk_buff *skb; + struct hci_rp_read_local_supported_codecs_v2 *rp; + struct hci_std_codecs_v2 *std_codecs; + struct hci_vnd_codecs_v2 *vnd_codecs; + struct hci_op_read_local_codec_caps caps; + __u8 i; + + skb = __hci_cmd_sync(hdev, HCI_OP_READ_LOCAL_CODECS_V2, 0, NULL, + HCI_CMD_TIMEOUT); + + if (IS_ERR(skb)) { + bt_dev_err(hdev, "Failed to read local supported codecs (%ld)", + PTR_ERR(skb)); + return; + } + + if (skb->len < sizeof(*rp)) + goto error; + + rp = (void *)skb->data; + + if (rp->status) + goto error; + + skb_pull(skb, sizeof(rp->status)); + + std_codecs = (void *)skb->data; + + /* check for payload data length before accessing */ + if (skb->len < flex_array_size(std_codecs, codec, std_codecs->num) + + sizeof(std_codecs->num)) + goto error; + + memset(&caps, 0, sizeof(caps)); + + for (i = 0; i < std_codecs->num; i++) { + caps.id = std_codecs->codec[i].id; + hci_read_codec_capabilities(hdev, std_codecs->codec[i].transport, + &caps); + } + + skb_pull(skb, flex_array_size(std_codecs, codec, std_codecs->num) + + sizeof(std_codecs->num)); + + vnd_codecs = (void *)skb->data; + + /* check for payload data length before accessing */ + if (skb->len < + flex_array_size(vnd_codecs, codec, vnd_codecs->num) + + sizeof(vnd_codecs->num)) + goto error; + + for (i = 0; i < vnd_codecs->num; i++) { + caps.id = 0xFF; + caps.cid = vnd_codecs->codec[i].cid; + caps.vid = vnd_codecs->codec[i].vid; + hci_read_codec_capabilities(hdev, vnd_codecs->codec[i].transport, + &caps); + } + +error: + kfree_skb(skb); +} diff --git a/net/bluetooth/hci_codec.h b/net/bluetooth/hci_codec.h new file mode 100644 index 000000000000..a2751930f123 --- /dev/null +++ b/net/bluetooth/hci_codec.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* Copyright (C) 2014 Intel Corporation */ + +void hci_read_supported_codecs(struct hci_dev *hdev); +void hci_read_supported_codecs_v2(struct hci_dev *hdev); +void hci_codec_list_clear(struct list_head *codec_list); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 2b5059a56cda..bd669c95b9a7 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -307,13 +307,133 @@ static bool find_next_esco_param(struct hci_conn *conn, return conn->attempt <= size; } -bool hci_setup_sync(struct hci_conn *conn, __u16 handle) +static bool hci_enhanced_setup_sync_conn(struct hci_conn *conn, __u16 handle) +{ + struct hci_dev *hdev = conn->hdev; + struct hci_cp_enhanced_setup_sync_conn cp; + const struct sco_param *param; + + bt_dev_dbg(hdev, "hcon %p", conn); + + /* for offload use case, codec needs to configured before opening SCO */ + if (conn->codec.data_path) + hci_req_configure_datapath(hdev, &conn->codec); + + conn->state = BT_CONNECT; + conn->out = true; + + conn->attempt++; + + memset(&cp, 0x00, sizeof(cp)); + + cp.handle = cpu_to_le16(handle); + + cp.tx_bandwidth = cpu_to_le32(0x00001f40); + cp.rx_bandwidth = cpu_to_le32(0x00001f40); + + switch (conn->codec.id) { + case BT_CODEC_MSBC: + if (!find_next_esco_param(conn, esco_param_msbc, + ARRAY_SIZE(esco_param_msbc))) + return false; + + param = &esco_param_msbc[conn->attempt - 1]; + cp.tx_coding_format.id = 0x05; + cp.rx_coding_format.id = 0x05; + cp.tx_codec_frame_size = __cpu_to_le16(60); + cp.rx_codec_frame_size = __cpu_to_le16(60); + cp.in_bandwidth = __cpu_to_le32(32000); + cp.out_bandwidth = __cpu_to_le32(32000); + cp.in_coding_format.id = 0x04; + cp.out_coding_format.id = 0x04; + cp.in_coded_data_size = __cpu_to_le16(16); + cp.out_coded_data_size = __cpu_to_le16(16); + cp.in_pcm_data_format = 2; + cp.out_pcm_data_format = 2; + cp.in_pcm_sample_payload_msb_pos = 0; + cp.out_pcm_sample_payload_msb_pos = 0; + cp.in_data_path = conn->codec.data_path; + cp.out_data_path = conn->codec.data_path; + cp.in_transport_unit_size = 1; + cp.out_transport_unit_size = 1; + break; + + case BT_CODEC_TRANSPARENT: + if (!find_next_esco_param(conn, esco_param_msbc, + ARRAY_SIZE(esco_param_msbc))) + return false; + param = &esco_param_msbc[conn->attempt - 1]; + cp.tx_coding_format.id = 0x03; + cp.rx_coding_format.id = 0x03; + cp.tx_codec_frame_size = __cpu_to_le16(60); + cp.rx_codec_frame_size = __cpu_to_le16(60); + cp.in_bandwidth = __cpu_to_le32(0x1f40); + cp.out_bandwidth = __cpu_to_le32(0x1f40); + cp.in_coding_format.id = 0x03; + cp.out_coding_format.id = 0x03; + cp.in_coded_data_size = __cpu_to_le16(16); + cp.out_coded_data_size = __cpu_to_le16(16); + cp.in_pcm_data_format = 2; + cp.out_pcm_data_format = 2; + cp.in_pcm_sample_payload_msb_pos = 0; + cp.out_pcm_sample_payload_msb_pos = 0; + cp.in_data_path = conn->codec.data_path; + cp.out_data_path = conn->codec.data_path; + cp.in_transport_unit_size = 1; + cp.out_transport_unit_size = 1; + break; + + case BT_CODEC_CVSD: + if (lmp_esco_capable(conn->link)) { + if (!find_next_esco_param(conn, esco_param_cvsd, + ARRAY_SIZE(esco_param_cvsd))) + return false; + param = &esco_param_cvsd[conn->attempt - 1]; + } else { + if (conn->attempt > ARRAY_SIZE(sco_param_cvsd)) + return false; + param = &sco_param_cvsd[conn->attempt - 1]; + } + cp.tx_coding_format.id = 2; + cp.rx_coding_format.id = 2; + cp.tx_codec_frame_size = __cpu_to_le16(60); + cp.rx_codec_frame_size = __cpu_to_le16(60); + cp.in_bandwidth = __cpu_to_le32(16000); + cp.out_bandwidth = __cpu_to_le32(16000); + cp.in_coding_format.id = 4; + cp.out_coding_format.id = 4; + cp.in_coded_data_size = __cpu_to_le16(16); + cp.out_coded_data_size = __cpu_to_le16(16); + cp.in_pcm_data_format = 2; + cp.out_pcm_data_format = 2; + cp.in_pcm_sample_payload_msb_pos = 0; + cp.out_pcm_sample_payload_msb_pos = 0; + cp.in_data_path = conn->codec.data_path; + cp.out_data_path = conn->codec.data_path; + cp.in_transport_unit_size = 16; + cp.out_transport_unit_size = 16; + break; + default: + return false; + } + + cp.retrans_effort = param->retrans_effort; + cp.pkt_type = __cpu_to_le16(param->pkt_type); + cp.max_latency = __cpu_to_le16(param->max_latency); + + if (hci_send_cmd(hdev, HCI_OP_ENHANCED_SETUP_SYNC_CONN, sizeof(cp), &cp) < 0) + return false; + + return true; +} + +static bool hci_setup_sync_conn(struct hci_conn *conn, __u16 handle) { struct hci_dev *hdev = conn->hdev; struct hci_cp_setup_sync_conn cp; const struct sco_param *param; - BT_DBG("hcon %p", conn); + bt_dev_dbg(hdev, "hcon %p", conn); conn->state = BT_CONNECT; conn->out = true; @@ -359,6 +479,14 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle) return true; } +bool hci_setup_sync(struct hci_conn *conn, __u16 handle) +{ + if (enhanced_sco_capable(conn->hdev)) + return hci_enhanced_setup_sync_conn(conn, handle); + + return hci_setup_sync_conn(conn, handle); +} + u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, u16 to_multiplier) { @@ -1040,8 +1168,8 @@ static void hci_req_directed_advertising(struct hci_request *req, } struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, - u8 dst_type, u8 sec_level, u16 conn_timeout, - u8 role, bdaddr_t *direct_rpa) + u8 dst_type, bool dst_resolved, u8 sec_level, + u16 conn_timeout, u8 role, bdaddr_t *direct_rpa) { struct hci_conn_params *params; struct hci_conn *conn; @@ -1078,19 +1206,24 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, return ERR_PTR(-EBUSY); } - /* When given an identity address with existing identity - * resolving key, the connection needs to be established - * to a resolvable random address. - * - * Storing the resolvable random address is required here - * to handle connection failures. The address will later - * be resolved back into the original identity address - * from the connect request. + /* Check if the destination address has been resolved by the controller + * since if it did then the identity address shall be used. */ - irk = hci_find_irk_by_addr(hdev, dst, dst_type); - if (irk && bacmp(&irk->rpa, BDADDR_ANY)) { - dst = &irk->rpa; - dst_type = ADDR_LE_DEV_RANDOM; + if (!dst_resolved) { + /* When given an identity address with existing identity + * resolving key, the connection needs to be established + * to a resolvable random address. + * + * Storing the resolvable random address is required here + * to handle connection failures. The address will later + * be resolved back into the original identity address + * from the connect request. + */ + irk = hci_find_irk_by_addr(hdev, dst, dst_type); + if (irk && bacmp(&irk->rpa, BDADDR_ANY)) { + dst = &irk->rpa; + dst_type = ADDR_LE_DEV_RANDOM; + } } if (conn) { @@ -1319,7 +1452,7 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, } struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, - __u16 setting) + __u16 setting, struct bt_codec *codec) { struct hci_conn *acl; struct hci_conn *sco; @@ -1344,6 +1477,7 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, hci_conn_hold(sco); sco->setting = setting; + sco->codec = *codec; if (acl->state == BT_CONNECTED && (sco->state == BT_OPEN || sco->state == BT_CLOSED)) { diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 8a47a3017d61..8d33aa64846b 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -45,6 +45,7 @@ #include "leds.h" #include "msft.h" #include "aosp.h" +#include "hci_codec.h" static void hci_rx_work(struct work_struct *work); static void hci_cmd_work(struct work_struct *work); @@ -61,130 +62,6 @@ DEFINE_MUTEX(hci_cb_list_lock); /* HCI ID Numbering */ static DEFINE_IDA(hci_index_ida); -/* ---- HCI debugfs entries ---- */ - -static ssize_t dut_mode_read(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct hci_dev *hdev = file->private_data; - char buf[3]; - - buf[0] = hci_dev_test_flag(hdev, HCI_DUT_MODE) ? 'Y' : 'N'; - buf[1] = '\n'; - buf[2] = '\0'; - return simple_read_from_buffer(user_buf, count, ppos, buf, 2); -} - -static ssize_t dut_mode_write(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct hci_dev *hdev = file->private_data; - struct sk_buff *skb; - bool enable; - int err; - - if (!test_bit(HCI_UP, &hdev->flags)) - return -ENETDOWN; - - err = kstrtobool_from_user(user_buf, count, &enable); - if (err) - return err; - - if (enable == hci_dev_test_flag(hdev, HCI_DUT_MODE)) - return -EALREADY; - - hci_req_sync_lock(hdev); - if (enable) - skb = __hci_cmd_sync(hdev, HCI_OP_ENABLE_DUT_MODE, 0, NULL, - HCI_CMD_TIMEOUT); - else - skb = __hci_cmd_sync(hdev, HCI_OP_RESET, 0, NULL, - HCI_CMD_TIMEOUT); - hci_req_sync_unlock(hdev); - - if (IS_ERR(skb)) - return PTR_ERR(skb); - - kfree_skb(skb); - - hci_dev_change_flag(hdev, HCI_DUT_MODE); - - return count; -} - -static const struct file_operations dut_mode_fops = { - .open = simple_open, - .read = dut_mode_read, - .write = dut_mode_write, - .llseek = default_llseek, -}; - -static ssize_t vendor_diag_read(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct hci_dev *hdev = file->private_data; - char buf[3]; - - buf[0] = hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) ? 'Y' : 'N'; - buf[1] = '\n'; - buf[2] = '\0'; - return simple_read_from_buffer(user_buf, count, ppos, buf, 2); -} - -static ssize_t vendor_diag_write(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct hci_dev *hdev = file->private_data; - bool enable; - int err; - - err = kstrtobool_from_user(user_buf, count, &enable); - if (err) - return err; - - /* When the diagnostic flags are not persistent and the transport - * is not active or in user channel operation, then there is no need - * for the vendor callback. Instead just store the desired value and - * the setting will be programmed when the controller gets powered on. - */ - if (test_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks) && - (!test_bit(HCI_RUNNING, &hdev->flags) || - hci_dev_test_flag(hdev, HCI_USER_CHANNEL))) - goto done; - - hci_req_sync_lock(hdev); - err = hdev->set_diag(hdev, enable); - hci_req_sync_unlock(hdev); - - if (err < 0) - return err; - -done: - if (enable) - hci_dev_set_flag(hdev, HCI_VENDOR_DIAG); - else - hci_dev_clear_flag(hdev, HCI_VENDOR_DIAG); - - return count; -} - -static const struct file_operations vendor_diag_fops = { - .open = simple_open, - .read = vendor_diag_read, - .write = vendor_diag_write, - .llseek = default_llseek, -}; - -static void hci_debugfs_create_basic(struct hci_dev *hdev) -{ - debugfs_create_file("dut_mode", 0644, hdev->debugfs, hdev, - &dut_mode_fops); - - if (hdev->set_diag) - debugfs_create_file("vendor_diag", 0644, hdev->debugfs, hdev, - &vendor_diag_fops); -} - static int hci_reset_req(struct hci_request *req, unsigned long opt) { BT_DBG("%s %ld", req->hdev->name, opt); @@ -838,10 +715,6 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt) if (hdev->commands[22] & 0x04) hci_set_event_mask_page_2(req); - /* Read local codec list if the HCI command is supported */ - if (hdev->commands[29] & 0x20) - hci_req_add(req, HCI_OP_READ_LOCAL_CODECS, 0, NULL); - /* Read local pairing options if the HCI command is supported */ if (hdev->commands[41] & 0x08) hci_req_add(req, HCI_OP_READ_LOCAL_PAIRING_OPTS, 0, NULL); @@ -937,6 +810,12 @@ static int __hci_init(struct hci_dev *hdev) if (err < 0) return err; + /* Read local codec list if the HCI command is supported */ + if (hdev->commands[45] & 0x04) + hci_read_supported_codecs_v2(hdev); + else if (hdev->commands[29] & 0x20) + hci_read_supported_codecs(hdev); + /* This function is only called when the controller is actually in * configured state. When the controller is marked as unconfigured, * this initialization procedure is not run. @@ -1848,6 +1727,7 @@ int hci_dev_do_close(struct hci_dev *hdev) memset(hdev->eir, 0, sizeof(hdev->eir)); memset(hdev->dev_class, 0, sizeof(hdev->dev_class)); bacpy(&hdev->random_addr, BDADDR_ANY); + hci_codec_list_clear(&hdev->local_codecs); hci_req_sync_unlock(hdev); @@ -3081,6 +2961,60 @@ int hci_set_adv_instance_data(struct hci_dev *hdev, u8 instance, } /* This function requires the caller holds hdev->lock */ +u32 hci_adv_instance_flags(struct hci_dev *hdev, u8 instance) +{ + u32 flags; + struct adv_info *adv; + + if (instance == 0x00) { + /* Instance 0 always manages the "Tx Power" and "Flags" + * fields + */ + flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS; + + /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting + * corresponds to the "connectable" instance flag. + */ + if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) + flags |= MGMT_ADV_FLAG_CONNECTABLE; + + if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) + flags |= MGMT_ADV_FLAG_LIMITED_DISCOV; + else if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) + flags |= MGMT_ADV_FLAG_DISCOV; + + return flags; + } + + adv = hci_find_adv_instance(hdev, instance); + + /* Return 0 when we got an invalid instance identifier. */ + if (!adv) + return 0; + + return adv->flags; +} + +bool hci_adv_instance_is_scannable(struct hci_dev *hdev, u8 instance) +{ + struct adv_info *adv; + + /* Instance 0x00 always set local name */ + if (instance == 0x00) + return true; + + adv = hci_find_adv_instance(hdev, instance); + if (!adv) + return false; + + if (adv->flags & MGMT_ADV_FLAG_APPEARANCE || + adv->flags & MGMT_ADV_FLAG_LOCAL_NAME) + return true; + + return adv->scan_rsp_len ? true : false; +} + +/* This function requires the caller holds hdev->lock */ void hci_adv_monitors_clear(struct hci_dev *hdev) { struct adv_monitor *monitor; @@ -3487,15 +3421,6 @@ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, { struct hci_conn_params *param; - switch (addr_type) { - case ADDR_LE_DEV_PUBLIC_RESOLVED: - addr_type = ADDR_LE_DEV_PUBLIC; - break; - case ADDR_LE_DEV_RANDOM_RESOLVED: - addr_type = ADDR_LE_DEV_RANDOM; - break; - } - list_for_each_entry(param, list, action) { if (bacmp(¶m->addr, addr) == 0 && param->addr_type == addr_type) @@ -3701,55 +3626,12 @@ static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, struct hci_dev *hdev = container_of(nb, struct hci_dev, suspend_notifier); int ret = 0; - u8 state = BT_RUNNING; - /* If powering down, wait for completion. */ - if (mgmt_powering_down(hdev)) { - set_bit(SUSPEND_POWERING_DOWN, hdev->suspend_tasks); - ret = hci_suspend_wait_event(hdev); - if (ret) - goto done; - } - - /* Suspend notifier should only act on events when powered. */ - if (!hdev_is_powered(hdev) || - hci_dev_test_flag(hdev, HCI_UNREGISTER)) - goto done; + if (action == PM_SUSPEND_PREPARE) + ret = hci_suspend_dev(hdev); + else if (action == PM_POST_SUSPEND) + ret = hci_resume_dev(hdev); - if (action == PM_SUSPEND_PREPARE) { - /* Suspend consists of two actions: - * - First, disconnect everything and make the controller not - * connectable (disabling scanning) - * - Second, program event filter/accept list and enable scan - */ - ret = hci_change_suspend_state(hdev, BT_SUSPEND_DISCONNECT); - if (!ret) - state = BT_SUSPEND_DISCONNECT; - - /* Only configure accept list if disconnect succeeded and wake - * isn't being prevented. - */ - if (!ret && !(hdev->prevent_wake && hdev->prevent_wake(hdev))) { - ret = hci_change_suspend_state(hdev, - BT_SUSPEND_CONFIGURE_WAKE); - if (!ret) - state = BT_SUSPEND_CONFIGURE_WAKE; - } - - hci_clear_wake_reason(hdev); - mgmt_suspending(hdev, state); - - } else if (action == PM_POST_SUSPEND) { - ret = hci_change_suspend_state(hdev, BT_RUNNING); - - mgmt_resuming(hdev, hdev->wake_reason, &hdev->wake_addr, - hdev->wake_addr_type); - } - -done: - /* We always allow suspend even if suspend preparation failed and - * attempt to recover in resume. - */ if (ret) bt_dev_err(hdev, "Suspend notifier action (%lu) failed: %d", action, ret); @@ -3857,6 +3739,7 @@ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv) INIT_LIST_HEAD(&hdev->adv_instances); INIT_LIST_HEAD(&hdev->blocked_keys); + INIT_LIST_HEAD(&hdev->local_codecs); INIT_WORK(&hdev->rx_work, hci_rx_work); INIT_WORK(&hdev->cmd_work, hci_cmd_work); INIT_WORK(&hdev->tx_work, hci_tx_work); @@ -3994,6 +3877,7 @@ int hci_register_dev(struct hci_dev *hdev) queue_work(hdev->req_workqueue, &hdev->power_on); idr_init(&hdev->adv_monitors_idr); + msft_register(hdev); return id; @@ -4026,6 +3910,8 @@ void hci_unregister_dev(struct hci_dev *hdev) cancel_work_sync(&hdev->suspend_prepare); } + msft_unregister(hdev); + hci_dev_do_close(hdev); if (!test_bit(HCI_INIT, &hdev->flags) && @@ -4088,16 +3974,78 @@ EXPORT_SYMBOL(hci_release_dev); /* Suspend HCI device */ int hci_suspend_dev(struct hci_dev *hdev) { + int ret; + u8 state = BT_RUNNING; + + bt_dev_dbg(hdev, ""); + + /* Suspend should only act on when powered. */ + if (!hdev_is_powered(hdev) || + hci_dev_test_flag(hdev, HCI_UNREGISTER)) + return 0; + + /* If powering down, wait for completion. */ + if (mgmt_powering_down(hdev)) { + set_bit(SUSPEND_POWERING_DOWN, hdev->suspend_tasks); + ret = hci_suspend_wait_event(hdev); + if (ret) + goto done; + } + + /* Suspend consists of two actions: + * - First, disconnect everything and make the controller not + * connectable (disabling scanning) + * - Second, program event filter/accept list and enable scan + */ + ret = hci_change_suspend_state(hdev, BT_SUSPEND_DISCONNECT); + if (ret) + goto clear; + + state = BT_SUSPEND_DISCONNECT; + + /* Only configure accept list if device may wakeup. */ + if (hdev->wakeup && hdev->wakeup(hdev)) { + ret = hci_change_suspend_state(hdev, BT_SUSPEND_CONFIGURE_WAKE); + if (!ret) + state = BT_SUSPEND_CONFIGURE_WAKE; + } + +clear: + hci_clear_wake_reason(hdev); + mgmt_suspending(hdev, state); + +done: + /* We always allow suspend even if suspend preparation failed and + * attempt to recover in resume. + */ hci_sock_dev_event(hdev, HCI_DEV_SUSPEND); - return 0; + return ret; } EXPORT_SYMBOL(hci_suspend_dev); /* Resume HCI device */ int hci_resume_dev(struct hci_dev *hdev) { + int ret; + + bt_dev_dbg(hdev, ""); + + /* Resume should only act on when powered. */ + if (!hdev_is_powered(hdev) || + hci_dev_test_flag(hdev, HCI_UNREGISTER)) + return 0; + + /* If powering down don't attempt to resume */ + if (mgmt_powering_down(hdev)) + return 0; + + ret = hci_change_suspend_state(hdev, BT_RUNNING); + + mgmt_resuming(hdev, hdev->wake_reason, &hdev->wake_addr, + hdev->wake_addr_type); + hci_sock_dev_event(hdev, HCI_DEV_RESUME); - return 0; + return ret; } EXPORT_SYMBOL(hci_resume_dev); diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 841393389f7b..902b40a90b91 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -27,6 +27,7 @@ #include <net/bluetooth/hci_core.h> #include "smp.h" +#include "hci_request.h" #include "hci_debugfs.h" #define DEFINE_QUIRK_ATTRIBUTE(__name, __quirk) \ @@ -1250,3 +1251,125 @@ void hci_debugfs_create_conn(struct hci_conn *conn) snprintf(name, sizeof(name), "%u", conn->handle); conn->debugfs = debugfs_create_dir(name, hdev->debugfs); } + +static ssize_t dut_mode_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[3]; + + buf[0] = hci_dev_test_flag(hdev, HCI_DUT_MODE) ? 'Y' : 'N'; + buf[1] = '\n'; + buf[2] = '\0'; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t dut_mode_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + struct sk_buff *skb; + bool enable; + int err; + + if (!test_bit(HCI_UP, &hdev->flags)) + return -ENETDOWN; + + err = kstrtobool_from_user(user_buf, count, &enable); + if (err) + return err; + + if (enable == hci_dev_test_flag(hdev, HCI_DUT_MODE)) + return -EALREADY; + + hci_req_sync_lock(hdev); + if (enable) + skb = __hci_cmd_sync(hdev, HCI_OP_ENABLE_DUT_MODE, 0, NULL, + HCI_CMD_TIMEOUT); + else + skb = __hci_cmd_sync(hdev, HCI_OP_RESET, 0, NULL, + HCI_CMD_TIMEOUT); + hci_req_sync_unlock(hdev); + + if (IS_ERR(skb)) + return PTR_ERR(skb); + + kfree_skb(skb); + + hci_dev_change_flag(hdev, HCI_DUT_MODE); + + return count; +} + +static const struct file_operations dut_mode_fops = { + .open = simple_open, + .read = dut_mode_read, + .write = dut_mode_write, + .llseek = default_llseek, +}; + +static ssize_t vendor_diag_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[3]; + + buf[0] = hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) ? 'Y' : 'N'; + buf[1] = '\n'; + buf[2] = '\0'; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t vendor_diag_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + bool enable; + int err; + + err = kstrtobool_from_user(user_buf, count, &enable); + if (err) + return err; + + /* When the diagnostic flags are not persistent and the transport + * is not active or in user channel operation, then there is no need + * for the vendor callback. Instead just store the desired value and + * the setting will be programmed when the controller gets powered on. + */ + if (test_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks) && + (!test_bit(HCI_RUNNING, &hdev->flags) || + hci_dev_test_flag(hdev, HCI_USER_CHANNEL))) + goto done; + + hci_req_sync_lock(hdev); + err = hdev->set_diag(hdev, enable); + hci_req_sync_unlock(hdev); + + if (err < 0) + return err; + +done: + if (enable) + hci_dev_set_flag(hdev, HCI_VENDOR_DIAG); + else + hci_dev_clear_flag(hdev, HCI_VENDOR_DIAG); + + return count; +} + +static const struct file_operations vendor_diag_fops = { + .open = simple_open, + .read = vendor_diag_read, + .write = vendor_diag_write, + .llseek = default_llseek, +}; + +void hci_debugfs_create_basic(struct hci_dev *hdev) +{ + debugfs_create_file("dut_mode", 0644, hdev->debugfs, hdev, + &dut_mode_fops); + + if (hdev->set_diag) + debugfs_create_file("vendor_diag", 0644, hdev->debugfs, hdev, + &vendor_diag_fops); +} diff --git a/net/bluetooth/hci_debugfs.h b/net/bluetooth/hci_debugfs.h index 4444dc8cedc2..9a8a7c93bb12 100644 --- a/net/bluetooth/hci_debugfs.h +++ b/net/bluetooth/hci_debugfs.h @@ -26,6 +26,7 @@ void hci_debugfs_create_common(struct hci_dev *hdev); void hci_debugfs_create_bredr(struct hci_dev *hdev); void hci_debugfs_create_le(struct hci_dev *hdev); void hci_debugfs_create_conn(struct hci_conn *conn); +void hci_debugfs_create_basic(struct hci_dev *hdev); #else @@ -45,4 +46,8 @@ static inline void hci_debugfs_create_conn(struct hci_conn *conn) { } +static inline void hci_debugfs_create_basic(struct hci_dev *hdev) +{ +} + #endif diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 0bca035bf2dc..7d0db1ca1248 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -36,6 +36,7 @@ #include "amp.h" #include "smp.h" #include "msft.h" +#include "eir.h" #define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \ "\x00\x00\x00\x00\x00\x00\x00\x00" @@ -2278,6 +2279,41 @@ static void hci_cs_setup_sync_conn(struct hci_dev *hdev, __u8 status) hci_dev_unlock(hdev); } +static void hci_cs_enhanced_setup_sync_conn(struct hci_dev *hdev, __u8 status) +{ + struct hci_cp_enhanced_setup_sync_conn *cp; + struct hci_conn *acl, *sco; + __u16 handle; + + bt_dev_dbg(hdev, "status 0x%2.2x", status); + + if (!status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_ENHANCED_SETUP_SYNC_CONN); + if (!cp) + return; + + handle = __le16_to_cpu(cp->handle); + + bt_dev_dbg(hdev, "handle 0x%4.4x", handle); + + hci_dev_lock(hdev); + + acl = hci_conn_hash_lookup_handle(hdev, handle); + if (acl) { + sco = acl->link; + if (sco) { + sco->state = BT_CLOSED; + + hci_connect_cfm(sco, status); + hci_conn_del(sco); + } + } + + hci_dev_unlock(hdev); +} + static void hci_cs_sniff_mode(struct hci_dev *hdev, __u8 status) { struct hci_cp_sniff_mode *cp; @@ -2351,7 +2387,7 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status) mgmt_disconnect_failed(hdev, &conn->dst, conn->type, conn->dst_type, status); - if (conn->type == LE_LINK) { + if (conn->type == LE_LINK && conn->role == HCI_ROLE_SLAVE) { hdev->cur_adv_instance = conn->adv_instance; hci_req_reenable_advertising(hdev); } @@ -2367,6 +2403,28 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status) hci_dev_unlock(hdev); } +static u8 ev_bdaddr_type(struct hci_dev *hdev, u8 type, bool *resolved) +{ + /* When using controller based address resolution, then the new + * address types 0x02 and 0x03 are used. These types need to be + * converted back into either public address or random address type + */ + switch (type) { + case ADDR_LE_DEV_PUBLIC_RESOLVED: + if (resolved) + *resolved = true; + return ADDR_LE_DEV_PUBLIC; + case ADDR_LE_DEV_RANDOM_RESOLVED: + if (resolved) + *resolved = true; + return ADDR_LE_DEV_RANDOM; + } + + if (resolved) + *resolved = false; + return type; +} + static void cs_le_create_conn(struct hci_dev *hdev, bdaddr_t *peer_addr, u8 peer_addr_type, u8 own_address_type, u8 filter_policy) @@ -2378,21 +2436,7 @@ static void cs_le_create_conn(struct hci_dev *hdev, bdaddr_t *peer_addr, if (!conn) return; - /* When using controller based address resolution, then the new - * address types 0x02 and 0x03 are used. These types need to be - * converted back into either public address or random address type - */ - if (use_ll_privacy(hdev) && - hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) { - switch (own_address_type) { - case ADDR_LE_DEV_PUBLIC_RESOLVED: - own_address_type = ADDR_LE_DEV_PUBLIC; - break; - case ADDR_LE_DEV_RANDOM_RESOLVED: - own_address_type = ADDR_LE_DEV_RANDOM; - break; - } - } + own_address_type = ev_bdaddr_type(hdev, own_address_type, NULL); /* Store the initiator and responder address information which * is needed for SMP. These values will not change during the @@ -2961,7 +3005,7 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) * or until a connection is created or until the Advertising * is timed out due to Directed Advertising." */ - if (conn->type == LE_LINK) { + if (conn->type == LE_LINK && conn->role == HCI_ROLE_SLAVE) { hdev->cur_adv_instance = conn->adv_instance; hci_req_reenable_advertising(hdev); } @@ -3756,6 +3800,10 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cs_setup_sync_conn(hdev, ev->status); break; + case HCI_OP_ENHANCED_SETUP_SYNC_CONN: + hci_cs_enhanced_setup_sync_conn(hdev, ev->status); + break; + case HCI_OP_SNIFF_MODE: hci_cs_sniff_mode(hdev, ev->status); break; @@ -4397,6 +4445,7 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, { struct hci_ev_sync_conn_complete *ev = (void *) skb->data; struct hci_conn *conn; + unsigned int notify_evt; BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); @@ -4471,15 +4520,21 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, switch (ev->air_mode) { case 0x02: - if (hdev->notify) - hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_CVSD); + notify_evt = HCI_NOTIFY_ENABLE_SCO_CVSD; break; case 0x03: - if (hdev->notify) - hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_TRANSP); + notify_evt = HCI_NOTIFY_ENABLE_SCO_TRANSP; break; } + /* Notify only in case of SCO over HCI transport data path which + * is zero and non-zero value shall be non-HCI transport data path + */ + if (conn->codec.data_path == 0) { + if (hdev->notify) + hdev->notify(hdev, notify_evt); + } + hci_connect_cfm(conn, ev->status); if (ev->status) hci_conn_del(conn); @@ -5282,22 +5337,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, conn->dst_type = irk->addr_type; } - /* When using controller based address resolution, then the new - * address types 0x02 and 0x03 are used. These types need to be - * converted back into either public address or random address type - */ - if (use_ll_privacy(hdev) && - hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY) && - hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) { - switch (conn->dst_type) { - case ADDR_LE_DEV_PUBLIC_RESOLVED: - conn->dst_type = ADDR_LE_DEV_PUBLIC; - break; - case ADDR_LE_DEV_RANDOM_RESOLVED: - conn->dst_type = ADDR_LE_DEV_RANDOM; - break; - } - } + conn->dst_type = ev_bdaddr_type(hdev, conn->dst_type, NULL); if (status) { hci_le_conn_failed(conn, status); @@ -5479,8 +5519,8 @@ static void hci_le_conn_update_complete_evt(struct hci_dev *hdev, /* This function requires the caller holds hdev->lock */ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, bdaddr_t *addr, - u8 addr_type, u8 adv_type, - bdaddr_t *direct_rpa) + u8 addr_type, bool addr_resolved, + u8 adv_type, bdaddr_t *direct_rpa) { struct hci_conn *conn; struct hci_conn_params *params; @@ -5532,9 +5572,9 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, } } - conn = hci_connect_le(hdev, addr, addr_type, BT_SECURITY_LOW, - hdev->def_le_autoconnect_timeout, HCI_ROLE_MASTER, - direct_rpa); + conn = hci_connect_le(hdev, addr, addr_type, addr_resolved, + BT_SECURITY_LOW, hdev->def_le_autoconnect_timeout, + HCI_ROLE_MASTER, direct_rpa); if (!IS_ERR(conn)) { /* If HCI_AUTO_CONN_EXPLICIT is set, conn is already owned * by higher layer that tried to connect, if no then @@ -5575,7 +5615,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, struct discovery_state *d = &hdev->discovery; struct smp_irk *irk; struct hci_conn *conn; - bool match; + bool match, bdaddr_resolved; u32 flags; u8 *ptr; @@ -5619,6 +5659,9 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, * controller address. */ if (direct_addr) { + direct_addr_type = ev_bdaddr_type(hdev, direct_addr_type, + &bdaddr_resolved); + /* Only resolvable random addresses are valid for these * kind of reports and others can be ignored. */ @@ -5646,13 +5689,15 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, bdaddr_type = irk->addr_type; } + bdaddr_type = ev_bdaddr_type(hdev, bdaddr_type, &bdaddr_resolved); + /* Check if we have been requested to connect to this device. * * direct_addr is set only for directed advertising reports (it is NULL * for advertising reports) and is already verified to be RPA above. */ - conn = check_pending_le_conn(hdev, bdaddr, bdaddr_type, type, - direct_addr); + conn = check_pending_le_conn(hdev, bdaddr, bdaddr_type, bdaddr_resolved, + type, direct_addr); if (!ext_adv && conn && type == LE_ADV_IND && len <= HCI_MAX_AD_LENGTH) { /* Store report for later inclusion by * mgmt_device_connected diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index f15626607b2d..92611bfc0b9e 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -30,6 +30,7 @@ #include "smp.h" #include "hci_request.h" #include "msft.h" +#include "eir.h" #define HCI_REQ_DONE 0 #define HCI_REQ_PEND 1 @@ -521,164 +522,6 @@ void __hci_req_update_name(struct hci_request *req) hci_req_add(req, HCI_OP_WRITE_LOCAL_NAME, sizeof(cp), &cp); } -#define PNP_INFO_SVCLASS_ID 0x1200 - -static u8 *create_uuid16_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) -{ - u8 *ptr = data, *uuids_start = NULL; - struct bt_uuid *uuid; - - if (len < 4) - return ptr; - - list_for_each_entry(uuid, &hdev->uuids, list) { - u16 uuid16; - - if (uuid->size != 16) - continue; - - uuid16 = get_unaligned_le16(&uuid->uuid[12]); - if (uuid16 < 0x1100) - continue; - - if (uuid16 == PNP_INFO_SVCLASS_ID) - continue; - - if (!uuids_start) { - uuids_start = ptr; - uuids_start[0] = 1; - uuids_start[1] = EIR_UUID16_ALL; - ptr += 2; - } - - /* Stop if not enough space to put next UUID */ - if ((ptr - data) + sizeof(u16) > len) { - uuids_start[1] = EIR_UUID16_SOME; - break; - } - - *ptr++ = (uuid16 & 0x00ff); - *ptr++ = (uuid16 & 0xff00) >> 8; - uuids_start[0] += sizeof(uuid16); - } - - return ptr; -} - -static u8 *create_uuid32_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) -{ - u8 *ptr = data, *uuids_start = NULL; - struct bt_uuid *uuid; - - if (len < 6) - return ptr; - - list_for_each_entry(uuid, &hdev->uuids, list) { - if (uuid->size != 32) - continue; - - if (!uuids_start) { - uuids_start = ptr; - uuids_start[0] = 1; - uuids_start[1] = EIR_UUID32_ALL; - ptr += 2; - } - - /* Stop if not enough space to put next UUID */ - if ((ptr - data) + sizeof(u32) > len) { - uuids_start[1] = EIR_UUID32_SOME; - break; - } - - memcpy(ptr, &uuid->uuid[12], sizeof(u32)); - ptr += sizeof(u32); - uuids_start[0] += sizeof(u32); - } - - return ptr; -} - -static u8 *create_uuid128_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) -{ - u8 *ptr = data, *uuids_start = NULL; - struct bt_uuid *uuid; - - if (len < 18) - return ptr; - - list_for_each_entry(uuid, &hdev->uuids, list) { - if (uuid->size != 128) - continue; - - if (!uuids_start) { - uuids_start = ptr; - uuids_start[0] = 1; - uuids_start[1] = EIR_UUID128_ALL; - ptr += 2; - } - - /* Stop if not enough space to put next UUID */ - if ((ptr - data) + 16 > len) { - uuids_start[1] = EIR_UUID128_SOME; - break; - } - - memcpy(ptr, uuid->uuid, 16); - ptr += 16; - uuids_start[0] += 16; - } - - return ptr; -} - -static void create_eir(struct hci_dev *hdev, u8 *data) -{ - u8 *ptr = data; - size_t name_len; - - name_len = strlen(hdev->dev_name); - - if (name_len > 0) { - /* EIR Data type */ - if (name_len > 48) { - name_len = 48; - ptr[1] = EIR_NAME_SHORT; - } else - ptr[1] = EIR_NAME_COMPLETE; - - /* EIR Data length */ - ptr[0] = name_len + 1; - - memcpy(ptr + 2, hdev->dev_name, name_len); - - ptr += (name_len + 2); - } - - if (hdev->inq_tx_power != HCI_TX_POWER_INVALID) { - ptr[0] = 2; - ptr[1] = EIR_TX_POWER; - ptr[2] = (u8) hdev->inq_tx_power; - - ptr += 3; - } - - if (hdev->devid_source > 0) { - ptr[0] = 9; - ptr[1] = EIR_DEVICE_ID; - - put_unaligned_le16(hdev->devid_source, ptr + 2); - put_unaligned_le16(hdev->devid_vendor, ptr + 4); - put_unaligned_le16(hdev->devid_product, ptr + 6); - put_unaligned_le16(hdev->devid_version, ptr + 8); - - ptr += 10; - } - - ptr = create_uuid16_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data)); - ptr = create_uuid32_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data)); - ptr = create_uuid128_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data)); -} - void __hci_req_update_eir(struct hci_request *req) { struct hci_dev *hdev = req->hdev; @@ -698,7 +541,7 @@ void __hci_req_update_eir(struct hci_request *req) memset(&cp, 0, sizeof(cp)); - create_eir(hdev, cp.data); + eir_create(hdev, cp.data); if (memcmp(cp.data, hdev->eir, sizeof(cp.data)) == 0) return; @@ -1134,25 +977,6 @@ void hci_req_add_le_passive_scan(struct hci_request *req) addr_resolv); } -static bool adv_instance_is_scannable(struct hci_dev *hdev, u8 instance) -{ - struct adv_info *adv_instance; - - /* Instance 0x00 always set local name */ - if (instance == 0x00) - return true; - - adv_instance = hci_find_adv_instance(hdev, instance); - if (!adv_instance) - return false; - - if (adv_instance->flags & MGMT_ADV_FLAG_APPEARANCE || - adv_instance->flags & MGMT_ADV_FLAG_LOCAL_NAME) - return true; - - return adv_instance->scan_rsp_len ? true : false; -} - static void hci_req_clear_event_filter(struct hci_request *req) { struct hci_cp_set_event_filter f; @@ -1281,21 +1105,24 @@ static void suspend_req_complete(struct hci_dev *hdev, u8 status, u16 opcode) } } -static void hci_req_add_set_adv_filter_enable(struct hci_request *req, - bool enable) +static void hci_req_prepare_adv_monitor_suspend(struct hci_request *req, + bool suspending) { struct hci_dev *hdev = req->hdev; switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_MSFT: - msft_req_add_set_filter_enable(req, enable); + if (suspending) + msft_suspend(hdev); + else + msft_resume(hdev); break; default: return; } /* No need to block when enabling since it's on resume path */ - if (hdev->suspended && !enable) + if (hdev->suspended && suspending) set_bit(SUSPEND_SET_ADV_FILTER, hdev->suspend_tasks); } @@ -1362,7 +1189,7 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) } /* Disable advertisement filters */ - hci_req_add_set_adv_filter_enable(&req, false); + hci_req_prepare_adv_monitor_suspend(&req, true); /* Prevent disconnects from causing scanning to be re-enabled */ hdev->scanning_paused = true; @@ -1404,7 +1231,7 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) /* Reset passive/background scanning to normal */ __hci_update_background_scan(&req); /* Enable all of the advertisement filters */ - hci_req_add_set_adv_filter_enable(&req, true); + hci_req_prepare_adv_monitor_suspend(&req, false); /* Unpause directed advertising */ hdev->advertising_paused = false; @@ -1442,7 +1269,7 @@ done: static bool adv_cur_instance_is_scannable(struct hci_dev *hdev) { - return adv_instance_is_scannable(hdev, hdev->cur_adv_instance); + return hci_adv_instance_is_scannable(hdev, hdev->cur_adv_instance); } void __hci_req_disable_advertising(struct hci_request *req) @@ -1457,40 +1284,6 @@ void __hci_req_disable_advertising(struct hci_request *req) } } -static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance) -{ - u32 flags; - struct adv_info *adv_instance; - - if (instance == 0x00) { - /* Instance 0 always manages the "Tx Power" and "Flags" - * fields - */ - flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS; - - /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting - * corresponds to the "connectable" instance flag. - */ - if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) - flags |= MGMT_ADV_FLAG_CONNECTABLE; - - if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) - flags |= MGMT_ADV_FLAG_LIMITED_DISCOV; - else if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) - flags |= MGMT_ADV_FLAG_DISCOV; - - return flags; - } - - adv_instance = hci_find_adv_instance(hdev, instance); - - /* Return 0 when we got an invalid instance identifier. */ - if (!adv_instance) - return 0; - - return adv_instance->flags; -} - static bool adv_use_rpa(struct hci_dev *hdev, uint32_t flags) { /* If privacy is not enabled don't use RPA */ @@ -1555,15 +1348,15 @@ static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable) void __hci_req_enable_advertising(struct hci_request *req) { struct hci_dev *hdev = req->hdev; - struct adv_info *adv_instance; + struct adv_info *adv; struct hci_cp_le_set_adv_param cp; u8 own_addr_type, enable = 0x01; bool connectable; u16 adv_min_interval, adv_max_interval; u32 flags; - flags = get_adv_instance_flags(hdev, hdev->cur_adv_instance); - adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance); + flags = hci_adv_instance_flags(hdev, hdev->cur_adv_instance); + adv = hci_find_adv_instance(hdev, hdev->cur_adv_instance); /* If the "connectable" instance flag was not set, then choose between * ADV_IND and ADV_NONCONN_IND based on the global connectable setting. @@ -1595,9 +1388,9 @@ void __hci_req_enable_advertising(struct hci_request *req) memset(&cp, 0, sizeof(cp)); - if (adv_instance) { - adv_min_interval = adv_instance->min_interval; - adv_max_interval = adv_instance->max_interval; + if (adv) { + adv_min_interval = adv->min_interval; + adv_max_interval = adv->max_interval; } else { adv_min_interval = hdev->le_adv_min_interval; adv_max_interval = hdev->le_adv_max_interval; @@ -1628,85 +1421,6 @@ void __hci_req_enable_advertising(struct hci_request *req) hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); } -u8 append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len) -{ - size_t short_len; - size_t complete_len; - - /* no space left for name (+ NULL + type + len) */ - if ((HCI_MAX_AD_LENGTH - ad_len) < HCI_MAX_SHORT_NAME_LENGTH + 3) - return ad_len; - - /* use complete name if present and fits */ - complete_len = strlen(hdev->dev_name); - if (complete_len && complete_len <= HCI_MAX_SHORT_NAME_LENGTH) - return eir_append_data(ptr, ad_len, EIR_NAME_COMPLETE, - hdev->dev_name, complete_len + 1); - - /* use short name if present */ - short_len = strlen(hdev->short_name); - if (short_len) - return eir_append_data(ptr, ad_len, EIR_NAME_SHORT, - hdev->short_name, short_len + 1); - - /* use shortened full name if present, we already know that name - * is longer then HCI_MAX_SHORT_NAME_LENGTH - */ - if (complete_len) { - u8 name[HCI_MAX_SHORT_NAME_LENGTH + 1]; - - memcpy(name, hdev->dev_name, HCI_MAX_SHORT_NAME_LENGTH); - name[HCI_MAX_SHORT_NAME_LENGTH] = '\0'; - - return eir_append_data(ptr, ad_len, EIR_NAME_SHORT, name, - sizeof(name)); - } - - return ad_len; -} - -static u8 append_appearance(struct hci_dev *hdev, u8 *ptr, u8 ad_len) -{ - return eir_append_le16(ptr, ad_len, EIR_APPEARANCE, hdev->appearance); -} - -static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) -{ - u8 scan_rsp_len = 0; - - if (hdev->appearance) - scan_rsp_len = append_appearance(hdev, ptr, scan_rsp_len); - - return append_local_name(hdev, ptr, scan_rsp_len); -} - -static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance, - u8 *ptr) -{ - struct adv_info *adv_instance; - u32 instance_flags; - u8 scan_rsp_len = 0; - - adv_instance = hci_find_adv_instance(hdev, instance); - if (!adv_instance) - return 0; - - instance_flags = adv_instance->flags; - - if ((instance_flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) - scan_rsp_len = append_appearance(hdev, ptr, scan_rsp_len); - - memcpy(&ptr[scan_rsp_len], adv_instance->scan_rsp_data, - adv_instance->scan_rsp_len); - - scan_rsp_len += adv_instance->scan_rsp_len; - - if (instance_flags & MGMT_ADV_FLAG_LOCAL_NAME) - scan_rsp_len = append_local_name(hdev, ptr, scan_rsp_len); - - return scan_rsp_len; -} - void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance) { struct hci_dev *hdev = req->hdev; @@ -1723,11 +1437,7 @@ void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance) memset(&pdu, 0, sizeof(pdu)); - if (instance) - len = create_instance_scan_rsp_data(hdev, instance, - pdu.data); - else - len = create_default_scan_rsp_data(hdev, pdu.data); + len = eir_create_scan_rsp(hdev, instance, pdu.data); if (hdev->scan_rsp_data_len == len && !memcmp(pdu.data, hdev->scan_rsp_data, len)) @@ -1748,11 +1458,7 @@ void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance) memset(&cp, 0, sizeof(cp)); - if (instance) - len = create_instance_scan_rsp_data(hdev, instance, - cp.data); - else - len = create_default_scan_rsp_data(hdev, cp.data); + len = eir_create_scan_rsp(hdev, instance, cp.data); if (hdev->scan_rsp_data_len == len && !memcmp(cp.data, hdev->scan_rsp_data, len)) @@ -1767,95 +1473,6 @@ void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance) } } -static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) -{ - struct adv_info *adv_instance = NULL; - u8 ad_len = 0, flags = 0; - u32 instance_flags; - - /* Return 0 when the current instance identifier is invalid. */ - if (instance) { - adv_instance = hci_find_adv_instance(hdev, instance); - if (!adv_instance) - return 0; - } - - instance_flags = get_adv_instance_flags(hdev, instance); - - /* If instance already has the flags set skip adding it once - * again. - */ - if (adv_instance && eir_get_data(adv_instance->adv_data, - adv_instance->adv_data_len, EIR_FLAGS, - NULL)) - goto skip_flags; - - /* The Add Advertising command allows userspace to set both the general - * and limited discoverable flags. - */ - if (instance_flags & MGMT_ADV_FLAG_DISCOV) - flags |= LE_AD_GENERAL; - - if (instance_flags & MGMT_ADV_FLAG_LIMITED_DISCOV) - flags |= LE_AD_LIMITED; - - if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) - flags |= LE_AD_NO_BREDR; - - if (flags || (instance_flags & MGMT_ADV_FLAG_MANAGED_FLAGS)) { - /* If a discovery flag wasn't provided, simply use the global - * settings. - */ - if (!flags) - flags |= mgmt_get_adv_discov_flags(hdev); - - /* If flags would still be empty, then there is no need to - * include the "Flags" AD field". - */ - if (flags) { - ptr[0] = 0x02; - ptr[1] = EIR_FLAGS; - ptr[2] = flags; - - ad_len += 3; - ptr += 3; - } - } - -skip_flags: - if (adv_instance) { - memcpy(ptr, adv_instance->adv_data, - adv_instance->adv_data_len); - ad_len += adv_instance->adv_data_len; - ptr += adv_instance->adv_data_len; - } - - if (instance_flags & MGMT_ADV_FLAG_TX_POWER) { - s8 adv_tx_power; - - if (ext_adv_capable(hdev)) { - if (adv_instance) - adv_tx_power = adv_instance->tx_power; - else - adv_tx_power = hdev->adv_tx_power; - } else { - adv_tx_power = hdev->adv_tx_power; - } - - /* Provide Tx Power only if we can provide a valid value for it */ - if (adv_tx_power != HCI_TX_POWER_INVALID) { - ptr[0] = 0x02; - ptr[1] = EIR_TX_POWER; - ptr[2] = (u8)adv_tx_power; - - ad_len += 3; - ptr += 3; - } - } - - return ad_len; -} - void __hci_req_update_adv_data(struct hci_request *req, u8 instance) { struct hci_dev *hdev = req->hdev; @@ -1872,7 +1489,7 @@ void __hci_req_update_adv_data(struct hci_request *req, u8 instance) memset(&pdu, 0, sizeof(pdu)); - len = create_instance_adv_data(hdev, instance, pdu.data); + len = eir_create_adv_data(hdev, instance, pdu.data); /* There's nothing to do if the data hasn't changed */ if (hdev->adv_data_len == len && @@ -1894,7 +1511,7 @@ void __hci_req_update_adv_data(struct hci_request *req, u8 instance) memset(&cp, 0, sizeof(cp)); - len = create_instance_adv_data(hdev, instance, cp.data); + len = eir_create_adv_data(hdev, instance, cp.data); /* There's nothing to do if the data hasn't changed */ if (hdev->adv_data_len == len && @@ -2183,7 +1800,7 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) adv_instance = NULL; } - flags = get_adv_instance_flags(hdev, instance); + flags = hci_adv_instance_flags(hdev, instance); /* If the "connectable" instance flag was not set, then choose between * ADV_IND and ADV_NONCONN_IND based on the global connectable setting. @@ -2223,7 +1840,7 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) cp.evt_properties = cpu_to_le16(LE_EXT_ADV_CONN_IND); else cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND); - } else if (adv_instance_is_scannable(hdev, instance) || + } else if (hci_adv_instance_is_scannable(hdev, instance) || (flags & MGMT_ADV_PARAM_SCAN_RSP)) { if (secondary_adv) cp.evt_properties = cpu_to_le16(LE_EXT_ADV_SCAN_IND); @@ -3327,6 +2944,53 @@ bool hci_req_stop_discovery(struct hci_request *req) return ret; } +static void config_data_path_complete(struct hci_dev *hdev, u8 status, + u16 opcode) +{ + bt_dev_dbg(hdev, "status %u", status); +} + +int hci_req_configure_datapath(struct hci_dev *hdev, struct bt_codec *codec) +{ + struct hci_request req; + int err; + __u8 vnd_len, *vnd_data = NULL; + struct hci_op_configure_data_path *cmd = NULL; + + hci_req_init(&req, hdev); + + err = hdev->get_codec_config_data(hdev, ESCO_LINK, codec, &vnd_len, + &vnd_data); + if (err < 0) + goto error; + + cmd = kzalloc(sizeof(*cmd) + vnd_len, GFP_KERNEL); + if (!cmd) { + err = -ENOMEM; + goto error; + } + + err = hdev->get_data_path_id(hdev, &cmd->data_path_id); + if (err < 0) + goto error; + + cmd->vnd_len = vnd_len; + memcpy(cmd->vnd_data, vnd_data, vnd_len); + + cmd->direction = 0x00; + hci_req_add(&req, HCI_CONFIGURE_DATA_PATH, sizeof(*cmd) + vnd_len, cmd); + + cmd->direction = 0x01; + hci_req_add(&req, HCI_CONFIGURE_DATA_PATH, sizeof(*cmd) + vnd_len, cmd); + + err = hci_req_run(&req, config_data_path_complete); +error: + + kfree(cmd); + kfree(vnd_data); + return err; +} + static int stop_discovery(struct hci_request *req, unsigned long opt) { hci_dev_lock(req->hdev); diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index 39ee8a18087a..f31420f58525 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -101,6 +101,8 @@ void __hci_req_update_class(struct hci_request *req); /* Returns true if HCI commands were queued */ bool hci_req_stop_discovery(struct hci_request *req); +int hci_req_configure_datapath(struct hci_dev *hdev, struct bt_codec *codec); + static inline void hci_req_update_scan(struct hci_dev *hdev) { queue_work(hdev->req_workqueue, &hdev->scan_update); @@ -122,26 +124,3 @@ static inline void hci_update_background_scan(struct hci_dev *hdev) void hci_request_setup(struct hci_dev *hdev); void hci_request_cancel_all(struct hci_dev *hdev); - -u8 append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len); - -static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, - u8 *data, u8 data_len) -{ - eir[eir_len++] = sizeof(type) + data_len; - eir[eir_len++] = type; - memcpy(&eir[eir_len], data, data_len); - eir_len += data_len; - - return eir_len; -} - -static inline u16 eir_append_le16(u8 *eir, u16 eir_len, u8 type, u16 data) -{ - eir[eir_len++] = sizeof(type) + sizeof(data); - eir[eir_len++] = type; - put_unaligned_le16(data, &eir[eir_len]); - eir_len += sizeof(data); - - return eir_len; -} diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index f1128c2134f0..d0dad1fafe07 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -57,6 +57,7 @@ struct hci_pinfo { unsigned long flags; __u32 cookie; char comm[TASK_COMM_LEN]; + __u16 mtu; }; static struct hci_dev *hci_hdev_from_sock(struct sock *sk) @@ -1374,6 +1375,10 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, break; } + /* Default MTU to HCI_MAX_FRAME_SIZE if not set */ + if (!hci_pi(sk)->mtu) + hci_pi(sk)->mtu = HCI_MAX_FRAME_SIZE; + sk->sk_state = BT_BOUND; done: @@ -1506,9 +1511,8 @@ static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, } static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk, - struct msghdr *msg, size_t msglen) + struct sk_buff *skb) { - void *buf; u8 *cp; struct mgmt_hdr *hdr; u16 opcode, index, len; @@ -1517,40 +1521,31 @@ static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk, bool var_len, no_hdev; int err; - BT_DBG("got %zu bytes", msglen); + BT_DBG("got %d bytes", skb->len); - if (msglen < sizeof(*hdr)) + if (skb->len < sizeof(*hdr)) return -EINVAL; - buf = kmalloc(msglen, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - if (memcpy_from_msg(buf, msg, msglen)) { - err = -EFAULT; - goto done; - } - - hdr = buf; + hdr = (void *)skb->data; opcode = __le16_to_cpu(hdr->opcode); index = __le16_to_cpu(hdr->index); len = __le16_to_cpu(hdr->len); - if (len != msglen - sizeof(*hdr)) { + if (len != skb->len - sizeof(*hdr)) { err = -EINVAL; goto done; } if (chan->channel == HCI_CHANNEL_CONTROL) { - struct sk_buff *skb; + struct sk_buff *cmd; /* Send event to monitor */ - skb = create_monitor_ctrl_command(sk, index, opcode, len, - buf + sizeof(*hdr)); - if (skb) { - hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + cmd = create_monitor_ctrl_command(sk, index, opcode, len, + skb->data + sizeof(*hdr)); + if (cmd) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, cmd, HCI_SOCK_TRUSTED, NULL); - kfree_skb(skb); + kfree_skb(cmd); } } @@ -1615,26 +1610,25 @@ static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk, if (hdev && chan->hdev_init) chan->hdev_init(sk, hdev); - cp = buf + sizeof(*hdr); + cp = skb->data + sizeof(*hdr); err = handler->func(sk, hdev, cp, len); if (err < 0) goto done; - err = msglen; + err = skb->len; done: if (hdev) hci_dev_put(hdev); - kfree(buf); return err; } -static int hci_logging_frame(struct sock *sk, struct msghdr *msg, int len) +static int hci_logging_frame(struct sock *sk, struct sk_buff *skb, + unsigned int flags) { struct hci_mon_hdr *hdr; - struct sk_buff *skb; struct hci_dev *hdev; u16 index; int err; @@ -1643,24 +1637,13 @@ static int hci_logging_frame(struct sock *sk, struct msghdr *msg, int len) * the priority byte, the ident length byte and at least one string * terminator NUL byte. Anything shorter are invalid packets. */ - if (len < sizeof(*hdr) + 3) + if (skb->len < sizeof(*hdr) + 3) return -EINVAL; - skb = bt_skb_send_alloc(sk, len, msg->msg_flags & MSG_DONTWAIT, &err); - if (!skb) - return err; - - if (memcpy_from_msg(skb_put(skb, len), msg, len)) { - err = -EFAULT; - goto drop; - } - hdr = (void *)skb->data; - if (__le16_to_cpu(hdr->len) != len - sizeof(*hdr)) { - err = -EINVAL; - goto drop; - } + if (__le16_to_cpu(hdr->len) != skb->len - sizeof(*hdr)) + return -EINVAL; if (__le16_to_cpu(hdr->opcode) == 0x0000) { __u8 priority = skb->data[sizeof(*hdr)]; @@ -1679,25 +1662,20 @@ static int hci_logging_frame(struct sock *sk, struct msghdr *msg, int len) * The message follows the ident string (if present) and * must be NUL terminated. Otherwise it is not a valid packet. */ - if (priority > 7 || skb->data[len - 1] != 0x00 || - ident_len > len - sizeof(*hdr) - 3 || - skb->data[sizeof(*hdr) + ident_len + 1] != 0x00) { - err = -EINVAL; - goto drop; - } + if (priority > 7 || skb->data[skb->len - 1] != 0x00 || + ident_len > skb->len - sizeof(*hdr) - 3 || + skb->data[sizeof(*hdr) + ident_len + 1] != 0x00) + return -EINVAL; } else { - err = -EINVAL; - goto drop; + return -EINVAL; } index = __le16_to_cpu(hdr->index); if (index != MGMT_INDEX_NONE) { hdev = hci_dev_get(index); - if (!hdev) { - err = -ENODEV; - goto drop; - } + if (!hdev) + return -ENODEV; } else { hdev = NULL; } @@ -1705,13 +1683,11 @@ static int hci_logging_frame(struct sock *sk, struct msghdr *msg, int len) hdr->opcode = cpu_to_le16(HCI_MON_USER_LOGGING); hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); - err = len; + err = skb->len; if (hdev) hci_dev_put(hdev); -drop: - kfree_skb(skb); return err; } @@ -1723,19 +1699,23 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, struct hci_dev *hdev; struct sk_buff *skb; int err; + const unsigned int flags = msg->msg_flags; BT_DBG("sock %p sk %p", sock, sk); - if (msg->msg_flags & MSG_OOB) + if (flags & MSG_OOB) return -EOPNOTSUPP; - if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_NOSIGNAL|MSG_ERRQUEUE| - MSG_CMSG_COMPAT)) + if (flags & ~(MSG_DONTWAIT | MSG_NOSIGNAL | MSG_ERRQUEUE | MSG_CMSG_COMPAT)) return -EINVAL; - if (len < 4 || len > HCI_MAX_FRAME_SIZE) + if (len < 4 || len > hci_pi(sk)->mtu) return -EINVAL; + skb = bt_skb_sendmsg(sk, msg, len, len, 0, 0); + if (IS_ERR(skb)) + return PTR_ERR(skb); + lock_sock(sk); switch (hci_pi(sk)->channel) { @@ -1744,39 +1724,30 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, break; case HCI_CHANNEL_MONITOR: err = -EOPNOTSUPP; - goto done; + goto drop; case HCI_CHANNEL_LOGGING: - err = hci_logging_frame(sk, msg, len); - goto done; + err = hci_logging_frame(sk, skb, flags); + goto drop; default: mutex_lock(&mgmt_chan_list_lock); chan = __hci_mgmt_chan_find(hci_pi(sk)->channel); if (chan) - err = hci_mgmt_cmd(chan, sk, msg, len); + err = hci_mgmt_cmd(chan, sk, skb); else err = -EINVAL; mutex_unlock(&mgmt_chan_list_lock); - goto done; + goto drop; } hdev = hci_hdev_from_sock(sk); if (IS_ERR(hdev)) { err = PTR_ERR(hdev); - goto done; + goto drop; } if (!test_bit(HCI_UP, &hdev->flags)) { err = -ENETDOWN; - goto done; - } - - skb = bt_skb_send_alloc(sk, len, msg->msg_flags & MSG_DONTWAIT, &err); - if (!skb) - goto done; - - if (memcpy_from_msg(skb_put(skb, len), msg, len)) { - err = -EFAULT; goto drop; } @@ -1857,8 +1828,8 @@ drop: goto done; } -static int hci_sock_setsockopt(struct socket *sock, int level, int optname, - sockptr_t optval, unsigned int len) +static int hci_sock_setsockopt_old(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int len) { struct hci_ufilter uf = { .opcode = 0 }; struct sock *sk = sock->sk; @@ -1866,9 +1837,6 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname, BT_DBG("sk %p, opt %d", sk, optname); - if (level != SOL_HCI) - return -ENOPROTOOPT; - lock_sock(sk); if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) { @@ -1943,18 +1911,63 @@ done: return err; } -static int hci_sock_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) +static int hci_sock_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int len) { - struct hci_ufilter uf; struct sock *sk = sock->sk; - int len, opt, err = 0; + int err = 0, opt = 0; BT_DBG("sk %p, opt %d", sk, optname); - if (level != SOL_HCI) + if (level == SOL_HCI) + return hci_sock_setsockopt_old(sock, level, optname, optval, + len); + + if (level != SOL_BLUETOOTH) return -ENOPROTOOPT; + lock_sock(sk); + + switch (optname) { + case BT_SNDMTU: + case BT_RCVMTU: + switch (hci_pi(sk)->channel) { + /* Don't allow changing MTU for channels that are meant for HCI + * traffic only. + */ + case HCI_CHANNEL_RAW: + case HCI_CHANNEL_USER: + err = -ENOPROTOOPT; + goto done; + } + + if (copy_from_sockptr(&opt, optval, sizeof(u16))) { + err = -EFAULT; + break; + } + + hci_pi(sk)->mtu = opt; + break; + + default: + err = -ENOPROTOOPT; + break; + } + +done: + release_sock(sk); + return err; +} + +static int hci_sock_getsockopt_old(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct hci_ufilter uf; + struct sock *sk = sock->sk; + int len, opt, err = 0; + + BT_DBG("sk %p, opt %d", sk, optname); + if (get_user(len, optlen)) return -EFAULT; @@ -2012,6 +2025,39 @@ done: return err; } +static int hci_sock_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + int err = 0; + + BT_DBG("sk %p, opt %d", sk, optname); + + if (level == SOL_HCI) + return hci_sock_getsockopt_old(sock, level, optname, optval, + optlen); + + if (level != SOL_BLUETOOTH) + return -ENOPROTOOPT; + + lock_sock(sk); + + switch (optname) { + case BT_SNDMTU: + case BT_RCVMTU: + if (put_user(hci_pi(sk)->mtu, (u16 __user *)optval)) + err = -EFAULT; + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + static const struct proto_ops hci_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 77ba68209dbd..4f8f37599962 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -7902,7 +7902,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, dst_type = ADDR_LE_DEV_RANDOM; if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) - hcon = hci_connect_le(hdev, dst, dst_type, + hcon = hci_connect_le(hdev, dst, dst_type, false, chan->sec_level, HCI_LE_CONN_TIMEOUT, HCI_ROLE_SLAVE, NULL); diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index c99d65ef13b1..160c016a5dfb 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1508,6 +1508,9 @@ static void l2cap_sock_close_cb(struct l2cap_chan *chan) { struct sock *sk = chan->data; + if (!sk) + return; + l2cap_sock_kill(sk); } @@ -1516,6 +1519,9 @@ static void l2cap_sock_teardown_cb(struct l2cap_chan *chan, int err) struct sock *sk = chan->data; struct sock *parent; + if (!sk) + return; + BT_DBG("chan %p state %s", chan, state_to_string(chan->state)); /* This callback can be called both for server (BT_LISTEN) @@ -1707,8 +1713,10 @@ static void l2cap_sock_destruct(struct sock *sk) { BT_DBG("sk %p", sk); - if (l2cap_pi(sk)->chan) + if (l2cap_pi(sk)->chan) { + l2cap_pi(sk)->chan->data = NULL; l2cap_chan_put(l2cap_pi(sk)->chan); + } if (l2cap_pi(sk)->rx_busy_skb) { kfree_skb(l2cap_pi(sk)->rx_busy_skb); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index cea01e275f1e..3e5283607b97 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -38,6 +38,7 @@ #include "mgmt_util.h" #include "mgmt_config.h" #include "msft.h" +#include "eir.h" #define MGMT_VERSION 1 #define MGMT_REVISION 21 @@ -3791,6 +3792,18 @@ static const u8 debug_uuid[16] = { }; #endif +/* 330859bc-7506-492d-9370-9a6f0614037f */ +static const u8 quality_report_uuid[16] = { + 0x7f, 0x03, 0x14, 0x06, 0x6f, 0x9a, 0x70, 0x93, + 0x2d, 0x49, 0x06, 0x75, 0xbc, 0x59, 0x08, 0x33, +}; + +/* a6695ace-ee7f-4fb9-881a-5fac66c629af */ +static const u8 offload_codecs_uuid[16] = { + 0xaf, 0x29, 0xc6, 0x66, 0xac, 0x5f, 0x1a, 0x88, + 0xb9, 0x4f, 0x7f, 0xee, 0xce, 0x5a, 0x69, 0xa6, +}; + /* 671b10b5-42c0-4696-9227-eb28d1b049d6 */ static const u8 simult_central_periph_uuid[16] = { 0xd6, 0x49, 0xb0, 0xd1, 0x28, 0xeb, 0x27, 0x92, @@ -3806,7 +3819,7 @@ static const u8 rpa_resolution_uuid[16] = { static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { - char buf[62]; /* Enough space for 3 features */ + char buf[102]; /* Enough space for 5 features: 2 + 20 * 5 */ struct mgmt_rp_read_exp_features_info *rp = (void *)buf; u16 idx = 0; u32 flags; @@ -3850,6 +3863,28 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, idx++; } + if (hdev && hdev->set_quality_report) { + if (hci_dev_test_flag(hdev, HCI_QUALITY_REPORT)) + flags = BIT(0); + else + flags = 0; + + memcpy(rp->features[idx].uuid, quality_report_uuid, 16); + rp->features[idx].flags = cpu_to_le32(flags); + idx++; + } + + if (hdev && hdev->get_data_path_id) { + if (hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED)) + flags = BIT(0); + else + flags = 0; + + memcpy(rp->features[idx].uuid, offload_codecs_uuid, 16); + rp->features[idx].flags = cpu_to_le32(flags); + idx++; + } + rp->feature_count = cpu_to_le16(idx); /* After reading the experimental features information, enable @@ -3892,150 +3927,341 @@ static int exp_debug_feature_changed(bool enabled, struct sock *skip) } #endif -static int set_exp_feature(struct sock *sk, struct hci_dev *hdev, - void *data, u16 data_len) +static int exp_quality_report_feature_changed(bool enabled, struct sock *skip) { - struct mgmt_cp_set_exp_feature *cp = data; - struct mgmt_rp_set_exp_feature rp; + struct mgmt_ev_exp_feature_changed ev; - bt_dev_dbg(hdev, "sock %p", sk); + memset(&ev, 0, sizeof(ev)); + memcpy(ev.uuid, quality_report_uuid, 16); + ev.flags = cpu_to_le32(enabled ? BIT(0) : 0); + + return mgmt_limited_event(MGMT_EV_EXP_FEATURE_CHANGED, NULL, + &ev, sizeof(ev), + HCI_MGMT_EXP_FEATURE_EVENTS, skip); +} + +#define EXP_FEAT(_uuid, _set_func) \ +{ \ + .uuid = _uuid, \ + .set_func = _set_func, \ +} + +/* The zero key uuid is special. Multiple exp features are set through it. */ +static int set_zero_key_func(struct sock *sk, struct hci_dev *hdev, + struct mgmt_cp_set_exp_feature *cp, u16 data_len) +{ + struct mgmt_rp_set_exp_feature rp; - if (!memcmp(cp->uuid, ZERO_KEY, 16)) { - memset(rp.uuid, 0, 16); - rp.flags = cpu_to_le32(0); + memset(rp.uuid, 0, 16); + rp.flags = cpu_to_le32(0); #ifdef CONFIG_BT_FEATURE_DEBUG - if (!hdev) { - bool changed = bt_dbg_get(); + if (!hdev) { + bool changed = bt_dbg_get(); - bt_dbg_set(false); + bt_dbg_set(false); - if (changed) - exp_debug_feature_changed(false, sk); - } + if (changed) + exp_debug_feature_changed(false, sk); + } #endif - if (hdev && use_ll_privacy(hdev) && !hdev_is_powered(hdev)) { - bool changed = hci_dev_test_flag(hdev, - HCI_ENABLE_LL_PRIVACY); + if (hdev && use_ll_privacy(hdev) && !hdev_is_powered(hdev)) { + bool changed = hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY); - hci_dev_clear_flag(hdev, HCI_ENABLE_LL_PRIVACY); + hci_dev_clear_flag(hdev, HCI_ENABLE_LL_PRIVACY); - if (changed) - exp_ll_privacy_feature_changed(false, hdev, sk); - } + if (changed) + exp_ll_privacy_feature_changed(false, hdev, sk); + } - hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); + hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); - return mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE, - MGMT_OP_SET_EXP_FEATURE, 0, - &rp, sizeof(rp)); - } + return mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, 0, + &rp, sizeof(rp)); +} #ifdef CONFIG_BT_FEATURE_DEBUG - if (!memcmp(cp->uuid, debug_uuid, 16)) { - bool val, changed; - int err; +static int set_debug_func(struct sock *sk, struct hci_dev *hdev, + struct mgmt_cp_set_exp_feature *cp, u16 data_len) +{ + struct mgmt_rp_set_exp_feature rp; - /* Command requires to use the non-controller index */ - if (hdev) - return mgmt_cmd_status(sk, hdev->id, - MGMT_OP_SET_EXP_FEATURE, - MGMT_STATUS_INVALID_INDEX); + bool val, changed; + int err; - /* Parameters are limited to a single octet */ - if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) - return mgmt_cmd_status(sk, MGMT_INDEX_NONE, - MGMT_OP_SET_EXP_FEATURE, - MGMT_STATUS_INVALID_PARAMS); + /* Command requires to use the non-controller index */ + if (hdev) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_INDEX); - /* Only boolean on/off is supported */ - if (cp->param[0] != 0x00 && cp->param[0] != 0x01) - return mgmt_cmd_status(sk, MGMT_INDEX_NONE, - MGMT_OP_SET_EXP_FEATURE, - MGMT_STATUS_INVALID_PARAMS); + /* Parameters are limited to a single octet */ + if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) + return mgmt_cmd_status(sk, MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); - val = !!cp->param[0]; - changed = val ? !bt_dbg_get() : bt_dbg_get(); - bt_dbg_set(val); + /* Only boolean on/off is supported */ + if (cp->param[0] != 0x00 && cp->param[0] != 0x01) + return mgmt_cmd_status(sk, MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); - memcpy(rp.uuid, debug_uuid, 16); - rp.flags = cpu_to_le32(val ? BIT(0) : 0); + val = !!cp->param[0]; + changed = val ? !bt_dbg_get() : bt_dbg_get(); + bt_dbg_set(val); - hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); + memcpy(rp.uuid, debug_uuid, 16); + rp.flags = cpu_to_le32(val ? BIT(0) : 0); - err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE, - MGMT_OP_SET_EXP_FEATURE, 0, - &rp, sizeof(rp)); + hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); - if (changed) - exp_debug_feature_changed(val, sk); + err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, 0, + &rp, sizeof(rp)); - return err; - } + if (changed) + exp_debug_feature_changed(val, sk); + + return err; +} #endif - if (!memcmp(cp->uuid, rpa_resolution_uuid, 16)) { - bool val, changed; - int err; - u32 flags; +static int set_rpa_resolution_func(struct sock *sk, struct hci_dev *hdev, + struct mgmt_cp_set_exp_feature *cp, + u16 data_len) +{ + struct mgmt_rp_set_exp_feature rp; + bool val, changed; + int err; + u32 flags; + + /* Command requires to use the controller index */ + if (!hdev) + return mgmt_cmd_status(sk, MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_INDEX); - /* Command requires to use the controller index */ - if (!hdev) - return mgmt_cmd_status(sk, MGMT_INDEX_NONE, - MGMT_OP_SET_EXP_FEATURE, - MGMT_STATUS_INVALID_INDEX); + /* Changes can only be made when controller is powered down */ + if (hdev_is_powered(hdev)) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_REJECTED); - /* Changes can only be made when controller is powered down */ - if (hdev_is_powered(hdev)) - return mgmt_cmd_status(sk, hdev->id, - MGMT_OP_SET_EXP_FEATURE, - MGMT_STATUS_REJECTED); + /* Parameters are limited to a single octet */ + if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); - /* Parameters are limited to a single octet */ - if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) - return mgmt_cmd_status(sk, hdev->id, - MGMT_OP_SET_EXP_FEATURE, - MGMT_STATUS_INVALID_PARAMS); + /* Only boolean on/off is supported */ + if (cp->param[0] != 0x00 && cp->param[0] != 0x01) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); - /* Only boolean on/off is supported */ - if (cp->param[0] != 0x00 && cp->param[0] != 0x01) - return mgmt_cmd_status(sk, hdev->id, - MGMT_OP_SET_EXP_FEATURE, - MGMT_STATUS_INVALID_PARAMS); + val = !!cp->param[0]; - val = !!cp->param[0]; + if (val) { + changed = !hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY); + hci_dev_set_flag(hdev, HCI_ENABLE_LL_PRIVACY); + hci_dev_clear_flag(hdev, HCI_ADVERTISING); - if (val) { - changed = !hci_dev_test_flag(hdev, - HCI_ENABLE_LL_PRIVACY); - hci_dev_set_flag(hdev, HCI_ENABLE_LL_PRIVACY); - hci_dev_clear_flag(hdev, HCI_ADVERTISING); + /* Enable LL privacy + supported settings changed */ + flags = BIT(0) | BIT(1); + } else { + changed = hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY); + hci_dev_clear_flag(hdev, HCI_ENABLE_LL_PRIVACY); - /* Enable LL privacy + supported settings changed */ - flags = BIT(0) | BIT(1); - } else { - changed = hci_dev_test_flag(hdev, - HCI_ENABLE_LL_PRIVACY); - hci_dev_clear_flag(hdev, HCI_ENABLE_LL_PRIVACY); + /* Disable LL privacy + supported settings changed */ + flags = BIT(1); + } - /* Disable LL privacy + supported settings changed */ - flags = BIT(1); + memcpy(rp.uuid, rpa_resolution_uuid, 16); + rp.flags = cpu_to_le32(flags); + + hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); + + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, 0, + &rp, sizeof(rp)); + + if (changed) + exp_ll_privacy_feature_changed(val, hdev, sk); + + return err; +} + +static int set_quality_report_func(struct sock *sk, struct hci_dev *hdev, + struct mgmt_cp_set_exp_feature *cp, + u16 data_len) +{ + struct mgmt_rp_set_exp_feature rp; + bool val, changed; + int err; + + /* Command requires to use a valid controller index */ + if (!hdev) + return mgmt_cmd_status(sk, MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_INDEX); + + /* Parameters are limited to a single octet */ + if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); + + /* Only boolean on/off is supported */ + if (cp->param[0] != 0x00 && cp->param[0] != 0x01) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); + + hci_req_sync_lock(hdev); + + val = !!cp->param[0]; + changed = (val != hci_dev_test_flag(hdev, HCI_QUALITY_REPORT)); + + if (!hdev->set_quality_report) { + err = mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_NOT_SUPPORTED); + goto unlock_quality_report; + } + + if (changed) { + err = hdev->set_quality_report(hdev, val); + if (err) { + err = mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_FAILED); + goto unlock_quality_report; } + if (val) + hci_dev_set_flag(hdev, HCI_QUALITY_REPORT); + else + hci_dev_clear_flag(hdev, HCI_QUALITY_REPORT); + } - memcpy(rp.uuid, rpa_resolution_uuid, 16); - rp.flags = cpu_to_le32(flags); + bt_dev_dbg(hdev, "quality report enable %d changed %d", val, changed); - hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); + memcpy(rp.uuid, quality_report_uuid, 16); + rp.flags = cpu_to_le32(val ? BIT(0) : 0); + hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, 0, + &rp, sizeof(rp)); - err = mgmt_cmd_complete(sk, hdev->id, - MGMT_OP_SET_EXP_FEATURE, 0, - &rp, sizeof(rp)); + if (changed) + exp_quality_report_feature_changed(val, sk); - if (changed) - exp_ll_privacy_feature_changed(val, hdev, sk); +unlock_quality_report: + hci_req_sync_unlock(hdev); + return err; +} - return err; +static int exp_offload_codec_feature_changed(bool enabled, struct sock *skip) +{ + struct mgmt_ev_exp_feature_changed ev; + + memset(&ev, 0, sizeof(ev)); + memcpy(ev.uuid, offload_codecs_uuid, 16); + ev.flags = cpu_to_le32(enabled ? BIT(0) : 0); + + return mgmt_limited_event(MGMT_EV_EXP_FEATURE_CHANGED, NULL, + &ev, sizeof(ev), + HCI_MGMT_EXP_FEATURE_EVENTS, skip); +} + +static int set_offload_codec_func(struct sock *sk, struct hci_dev *hdev, + struct mgmt_cp_set_exp_feature *cp, + u16 data_len) +{ + bool val, changed; + int err; + struct mgmt_rp_set_exp_feature rp; + + /* Command requires to use a valid controller index */ + if (!hdev) + return mgmt_cmd_status(sk, MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_INDEX); + + /* Parameters are limited to a single octet */ + if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); + + /* Only boolean on/off is supported */ + if (cp->param[0] != 0x00 && cp->param[0] != 0x01) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); + + val = !!cp->param[0]; + changed = (val != hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED)); + + if (!hdev->get_data_path_id) { + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_NOT_SUPPORTED); + } + + if (changed) { + if (val) + hci_dev_set_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED); + else + hci_dev_clear_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED); + } + + bt_dev_info(hdev, "offload codecs enable %d changed %d", + val, changed); + + memcpy(rp.uuid, offload_codecs_uuid, 16); + rp.flags = cpu_to_le32(val ? BIT(0) : 0); + hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, 0, + &rp, sizeof(rp)); + + if (changed) + exp_offload_codec_feature_changed(val, sk); + + return err; +} + +static const struct mgmt_exp_feature { + const u8 *uuid; + int (*set_func)(struct sock *sk, struct hci_dev *hdev, + struct mgmt_cp_set_exp_feature *cp, u16 data_len); +} exp_features[] = { + EXP_FEAT(ZERO_KEY, set_zero_key_func), +#ifdef CONFIG_BT_FEATURE_DEBUG + EXP_FEAT(debug_uuid, set_debug_func), +#endif + EXP_FEAT(rpa_resolution_uuid, set_rpa_resolution_func), + EXP_FEAT(quality_report_uuid, set_quality_report_func), + EXP_FEAT(offload_codecs_uuid, set_offload_codec_func), + + /* end with a null feature */ + EXP_FEAT(NULL, NULL) +}; + +static int set_exp_feature(struct sock *sk, struct hci_dev *hdev, + void *data, u16 data_len) +{ + struct mgmt_cp_set_exp_feature *cp = data; + size_t i = 0; + + bt_dev_dbg(hdev, "sock %p", sk); + + for (i = 0; exp_features[i].uuid; i++) { + if (!memcmp(cp->uuid, exp_features[i].uuid, 16)) + return exp_features[i].set_func(sk, hdev, cp, data_len); } return mgmt_cmd_status(sk, hdev ? hdev->id : MGMT_INDEX_NONE, @@ -7315,6 +7541,11 @@ static int read_local_oob_ext_data(struct sock *sk, struct hci_dev *hdev, if (!rp) return -ENOMEM; + if (!status && !lmp_ssp_capable(hdev)) { + status = MGMT_STATUS_NOT_SUPPORTED; + eir_len = 0; + } + if (status) goto complete; @@ -7526,7 +7757,7 @@ static u8 calculate_name_len(struct hci_dev *hdev) { u8 buf[HCI_MAX_SHORT_NAME_LENGTH + 3]; - return append_local_name(hdev, buf, 0); + return eir_append_local_name(hdev, buf, 0); } static u8 tlv_data_max_len(struct hci_dev *hdev, u32 adv_flags, @@ -8222,7 +8453,7 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, * advertising. */ if (hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) - return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING, MGMT_STATUS_NOT_SUPPORTED); hci_dev_lock(hdev); diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index b4bfae41e8a5..255cffa554ee 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -94,11 +94,14 @@ struct msft_data { __u16 pending_add_handle; __u16 pending_remove_handle; __u8 reregistering; + __u8 suspending; __u8 filter_enabled; }; static int __msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor); +static int __msft_remove_monitor(struct hci_dev *hdev, + struct adv_monitor *monitor, u16 handle); bool msft_monitor_supported(struct hci_dev *hdev) { @@ -154,7 +157,7 @@ failed: } /* This function requires the caller holds hdev->lock */ -static void reregister_monitor_on_restart(struct hci_dev *hdev, int handle) +static void reregister_monitor(struct hci_dev *hdev, int handle) { struct adv_monitor *monitor; struct msft_data *msft = hdev->msft_data; @@ -182,31 +185,102 @@ static void reregister_monitor_on_restart(struct hci_dev *hdev, int handle) } } +/* This function requires the caller holds hdev->lock */ +static void remove_monitor_on_suspend(struct hci_dev *hdev, int handle) +{ + struct adv_monitor *monitor; + struct msft_data *msft = hdev->msft_data; + int err; + + while (1) { + monitor = idr_get_next(&hdev->adv_monitors_idr, &handle); + if (!monitor) { + /* All monitors have been removed */ + msft->suspending = false; + hci_update_background_scan(hdev); + return; + } + + msft->pending_remove_handle = (u16)handle; + err = __msft_remove_monitor(hdev, monitor, handle); + + /* If success, return and wait for monitor removed callback */ + if (!err) + return; + + /* Otherwise free the monitor and keep removing */ + hci_free_adv_monitor(hdev, monitor); + handle++; + } +} + +/* This function requires the caller holds hdev->lock */ +void msft_suspend(struct hci_dev *hdev) +{ + struct msft_data *msft = hdev->msft_data; + + if (!msft) + return; + + if (msft_monitor_supported(hdev)) { + msft->suspending = true; + /* Quitely remove all monitors on suspend to avoid waking up + * the system. + */ + remove_monitor_on_suspend(hdev, 0); + } +} + +/* This function requires the caller holds hdev->lock */ +void msft_resume(struct hci_dev *hdev) +{ + struct msft_data *msft = hdev->msft_data; + + if (!msft) + return; + + if (msft_monitor_supported(hdev)) { + msft->reregistering = true; + /* Monitors are removed on suspend, so we need to add all + * monitors on resume. + */ + reregister_monitor(hdev, 0); + } +} + void msft_do_open(struct hci_dev *hdev) { - struct msft_data *msft; + struct msft_data *msft = hdev->msft_data; if (hdev->msft_opcode == HCI_OP_NOP) return; + if (!msft) { + bt_dev_err(hdev, "MSFT extension not registered"); + return; + } + bt_dev_dbg(hdev, "Initialize MSFT extension"); - msft = kzalloc(sizeof(*msft), GFP_KERNEL); - if (!msft) - return; + /* Reset existing MSFT data before re-reading */ + kfree(msft->evt_prefix); + msft->evt_prefix = NULL; + msft->evt_prefix_len = 0; + msft->features = 0; if (!read_supported_features(hdev, msft)) { + hdev->msft_data = NULL; kfree(msft); return; } - INIT_LIST_HEAD(&msft->handle_map); - hdev->msft_data = msft; - if (msft_monitor_supported(hdev)) { msft->reregistering = true; msft_set_filter_enable(hdev, true); - reregister_monitor_on_restart(hdev, 0); + /* Monitors get removed on power off, so we need to explicitly + * tell the controller to re-monitor. + */ + reregister_monitor(hdev, 0); } } @@ -221,8 +295,9 @@ void msft_do_close(struct hci_dev *hdev) bt_dev_dbg(hdev, "Cleanup of MSFT extension"); - hdev->msft_data = NULL; - + /* The controller will silently remove all monitors on power off. + * Therefore, remove handle_data mapping and reset monitor state. + */ list_for_each_entry_safe(handle_data, tmp, &msft->handle_map, list) { monitor = idr_find(&hdev->adv_monitors_idr, handle_data->mgmt_handle); @@ -233,6 +308,34 @@ void msft_do_close(struct hci_dev *hdev) list_del(&handle_data->list); kfree(handle_data); } +} + +void msft_register(struct hci_dev *hdev) +{ + struct msft_data *msft = NULL; + + bt_dev_dbg(hdev, "Register MSFT extension"); + + msft = kzalloc(sizeof(*msft), GFP_KERNEL); + if (!msft) { + bt_dev_err(hdev, "Failed to register MSFT extension"); + return; + } + + INIT_LIST_HEAD(&msft->handle_map); + hdev->msft_data = msft; +} + +void msft_unregister(struct hci_dev *hdev) +{ + struct msft_data *msft = hdev->msft_data; + + if (!msft) + return; + + bt_dev_dbg(hdev, "Unregister MSFT extension"); + + hdev->msft_data = NULL; kfree(msft->evt_prefix); kfree(msft); @@ -345,8 +448,7 @@ unlock: /* If in restart/reregister sequence, keep registering. */ if (msft->reregistering) - reregister_monitor_on_restart(hdev, - msft->pending_add_handle + 1); + reregister_monitor(hdev, msft->pending_add_handle + 1); hci_dev_unlock(hdev); @@ -383,13 +485,25 @@ static void msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev, if (handle_data) { monitor = idr_find(&hdev->adv_monitors_idr, handle_data->mgmt_handle); - if (monitor) + + if (monitor && monitor->state == ADV_MONITOR_STATE_OFFLOADED) + monitor->state = ADV_MONITOR_STATE_REGISTERED; + + /* Do not free the monitor if it is being removed due to + * suspend. It will be re-monitored on resume. + */ + if (monitor && !msft->suspending) hci_free_adv_monitor(hdev, monitor); list_del(&handle_data->list); kfree(handle_data); } + /* If in suspend/remove sequence, keep removing. */ + if (msft->suspending) + remove_monitor_on_suspend(hdev, + msft->pending_remove_handle + 1); + /* If remove all monitors is required, we need to continue the process * here because the earlier it was paused when waiting for the * response from controller. @@ -408,7 +522,8 @@ static void msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev, hci_dev_unlock(hdev); done: - hci_remove_adv_monitor_complete(hdev, status); + if (!msft->suspending) + hci_remove_adv_monitor_complete(hdev, status); } static void msft_le_set_advertisement_filter_enable_cb(struct hci_dev *hdev, @@ -541,15 +656,15 @@ int msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor) if (!msft) return -EOPNOTSUPP; - if (msft->reregistering) + if (msft->reregistering || msft->suspending) return -EBUSY; return __msft_add_monitor_pattern(hdev, monitor); } /* This function requires the caller holds hdev->lock */ -int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, - u16 handle) +static int __msft_remove_monitor(struct hci_dev *hdev, + struct adv_monitor *monitor, u16 handle) { struct msft_cp_le_cancel_monitor_advertisement cp; struct msft_monitor_advertisement_handle_data *handle_data; @@ -557,12 +672,6 @@ int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, struct msft_data *msft = hdev->msft_data; int err = 0; - if (!msft) - return -EOPNOTSUPP; - - if (msft->reregistering) - return -EBUSY; - handle_data = msft_find_handle_data(hdev, monitor->handle, true); /* If no matched handle, just remove without telling controller */ @@ -582,6 +691,21 @@ int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, return err; } +/* This function requires the caller holds hdev->lock */ +int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, + u16 handle) +{ + struct msft_data *msft = hdev->msft_data; + + if (!msft) + return -EOPNOTSUPP; + + if (msft->reregistering || msft->suspending) + return -EBUSY; + + return __msft_remove_monitor(hdev, monitor, handle); +} + void msft_req_add_set_filter_enable(struct hci_request *req, bool enable) { struct hci_dev *hdev = req->hdev; diff --git a/net/bluetooth/msft.h b/net/bluetooth/msft.h index 6e56d94b88d8..59c6e081c789 100644 --- a/net/bluetooth/msft.h +++ b/net/bluetooth/msft.h @@ -13,6 +13,8 @@ #if IS_ENABLED(CONFIG_BT_MSFTEXT) bool msft_monitor_supported(struct hci_dev *hdev); +void msft_register(struct hci_dev *hdev); +void msft_unregister(struct hci_dev *hdev); void msft_do_open(struct hci_dev *hdev); void msft_do_close(struct hci_dev *hdev); void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb); @@ -22,6 +24,8 @@ int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, u16 handle); void msft_req_add_set_filter_enable(struct hci_request *req, bool enable); int msft_set_filter_enable(struct hci_dev *hdev, bool enable); +void msft_suspend(struct hci_dev *hdev); +void msft_resume(struct hci_dev *hdev); bool msft_curve_validity(struct hci_dev *hdev); #else @@ -31,6 +35,8 @@ static inline bool msft_monitor_supported(struct hci_dev *hdev) return false; } +static inline void msft_register(struct hci_dev *hdev) {} +static inline void msft_unregister(struct hci_dev *hdev) {} static inline void msft_do_open(struct hci_dev *hdev) {} static inline void msft_do_close(struct hci_dev *hdev) {} static inline void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb) {} @@ -55,6 +61,9 @@ static inline int msft_set_filter_enable(struct hci_dev *hdev, bool enable) return -EOPNOTSUPP; } +static inline void msft_suspend(struct hci_dev *hdev) {} +static inline void msft_resume(struct hci_dev *hdev) {} + static inline bool msft_curve_validity(struct hci_dev *hdev) { return false; diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index f2bacb464ccf..7324764384b6 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -549,22 +549,58 @@ struct rfcomm_dlc *rfcomm_dlc_exists(bdaddr_t *src, bdaddr_t *dst, u8 channel) return dlc; } +static int rfcomm_dlc_send_frag(struct rfcomm_dlc *d, struct sk_buff *frag) +{ + int len = frag->len; + + BT_DBG("dlc %p mtu %d len %d", d, d->mtu, len); + + if (len > d->mtu) + return -EINVAL; + + rfcomm_make_uih(frag, d->addr); + __skb_queue_tail(&d->tx_queue, frag); + + return len; +} + int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb) { - int len = skb->len; + unsigned long flags; + struct sk_buff *frag, *next; + int len; if (d->state != BT_CONNECTED) return -ENOTCONN; - BT_DBG("dlc %p mtu %d len %d", d, d->mtu, len); + frag = skb_shinfo(skb)->frag_list; + skb_shinfo(skb)->frag_list = NULL; - if (len > d->mtu) - return -EINVAL; + /* Queue all fragments atomically. */ + spin_lock_irqsave(&d->tx_queue.lock, flags); - rfcomm_make_uih(skb, d->addr); - skb_queue_tail(&d->tx_queue, skb); + len = rfcomm_dlc_send_frag(d, skb); + if (len < 0 || !frag) + goto unlock; + + for (; frag; frag = next) { + int ret; + + next = frag->next; + + ret = rfcomm_dlc_send_frag(d, frag); + if (ret < 0) { + kfree_skb(frag); + goto unlock; + } + + len += ret; + } + +unlock: + spin_unlock_irqrestore(&d->tx_queue.lock, flags); - if (!test_bit(RFCOMM_TX_THROTTLED, &d->flags)) + if (len > 0 && !test_bit(RFCOMM_TX_THROTTLED, &d->flags)) rfcomm_schedule(); return len; } diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 2c95bb58f901..4bf4ea6cbb5e 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -575,46 +575,20 @@ static int rfcomm_sock_sendmsg(struct socket *sock, struct msghdr *msg, lock_sock(sk); sent = bt_sock_wait_ready(sk, msg->msg_flags); - if (sent) - goto done; - - while (len) { - size_t size = min_t(size_t, len, d->mtu); - int err; - - skb = sock_alloc_send_skb(sk, size + RFCOMM_SKB_RESERVE, - msg->msg_flags & MSG_DONTWAIT, &err); - if (!skb) { - if (sent == 0) - sent = err; - break; - } - skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE); - - err = memcpy_from_msg(skb_put(skb, size), msg, size); - if (err) { - kfree_skb(skb); - if (sent == 0) - sent = err; - break; - } - skb->priority = sk->sk_priority; + release_sock(sk); - err = rfcomm_dlc_send(d, skb); - if (err < 0) { - kfree_skb(skb); - if (sent == 0) - sent = err; - break; - } + if (sent) + return sent; - sent += size; - len -= size; - } + skb = bt_skb_sendmmsg(sk, msg, len, d->mtu, RFCOMM_SKB_HEAD_RESERVE, + RFCOMM_SKB_TAIL_RESERVE); + if (IS_ERR(skb)) + return PTR_ERR(skb); -done: - release_sock(sk); + sent = rfcomm_dlc_send(d, skb); + if (sent < 0) + kfree_skb(skb); return sent; } diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 98a881586512..8eabf41b2993 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -69,6 +69,7 @@ struct sco_pinfo { __u32 flags; __u16 setting; __u8 cmsg_mask; + struct bt_codec codec; struct sco_conn *conn; }; @@ -133,6 +134,7 @@ static struct sco_conn *sco_conn_add(struct hci_conn *hcon) return NULL; spin_lock_init(&conn->lock); + INIT_DELAYED_WORK(&conn->timeout_work, sco_sock_timeout); hcon->sco_data = conn; conn->hcon = hcon; @@ -187,20 +189,21 @@ static void sco_conn_del(struct hci_conn *hcon, int err) /* Kill socket */ sco_conn_lock(conn); sk = conn->sk; + if (sk) + sock_hold(sk); sco_conn_unlock(conn); if (sk) { - sock_hold(sk); lock_sock(sk); sco_sock_clear_timer(sk); sco_chan_del(sk, err); release_sock(sk); sock_put(sk); - - /* Ensure no more work items will run before freeing conn. */ - cancel_delayed_work_sync(&conn->timeout_work); } + /* Ensure no more work items will run before freeing conn. */ + cancel_delayed_work_sync(&conn->timeout_work); + hcon->sco_data = NULL; kfree(conn); } @@ -213,8 +216,6 @@ static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, sco_pi(sk)->conn = conn; conn->sk = sk; - INIT_DELAYED_WORK(&conn->timeout_work, sco_sock_timeout); - if (parent) bt_accept_enqueue(parent, sk, true); } @@ -252,7 +253,7 @@ static int sco_connect(struct hci_dev *hdev, struct sock *sk) return -EOPNOTSUPP; hcon = hci_connect_sco(hdev, type, &sco_pi(sk)->dst, - sco_pi(sk)->setting); + sco_pi(sk)->setting, &sco_pi(sk)->codec); if (IS_ERR(hcon)) return PTR_ERR(hcon); @@ -280,11 +281,10 @@ static int sco_connect(struct hci_dev *hdev, struct sock *sk) return err; } -static int sco_send_frame(struct sock *sk, struct msghdr *msg, int len) +static int sco_send_frame(struct sock *sk, struct sk_buff *skb) { struct sco_conn *conn = sco_pi(sk)->conn; - struct sk_buff *skb; - int err; + int len = skb->len; /* Check outgoing MTU */ if (len > conn->mtu) @@ -292,15 +292,6 @@ static int sco_send_frame(struct sock *sk, struct msghdr *msg, int len) BT_DBG("sk %p len %d", sk, len); - skb = bt_skb_send_alloc(sk, len, msg->msg_flags & MSG_DONTWAIT, &err); - if (!skb) - return err; - - if (memcpy_from_msg(skb_put(skb, len), msg, len)) { - kfree_skb(skb); - return -EFAULT; - } - hci_send_sco(conn->hcon, skb); return len; @@ -444,6 +435,7 @@ static void __sco_sock_close(struct sock *sk) sock_set_flag(sk, SOCK_ZAPPED); break; } + } /* Must be called on unlocked socket. */ @@ -504,6 +496,10 @@ static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, sk->sk_state = BT_OPEN; sco_pi(sk)->setting = BT_VOICE_CVSD_16BIT; + sco_pi(sk)->codec.id = BT_CODEC_CVSD; + sco_pi(sk)->codec.cid = 0xffff; + sco_pi(sk)->codec.vid = 0xffff; + sco_pi(sk)->codec.data_path = 0x00; bt_sock_link(&sco_sk_list, sk); return sk; @@ -725,6 +721,7 @@ static int sco_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; + struct sk_buff *skb; int err; BT_DBG("sock %p, sk %p", sock, sk); @@ -736,14 +733,21 @@ static int sco_sock_sendmsg(struct socket *sock, struct msghdr *msg, if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; + skb = bt_skb_sendmsg(sk, msg, len, len, 0, 0); + if (IS_ERR(skb)) + return PTR_ERR(skb); + lock_sock(sk); if (sk->sk_state == BT_CONNECTED) - err = sco_send_frame(sk, msg, len); + err = sco_send_frame(sk, skb); else err = -ENOTCONN; release_sock(sk); + + if (err < 0) + kfree_skb(skb); return err; } @@ -825,6 +829,9 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, int len, err = 0; struct bt_voice voice; u32 opt; + struct bt_codecs *codecs; + struct hci_dev *hdev; + __u8 buffer[255]; BT_DBG("sk %p", sk); @@ -872,6 +879,16 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, } sco_pi(sk)->setting = voice.setting; + hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, + BDADDR_BREDR); + if (!hdev) { + err = -EBADFD; + break; + } + if (enhanced_sco_capable(hdev) && + voice.setting == BT_VOICE_TRANSPARENT) + sco_pi(sk)->codec.id = BT_CODEC_TRANSPARENT; + hci_dev_put(hdev); break; case BT_PKT_STATUS: @@ -886,6 +903,57 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, sco_pi(sk)->cmsg_mask &= SCO_CMSG_PKT_STATUS; break; + case BT_CODEC: + if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND && + sk->sk_state != BT_CONNECT2) { + err = -EINVAL; + break; + } + + hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, + BDADDR_BREDR); + if (!hdev) { + err = -EBADFD; + break; + } + + if (!hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED)) { + hci_dev_put(hdev); + err = -EOPNOTSUPP; + break; + } + + if (!hdev->get_data_path_id) { + hci_dev_put(hdev); + err = -EOPNOTSUPP; + break; + } + + if (optlen < sizeof(struct bt_codecs) || + optlen > sizeof(buffer)) { + hci_dev_put(hdev); + err = -EINVAL; + break; + } + + if (copy_from_sockptr(buffer, optval, optlen)) { + hci_dev_put(hdev); + err = -EFAULT; + break; + } + + codecs = (void *)buffer; + + if (codecs->num_codecs > 1) { + hci_dev_put(hdev); + err = -EINVAL; + break; + } + + sco_pi(sk)->codec = codecs->codecs[0]; + hci_dev_put(hdev); + break; + default: err = -ENOPROTOOPT; break; @@ -964,6 +1032,12 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, struct bt_voice voice; u32 phys; int pkt_status; + int buf_len; + struct codec_list *c; + u8 num_codecs, i, __user *ptr; + struct hci_dev *hdev; + struct hci_codec_caps *caps; + struct bt_codec codec; BT_DBG("sk %p", sk); @@ -1028,6 +1102,101 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, err = -EFAULT; break; + case BT_CODEC: + num_codecs = 0; + buf_len = 0; + + hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR); + if (!hdev) { + err = -EBADFD; + break; + } + + if (!hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED)) { + hci_dev_put(hdev); + err = -EOPNOTSUPP; + break; + } + + if (!hdev->get_data_path_id) { + hci_dev_put(hdev); + err = -EOPNOTSUPP; + break; + } + + /* find total buffer size required to copy codec + caps */ + hci_dev_lock(hdev); + list_for_each_entry(c, &hdev->local_codecs, list) { + if (c->transport != HCI_TRANSPORT_SCO_ESCO) + continue; + num_codecs++; + for (i = 0, caps = c->caps; i < c->num_caps; i++) { + buf_len += 1 + caps->len; + caps = (void *)&caps->data[caps->len]; + } + buf_len += sizeof(struct bt_codec); + } + hci_dev_unlock(hdev); + + buf_len += sizeof(struct bt_codecs); + if (buf_len > len) { + hci_dev_put(hdev); + err = -ENOBUFS; + break; + } + ptr = optval; + + if (put_user(num_codecs, ptr)) { + hci_dev_put(hdev); + err = -EFAULT; + break; + } + ptr += sizeof(num_codecs); + + /* Iterate all the codecs supported over SCO and populate + * codec data + */ + hci_dev_lock(hdev); + list_for_each_entry(c, &hdev->local_codecs, list) { + if (c->transport != HCI_TRANSPORT_SCO_ESCO) + continue; + + codec.id = c->id; + codec.cid = c->cid; + codec.vid = c->vid; + err = hdev->get_data_path_id(hdev, &codec.data_path); + if (err < 0) + break; + codec.num_caps = c->num_caps; + if (copy_to_user(ptr, &codec, sizeof(codec))) { + err = -EFAULT; + break; + } + ptr += sizeof(codec); + + /* find codec capabilities data length */ + len = 0; + for (i = 0, caps = c->caps; i < c->num_caps; i++) { + len += 1 + caps->len; + caps = (void *)&caps->data[caps->len]; + } + + /* copy codec capabilities data */ + if (len && copy_to_user(ptr, c->caps, len)) { + err = -EFAULT; + break; + } + ptr += len; + } + + if (!err && put_user(buf_len, optlen)) + err = -EFAULT; + + hci_dev_unlock(hdev); + hci_dev_put(hdev); + + break; + default: err = -ENOPROTOOPT; break; diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index fcb2f493f710..072f0c16c779 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -558,6 +558,12 @@ static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) __skb->hwtstamp = skb_shinfo(skb)->hwtstamps.hwtstamp; } +static struct proto bpf_dummy_proto = { + .name = "bpf_dummy", + .owner = THIS_MODULE, + .obj_size = sizeof(struct sock), +}; + int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { @@ -602,20 +608,19 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, break; } - sk = kzalloc(sizeof(struct sock), GFP_USER); + sk = sk_alloc(net, AF_UNSPEC, GFP_USER, &bpf_dummy_proto, 1); if (!sk) { kfree(data); kfree(ctx); return -ENOMEM; } - sock_net_set(sk, net); sock_init_data(NULL, sk); skb = build_skb(data, 0); if (!skb) { kfree(data); kfree(ctx); - kfree(sk); + sk_free(sk); return -ENOMEM; } skb->sk = sk; @@ -688,8 +693,7 @@ out: if (dev && dev != net->loopback_dev) dev_put(dev); kfree_skb(skb); - bpf_sk_storage_free(sk); - kfree(sk); + sk_free(sk); kfree(ctx); return ret; } @@ -803,7 +807,8 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, if (ret) goto free_data; - bpf_prog_change_xdp(NULL, prog); + if (repeat > 1) + bpf_prog_change_xdp(NULL, prog); ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); /* We convert the xdp_buff back to an xdp_md before checking the return * code so the reference count of any held netdevice will be decremented @@ -824,7 +829,8 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, sizeof(struct xdp_md)); out: - bpf_prog_change_xdp(prog, NULL); + if (repeat > 1) + bpf_prog_change_xdp(prog, NULL); free_data: kfree(data); free_ctx: diff --git a/net/bridge/br.c b/net/bridge/br.c index d3a32c6813e0..1fac72cc617f 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -36,7 +36,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v bool changed_addr; int err; - if (dev->priv_flags & IFF_EBRIDGE) { + if (netif_is_bridge_master(dev)) { err = br_vlan_bridge_event(dev, event, ptr); if (err) return notifier_from_errno(err); @@ -349,7 +349,7 @@ static void __net_exit br_net_exit(struct net *net) rtnl_lock(); for_each_netdev(net, dev) - if (dev->priv_flags & IFF_EBRIDGE) + if (netif_is_bridge_master(dev)) br_dev_delete(dev, &list); unregister_netdevice_many(&list); diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 46812b659710..a6a68e18c70a 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -825,7 +825,7 @@ int br_fdb_dump(struct sk_buff *skb, struct net_bridge_fdb_entry *f; int err = 0; - if (!(dev->priv_flags & IFF_EBRIDGE)) + if (!netif_is_bridge_master(dev)) return err; if (!filter_dev) { @@ -1076,7 +1076,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], return -EINVAL; } - if (dev->priv_flags & IFF_EBRIDGE) { + if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); vg = br_vlan_group(br); } else { @@ -1173,7 +1173,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_bridge *br; int err; - if (dev->priv_flags & IFF_EBRIDGE) { + if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); vg = br_vlan_group(br); } else { diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 4a02f8bb278a..c11bba3e7ec0 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -471,7 +471,7 @@ int br_del_bridge(struct net *net, const char *name) if (dev == NULL) ret = -ENXIO; /* Could not find device */ - else if (!(dev->priv_flags & IFF_EBRIDGE)) { + else if (!netif_is_bridge_master(dev)) { /* Attempt to delete non bridge device! */ ret = -EPERM; } diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 793b0db9d9a3..db4ab2c2ce18 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -26,7 +26,7 @@ static int get_bridge_ifindices(struct net *net, int *indices, int num) for_each_netdev_rcu(net, dev) { if (i >= num) break; - if (dev->priv_flags & IFF_EBRIDGE) + if (netif_is_bridge_master(dev)) indices[i++] = dev->ifindex; } rcu_read_unlock(); @@ -71,7 +71,8 @@ static int get_fdb_entries(struct net_bridge *br, void __user *userbuf, num = br_fdb_fillbuf(br, buf, maxnum, offset); if (num > 0) { - if (copy_to_user(userbuf, buf, num*sizeof(struct __fdb_entry))) + if (copy_to_user(userbuf, buf, + array_size(num, sizeof(struct __fdb_entry)))) num = -EFAULT; } kfree(buf); @@ -188,7 +189,7 @@ int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq, void __user return -ENOMEM; get_port_ifindices(br, indices, num); - if (copy_to_user(argp, indices, num * sizeof(int))) + if (copy_to_user(argp, indices, array_size(num, sizeof(int)))) num = -EFAULT; kfree(indices); return num; @@ -336,7 +337,8 @@ static int old_deviceless(struct net *net, void __user *uarg) args[2] = get_bridge_ifindices(net, indices, args[2]); - ret = copy_to_user(uarg, indices, args[2]*sizeof(int)) + ret = copy_to_user(uarg, indices, + array_size(args[2], sizeof(int))) ? -EFAULT : args[2]; kfree(indices); diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 0281453f7766..61ccf46fcc21 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -422,7 +422,7 @@ static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->seq = net->dev_base_seq; for_each_netdev_rcu(net, dev) { - if (dev->priv_flags & IFF_EBRIDGE) { + if (netif_is_bridge_master(dev)) { struct net_bridge *br = netdev_priv(dev); struct br_port_msg *bpm; @@ -1016,7 +1016,7 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, return -ENODEV; } - if (!(dev->priv_flags & IFF_EBRIDGE)) { + if (!netif_is_bridge_master(dev)) { NL_SET_ERR_MSG_MOD(extack, "Device is not a bridge"); return -EOPNOTSUPP; } diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 3523c8c7068f..f3d751105343 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1677,8 +1677,6 @@ static void br_multicast_update_querier(struct net_bridge_mcast *brmctx, int ifindex, struct br_ip *saddr) { - lockdep_assert_held_once(&brmctx->br->multicast_lock); - write_seqcount_begin(&querier->seq); querier->port_ifidx = ifindex; memcpy(&querier->addr, saddr, sizeof(*saddr)); @@ -3867,13 +3865,13 @@ void br_multicast_ctx_init(struct net_bridge *br, brmctx->ip4_other_query.delay_time = 0; brmctx->ip4_querier.port_ifidx = 0; - seqcount_init(&brmctx->ip4_querier.seq); + seqcount_spinlock_init(&brmctx->ip4_querier.seq, &br->multicast_lock); brmctx->multicast_igmp_version = 2; #if IS_ENABLED(CONFIG_IPV6) brmctx->multicast_mld_version = 1; brmctx->ip6_other_query.delay_time = 0; brmctx->ip6_querier.port_ifidx = 0; - seqcount_init(&brmctx->ip6_querier.seq); + seqcount_spinlock_init(&brmctx->ip6_querier.seq, &br->multicast_lock); #endif timer_setup(&brmctx->ip4_mc_router_timer, diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 8edfb98ae1d5..b5af68c105a8 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -968,7 +968,7 @@ static int brnf_device_event(struct notifier_block *unused, unsigned long event, struct net *net; int ret; - if (event != NETDEV_REGISTER || !(dev->priv_flags & IFF_EBRIDGE)) + if (event != NETDEV_REGISTER || !netif_is_bridge_master(dev)) return NOTIFY_DONE; ASSERT_RTNL(); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 6c58fc14d2cb..0c8b5f1a15bc 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -106,7 +106,7 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev, p = br_port_get_check_rcu(dev); if (p) vg = nbp_vlan_group_rcu(p); - } else if (dev->priv_flags & IFF_EBRIDGE) { + } else if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); vg = br_vlan_group_rcu(br); } @@ -1050,7 +1050,7 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) p = br_port_get_rtnl(dev); /* We want to accept dev as bridge itself as well */ - if (!p && !(dev->priv_flags & IFF_EBRIDGE)) + if (!p && !netif_is_bridge_master(dev)) return -EINVAL; err = br_afspec(br, p, afspec, RTM_DELLINK, &changed, NULL); @@ -1666,7 +1666,8 @@ static size_t br_get_linkxstats_size(const struct net_device *dev, int attr) } return numvls * nla_total_size(sizeof(struct bridge_vlan_xstats)) + - nla_total_size(sizeof(struct br_mcast_stats)) + + nla_total_size_64bit(sizeof(struct br_mcast_stats)) + + (p ? nla_total_size_64bit(sizeof(p->stp_xstats)) : 0) + nla_total_size(0); } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index b4cef3a97f12..37ca76406f1e 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -82,7 +82,7 @@ struct bridge_mcast_other_query { struct bridge_mcast_querier { struct br_ip addr; int port_ifidx; - seqcount_t seq; + seqcount_spinlock_t seq; }; /* IGMP/MLD statistics */ @@ -1125,9 +1125,7 @@ static inline unsigned long br_multicast_lmqt(const struct net_bridge_mcast *brm static inline unsigned long br_multicast_gmi(const struct net_bridge_mcast *brmctx) { - /* use the RFC default of 2 for QRV */ - return 2 * brmctx->multicast_query_interval + - brmctx->multicast_query_response_interval; + return brmctx->multicast_membership_interval; } static inline bool diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index ba55851fe132..75204d36d7f9 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -233,7 +233,7 @@ void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr) memcpy(oldaddr, br->bridge_id.addr, ETH_ALEN); memcpy(br->bridge_id.addr, addr, ETH_ALEN); - memcpy(br->dev->dev_addr, addr, ETH_ALEN); + eth_hw_addr_set(br->dev, addr); list_for_each_entry(p, &br->port_list, list) { if (ether_addr_equal(p->designated_bridge.addr, oldaddr)) diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index a7af4eaff17d..1a11064f9990 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -66,7 +66,7 @@ static unsigned int ebt_broute(void *priv, struct sk_buff *skb, NFPROTO_BRIDGE, s->in, NULL, NULL, s->net, NULL); - ret = ebt_do_table(skb, &state, priv); + ret = ebt_do_table(priv, skb, &state); if (ret != NF_DROP) return ret; diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index c0b121df4a9a..cb949436bc0e 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -58,28 +58,21 @@ static const struct ebt_table frame_filter = { .me = THIS_MODULE, }; -static unsigned int -ebt_filter_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ebt_do_table(skb, state, priv); -} - static const struct nf_hook_ops ebt_ops_filter[] = { { - .hook = ebt_filter_hook, + .hook = ebt_do_table, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_IN, .priority = NF_BR_PRI_FILTER_BRIDGED, }, { - .hook = ebt_filter_hook, + .hook = ebt_do_table, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_FORWARD, .priority = NF_BR_PRI_FILTER_BRIDGED, }, { - .hook = ebt_filter_hook, + .hook = ebt_do_table, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_OUT, .priority = NF_BR_PRI_FILTER_OTHER, diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 4078151c224f..5ee0531ae506 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -58,27 +58,21 @@ static const struct ebt_table frame_nat = { .me = THIS_MODULE, }; -static unsigned int ebt_nat_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ebt_do_table(skb, state, priv); -} - static const struct nf_hook_ops ebt_ops_nat[] = { { - .hook = ebt_nat_hook, + .hook = ebt_do_table, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_OUT, .priority = NF_BR_PRI_NAT_DST_OTHER, }, { - .hook = ebt_nat_hook, + .hook = ebt_do_table, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_POST_ROUTING, .priority = NF_BR_PRI_NAT_SRC, }, { - .hook = ebt_nat_hook, + .hook = ebt_do_table, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_PRE_ROUTING, .priority = NF_BR_PRI_NAT_DST_BRIDGED, diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 83d1798dfbb4..460d3064dc15 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -189,10 +189,10 @@ ebt_get_target_c(const struct ebt_entry *e) } /* Do some firewalling */ -unsigned int ebt_do_table(struct sk_buff *skb, - const struct nf_hook_state *state, - struct ebt_table *table) +unsigned int ebt_do_table(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) { + struct ebt_table *table = priv; unsigned int hook = state->hook; int i, nentries; struct ebt_entry *point; @@ -926,7 +926,9 @@ static int translate_table(struct net *net, const char *name, return -ENOMEM; for_each_possible_cpu(i) { newinfo->chainstack[i] = - vmalloc(array_size(udc_cnt, sizeof(*(newinfo->chainstack[0])))); + vmalloc_node(array_size(udc_cnt, + sizeof(*(newinfo->chainstack[0]))), + cpu_to_node(i)); if (!newinfo->chainstack[i]) { while (i) vfree(newinfo->chainstack[--i]); diff --git a/net/can/isotp.c b/net/can/isotp.c index caaa532ece94..df6968b28bf4 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -121,7 +121,7 @@ enum { struct tpcon { int idx; int len; - u8 state; + u32 state; u8 bs; u8 sn; u8 ll_dl; @@ -848,6 +848,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); + u32 old_state = so->tx.state; struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf; @@ -860,45 +861,55 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) return -EADDRNOTAVAIL; /* we do not support multiple buffers - for now */ - if (so->tx.state != ISOTP_IDLE || wq_has_sleeper(&so->wait)) { - if (msg->msg_flags & MSG_DONTWAIT) - return -EAGAIN; + if (cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SENDING) != ISOTP_IDLE || + wq_has_sleeper(&so->wait)) { + if (msg->msg_flags & MSG_DONTWAIT) { + err = -EAGAIN; + goto err_out; + } /* wait for complete transmission of current pdu */ - wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + if (err) + goto err_out; } - if (!size || size > MAX_MSG_LENGTH) - return -EINVAL; + if (!size || size > MAX_MSG_LENGTH) { + err = -EINVAL; + goto err_out; + } /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ off = (so->tx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; /* does the given data fit into a single frame for SF_BROADCAST? */ if ((so->opt.flags & CAN_ISOTP_SF_BROADCAST) && - (size > so->tx.ll_dl - SF_PCI_SZ4 - ae - off)) - return -EINVAL; + (size > so->tx.ll_dl - SF_PCI_SZ4 - ae - off)) { + err = -EINVAL; + goto err_out; + } err = memcpy_from_msg(so->tx.buf, msg, size); if (err < 0) - return err; + goto err_out; dev = dev_get_by_index(sock_net(sk), so->ifindex); - if (!dev) - return -ENXIO; + if (!dev) { + err = -ENXIO; + goto err_out; + } skb = sock_alloc_send_skb(sk, so->ll.mtu + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) { dev_put(dev); - return err; + goto err_out; } can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; - so->tx.state = ISOTP_SENDING; so->tx.len = size; so->tx.idx = 0; @@ -954,15 +965,25 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (err) { pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(err)); - return err; + goto err_out; } if (wait_tx_done) { /* wait for complete transmission of current pdu */ wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + + if (sk->sk_err) + return -sk->sk_err; } return size; + +err_out: + so->tx.state = old_state; + if (so->tx.state == ISOTP_IDLE) + wake_up_interruptible(&so->wait); + + return err; } static int isotp_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h index f6df20808f5e..16af1a7f80f6 100644 --- a/net/can/j1939/j1939-priv.h +++ b/net/can/j1939/j1939-priv.h @@ -330,6 +330,7 @@ int j1939_session_activate(struct j1939_session *session); void j1939_tp_schedule_txtimer(struct j1939_session *session, int msec); void j1939_session_timers_cancel(struct j1939_session *session); +#define J1939_MIN_TP_PACKET_SIZE 9 #define J1939_MAX_TP_PACKET_SIZE (7 * 0xff) #define J1939_MAX_ETP_PACKET_SIZE (7 * 0x00ffffff) diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index 08c8606cfd9c..9bc55ecb37f9 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -249,11 +249,14 @@ struct j1939_priv *j1939_netdev_start(struct net_device *ndev) struct j1939_priv *priv, *priv_new; int ret; - priv = j1939_priv_get_by_ndev(ndev); + spin_lock(&j1939_netdev_lock); + priv = j1939_priv_get_by_ndev_locked(ndev); if (priv) { kref_get(&priv->rx_kref); + spin_unlock(&j1939_netdev_lock); return priv; } + spin_unlock(&j1939_netdev_lock); priv = j1939_priv_create(ndev); if (!priv) @@ -269,10 +272,10 @@ struct j1939_priv *j1939_netdev_start(struct net_device *ndev) /* Someone was faster than us, use their priv and roll * back our's. */ + kref_get(&priv_new->rx_kref); spin_unlock(&j1939_netdev_lock); dev_put(ndev); kfree(priv); - kref_get(&priv_new->rx_kref); return priv_new; } j1939_priv_set(ndev, priv); diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index bb5c4b8979be..6c0a0ebdd024 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1237,12 +1237,11 @@ static enum hrtimer_restart j1939_tp_rxtimer(struct hrtimer *hrtimer) session->err = -ETIME; j1939_session_deactivate(session); } else { - netdev_alert(priv->ndev, "%s: 0x%p: rx timeout, send abort\n", - __func__, session); - j1939_session_list_lock(session->priv); if (session->state >= J1939_SESSION_ACTIVE && session->state < J1939_SESSION_ACTIVE_MAX) { + netdev_alert(priv->ndev, "%s: 0x%p: rx timeout, send abort\n", + __func__, session); j1939_session_get(session); hrtimer_start(&session->rxtimer, ms_to_ktime(J1939_XTP_ABORT_TIMEOUT_MS), @@ -1609,6 +1608,8 @@ j1939_session *j1939_xtp_rx_rts_session_new(struct j1939_priv *priv, abort = J1939_XTP_ABORT_FAULT; else if (len > priv->tp_max_packet_size) abort = J1939_XTP_ABORT_RESOURCE; + else if (len < J1939_MIN_TP_PACKET_SIZE) + abort = J1939_XTP_ABORT_FAULT; } if (abort != J1939_XTP_NO_ABORT) { @@ -1789,6 +1790,7 @@ static void j1939_xtp_rx_dpo(struct j1939_priv *priv, struct sk_buff *skb, static void j1939_xtp_rx_dat_one(struct j1939_session *session, struct sk_buff *skb) { + enum j1939_xtp_abort abort = J1939_XTP_ABORT_FAULT; struct j1939_priv *priv = session->priv; struct j1939_sk_buff_cb *skcb, *se_skcb; struct sk_buff *se_skb = NULL; @@ -1803,9 +1805,11 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, skcb = j1939_skb_to_cb(skb); dat = skb->data; - if (skb->len <= 1) + if (skb->len != 8) { /* makes no sense */ + abort = J1939_XTP_ABORT_UNEXPECTED_DATA; goto out_session_cancel; + } switch (session->last_cmd) { case 0xff: @@ -1904,7 +1908,7 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, out_session_cancel: kfree_skb(se_skb); j1939_session_timers_cancel(session); - j1939_session_cancel(session, J1939_XTP_ABORT_FAULT); + j1939_session_cancel(session, abort); j1939_session_put(session); } diff --git a/net/core/Makefile b/net/core/Makefile index 35ced6201814..4268846f2f47 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -36,3 +36,4 @@ obj-$(CONFIG_FAILOVER) += failover.o obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o +obj-$(CONFIG_OF) += of_net.o diff --git a/net/core/dev.c b/net/core/dev.c index f930329f0dc2..4e3d19a06de4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -140,7 +140,7 @@ #include <linux/if_macvlan.h> #include <linux/errqueue.h> #include <linux/hrtimer.h> -#include <linux/netfilter_ingress.h> +#include <linux/netfilter_netdev.h> #include <linux/crash_dump.h> #include <linux/sctp.h> #include <net/udp_tunnel.h> @@ -303,6 +303,12 @@ static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net, return NULL; } +bool netdev_name_in_use(struct net *net, const char *name) +{ + return netdev_name_node_lookup(net, name); +} +EXPORT_SYMBOL(netdev_name_in_use); + int netdev_name_node_alt_create(struct net_device *dev, const char *name) { struct netdev_name_node *name_node; @@ -1133,7 +1139,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) } snprintf(buf, IFNAMSIZ, name, i); - if (!__dev_get_by_name(net, buf)) + if (!netdev_name_in_use(net, buf)) return i; /* It is possible to run out of possible slots @@ -1187,7 +1193,7 @@ static int dev_get_valid_name(struct net *net, struct net_device *dev, if (strchr(name, '%')) return dev_alloc_name_ns(net, dev, name); - else if (__dev_get_by_name(net, name)) + else if (netdev_name_in_use(net, name)) return -EEXIST; else if (dev->name != name) strlcpy(dev->name, name, IFNAMSIZ); @@ -1290,8 +1296,8 @@ rollback: old_assign_type = NET_NAME_RENAMED; goto rollback; } else { - pr_err("%s: name change rollback failed: %d\n", - dev->name, ret); + netdev_err(dev, "name change rollback failed: %d\n", + ret); } } @@ -2345,7 +2351,7 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq) /* If TC0 is invalidated disable TC mapping */ if (tc->offset + tc->count > txq) { - pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); + netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); dev->num_tc = 0; return; } @@ -2356,8 +2362,8 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq) tc = &dev->tc_to_txq[q]; if (tc->offset + tc->count > txq) { - pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", - i, q); + netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", + i, q); netdev_set_prio_tc_map(dev, i, 0); } } @@ -3410,7 +3416,7 @@ EXPORT_SYMBOL(__skb_gso_segment); #ifdef CONFIG_BUG static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { - pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); + netdev_err(dev, "hw csum failure\n"); skb_dump(KERN_ERR, skb, true); dump_stack(); } @@ -3920,6 +3926,7 @@ EXPORT_SYMBOL(dev_loopback_xmit); static struct sk_buff * sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { +#ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress); struct tcf_result cl_res; @@ -3955,6 +3962,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) default: break; } +#endif /* CONFIG_NET_CLS_ACT */ return skb; } @@ -4148,13 +4156,20 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) qdisc_pkt_len_init(skb); #ifdef CONFIG_NET_CLS_ACT skb->tc_at_ingress = 0; -# ifdef CONFIG_NET_EGRESS +#endif +#ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { + if (nf_hook_egress_active()) { + skb = nf_hook_egress(skb, &rc, dev); + if (!skb) + goto out; + } + nf_skip_egress(skb, true); skb = sch_handle_egress(skb, &rc, dev); if (!skb) goto out; + nf_skip_egress(skb, false); } -# endif #endif /* If device/qdisc don't need skb->dst, release it right now while * its hot in this cpu cache. @@ -5296,6 +5311,7 @@ skip_taps: if (static_branch_unlikely(&ingress_needed_key)) { bool another = false; + nf_skip_egress(skb, true); skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, &another); if (another) @@ -5303,6 +5319,7 @@ skip_taps: if (!skb) goto out; + nf_skip_egress(skb, false); if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) goto out; } @@ -5839,7 +5856,7 @@ static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int se gro_normal_list(napi); } -static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) +static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) { struct packet_offload *ptype; __be16 type = skb->protocol; @@ -5868,12 +5885,11 @@ static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) if (err) { WARN_ON(&ptype->list == head); kfree_skb(skb); - return NET_RX_SUCCESS; + return; } out: gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); - return NET_RX_SUCCESS; } static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, @@ -6900,19 +6916,25 @@ EXPORT_SYMBOL(netif_napi_add); void napi_disable(struct napi_struct *n) { + unsigned long val, new; + might_sleep(); set_bit(NAPI_STATE_DISABLE, &n->state); - while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) - msleep(1); - while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state)) - msleep(1); + do { + val = READ_ONCE(n->state); + if (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { + usleep_range(20, 200); + continue; + } + + new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; + new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); + } while (cmpxchg(&n->state, val, new) != val); hrtimer_cancel(&n->timer); - clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); clear_bit(NAPI_STATE_DISABLE, &n->state); - clear_bit(NAPI_STATE_THREADED, &n->state); } EXPORT_SYMBOL(napi_disable); @@ -6925,12 +6947,16 @@ EXPORT_SYMBOL(napi_disable); */ void napi_enable(struct napi_struct *n) { - BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); - smp_mb__before_atomic(); - clear_bit(NAPI_STATE_SCHED, &n->state); - clear_bit(NAPI_STATE_NPSVC, &n->state); - if (n->dev->threaded && n->thread) - set_bit(NAPI_STATE_THREADED, &n->state); + unsigned long val, new; + + do { + val = READ_ONCE(n->state); + BUG_ON(!test_bit(NAPI_STATE_SCHED, &val)); + + new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC); + if (n->dev->threaded && n->thread) + new |= NAPIF_STATE_THREADED; + } while (cmpxchg(&n->state, val, new) != val); } EXPORT_SYMBOL(napi_enable); @@ -6986,8 +7012,8 @@ static int __napi_poll(struct napi_struct *n, bool *repoll) } if (unlikely(work > weight)) - pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n", - n->poll, work, weight); + netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n", + n->poll, work, weight); if (likely(work < weight)) return work; @@ -8541,8 +8567,7 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) dev->flags &= ~IFF_PROMISC; else { dev->promiscuity -= inc; - pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n", - dev->name); + netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n"); return -EOVERFLOW; } } @@ -8612,8 +8637,7 @@ static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) dev->flags &= ~IFF_ALLMULTI; else { dev->allmulti -= inc; - pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n", - dev->name); + netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n"); return -EOVERFLOW; } } @@ -9150,14 +9174,11 @@ int dev_get_port_parent_id(struct net_device *dev, } err = devlink_compat_switch_id_get(dev, ppid); - if (!err || err != -EOPNOTSUPP) + if (!recurse || err != -EOPNOTSUPP) return err; - if (!recurse) - return -EOPNOTSUPP; - netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = dev_get_port_parent_id(lower_dev, ppid, recurse); + err = dev_get_port_parent_id(lower_dev, ppid, true); if (err) break; if (!first.id_len) @@ -10858,7 +10879,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, if (!dev->ethtool_ops) dev->ethtool_ops = &default_ethtool_ops; - nf_hook_ingress_init(dev); + nf_hook_netdev_init(dev); return dev; @@ -11144,7 +11165,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, * we can use it in the destination network namespace. */ err = -EEXIST; - if (__dev_get_by_name(net, dev->name)) { + if (netdev_name_in_use(net, dev->name)) { /* We get here if we can't use the current device name */ if (!pat) goto out; @@ -11497,7 +11518,7 @@ static void __net_exit default_device_exit(struct net *net) /* Push remaining network devices to init_net */ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); - if (__dev_get_by_name(&init_net, fb_name)) + if (netdev_name_in_use(&init_net, fb_name)) snprintf(fb_name, IFNAMSIZ, "dev%%d"); err = dev_change_net_namespace(dev, &init_net, fb_name); if (err) { diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 8c39283c26ae..f0cb38344126 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -50,6 +50,11 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, if (addr_len > MAX_ADDR_LEN) return -EINVAL; + ha = list_first_entry(&list->list, struct netdev_hw_addr, list); + if (ha && !memcmp(addr, ha->addr, addr_len) && + (!addr_type || addr_type == ha->type)) + goto found_it; + while (*ins_point) { int diff; @@ -64,6 +69,7 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, } else if (diff > 0) { ins_point = &parent->rb_right; } else { +found_it: if (exclusive) return -EEXIST; if (global) { diff --git a/net/core/devlink.c b/net/core/devlink.c index 7d975057c2a9..3464854015a2 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -30,6 +30,63 @@ #define CREATE_TRACE_POINTS #include <trace/events/devlink.h> +#define DEVLINK_RELOAD_STATS_ARRAY_SIZE \ + (__DEVLINK_RELOAD_LIMIT_MAX * __DEVLINK_RELOAD_ACTION_MAX) + +struct devlink_dev_stats { + u32 reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; + u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; +}; + +struct devlink { + u32 index; + struct list_head port_list; + struct list_head rate_list; + struct list_head sb_list; + struct list_head dpipe_table_list; + struct list_head resource_list; + struct list_head param_list; + struct list_head region_list; + struct list_head reporter_list; + struct mutex reporters_lock; /* protects reporter_list */ + struct devlink_dpipe_headers *dpipe_headers; + struct list_head trap_list; + struct list_head trap_group_list; + struct list_head trap_policer_list; + const struct devlink_ops *ops; + u64 features; + struct xarray snapshot_ids; + struct devlink_dev_stats stats; + struct device *dev; + possible_net_t _net; + /* Serializes access to devlink instance specific objects such as + * port, sb, dpipe, resource, params, region, traps and more. + */ + struct mutex lock; + u8 reload_failed:1; + refcount_t refcount; + struct completion comp; + char priv[0] __aligned(NETDEV_ALIGN); +}; + +void *devlink_priv(struct devlink *devlink) +{ + return &devlink->priv; +} +EXPORT_SYMBOL_GPL(devlink_priv); + +struct devlink *priv_to_devlink(void *priv) +{ + return container_of(priv, struct devlink, priv); +} +EXPORT_SYMBOL_GPL(priv_to_devlink); + +struct device *devlink_to_dev(const struct devlink *devlink) +{ + return devlink->dev; +} +EXPORT_SYMBOL_GPL(devlink_to_dev); + static struct devlink_dpipe_field devlink_dpipe_fields_ethernet[] = { { .name = "destination mac", @@ -95,6 +152,22 @@ static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ static DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC); #define DEVLINK_REGISTERED XA_MARK_1 +/* devlink instances are open to the access from the user space after + * devlink_register() call. Such logical barrier allows us to have certain + * expectations related to locking. + * + * Before *_register() - we are in initialization stage and no parallel + * access possible to the devlink instance. All drivers perform that phase + * by implicitly holding device_lock. + * + * After *_register() - users and driver can access devlink instance at + * the same time. + */ +#define ASSERT_DEVLINK_REGISTERED(d) \ + WARN_ON_ONCE(!xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED)) +#define ASSERT_DEVLINK_NOT_REGISTERED(d) \ + WARN_ON_ONCE(xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED)) + /* devlink_mutex * * An overall lock guarding every operation coming from userspace. @@ -742,6 +815,7 @@ static void devlink_notify(struct devlink *devlink, enum devlink_command cmd) int err; WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL); + WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) @@ -1040,11 +1114,15 @@ nla_put_failure: static void devlink_port_notify(struct devlink_port *devlink_port, enum devlink_command cmd) { + struct devlink *devlink = devlink_port->devlink; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL); + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; @@ -1055,19 +1133,22 @@ static void devlink_port_notify(struct devlink_port *devlink_port, return; } - genlmsg_multicast_netns(&devlink_nl_family, - devlink_net(devlink_port->devlink), msg, 0, - DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, + 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } static void devlink_rate_notify(struct devlink_rate *devlink_rate, enum devlink_command cmd) { + struct devlink *devlink = devlink_rate->devlink; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && cmd != DEVLINK_CMD_RATE_DEL); + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; @@ -1078,9 +1159,8 @@ static void devlink_rate_notify(struct devlink_rate *devlink_rate, return; } - genlmsg_multicast_netns(&devlink_nl_family, - devlink_net(devlink_rate->devlink), msg, 0, - DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, + 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg, @@ -3952,9 +4032,6 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net, struct net *curr_net; int err; - if (!devlink->reload_enabled) - return -EOPNOTSUPP; - memcpy(remote_reload_stats, devlink->stats.remote_reload_stats, sizeof(remote_reload_stats)); @@ -4022,7 +4099,7 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) u32 actions_performed; int err; - if (!devlink_reload_supported(devlink->ops)) + if (!(devlink->features & DEVLINK_F_RELOAD)) return -EOPNOTSUPP; err = devlink_resources_validate(devlink, NULL, info); @@ -4150,6 +4227,7 @@ static void __devlink_flash_update_notify(struct devlink *devlink, WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE && cmd != DEVLINK_CMD_FLASH_UPDATE_END && cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS); + WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) @@ -5070,6 +5148,11 @@ static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink, if (err) goto nla_put_failure; + err = nla_put_u32(msg, DEVLINK_ATTR_REGION_MAX_SNAPSHOTS, + region->max_snapshots); + if (err) + goto nla_put_failure; + err = devlink_nl_region_snapshots_id_put(msg, devlink, region); if (err) goto nla_put_failure; @@ -5145,17 +5228,19 @@ static void devlink_nl_region_notify(struct devlink_region *region, struct devlink_snapshot *snapshot, enum devlink_command cmd) { + struct devlink *devlink = region->devlink; struct sk_buff *msg; WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL); + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; msg = devlink_nl_region_notify_build(region, snapshot, cmd, 0, 0); if (IS_ERR(msg)) return; - genlmsg_multicast_netns(&devlink_nl_family, - devlink_net(region->devlink), msg, 0, - DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, + 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } /** @@ -6920,10 +7005,12 @@ genlmsg_cancel: static void devlink_recover_notify(struct devlink_health_reporter *reporter, enum devlink_command cmd) { + struct devlink *devlink = reporter->devlink; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_HEALTH_REPORTER_RECOVER); + WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) @@ -6935,9 +7022,8 @@ static void devlink_recover_notify(struct devlink_health_reporter *reporter, return; } - genlmsg_multicast_netns(&devlink_nl_family, - devlink_net(reporter->devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, + 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } void @@ -8897,6 +8983,25 @@ static bool devlink_reload_actions_valid(const struct devlink_ops *ops) } /** + * devlink_set_features - Set devlink supported features + * + * @devlink: devlink + * @features: devlink support features + * + * This interface allows us to set reload ops separatelly from + * the devlink_alloc. + */ +void devlink_set_features(struct devlink *devlink, u64 features) +{ + ASSERT_DEVLINK_NOT_REGISTERED(devlink); + + WARN_ON(features & DEVLINK_F_RELOAD && + !devlink_reload_supported(devlink->ops)); + devlink->features = features; +} +EXPORT_SYMBOL_GPL(devlink_set_features); + +/** * devlink_alloc_ns - Allocate new devlink instance resources * in specific namespace * @@ -8955,6 +9060,84 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, } EXPORT_SYMBOL_GPL(devlink_alloc_ns); +static void +devlink_trap_policer_notify(struct devlink *devlink, + const struct devlink_trap_policer_item *policer_item, + enum devlink_command cmd); +static void +devlink_trap_group_notify(struct devlink *devlink, + const struct devlink_trap_group_item *group_item, + enum devlink_command cmd); +static void devlink_trap_notify(struct devlink *devlink, + const struct devlink_trap_item *trap_item, + enum devlink_command cmd); + +static void devlink_notify_register(struct devlink *devlink) +{ + struct devlink_trap_policer_item *policer_item; + struct devlink_trap_group_item *group_item; + struct devlink_trap_item *trap_item; + struct devlink_port *devlink_port; + struct devlink_rate *rate_node; + struct devlink_region *region; + + devlink_notify(devlink, DEVLINK_CMD_NEW); + list_for_each_entry(devlink_port, &devlink->port_list, list) + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); + + list_for_each_entry(policer_item, &devlink->trap_policer_list, list) + devlink_trap_policer_notify(devlink, policer_item, + DEVLINK_CMD_TRAP_POLICER_NEW); + + list_for_each_entry(group_item, &devlink->trap_group_list, list) + devlink_trap_group_notify(devlink, group_item, + DEVLINK_CMD_TRAP_GROUP_NEW); + + list_for_each_entry(trap_item, &devlink->trap_list, list) + devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW); + + list_for_each_entry(rate_node, &devlink->rate_list, list) + devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); + + list_for_each_entry(region, &devlink->region_list, list) + devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); + + devlink_params_publish(devlink); +} + +static void devlink_notify_unregister(struct devlink *devlink) +{ + struct devlink_trap_policer_item *policer_item; + struct devlink_trap_group_item *group_item; + struct devlink_trap_item *trap_item; + struct devlink_port *devlink_port; + struct devlink_rate *rate_node; + struct devlink_region *region; + + devlink_params_unpublish(devlink); + + list_for_each_entry_reverse(region, &devlink->region_list, list) + devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL); + + list_for_each_entry_reverse(rate_node, &devlink->rate_list, list) + devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); + + list_for_each_entry_reverse(trap_item, &devlink->trap_list, list) + devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL); + + list_for_each_entry_reverse(group_item, &devlink->trap_group_list, list) + devlink_trap_group_notify(devlink, group_item, + DEVLINK_CMD_TRAP_GROUP_DEL); + list_for_each_entry_reverse(policer_item, &devlink->trap_policer_list, + list) + devlink_trap_policer_notify(devlink, policer_item, + DEVLINK_CMD_TRAP_POLICER_DEL); + + list_for_each_entry_reverse(devlink_port, &devlink->port_list, list) + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); + devlink_notify(devlink, DEVLINK_CMD_DEL); +} + /** * devlink_register - Register devlink instance * @@ -8962,9 +9145,12 @@ EXPORT_SYMBOL_GPL(devlink_alloc_ns); */ void devlink_register(struct devlink *devlink) { + ASSERT_DEVLINK_NOT_REGISTERED(devlink); + /* Make sure that we are in .probe() routine */ + mutex_lock(&devlink_mutex); xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); - devlink_notify(devlink, DEVLINK_CMD_NEW); + devlink_notify_register(devlink); mutex_unlock(&devlink_mutex); } EXPORT_SYMBOL_GPL(devlink_register); @@ -8976,60 +9162,28 @@ EXPORT_SYMBOL_GPL(devlink_register); */ void devlink_unregister(struct devlink *devlink) { + ASSERT_DEVLINK_REGISTERED(devlink); + /* Make sure that we are in .remove() routine */ + devlink_put(devlink); wait_for_completion(&devlink->comp); mutex_lock(&devlink_mutex); - WARN_ON(devlink_reload_supported(devlink->ops) && - devlink->reload_enabled); - devlink_notify(devlink, DEVLINK_CMD_DEL); + devlink_notify_unregister(devlink); xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); mutex_unlock(&devlink_mutex); } EXPORT_SYMBOL_GPL(devlink_unregister); /** - * devlink_reload_enable - Enable reload of devlink instance - * - * @devlink: devlink - * - * Should be called at end of device initialization - * process when reload operation is supported. - */ -void devlink_reload_enable(struct devlink *devlink) -{ - mutex_lock(&devlink_mutex); - devlink->reload_enabled = true; - mutex_unlock(&devlink_mutex); -} -EXPORT_SYMBOL_GPL(devlink_reload_enable); - -/** - * devlink_reload_disable - Disable reload of devlink instance - * - * @devlink: devlink - * - * Should be called at the beginning of device cleanup - * process when reload operation is supported. - */ -void devlink_reload_disable(struct devlink *devlink) -{ - mutex_lock(&devlink_mutex); - /* Mutex is taken which ensures that no reload operation is in - * progress while setting up forbidded flag. - */ - devlink->reload_enabled = false; - mutex_unlock(&devlink_mutex); -} -EXPORT_SYMBOL_GPL(devlink_reload_disable); - -/** * devlink_free - Free devlink instance resources * * @devlink: devlink */ void devlink_free(struct devlink *devlink) { + ASSERT_DEVLINK_NOT_REGISTERED(devlink); + mutex_destroy(&devlink->reporters_lock); mutex_destroy(&devlink->lock); WARN_ON(!list_empty(&devlink->trap_policer_list)); @@ -10086,6 +10240,9 @@ void devlink_params_publish(struct devlink *devlink) { struct devlink_param_item *param_item; + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; + list_for_each_entry(param_item, &devlink->param_list, list) { if (param_item->published) continue; @@ -10118,54 +10275,25 @@ void devlink_params_unpublish(struct devlink *devlink) EXPORT_SYMBOL_GPL(devlink_params_unpublish); /** - * devlink_port_params_register - register port configuration parameters - * - * @devlink_port: devlink port - * @params: configuration parameters array - * @params_count: number of parameters provided + * devlink_param_driverinit_value_get - get configuration parameter + * value for driver initializing * - * Register the configuration parameters supported by the port. - */ -int devlink_port_params_register(struct devlink_port *devlink_port, - const struct devlink_param *params, - size_t params_count) -{ - return __devlink_params_register(devlink_port->devlink, - devlink_port->index, - &devlink_port->param_list, params, - params_count, - DEVLINK_CMD_PORT_PARAM_NEW, - DEVLINK_CMD_PORT_PARAM_DEL); -} -EXPORT_SYMBOL_GPL(devlink_port_params_register); - -/** - * devlink_port_params_unregister - unregister port configuration - * parameters + * @devlink: devlink + * @param_id: parameter ID + * @init_val: value of parameter in driverinit configuration mode * - * @devlink_port: devlink port - * @params: configuration parameters array - * @params_count: number of parameters provided + * This function should be used by the driver to get driverinit + * configuration for initialization after reload command. */ -void devlink_port_params_unregister(struct devlink_port *devlink_port, - const struct devlink_param *params, - size_t params_count) -{ - return __devlink_params_unregister(devlink_port->devlink, - devlink_port->index, - &devlink_port->param_list, - params, params_count, - DEVLINK_CMD_PORT_PARAM_DEL); -} -EXPORT_SYMBOL_GPL(devlink_port_params_unregister); - -static int -__devlink_param_driverinit_value_get(struct list_head *param_list, u32 param_id, - union devlink_param_value *init_val) +int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, + union devlink_param_value *init_val) { struct devlink_param_item *param_item; - param_item = devlink_param_find_by_id(param_list, param_id); + if (!devlink_reload_supported(devlink->ops)) + return -EOPNOTSUPP; + + param_item = devlink_param_find_by_id(&devlink->param_list, param_id); if (!param_item) return -EINVAL; @@ -10181,17 +10309,26 @@ __devlink_param_driverinit_value_get(struct list_head *param_list, u32 param_id, return 0; } +EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get); -static int -__devlink_param_driverinit_value_set(struct devlink *devlink, - unsigned int port_index, - struct list_head *param_list, u32 param_id, - union devlink_param_value init_val, - enum devlink_command cmd) +/** + * devlink_param_driverinit_value_set - set value of configuration + * parameter for driverinit + * configuration mode + * + * @devlink: devlink + * @param_id: parameter ID + * @init_val: value of parameter to set for driverinit configuration mode + * + * This function should be used by the driver to set driverinit + * configuration mode default value. + */ +int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, + union devlink_param_value init_val) { struct devlink_param_item *param_item; - param_item = devlink_param_find_by_id(param_list, param_id); + param_item = devlink_param_find_by_id(&devlink->param_list, param_id); if (!param_item) return -EINVAL; @@ -10205,52 +10342,9 @@ __devlink_param_driverinit_value_set(struct devlink *devlink, param_item->driverinit_value = init_val; param_item->driverinit_value_valid = true; - devlink_param_notify(devlink, port_index, param_item, cmd); + devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); return 0; } - -/** - * devlink_param_driverinit_value_get - get configuration parameter - * value for driver initializing - * - * @devlink: devlink - * @param_id: parameter ID - * @init_val: value of parameter in driverinit configuration mode - * - * This function should be used by the driver to get driverinit - * configuration for initialization after reload command. - */ -int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, - union devlink_param_value *init_val) -{ - if (!devlink_reload_supported(devlink->ops)) - return -EOPNOTSUPP; - - return __devlink_param_driverinit_value_get(&devlink->param_list, - param_id, init_val); -} -EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get); - -/** - * devlink_param_driverinit_value_set - set value of configuration - * parameter for driverinit - * configuration mode - * - * @devlink: devlink - * @param_id: parameter ID - * @init_val: value of parameter to set for driverinit configuration mode - * - * This function should be used by the driver to set driverinit - * configuration mode default value. - */ -int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, - union devlink_param_value init_val) -{ - return __devlink_param_driverinit_value_set(devlink, 0, - &devlink->param_list, - param_id, init_val, - DEVLINK_CMD_PARAM_NEW); -} EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set); /** @@ -10694,6 +10788,8 @@ devlink_trap_group_notify(struct devlink *devlink, WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_GROUP_NEW && cmd != DEVLINK_CMD_TRAP_GROUP_DEL); + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) @@ -10735,6 +10831,8 @@ static void devlink_trap_notify(struct devlink *devlink, WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_NEW && cmd != DEVLINK_CMD_TRAP_DEL); + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) @@ -11116,6 +11214,8 @@ devlink_trap_policer_notify(struct devlink *devlink, WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_POLICER_NEW && cmd != DEVLINK_CMD_TRAP_POLICER_DEL); + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) @@ -11284,6 +11384,24 @@ free_msg: nlmsg_free(msg); } +static struct devlink_port *netdev_to_devlink_port(struct net_device *dev) +{ + if (!dev->netdev_ops->ndo_get_devlink_port) + return NULL; + + return dev->netdev_ops->ndo_get_devlink_port(dev); +} + +static struct devlink *netdev_to_devlink(struct net_device *dev) +{ + struct devlink_port *devlink_port = netdev_to_devlink_port(dev); + + if (!devlink_port) + return NULL; + + return devlink_port->devlink; +} + void devlink_compat_running_version(struct net_device *dev, char *buf, size_t len) { @@ -11393,7 +11511,7 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net) if (!net_eq(devlink_net(devlink), net)) goto retry; - WARN_ON(!devlink_reload_supported(devlink->ops)); + WARN_ON(!(devlink->features & DEVLINK_F_RELOAD)); err = devlink_reload(devlink, &init_net, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, DEVLINK_RELOAD_LIMIT_UNSPEC, diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index bac0184cf3de..7d0a9f84aaf7 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1196,9 +1196,8 @@ proto_again: break; } - proto = hdr->proto; nhoff += PPPOE_SES_HLEN; - switch (proto) { + switch (hdr->proto) { case htons(PPP_IP): proto = htons(ETH_P_IP); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 8e582e29a41e..4fcbdd71c59f 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -40,10 +40,10 @@ */ struct net_rate_estimator { - struct gnet_stats_basic_packed *bstats; + struct gnet_stats_basic_sync *bstats; spinlock_t *stats_lock; - seqcount_t *running; - struct gnet_stats_basic_cpu __percpu *cpu_bstats; + bool running; + struct gnet_stats_basic_sync __percpu *cpu_bstats; u8 ewma_log; u8 intvl_log; /* period : (250ms << intvl_log) */ @@ -60,13 +60,13 @@ struct net_rate_estimator { }; static void est_fetch_counters(struct net_rate_estimator *e, - struct gnet_stats_basic_packed *b) + struct gnet_stats_basic_sync *b) { - memset(b, 0, sizeof(*b)); + gnet_stats_basic_sync_init(b); if (e->stats_lock) spin_lock(e->stats_lock); - __gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats); + gnet_stats_add_basic(b, e->cpu_bstats, e->bstats, e->running); if (e->stats_lock) spin_unlock(e->stats_lock); @@ -76,14 +76,18 @@ static void est_fetch_counters(struct net_rate_estimator *e, static void est_timer(struct timer_list *t) { struct net_rate_estimator *est = from_timer(est, t, timer); - struct gnet_stats_basic_packed b; + struct gnet_stats_basic_sync b; + u64 b_bytes, b_packets; u64 rate, brate; est_fetch_counters(est, &b); - brate = (b.bytes - est->last_bytes) << (10 - est->intvl_log); + b_bytes = u64_stats_read(&b.bytes); + b_packets = u64_stats_read(&b.packets); + + brate = (b_bytes - est->last_bytes) << (10 - est->intvl_log); brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log); - rate = (b.packets - est->last_packets) << (10 - est->intvl_log); + rate = (b_packets - est->last_packets) << (10 - est->intvl_log); rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log); write_seqcount_begin(&est->seq); @@ -91,8 +95,8 @@ static void est_timer(struct timer_list *t) est->avpps += rate; write_seqcount_end(&est->seq); - est->last_bytes = b.bytes; - est->last_packets = b.packets; + est->last_bytes = b_bytes; + est->last_packets = b_packets; est->next_jiffies += ((HZ/4) << est->intvl_log); @@ -109,7 +113,9 @@ static void est_timer(struct timer_list *t) * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics * @lock: lock for statistics and control path - * @running: qdisc running seqcount + * @running: true if @bstats represents a running qdisc, thus @bstats' + * internal values might change during basic reads. Only used + * if @bstats_cpu is NULL * @opt: rate estimator configuration TLV * * Creates a new rate estimator with &bstats as source and &rate_est @@ -121,16 +127,16 @@ static void est_timer(struct timer_list *t) * Returns 0 on success or a negative error code. * */ -int gen_new_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu_bstats, +int gen_new_estimator(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, spinlock_t *lock, - seqcount_t *running, + bool running, struct nlattr *opt) { struct gnet_estimator *parm = nla_data(opt); struct net_rate_estimator *old, *est; - struct gnet_stats_basic_packed b; + struct gnet_stats_basic_sync b; int intvl_log; if (nla_len(opt) < sizeof(*parm)) @@ -164,8 +170,8 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, est_fetch_counters(est, &b); if (lock) local_bh_enable(); - est->last_bytes = b.bytes; - est->last_packets = b.packets; + est->last_bytes = u64_stats_read(&b.bytes); + est->last_packets = u64_stats_read(&b.packets); if (lock) spin_lock_bh(lock); @@ -214,7 +220,9 @@ EXPORT_SYMBOL(gen_kill_estimator); * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics * @lock: lock for statistics and control path - * @running: qdisc running seqcount (might be NULL) + * @running: true if @bstats represents a running qdisc, thus @bstats' + * internal values might change during basic reads. Only used + * if @cpu_bstats is NULL * @opt: rate estimator configuration TLV * * Replaces the configuration of a rate estimator by calling @@ -222,11 +230,11 @@ EXPORT_SYMBOL(gen_kill_estimator); * * Returns 0 on success or a negative error code. */ -int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu_bstats, +int gen_replace_estimator(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, spinlock_t *lock, - seqcount_t *running, struct nlattr *opt) + bool running, struct nlattr *opt) { return gen_new_estimator(bstats, cpu_bstats, rate_est, lock, running, opt); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index e491b083b348..a10335b4ba2d 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -18,7 +18,7 @@ #include <linux/gen_stats.h> #include <net/netlink.h> #include <net/gen_stats.h> - +#include <net/sch_generic.h> static inline int gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr) @@ -114,63 +114,112 @@ gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, } EXPORT_SYMBOL(gnet_stats_start_copy); -static void -__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu) +/* Must not be inlined, due to u64_stats seqcount_t lockdep key */ +void gnet_stats_basic_sync_init(struct gnet_stats_basic_sync *b) { + u64_stats_set(&b->bytes, 0); + u64_stats_set(&b->packets, 0); + u64_stats_init(&b->syncp); +} +EXPORT_SYMBOL(gnet_stats_basic_sync_init); + +static void gnet_stats_add_basic_cpu(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu) +{ + u64 t_bytes = 0, t_packets = 0; int i; for_each_possible_cpu(i) { - struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i); + struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i); unsigned int start; u64 bytes, packets; do { start = u64_stats_fetch_begin_irq(&bcpu->syncp); - bytes = bcpu->bstats.bytes; - packets = bcpu->bstats.packets; + bytes = u64_stats_read(&bcpu->bytes); + packets = u64_stats_read(&bcpu->packets); } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); - bstats->bytes += bytes; - bstats->packets += packets; + t_bytes += bytes; + t_packets += packets; + } + _bstats_update(bstats, t_bytes, t_packets); +} + +void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, bool running) +{ + unsigned int start; + u64 bytes = 0; + u64 packets = 0; + + WARN_ON_ONCE((cpu || running) && in_hardirq()); + + if (cpu) { + gnet_stats_add_basic_cpu(bstats, cpu); + return; } + do { + if (running) + start = u64_stats_fetch_begin_irq(&b->syncp); + bytes = u64_stats_read(&b->bytes); + packets = u64_stats_read(&b->packets); + } while (running && u64_stats_fetch_retry_irq(&b->syncp, start)); + + _bstats_update(bstats, bytes, packets); } +EXPORT_SYMBOL(gnet_stats_add_basic); -void -__gnet_stats_copy_basic(const seqcount_t *running, - struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b) +static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, bool running) { - unsigned int seq; + unsigned int start; if (cpu) { - __gnet_stats_copy_basic_cpu(bstats, cpu); + u64 t_bytes = 0, t_packets = 0; + int i; + + for_each_possible_cpu(i) { + struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i); + unsigned int start; + u64 bytes, packets; + + do { + start = u64_stats_fetch_begin_irq(&bcpu->syncp); + bytes = u64_stats_read(&bcpu->bytes); + packets = u64_stats_read(&bcpu->packets); + } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); + + t_bytes += bytes; + t_packets += packets; + } + *ret_bytes = t_bytes; + *ret_packets = t_packets; return; } do { if (running) - seq = read_seqcount_begin(running); - bstats->bytes = b->bytes; - bstats->packets = b->packets; - } while (running && read_seqcount_retry(running, seq)); + start = u64_stats_fetch_begin_irq(&b->syncp); + *ret_bytes = u64_stats_read(&b->bytes); + *ret_packets = u64_stats_read(&b->packets); + } while (running && u64_stats_fetch_retry_irq(&b->syncp, start)); } -EXPORT_SYMBOL(__gnet_stats_copy_basic); static int -___gnet_stats_copy_basic(const seqcount_t *running, - struct gnet_dump *d, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b, - int type) +___gnet_stats_copy_basic(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, + int type, bool running) { - struct gnet_stats_basic_packed bstats = {0}; + u64 bstats_bytes, bstats_packets; - __gnet_stats_copy_basic(running, &bstats, cpu, b); + gnet_stats_read_basic(&bstats_bytes, &bstats_packets, cpu, b, running); if (d->compat_tc_stats && type == TCA_STATS_BASIC) { - d->tc_stats.bytes = bstats.bytes; - d->tc_stats.packets = bstats.packets; + d->tc_stats.bytes = bstats_bytes; + d->tc_stats.packets = bstats_packets; } if (d->tail) { @@ -178,24 +227,28 @@ ___gnet_stats_copy_basic(const seqcount_t *running, int res; memset(&sb, 0, sizeof(sb)); - sb.bytes = bstats.bytes; - sb.packets = bstats.packets; + sb.bytes = bstats_bytes; + sb.packets = bstats_packets; res = gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD); - if (res < 0 || sb.packets == bstats.packets) + if (res < 0 || sb.packets == bstats_packets) return res; /* emit 64bit stats only if needed */ - return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats.packets, - sizeof(bstats.packets), TCA_STATS_PAD); + return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats_packets, + sizeof(bstats_packets), TCA_STATS_PAD); } return 0; } /** * gnet_stats_copy_basic - copy basic statistics into statistic TLV - * @running: seqcount_t pointer * @d: dumping handle * @cpu: copy statistic per cpu * @b: basic statistics + * @running: true if @b represents a running qdisc, thus @b's + * internal values might change during basic reads. + * Only used if @cpu is NULL + * + * Context: task; must not be run from IRQ or BH contexts * * Appends the basic statistics to the top level TLV created by * gnet_stats_start_copy(). @@ -204,22 +257,25 @@ ___gnet_stats_copy_basic(const seqcount_t *running, * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_basic(const seqcount_t *running, - struct gnet_dump *d, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b) +gnet_stats_copy_basic(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, + bool running) { - return ___gnet_stats_copy_basic(running, d, cpu, b, - TCA_STATS_BASIC); + return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC, running); } EXPORT_SYMBOL(gnet_stats_copy_basic); /** * gnet_stats_copy_basic_hw - copy basic hw statistics into statistic TLV - * @running: seqcount_t pointer * @d: dumping handle * @cpu: copy statistic per cpu * @b: basic statistics + * @running: true if @b represents a running qdisc, thus @b's + * internal values might change during basic reads. + * Only used if @cpu is NULL + * + * Context: task; must not be run from IRQ or BH contexts * * Appends the basic statistics to the top level TLV created by * gnet_stats_start_copy(). @@ -228,13 +284,12 @@ EXPORT_SYMBOL(gnet_stats_copy_basic); * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_basic_hw(const seqcount_t *running, - struct gnet_dump *d, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b) +gnet_stats_copy_basic_hw(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, + bool running) { - return ___gnet_stats_copy_basic(running, d, cpu, b, - TCA_STATS_BASIC_HW); + return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC_HW, running); } EXPORT_SYMBOL(gnet_stats_copy_basic_hw); @@ -282,16 +337,15 @@ gnet_stats_copy_rate_est(struct gnet_dump *d, } EXPORT_SYMBOL(gnet_stats_copy_rate_est); -static void -__gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, - const struct gnet_stats_queue __percpu *q) +static void gnet_stats_add_queue_cpu(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *q) { int i; for_each_possible_cpu(i) { const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i); - qstats->qlen = 0; + qstats->qlen += qcpu->backlog; qstats->backlog += qcpu->backlog; qstats->drops += qcpu->drops; qstats->requeues += qcpu->requeues; @@ -299,24 +353,21 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, } } -void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, - const struct gnet_stats_queue __percpu *cpu, - const struct gnet_stats_queue *q, - __u32 qlen) +void gnet_stats_add_queue(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *cpu, + const struct gnet_stats_queue *q) { if (cpu) { - __gnet_stats_copy_queue_cpu(qstats, cpu); + gnet_stats_add_queue_cpu(qstats, cpu); } else { - qstats->qlen = q->qlen; - qstats->backlog = q->backlog; - qstats->drops = q->drops; - qstats->requeues = q->requeues; - qstats->overlimits = q->overlimits; + qstats->qlen += q->qlen; + qstats->backlog += q->backlog; + qstats->drops += q->drops; + qstats->requeues += q->requeues; + qstats->overlimits += q->overlimits; } - - qstats->qlen = qlen; } -EXPORT_SYMBOL(__gnet_stats_copy_queue); +EXPORT_SYMBOL(gnet_stats_add_queue); /** * gnet_stats_copy_queue - copy queue statistics into statistics TLV @@ -339,7 +390,8 @@ gnet_stats_copy_queue(struct gnet_dump *d, { struct gnet_stats_queue qstats = {0}; - __gnet_stats_copy_queue(&qstats, cpu_q, q, qlen); + gnet_stats_add_queue(&qstats, cpu_q, q); + qstats.qlen = qlen; if (d->compat_tc_stats) { d->tc_stats.drops = qstats.drops; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 2d5bc3a75fae..47931c8be04b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -122,6 +122,8 @@ static void neigh_mark_dead(struct neighbour *n) list_del_init(&n->gc_list); atomic_dec(&n->tbl->gc_entries); } + if (!list_empty(&n->managed_list)) + list_del_init(&n->managed_list); } static void neigh_update_gc_list(struct neighbour *n) @@ -130,7 +132,6 @@ static void neigh_update_gc_list(struct neighbour *n) write_lock_bh(&n->tbl->lock); write_lock(&n->lock); - if (n->dead) goto out; @@ -149,32 +150,59 @@ static void neigh_update_gc_list(struct neighbour *n) list_add_tail(&n->gc_list, &n->tbl->gc_list); atomic_inc(&n->tbl->gc_entries); } +out: + write_unlock(&n->lock); + write_unlock_bh(&n->tbl->lock); +} +static void neigh_update_managed_list(struct neighbour *n) +{ + bool on_managed_list, add_to_managed; + + write_lock_bh(&n->tbl->lock); + write_lock(&n->lock); + if (n->dead) + goto out; + + add_to_managed = n->flags & NTF_MANAGED; + on_managed_list = !list_empty(&n->managed_list); + + if (!add_to_managed && on_managed_list) + list_del_init(&n->managed_list); + else if (add_to_managed && !on_managed_list) + list_add_tail(&n->managed_list, &n->tbl->managed_list); out: write_unlock(&n->lock); write_unlock_bh(&n->tbl->lock); } -static bool neigh_update_ext_learned(struct neighbour *neigh, u32 flags, - int *notify) +static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, + bool *gc_update, bool *managed_update) { - bool rc = false; - u8 ndm_flags; + u32 ndm_flags, old_flags = neigh->flags; if (!(flags & NEIGH_UPDATE_F_ADMIN)) - return rc; + return; + + ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; + ndm_flags |= (flags & NEIGH_UPDATE_F_MANAGED) ? NTF_MANAGED : 0; - ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; - if ((neigh->flags ^ ndm_flags) & NTF_EXT_LEARNED) { + if ((old_flags ^ ndm_flags) & NTF_EXT_LEARNED) { if (ndm_flags & NTF_EXT_LEARNED) neigh->flags |= NTF_EXT_LEARNED; else neigh->flags &= ~NTF_EXT_LEARNED; - rc = true; *notify = 1; + *gc_update = true; + } + if ((old_flags ^ ndm_flags) & NTF_MANAGED) { + if (ndm_flags & NTF_MANAGED) + neigh->flags |= NTF_MANAGED; + else + neigh->flags &= ~NTF_MANAGED; + *notify = 1; + *managed_update = true; } - - return rc; } static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, @@ -379,7 +407,7 @@ EXPORT_SYMBOL(neigh_ifdown); static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev, - bool exempt_from_gc) + u32 flags, bool exempt_from_gc) { struct neighbour *n = NULL; unsigned long now = jiffies; @@ -412,6 +440,7 @@ do_alloc: n->updated = n->used = now; n->nud_state = NUD_NONE; n->output = neigh_blackhole; + n->flags = flags; seqlock_init(&n->hh.hh_lock); n->parms = neigh_parms_clone(&tbl->parms); timer_setup(&n->timer, neigh_timer_handler, 0); @@ -421,6 +450,7 @@ do_alloc: refcount_set(&n->refcnt, 1); n->dead = 1; INIT_LIST_HEAD(&n->gc_list); + INIT_LIST_HEAD(&n->managed_list); atomic_inc(&tbl->entries); out: @@ -575,19 +605,18 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, } EXPORT_SYMBOL(neigh_lookup_nodev); -static struct neighbour *___neigh_create(struct neigh_table *tbl, - const void *pkey, - struct net_device *dev, - bool exempt_from_gc, bool want_ref) +static struct neighbour * +___neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev, u32 flags, + bool exempt_from_gc, bool want_ref) { - struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, exempt_from_gc); - u32 hash_val; - unsigned int key_len = tbl->key_len; - int error; + u32 hash_val, key_len = tbl->key_len; + struct neighbour *n1, *rc, *n; struct neigh_hash_table *nht; + int error; + n = neigh_alloc(tbl, dev, flags, exempt_from_gc); trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc); - if (!n) { rc = ERR_PTR(-ENOBUFS); goto out; @@ -650,7 +679,8 @@ static struct neighbour *___neigh_create(struct neigh_table *tbl, n->dead = 0; if (!exempt_from_gc) list_add_tail(&n->gc_list, &n->tbl->gc_list); - + if (n->flags & NTF_MANAGED) + list_add_tail(&n->managed_list, &n->tbl->managed_list); if (want_ref) neigh_hold(n); rcu_assign_pointer(n->next, @@ -674,7 +704,7 @@ out_neigh_release: struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) { - return ___neigh_create(tbl, pkey, dev, false, want_ref); + return ___neigh_create(tbl, pkey, dev, 0, false, want_ref); } EXPORT_SYMBOL(__neigh_create); @@ -1205,8 +1235,6 @@ static void neigh_update_hhs(struct neighbour *neigh) } } - - /* Generic update routine. -- lladdr is new lladdr or NULL, if it is not supplied. -- new is new state. @@ -1217,7 +1245,8 @@ static void neigh_update_hhs(struct neighbour *neigh) lladdr instead of overriding it if it is different. NEIGH_UPDATE_F_ADMIN means that the change is administrative. - + NEIGH_UPDATE_F_USE means that the entry is user triggered. + NEIGH_UPDATE_F_MANAGED means that the entry will be auto-refreshed. NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing NTF_ROUTER flag. NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as @@ -1225,17 +1254,15 @@ static void neigh_update_hhs(struct neighbour *neigh) Caller MUST hold reference count on the entry. */ - static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid, struct netlink_ext_ack *extack) { - bool ext_learn_change = false; - u8 old; - int err; - int notify = 0; - struct net_device *dev; + bool gc_update = false, managed_update = false; int update_isrouter = 0; + struct net_device *dev; + int err, notify = 0; + u8 old; trace_neigh_update(neigh, lladdr, new, flags, nlmsg_pid); @@ -1254,7 +1281,13 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, (old & (NUD_NOARP | NUD_PERMANENT))) goto out; - ext_learn_change = neigh_update_ext_learned(neigh, flags, ¬ify); + neigh_update_flags(neigh, flags, ¬ify, &gc_update, &managed_update); + if (flags & (NEIGH_UPDATE_F_USE | NEIGH_UPDATE_F_MANAGED)) { + new = old & ~NUD_PERMANENT; + neigh->nud_state = new; + err = 0; + goto out; + } if (!(new & NUD_VALID)) { neigh_del_timer(neigh); @@ -1399,15 +1432,13 @@ out: if (update_isrouter) neigh_update_is_router(neigh, flags, ¬ify); write_unlock_bh(&neigh->lock); - - if (((new ^ old) & NUD_PERMANENT) || ext_learn_change) + if (((new ^ old) & NUD_PERMANENT) || gc_update) neigh_update_gc_list(neigh); - + if (managed_update) + neigh_update_managed_list(neigh); if (notify) neigh_update_notify(neigh, nlmsg_pid); - trace_neigh_update_done(neigh, err); - return err; } @@ -1533,6 +1564,20 @@ int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb) } EXPORT_SYMBOL(neigh_direct_output); +static void neigh_managed_work(struct work_struct *work) +{ + struct neigh_table *tbl = container_of(work, struct neigh_table, + managed_work.work); + struct neighbour *neigh; + + write_lock_bh(&tbl->lock); + list_for_each_entry(neigh, &tbl->managed_list, managed_list) + neigh_event_send(neigh, NULL); + queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, + NEIGH_VAR(&tbl->parms, DELAY_PROBE_TIME)); + write_unlock_bh(&tbl->lock); +} + static void neigh_proxy_process(struct timer_list *t) { struct neigh_table *tbl = from_timer(tbl, t, proxy_timer); @@ -1679,6 +1724,8 @@ void neigh_table_init(int index, struct neigh_table *tbl) INIT_LIST_HEAD(&tbl->parms_list); INIT_LIST_HEAD(&tbl->gc_list); + INIT_LIST_HEAD(&tbl->managed_list); + list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); refcount_set(&tbl->parms.refcnt, 1); @@ -1710,9 +1757,13 @@ void neigh_table_init(int index, struct neigh_table *tbl) WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); rwlock_init(&tbl->lock); + INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, tbl->parms.reachable_time); + INIT_DEFERRABLE_WORK(&tbl->managed_work, neigh_managed_work); + queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, 0); + timer_setup(&tbl->proxy_timer, neigh_proxy_process, 0); skb_queue_head_init_class(&tbl->proxy_queue, &neigh_table_proxy_queue_class); @@ -1783,6 +1834,7 @@ const struct nla_policy nda_policy[NDA_MAX+1] = { [NDA_MASTER] = { .type = NLA_U32 }, [NDA_PROTOCOL] = { .type = NLA_U8 }, [NDA_NH_ID] = { .type = NLA_U32 }, + [NDA_FLAGS_EXT] = NLA_POLICY_MASK(NLA_U32, NTF_EXT_MASK), [NDA_FDB_EXT_ATTRS] = { .type = NLA_NESTED }, }; @@ -1855,7 +1907,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE | - NEIGH_UPDATE_F_OVERRIDE_ISROUTER; + NEIGH_UPDATE_F_OVERRIDE_ISROUTER; struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; @@ -1864,6 +1916,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct neighbour *neigh; void *dst, *lladdr; u8 protocol = 0; + u32 ndm_flags; int err; ASSERT_RTNL(); @@ -1879,6 +1932,15 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, } ndm = nlmsg_data(nlh); + ndm_flags = ndm->ndm_flags; + if (tb[NDA_FLAGS_EXT]) { + u32 ext = nla_get_u32(tb[NDA_FLAGS_EXT]); + + BUILD_BUG_ON(sizeof(neigh->flags) * BITS_PER_BYTE < + (sizeof(ndm->ndm_flags) * BITS_PER_BYTE + + hweight32(NTF_EXT_MASK))); + ndm_flags |= (ext << NTF_EXT_SHIFT); + } if (ndm->ndm_ifindex) { dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { @@ -1906,14 +1968,18 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[NDA_PROTOCOL]) protocol = nla_get_u8(tb[NDA_PROTOCOL]); - - if (ndm->ndm_flags & NTF_PROXY) { + if (ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; + if (ndm_flags & NTF_MANAGED) { + NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination"); + goto out; + } + err = -ENOBUFS; pn = pneigh_lookup(tbl, net, dst, dev, 1); if (pn) { - pn->flags = ndm->ndm_flags; + pn->flags = ndm_flags; if (protocol) pn->protocol = protocol; err = 0; @@ -1933,16 +1999,24 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { - bool exempt_from_gc; + bool ndm_permanent = ndm->ndm_state & NUD_PERMANENT; + bool exempt_from_gc = ndm_permanent || + ndm_flags & NTF_EXT_LEARNED; if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { err = -ENOENT; goto out; } + if (ndm_permanent && (ndm_flags & NTF_MANAGED)) { + NL_SET_ERR_MSG(extack, "Invalid NTF_* flag for permanent entry"); + err = -EINVAL; + goto out; + } - exempt_from_gc = ndm->ndm_state & NUD_PERMANENT || - ndm->ndm_flags & NTF_EXT_LEARNED; - neigh = ___neigh_create(tbl, dst, dev, exempt_from_gc, true); + neigh = ___neigh_create(tbl, dst, dev, + ndm_flags & + (NTF_EXT_LEARNED | NTF_MANAGED), + exempt_from_gc, true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); goto out; @@ -1961,22 +2035,22 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (protocol) neigh->protocol = protocol; - - if (ndm->ndm_flags & NTF_EXT_LEARNED) + if (ndm_flags & NTF_EXT_LEARNED) flags |= NEIGH_UPDATE_F_EXT_LEARNED; - - if (ndm->ndm_flags & NTF_ROUTER) + if (ndm_flags & NTF_ROUTER) flags |= NEIGH_UPDATE_F_ISROUTER; + if (ndm_flags & NTF_MANAGED) + flags |= NEIGH_UPDATE_F_MANAGED; + if (ndm_flags & NTF_USE) + flags |= NEIGH_UPDATE_F_USE; - if (ndm->ndm_flags & NTF_USE) { + err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, + NETLINK_CB(skb).portid, extack); + if (!err && ndm_flags & (NTF_USE | NTF_MANAGED)) { neigh_event_send(neigh, NULL); err = 0; - } else - err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, - NETLINK_CB(skb).portid, extack); - + } neigh_release(neigh); - out: return err; } @@ -2427,6 +2501,7 @@ out: static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, u32 pid, u32 seq, int type, unsigned int flags) { + u32 neigh_flags, neigh_flags_ext; unsigned long now = jiffies; struct nda_cacheinfo ci; struct nlmsghdr *nlh; @@ -2436,11 +2511,14 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, if (nlh == NULL) return -EMSGSIZE; + neigh_flags_ext = neigh->flags >> NTF_EXT_SHIFT; + neigh_flags = neigh->flags & NTF_OLD_MASK; + ndm = nlmsg_data(nlh); ndm->ndm_family = neigh->ops->family; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; - ndm->ndm_flags = neigh->flags; + ndm->ndm_flags = neigh_flags; ndm->ndm_type = neigh->type; ndm->ndm_ifindex = neigh->dev->ifindex; @@ -2471,6 +2549,8 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, if (neigh->protocol && nla_put_u8(skb, NDA_PROTOCOL, neigh->protocol)) goto nla_put_failure; + if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) + goto nla_put_failure; nlmsg_end(skb, nlh); return 0; @@ -2484,6 +2564,7 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, u32 pid, u32 seq, int type, unsigned int flags, struct neigh_table *tbl) { + u32 neigh_flags, neigh_flags_ext; struct nlmsghdr *nlh; struct ndmsg *ndm; @@ -2491,11 +2572,14 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, if (nlh == NULL) return -EMSGSIZE; + neigh_flags_ext = pn->flags >> NTF_EXT_SHIFT; + neigh_flags = pn->flags & NTF_OLD_MASK; + ndm = nlmsg_data(nlh); ndm->ndm_family = tbl->family; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; - ndm->ndm_flags = pn->flags | NTF_PROXY; + ndm->ndm_flags = neigh_flags | NTF_PROXY; ndm->ndm_type = RTN_UNICAST; ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0; ndm->ndm_state = NUD_NONE; @@ -2505,6 +2589,8 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, if (pn->protocol && nla_put_u8(skb, NDA_PROTOCOL, pn->protocol)) goto nla_put_failure; + if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) + goto nla_put_failure; nlmsg_end(skb, nlh); return 0; @@ -2820,6 +2906,7 @@ static inline size_t neigh_nlmsg_size(void) + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ + nla_total_size(sizeof(struct nda_cacheinfo)) + nla_total_size(4) /* NDA_PROBES */ + + nla_total_size(4) /* NDA_FLAGS_EXT */ + nla_total_size(1); /* NDA_PROTOCOL */ } @@ -2848,6 +2935,7 @@ static inline size_t pneigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + + nla_total_size(4) /* NDA_FLAGS_EXT */ + nla_total_size(1); /* NDA_PROTOCOL */ } diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index eab5fc88a002..d8b9dbabd4a4 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -77,8 +77,8 @@ static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); - seq_printf(seq, "%9s: %16llu %12llu %4llu %6llu %4llu %5llu %10llu %9llu " - "%16llu %12llu %4llu %6llu %4llu %5llu %7llu %10llu\n", + seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " + "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", dev->name, stats->rx_bytes, stats->rx_packets, stats->rx_errors, stats->rx_dropped + stats->rx_missed_errors, @@ -103,11 +103,11 @@ static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) static int dev_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) - seq_puts(seq, "Interface| Receive " - " | Transmit\n" - " | bytes packets errs drop fifo frame " - "compressed multicast| bytes packets errs " - " drop fifo colls carrier compressed\n"); + seq_puts(seq, "Inter-| Receive " + " | Transmit\n" + " face |bytes packets errs drop fifo frame " + "compressed multicast|bytes packets errs " + "drop fifo colls carrier compressed\n"); else dev_seq_printf_stats(seq, v); return 0; @@ -259,14 +259,14 @@ static int ptype_seq_show(struct seq_file *seq, void *v) struct packet_type *pt = v; if (v == SEQ_START_TOKEN) - seq_puts(seq, "Type Device Function\n"); + seq_puts(seq, "Type Device Function\n"); else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { if (pt->type == htons(ETH_P_ALL)) seq_puts(seq, "ALL "); else seq_printf(seq, "%04x", ntohs(pt->type)); - seq_printf(seq, " %-9s %ps\n", + seq_printf(seq, " %-8s %ps\n", pt->dev ? pt->dev->name : "", pt->func); } @@ -327,14 +327,12 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v) struct netdev_hw_addr *ha; struct net_device *dev = v; - if (v == SEQ_START_TOKEN) { - seq_puts(seq, "Ifindex Interface Refcount Global_use Address\n"); + if (v == SEQ_START_TOKEN) return 0; - } netif_addr_lock_bh(dev); netdev_for_each_mc_addr(ha, dev) { - seq_printf(seq, "%-7d %-9s %-8d %-10d %*phN\n", + seq_printf(seq, "%-4d %-15s %-5d %-5d %*phN\n", dev->ifindex, dev->name, ha->refcount, ha->global_use, (int)dev->addr_len, ha->addr); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index f6197774048b..d6e4e0b43beb 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -175,6 +175,14 @@ static int change_carrier(struct net_device *dev, unsigned long new_carrier) static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { + struct net_device *netdev = to_net_dev(dev); + + /* The check is also done in change_carrier; this helps returning early + * without hitting the trylock/restart in netdev_store. + */ + if (!netdev->netdev_ops->ndo_change_carrier) + return -EOPNOTSUPP; + return netdev_store(dev, attr, buf, len, change_carrier); } @@ -196,6 +204,12 @@ static ssize_t speed_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); int ret = -EINVAL; + /* The check is also done in __ethtool_get_link_ksettings; this helps + * returning early without hitting the trylock/restart below. + */ + if (!netdev->ethtool_ops->get_link_ksettings) + return ret; + if (!rtnl_trylock()) return restart_syscall(); @@ -216,6 +230,12 @@ static ssize_t duplex_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); int ret = -EINVAL; + /* The check is also done in __ethtool_get_link_ksettings; this helps + * returning early without hitting the trylock/restart below. + */ + if (!netdev->ethtool_ops->get_link_ksettings) + return ret; + if (!rtnl_trylock()) return restart_syscall(); @@ -468,6 +488,14 @@ static ssize_t proto_down_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { + struct net_device *netdev = to_net_dev(dev); + + /* The check is also done in change_proto_down; this helps returning + * early without hitting the trylock/restart in netdev_store. + */ + if (!netdev->netdev_ops->ndo_change_proto_down) + return -EOPNOTSUPP; + return netdev_store(dev, attr, buf, len, change_proto_down); } NETDEVICE_SHOW_RW(proto_down, fmt_dec); @@ -478,6 +506,12 @@ static ssize_t phys_port_id_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; + /* The check is also done in dev_get_phys_port_id; this helps returning + * early without hitting the trylock/restart below. + */ + if (!netdev->netdev_ops->ndo_get_phys_port_id) + return -EOPNOTSUPP; + if (!rtnl_trylock()) return restart_syscall(); @@ -500,6 +534,13 @@ static ssize_t phys_port_name_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; + /* The checks are also done in dev_get_phys_port_name; this helps + * returning early without hitting the trylock/restart below. + */ + if (!netdev->netdev_ops->ndo_get_phys_port_name && + !netdev->netdev_ops->ndo_get_devlink_port) + return -EOPNOTSUPP; + if (!rtnl_trylock()) return restart_syscall(); @@ -522,6 +563,14 @@ static ssize_t phys_switch_id_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; + /* The checks are also done in dev_get_phys_port_name; this helps + * returning early without hitting the trylock/restart below. This works + * because recurse is false when calling dev_get_port_parent_id. + */ + if (!netdev->netdev_ops->ndo_get_port_parent_id && + !netdev->netdev_ops->ndo_get_devlink_port) + return -EOPNOTSUPP; + if (!rtnl_trylock()) return restart_syscall(); @@ -1226,6 +1275,12 @@ static ssize_t tx_maxrate_store(struct netdev_queue *queue, if (!capable(CAP_NET_ADMIN)) return -EPERM; + /* The check is also done later; this helps returning early without + * hitting the trylock/restart below. + */ + if (!dev->netdev_ops->ndo_set_tx_maxrate) + return -EOPNOTSUPP; + err = kstrtou32(buf, 10, &rate); if (err < 0) return err; @@ -1869,7 +1924,7 @@ static struct class net_class __ro_after_init = { .get_ownership = net_get_ownership, }; -#ifdef CONFIG_OF_NET +#ifdef CONFIG_OF static int of_dev_node_match(struct device *dev, const void *data) { for (; dev; dev = dev->parent) { diff --git a/net/core/of_net.c b/net/core/of_net.c new file mode 100644 index 000000000000..f1a9bf7578e7 --- /dev/null +++ b/net/core/of_net.c @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * OF helpers for network devices. + * + * Initially copied out of arch/powerpc/kernel/prom_parse.c + */ +#include <linux/etherdevice.h> +#include <linux/kernel.h> +#include <linux/of_net.h> +#include <linux/of_platform.h> +#include <linux/phy.h> +#include <linux/export.h> +#include <linux/device.h> +#include <linux/nvmem-consumer.h> + +/** + * of_get_phy_mode - Get phy mode for given device_node + * @np: Pointer to the given device_node + * @interface: Pointer to the result + * + * The function gets phy interface string from property 'phy-mode' or + * 'phy-connection-type'. The index in phy_modes table is set in + * interface and 0 returned. In case of error interface is set to + * PHY_INTERFACE_MODE_NA and an errno is returned, e.g. -ENODEV. + */ +int of_get_phy_mode(struct device_node *np, phy_interface_t *interface) +{ + const char *pm; + int err, i; + + *interface = PHY_INTERFACE_MODE_NA; + + err = of_property_read_string(np, "phy-mode", &pm); + if (err < 0) + err = of_property_read_string(np, "phy-connection-type", &pm); + if (err < 0) + return err; + + for (i = 0; i < PHY_INTERFACE_MODE_MAX; i++) + if (!strcasecmp(pm, phy_modes(i))) { + *interface = i; + return 0; + } + + return -ENODEV; +} +EXPORT_SYMBOL_GPL(of_get_phy_mode); + +static int of_get_mac_addr(struct device_node *np, const char *name, u8 *addr) +{ + struct property *pp = of_find_property(np, name, NULL); + + if (pp && pp->length == ETH_ALEN && is_valid_ether_addr(pp->value)) { + memcpy(addr, pp->value, ETH_ALEN); + return 0; + } + return -ENODEV; +} + +static int of_get_mac_addr_nvmem(struct device_node *np, u8 *addr) +{ + struct platform_device *pdev = of_find_device_by_node(np); + struct nvmem_cell *cell; + const void *mac; + size_t len; + int ret; + + /* Try lookup by device first, there might be a nvmem_cell_lookup + * associated with a given device. + */ + if (pdev) { + ret = nvmem_get_mac_address(&pdev->dev, addr); + put_device(&pdev->dev); + return ret; + } + + cell = of_nvmem_cell_get(np, "mac-address"); + if (IS_ERR(cell)) + return PTR_ERR(cell); + + mac = nvmem_cell_read(cell, &len); + nvmem_cell_put(cell); + + if (IS_ERR(mac)) + return PTR_ERR(mac); + + if (len != ETH_ALEN || !is_valid_ether_addr(mac)) { + kfree(mac); + return -EINVAL; + } + + memcpy(addr, mac, ETH_ALEN); + kfree(mac); + + return 0; +} + +/** + * of_get_mac_address() + * @np: Caller's Device Node + * @addr: Pointer to a six-byte array for the result + * + * Search the device tree for the best MAC address to use. 'mac-address' is + * checked first, because that is supposed to contain to "most recent" MAC + * address. If that isn't set, then 'local-mac-address' is checked next, + * because that is the default address. If that isn't set, then the obsolete + * 'address' is checked, just in case we're using an old device tree. If any + * of the above isn't set, then try to get MAC address from nvmem cell named + * 'mac-address'. + * + * Note that the 'address' property is supposed to contain a virtual address of + * the register set, but some DTS files have redefined that property to be the + * MAC address. + * + * All-zero MAC addresses are rejected, because those could be properties that + * exist in the device tree, but were not set by U-Boot. For example, the + * DTS could define 'mac-address' and 'local-mac-address', with zero MAC + * addresses. Some older U-Boots only initialized 'local-mac-address'. In + * this case, the real MAC is in 'local-mac-address', and 'mac-address' exists + * but is all zeros. + * + * Return: 0 on success and errno in case of error. +*/ +int of_get_mac_address(struct device_node *np, u8 *addr) +{ + int ret; + + if (!np) + return -ENODEV; + + ret = of_get_mac_addr(np, "mac-address", addr); + if (!ret) + return 0; + + ret = of_get_mac_addr(np, "local-mac-address", addr); + if (!ret) + return 0; + + ret = of_get_mac_addr(np, "address", addr); + if (!ret) + return 0; + + return of_get_mac_addr_nvmem(np, addr); +} +EXPORT_SYMBOL(of_get_mac_address); + +/** + * of_get_ethdev_address() + * @np: Caller's Device Node + * @dev: Pointer to netdevice which address will be updated + * + * Search the device tree for the best MAC address to use. + * If found set @dev->dev_addr to that address. + * + * See documentation of of_get_mac_address() for more information on how + * the best address is determined. + * + * Return: 0 on success and errno in case of error. + */ +int of_get_ethdev_address(struct device_node *np, struct net_device *dev) +{ + u8 addr[ETH_ALEN]; + int ret; + + ret = of_get_mac_address(np, addr); + if (!ret) + eth_hw_addr_set(dev, addr); + return ret; +} +EXPORT_SYMBOL(of_get_ethdev_address); diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 1a6978427d6c..9b60e4301a44 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -49,6 +49,12 @@ static int page_pool_init(struct page_pool *pool, * which is the XDP_TX use-case. */ if (pool->p.flags & PP_FLAG_DMA_MAP) { + /* DMA-mapping is not supported on 32-bit systems with + * 64-bit DMA mapping. + */ + if (sizeof(dma_addr_t) > sizeof(unsigned long)) + return -EOPNOTSUPP; + if ((pool->p.dma_dir != DMA_FROM_DEVICE) && (pool->p.dma_dir != DMA_BIDIRECTIONAL)) return -EINVAL; @@ -69,10 +75,6 @@ static int page_pool_init(struct page_pool *pool, */ } - if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT && - pool->p.flags & PP_FLAG_PAGE_FRAG) - return -EINVAL; - if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) return -ENOMEM; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 327ca6bc6e6d..79477dcae7c2 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3804,9 +3804,8 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; - size_t if_info_size; - skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), flags); + skb = nlmsg_new(if_nlmsg_size(dev, 0), flags); if (skb == NULL) goto errout; @@ -4384,7 +4383,7 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) continue; if (br_dev != netdev_master_upper_dev_get(dev) && - !(dev->priv_flags & IFF_EBRIDGE)) + !netif_is_bridge_master(dev)) continue; cops = ops; } @@ -5262,7 +5261,7 @@ nla_put_failure: static size_t if_nlmsg_stats_size(const struct net_device *dev, u32 filter_mask) { - size_t size = 0; + size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg)); if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0)) size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64)); diff --git a/net/core/sock.c b/net/core/sock.c index 62627e868e03..9862eefce21e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -350,7 +350,7 @@ void sk_error_report(struct sock *sk) } EXPORT_SYMBOL(sk_error_report); -static int sock_get_timeout(long timeo, void *optval, bool old_timeval) +int sock_get_timeout(long timeo, void *optval, bool old_timeval) { struct __kernel_sock_timeval tv; @@ -379,12 +379,11 @@ static int sock_get_timeout(long timeo, void *optval, bool old_timeval) *(struct __kernel_sock_timeval *)optval = tv; return sizeof(tv); } +EXPORT_SYMBOL(sock_get_timeout); -static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, - bool old_timeval) +int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, + sockptr_t optval, int optlen, bool old_timeval) { - struct __kernel_sock_timeval tv; - if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { struct old_timeval32 tv32; @@ -393,8 +392,8 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) return -EFAULT; - tv.tv_sec = tv32.tv_sec; - tv.tv_usec = tv32.tv_usec; + tv->tv_sec = tv32.tv_sec; + tv->tv_usec = tv32.tv_usec; } else if (old_timeval) { struct __kernel_old_timeval old_tv; @@ -402,14 +401,28 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, return -EINVAL; if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) return -EFAULT; - tv.tv_sec = old_tv.tv_sec; - tv.tv_usec = old_tv.tv_usec; + tv->tv_sec = old_tv.tv_sec; + tv->tv_usec = old_tv.tv_usec; } else { - if (optlen < sizeof(tv)) + if (optlen < sizeof(*tv)) return -EINVAL; - if (copy_from_sockptr(&tv, optval, sizeof(tv))) + if (copy_from_sockptr(tv, optval, sizeof(*tv))) return -EFAULT; } + + return 0; +} +EXPORT_SYMBOL(sock_copy_user_timeval); + +static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, + bool old_timeval) +{ + struct __kernel_sock_timeval tv; + int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); + + if (err) + return err; + if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) return -EDOM; @@ -947,6 +960,53 @@ void sock_set_mark(struct sock *sk, u32 val) } EXPORT_SYMBOL(sock_set_mark); +static void sock_release_reserved_memory(struct sock *sk, int bytes) +{ + /* Round down bytes to multiple of pages */ + bytes &= ~(SK_MEM_QUANTUM - 1); + + WARN_ON(bytes > sk->sk_reserved_mem); + sk->sk_reserved_mem -= bytes; + sk_mem_reclaim(sk); +} + +static int sock_reserve_memory(struct sock *sk, int bytes) +{ + long allocated; + bool charged; + int pages; + + if (!mem_cgroup_sockets_enabled || !sk->sk_memcg) + return -EOPNOTSUPP; + + if (!bytes) + return 0; + + pages = sk_mem_pages(bytes); + + /* pre-charge to memcg */ + charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); + if (!charged) + return -ENOMEM; + + /* pre-charge to forward_alloc */ + allocated = sk_memory_allocated_add(sk, pages); + /* If the system goes into memory pressure with this + * precharge, give up and return error. + */ + if (allocated > sk_prot_mem_limits(sk, 1)) { + sk_memory_allocated_sub(sk, pages); + mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); + return -ENOMEM; + } + sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT; + + sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT; + + return 0; +} + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. @@ -1367,6 +1427,23 @@ set_sndbuf: ~SOCK_BUF_LOCK_MASK); break; + case SO_RESERVE_MEM: + { + int delta; + + if (val < 0) { + ret = -EINVAL; + break; + } + + delta = val - sk->sk_reserved_mem; + if (delta < 0) + sock_release_reserved_memory(sk, -delta); + else + ret = sock_reserve_memory(sk, delta); + break; + } + default: ret = -ENOPROTOOPT; break; @@ -1376,6 +1453,16 @@ set_sndbuf: } EXPORT_SYMBOL(sock_setsockopt); +static const struct cred *sk_get_peer_cred(struct sock *sk) +{ + const struct cred *cred; + + spin_lock(&sk->sk_peer_lock); + cred = get_cred(sk->sk_peer_cred); + spin_unlock(&sk->sk_peer_lock); + + return cred; +} static void cred_to_ucred(struct pid *pid, const struct cred *cred, struct ucred *ucred) @@ -1552,7 +1639,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname, struct ucred peercred; if (len > sizeof(peercred)) len = sizeof(peercred); + + spin_lock(&sk->sk_peer_lock); cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); + spin_unlock(&sk->sk_peer_lock); + if (copy_to_user(optval, &peercred, len)) return -EFAULT; goto lenout; @@ -1560,20 +1651,23 @@ int sock_getsockopt(struct socket *sock, int level, int optname, case SO_PEERGROUPS: { + const struct cred *cred; int ret, n; - if (!sk->sk_peer_cred) + cred = sk_get_peer_cred(sk); + if (!cred) return -ENODATA; - n = sk->sk_peer_cred->group_info->ngroups; + n = cred->group_info->ngroups; if (len < n * sizeof(gid_t)) { len = n * sizeof(gid_t); + put_cred(cred); return put_user(len, optlen) ? -EFAULT : -ERANGE; } len = n * sizeof(gid_t); - ret = groups_to_user((gid_t __user *)optval, - sk->sk_peer_cred->group_info); + ret = groups_to_user((gid_t __user *)optval, cred->group_info); + put_cred(cred); if (ret) return ret; goto lenout; @@ -1733,6 +1827,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; break; + case SO_RESERVE_MEM: + v.val = sk->sk_reserved_mem; + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -1935,9 +2033,10 @@ static void __sk_destruct(struct rcu_head *head) sk->sk_frag.page = NULL; } - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ + put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); + if (likely(sk->sk_net_refcnt)) put_net(sock_net(sk)); sk_prot_free(sk->sk_prot_creator, sk); @@ -2045,6 +2144,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_dst_pending_confirm = 0; newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; + newsk->sk_reserved_mem = 0; atomic_set(&newsk->sk_drops, 0); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; @@ -3145,6 +3245,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_peer_pid = NULL; sk->sk_peer_cred = NULL; + spin_lock_init(&sk->sk_peer_lock); + sk->sk_write_pending = 0; sk->sk_rcvlowat = 1; sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; @@ -3179,17 +3281,15 @@ EXPORT_SYMBOL(sock_init_data); void lock_sock_nested(struct sock *sk, int subclass) { + /* The sk_lock has mutex_lock() semantics here. */ + mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); + might_sleep(); spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_lock.owned) __lock_sock(sk); sk->sk_lock.owned = 1; - spin_unlock(&sk->sk_lock.slock); - /* - * The sk_lock has mutex_lock() semantics here: - */ - mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); - local_bh_enable(); + spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL(lock_sock_nested); @@ -3212,42 +3312,37 @@ void release_sock(struct sock *sk) } EXPORT_SYMBOL(release_sock); -/** - * lock_sock_fast - fast version of lock_sock - * @sk: socket - * - * This version should be used for very small section, where process wont block - * return false if fast path is taken: - * - * sk_lock.slock locked, owned = 0, BH disabled - * - * return true if slow path is taken: - * - * sk_lock.slock unlocked, owned = 1, BH enabled - */ -bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) +bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) { might_sleep(); spin_lock_bh(&sk->sk_lock.slock); - if (!sk->sk_lock.owned) + if (!sk->sk_lock.owned) { /* - * Note : We must disable BH + * Fast path return with bottom halves disabled and + * sock::sk_lock.slock held. + * + * The 'mutex' is not contended and holding + * sock::sk_lock.slock prevents all other lockers to + * proceed so the corresponding unlock_sock_fast() can + * avoid the slow path of release_sock() completely and + * just release slock. + * + * From a semantical POV this is equivalent to 'acquiring' + * the 'mutex', hence the corresponding lockdep + * mutex_release() has to happen in the fast path of + * unlock_sock_fast(). */ return false; + } __lock_sock(sk); sk->sk_lock.owned = 1; - spin_unlock(&sk->sk_lock.slock); - /* - * The sk_lock has mutex_lock() semantics here: - */ - mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); __acquire(&sk->sk_lock.slock); - local_bh_enable(); + spin_unlock_bh(&sk->sk_lock.slock); return true; } -EXPORT_SYMBOL(lock_sock_fast); +EXPORT_SYMBOL(__lock_sock_fast); int sock_gettstamp(struct socket *sock, void __user *userstamp, bool timeval, bool time32) diff --git a/net/core/stream.c b/net/core/stream.c index 4f1d4aa5fb38..06b36c730ce8 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -195,14 +195,11 @@ void sk_stream_kill_queues(struct sock *sk) /* First the read buffer. */ __skb_queue_purge(&sk->sk_receive_queue); - /* Next, the error queue. */ - __skb_queue_purge(&sk->sk_error_queue); - /* Next, the write queue. */ WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); /* Account for returned memory. */ - sk_mem_reclaim(sk); + sk_mem_reclaim_final(sk); WARN_ON(sk->sk_wmem_queued); WARN_ON(sk->sk_forward_alloc); diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index c5c1d2b8045e..5183e627468d 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -48,7 +48,7 @@ extern bool dccp_debug; extern struct inet_hashinfo dccp_hashinfo; -extern struct percpu_counter dccp_orphan_count; +DECLARE_PER_CPU(unsigned int, dccp_orphan_count); void dccp_time_wait(struct sock *sk, int state, int timeo); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index abb5c596a817..fc44dadc778b 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -42,8 +42,8 @@ DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; EXPORT_SYMBOL_GPL(dccp_statistics); -struct percpu_counter dccp_orphan_count; -EXPORT_SYMBOL_GPL(dccp_orphan_count); +DEFINE_PER_CPU(unsigned int, dccp_orphan_count); +EXPORT_PER_CPU_SYMBOL_GPL(dccp_orphan_count); struct inet_hashinfo dccp_hashinfo; EXPORT_SYMBOL_GPL(dccp_hashinfo); @@ -1055,7 +1055,7 @@ adjudge_to_death: bh_lock_sock(sk); WARN_ON(sock_owned_by_user(sk)); - percpu_counter_inc(sk->sk_prot->orphan_count); + this_cpu_inc(dccp_orphan_count); /* Have we already been destroyed by a softirq or backlog? */ if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED) @@ -1115,13 +1115,10 @@ static int __init dccp_init(void) BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > sizeof_field(struct sk_buff, cb)); - rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL); - if (rc) - goto out_fail; inet_hashinfo_init(&dccp_hashinfo); rc = inet_hashinfo2_init_mod(&dccp_hashinfo); if (rc) - goto out_free_percpu; + goto out_fail; rc = -ENOBUFS; dccp_hashinfo.bind_bucket_cachep = kmem_cache_create("dccp_bind_bucket", @@ -1226,8 +1223,6 @@ out_free_bind_bucket_cachep: kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); out_free_hashinfo2: inet_hashinfo2_free_mod(&dccp_hashinfo); -out_free_percpu: - percpu_counter_destroy(&dccp_orphan_count); out_fail: dccp_hashinfo.bhash = NULL; dccp_hashinfo.ehash = NULL; @@ -1250,7 +1245,6 @@ static void __exit dccp_fini(void) dccp_ackvec_exit(); dccp_sysctl_exit(); inet_hashinfo2_free_mod(&dccp_hashinfo); - percpu_counter_destroy(&dccp_orphan_count); } module_init(dccp_init); diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 548285539752..8cb87b5067ee 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -92,17 +92,8 @@ config NET_DSA_TAG_KSZ Say Y if you want to enable support for tagging frames for the Microchip 8795/9477/9893 families of switches. -config NET_DSA_TAG_RTL4_A - tristate "Tag driver for Realtek 4 byte protocol A tags" - help - Say Y or M if you want to enable support for tagging frames for the - Realtek switches with 4 byte protocol A tags, sich as found in - the Realtek RTL8366RB. - config NET_DSA_TAG_OCELOT tristate "Tag driver for Ocelot family of switches, using NPI port" - depends on MSCC_OCELOT_SWITCH_LIB || \ - (MSCC_OCELOT_SWITCH_LIB=n && COMPILE_TEST) select PACKING help Say Y or M if you want to enable NPI tagging for the Ocelot switches @@ -114,8 +105,6 @@ config NET_DSA_TAG_OCELOT config NET_DSA_TAG_OCELOT_8021Q tristate "Tag driver for Ocelot family of switches, using VLAN" - depends on MSCC_OCELOT_SWITCH_LIB || \ - (MSCC_OCELOT_SWITCH_LIB=n && COMPILE_TEST) help Say Y or M if you want to enable support for tagging frames with a custom VLAN-based header. Frames that require timestamping, such as @@ -130,6 +119,19 @@ config NET_DSA_TAG_QCA Say Y or M if you want to enable support for tagging frames for the Qualcomm Atheros QCA8K switches. +config NET_DSA_TAG_RTL4_A + tristate "Tag driver for Realtek 4 byte protocol A tags" + help + Say Y or M if you want to enable support for tagging frames for the + Realtek switches with 4 byte protocol A tags, sich as found in + the Realtek RTL8366RB. + +config NET_DSA_TAG_RTL8_4 + tristate "Tag driver for Realtek 8 byte protocol 4 tags" + help + Say Y or M if you want to enable support for tagging frames for Realtek + switches with 8 byte protocol 4 tags, such as the Realtek RTL8365MB-VC. + config NET_DSA_TAG_LAN9303 tristate "Tag driver for SMSC/Microchip LAN9303 family of switches" help @@ -138,7 +140,6 @@ config NET_DSA_TAG_LAN9303 config NET_DSA_TAG_SJA1105 tristate "Tag driver for NXP SJA1105 switches" - depends on NET_DSA_SJA1105 || !NET_DSA_SJA1105 select PACKING help Say Y or M if you want to enable support for tagging frames with the diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 67ea009f242c..9f75820e7c98 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -10,12 +10,13 @@ obj-$(CONFIG_NET_DSA_TAG_DSA_COMMON) += tag_dsa.o obj-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o obj-$(CONFIG_NET_DSA_TAG_HELLCREEK) += tag_hellcreek.o obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o -obj-$(CONFIG_NET_DSA_TAG_RTL4_A) += tag_rtl4_a.o obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o obj-$(CONFIG_NET_DSA_TAG_OCELOT_8021Q) += tag_ocelot_8021q.o obj-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o +obj-$(CONFIG_NET_DSA_TAG_RTL4_A) += tag_rtl4_a.o +obj-$(CONFIG_NET_DSA_TAG_RTL8_4) += tag_rtl8_4.o obj-$(CONFIG_NET_DSA_TAG_SJA1105) += tag_sja1105.o obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o obj-$(CONFIG_NET_DSA_TAG_XRS700X) += tag_xrs700x.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 41f36ad8b0ec..ea5169e671ae 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -280,23 +280,22 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, } #ifdef CONFIG_PM_SLEEP -static bool dsa_is_port_initialized(struct dsa_switch *ds, int p) +static bool dsa_port_is_initialized(const struct dsa_port *dp) { - const struct dsa_port *dp = dsa_to_port(ds, p); - return dp->type == DSA_PORT_TYPE_USER && dp->slave; } int dsa_switch_suspend(struct dsa_switch *ds) { - int i, ret = 0; + struct dsa_port *dp; + int ret = 0; /* Suspend slave network devices */ - for (i = 0; i < ds->num_ports; i++) { - if (!dsa_is_port_initialized(ds, i)) + dsa_switch_for_each_port(dp, ds) { + if (!dsa_port_is_initialized(dp)) continue; - ret = dsa_slave_suspend(dsa_to_port(ds, i)->slave); + ret = dsa_slave_suspend(dp->slave); if (ret) return ret; } @@ -310,7 +309,8 @@ EXPORT_SYMBOL_GPL(dsa_switch_suspend); int dsa_switch_resume(struct dsa_switch *ds) { - int i, ret = 0; + struct dsa_port *dp; + int ret = 0; if (ds->ops->resume) ret = ds->ops->resume(ds); @@ -319,11 +319,11 @@ int dsa_switch_resume(struct dsa_switch *ds) return ret; /* Resume slave network devices */ - for (i = 0; i < ds->num_ports; i++) { - if (!dsa_is_port_initialized(ds, i)) + dsa_switch_for_each_port(dp, ds) { + if (!dsa_port_is_initialized(dp)) continue; - ret = dsa_slave_resume(dsa_to_port(ds, i)->slave); + ret = dsa_slave_resume(dp->slave); if (ret) return ret; } diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 96f211f52ac3..f5270114dcb8 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -170,7 +170,7 @@ void dsa_bridge_num_put(const struct net_device *bridge_dev, int bridge_num) /* Check if the bridge is still in use, otherwise it is time * to clean it up so we can reuse this bridge_num later. */ - if (!dsa_bridge_num_find(bridge_dev)) + if (dsa_bridge_num_find(bridge_dev) < 0) clear_bit(bridge_num, &dsa_fwd_offloading_bridges); } @@ -399,11 +399,8 @@ static int dsa_tree_setup_cpu_ports(struct dsa_switch_tree *dst) if (!dsa_port_is_cpu(cpu_dp)) continue; - list_for_each_entry(dp, &dst->ports, list) { - /* Prefer a local CPU port */ - if (dp->ds != cpu_dp->ds) - continue; - + /* Prefer a local CPU port */ + dsa_switch_for_each_port(dp, cpu_dp->ds) { /* Prefer the first local CPU port found */ if (dp->cpu_dp) continue; @@ -429,6 +426,7 @@ static int dsa_port_setup(struct dsa_port *dp) { struct devlink_port *dlp = &dp->devlink_port; bool dsa_port_link_registered = false; + struct dsa_switch *ds = dp->ds; bool dsa_port_enabled = false; int err = 0; @@ -438,6 +436,12 @@ static int dsa_port_setup(struct dsa_port *dp) INIT_LIST_HEAD(&dp->fdbs); INIT_LIST_HEAD(&dp->mdbs); + if (ds->ops->port_setup) { + err = ds->ops->port_setup(ds, dp->index); + if (err) + return err; + } + switch (dp->type) { case DSA_PORT_TYPE_UNUSED: dsa_port_disable(dp); @@ -480,8 +484,11 @@ static int dsa_port_setup(struct dsa_port *dp) dsa_port_disable(dp); if (err && dsa_port_link_registered) dsa_port_link_unregister_of(dp); - if (err) + if (err) { + if (ds->ops->port_teardown) + ds->ops->port_teardown(ds, dp->index); return err; + } dp->setup = true; @@ -533,11 +540,15 @@ static int dsa_port_devlink_setup(struct dsa_port *dp) static void dsa_port_teardown(struct dsa_port *dp) { struct devlink_port *dlp = &dp->devlink_port; + struct dsa_switch *ds = dp->ds; struct dsa_mac_addr *a, *tmp; if (!dp->setup) return; + if (ds->ops->port_teardown) + ds->ops->port_teardown(ds, dp->index); + devlink_port_type_clear(dlp); switch (dp->type) { @@ -581,6 +592,36 @@ static void dsa_port_devlink_teardown(struct dsa_port *dp) dp->devlink_port_setup = false; } +/* Destroy the current devlink port, and create a new one which has the UNUSED + * flavour. At this point, any call to ds->ops->port_setup has been already + * balanced out by a call to ds->ops->port_teardown, so we know that any + * devlink port regions the driver had are now unregistered. We then call its + * ds->ops->port_setup again, in order for the driver to re-create them on the + * new devlink port. + */ +static int dsa_port_reinit_as_unused(struct dsa_port *dp) +{ + struct dsa_switch *ds = dp->ds; + int err; + + dsa_port_devlink_teardown(dp); + dp->type = DSA_PORT_TYPE_UNUSED; + err = dsa_port_devlink_setup(dp); + if (err) + return err; + + if (ds->ops->port_setup) { + /* On error, leave the devlink port registered, + * dsa_switch_teardown will clean it up later. + */ + err = ds->ops->port_setup(ds, dp->index); + if (err) + return err; + } + + return 0; +} + static int dsa_devlink_info_get(struct devlink *dl, struct devlink_info_req *req, struct netlink_ext_ack *extack) @@ -758,16 +799,17 @@ static int dsa_switch_setup_tag_protocol(struct dsa_switch *ds) { const struct dsa_device_ops *tag_ops = ds->dst->tag_ops; struct dsa_switch_tree *dst = ds->dst; - int port, err; + struct dsa_port *cpu_dp; + int err; if (tag_ops->proto == dst->default_proto) return 0; - for (port = 0; port < ds->num_ports; port++) { - if (!dsa_is_cpu_port(ds, port)) - continue; - - err = ds->ops->change_tag_protocol(ds, port, tag_ops->proto); + dsa_switch_for_each_cpu_port(cpu_dp, ds) { + rtnl_lock(); + err = ds->ops->change_tag_protocol(ds, cpu_dp->index, + tag_ops->proto); + rtnl_unlock(); if (err) { dev_err(ds->dev, "Unable to use tag protocol \"%s\": %pe\n", tag_ops->name, ERR_PTR(err)); @@ -804,16 +846,13 @@ static int dsa_switch_setup(struct dsa_switch *ds) dl_priv = devlink_priv(ds->devlink); dl_priv->ds = ds; - devlink_register(ds->devlink); /* Setup devlink port instances now, so that the switch * setup() can register regions etc, against the ports */ - list_for_each_entry(dp, &ds->dst->ports, list) { - if (dp->ds == ds) { - err = dsa_port_devlink_setup(dp); - if (err) - goto unregister_devlink_ports; - } + dsa_switch_for_each_port(dp, ds) { + err = dsa_port_devlink_setup(dp); + if (err) + goto unregister_devlink_ports; } err = dsa_switch_register_notifier(ds); @@ -830,10 +869,8 @@ static int dsa_switch_setup(struct dsa_switch *ds) if (err) goto teardown; - devlink_params_publish(ds->devlink); - if (!ds->slave_mii_bus && ds->ops->phy_read) { - ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); + ds->slave_mii_bus = mdiobus_alloc(); if (!ds->slave_mii_bus) { err = -ENOMEM; goto teardown; @@ -843,23 +880,24 @@ static int dsa_switch_setup(struct dsa_switch *ds) err = mdiobus_register(ds->slave_mii_bus); if (err < 0) - goto teardown; + goto free_slave_mii_bus; } ds->setup = true; - + devlink_register(ds->devlink); return 0; +free_slave_mii_bus: + if (ds->slave_mii_bus && ds->ops->phy_read) + mdiobus_free(ds->slave_mii_bus); teardown: if (ds->ops->teardown) ds->ops->teardown(ds); unregister_notifier: dsa_switch_unregister_notifier(ds); unregister_devlink_ports: - list_for_each_entry(dp, &ds->dst->ports, list) - if (dp->ds == ds) - dsa_port_devlink_teardown(dp); - devlink_unregister(ds->devlink); + dsa_switch_for_each_port(dp, ds) + dsa_port_devlink_teardown(dp); devlink_free(ds->devlink); ds->devlink = NULL; return err; @@ -872,19 +910,23 @@ static void dsa_switch_teardown(struct dsa_switch *ds) if (!ds->setup) return; - if (ds->slave_mii_bus && ds->ops->phy_read) - mdiobus_unregister(ds->slave_mii_bus); + if (ds->devlink) + devlink_unregister(ds->devlink); - dsa_switch_unregister_notifier(ds); + if (ds->slave_mii_bus && ds->ops->phy_read) { + mdiobus_unregister(ds->slave_mii_bus); + mdiobus_free(ds->slave_mii_bus); + ds->slave_mii_bus = NULL; + } if (ds->ops->teardown) ds->ops->teardown(ds); + dsa_switch_unregister_notifier(ds); + if (ds->devlink) { - list_for_each_entry(dp, &ds->dst->ports, list) - if (dp->ds == ds) - dsa_port_devlink_teardown(dp); - devlink_unregister(ds->devlink); + dsa_switch_for_each_port(dp, ds) + dsa_port_devlink_teardown(dp); devlink_free(ds->devlink); ds->devlink = NULL; } @@ -933,12 +975,9 @@ static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) list_for_each_entry(dp, &dst->ports, list) { err = dsa_port_setup(dp); if (err) { - dsa_port_devlink_teardown(dp); - dp->type = DSA_PORT_TYPE_UNUSED; - err = dsa_port_devlink_setup(dp); + err = dsa_port_reinit_as_unused(dp); if (err) goto teardown; - continue; } } @@ -1043,6 +1082,7 @@ static int dsa_tree_setup(struct dsa_switch_tree *dst) teardown_master: dsa_tree_teardown_master(dst); teardown_switches: + dsa_tree_teardown_ports(dst); dsa_tree_teardown_switches(dst); teardown_cpu_ports: dsa_tree_teardown_cpu_ports(dst); @@ -1102,7 +1142,7 @@ int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, goto out_unlock; list_for_each_entry(dp, &dst->ports, list) { - if (!dsa_is_user_port(dp->ds, dp->index)) + if (!dsa_port_is_user(dp)) continue; if (dp->slave->flags & IFF_UP) @@ -1133,8 +1173,8 @@ static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index) struct dsa_switch_tree *dst = ds->dst; struct dsa_port *dp; - list_for_each_entry(dp, &dst->ports, list) - if (dp->ds == ds && dp->index == index) + dsa_switch_for_each_port(dp, ds) + if (dp->index == index) return dp; dp = kzalloc(sizeof(*dp), GFP_KERNEL); @@ -1319,12 +1359,15 @@ static int dsa_switch_parse_ports_of(struct dsa_switch *ds, for_each_available_child_of_node(ports, port) { err = of_property_read_u32(port, "reg", ®); - if (err) + if (err) { + of_node_put(port); goto out_put_node; + } if (reg >= ds->num_ports) { dev_err(ds->dev, "port %pOF index %u exceeds num_ports (%zu)\n", port, reg, ds->num_ports); + of_node_put(port); err = -EINVAL; goto out_put_node; } @@ -1332,8 +1375,10 @@ static int dsa_switch_parse_ports_of(struct dsa_switch *ds, dp = dsa_to_port(ds, reg); err = dsa_port_parse_of(dp, port); - if (err) + if (err) { + of_node_put(port); goto out_put_node; + } } out_put_node: @@ -1475,12 +1520,9 @@ static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd) static void dsa_switch_release_ports(struct dsa_switch *ds) { - struct dsa_switch_tree *dst = ds->dst; struct dsa_port *dp, *next; - list_for_each_entry_safe(dp, next, &dst->ports, list) { - if (dp->ds != ds) - continue; + dsa_switch_for_each_port_safe(dp, next, ds) { list_del(&dp->list); kfree(dp); } @@ -1557,3 +1599,47 @@ void dsa_unregister_switch(struct dsa_switch *ds) mutex_unlock(&dsa2_mutex); } EXPORT_SYMBOL_GPL(dsa_unregister_switch); + +/* If the DSA master chooses to unregister its net_device on .shutdown, DSA is + * blocking that operation from completion, due to the dev_hold taken inside + * netdev_upper_dev_link. Unlink the DSA slave interfaces from being uppers of + * the DSA master, so that the system can reboot successfully. + */ +void dsa_switch_shutdown(struct dsa_switch *ds) +{ + struct net_device *master, *slave_dev; + LIST_HEAD(unregister_list); + struct dsa_port *dp; + + mutex_lock(&dsa2_mutex); + rtnl_lock(); + + dsa_switch_for_each_user_port(dp, ds) { + master = dp->cpu_dp->master; + slave_dev = dp->slave; + + netdev_upper_dev_unlink(master, slave_dev); + /* Just unlinking ourselves as uppers of the master is not + * sufficient. When the master net device unregisters, that will + * also call dev_close, which we will catch as NETDEV_GOING_DOWN + * and trigger a dev_close on our own devices (dsa_slave_close). + * In turn, that will call dev_mc_unsync on the master's net + * device. If the master is also a DSA switch port, this will + * trigger dsa_slave_set_rx_mode which will call dev_mc_sync on + * its own master. Lockdep will complain about the fact that + * all cascaded masters have the same dsa_master_addr_list_lock_key, + * which it normally would not do if the cascaded masters would + * be in a proper upper/lower relationship, which we've just + * destroyed. + * To suppress the lockdep warnings, let's actually unregister + * the DSA slave interfaces too, to avoid the nonsensical + * multicast address list synchronization on shutdown. + */ + unregister_netdevice_queue(slave_dev, &unregister_list); + } + unregister_netdevice_many(&unregister_list); + + rtnl_unlock(); + mutex_unlock(&dsa2_mutex); +} +EXPORT_SYMBOL_GPL(dsa_switch_shutdown); diff --git a/net/dsa/port.c b/net/dsa/port.c index 616330a16d31..bf671306b560 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -515,14 +515,15 @@ static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp, struct netlink_ext_ack *extack) { struct dsa_switch *ds = dp->ds; - int err, i; + struct dsa_port *other_dp; + int err; /* VLAN awareness was off, so the question is "can we turn it on". * We may have had 8021q uppers, those need to go. Make sure we don't * enter an inconsistent state: deny changing the VLAN awareness state * as long as we have 8021q uppers. */ - if (vlan_filtering && dsa_is_user_port(ds, dp->index)) { + if (vlan_filtering && dsa_port_is_user(dp)) { struct net_device *upper_dev, *slave = dp->slave; struct net_device *br = dp->bridge_dev; struct list_head *iter; @@ -557,10 +558,10 @@ static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp, * different ports of the same switch device and one of them has a * different setting than what is being requested. */ - for (i = 0; i < ds->num_ports; i++) { + dsa_switch_for_each_port(other_dp, ds) { struct net_device *other_bridge; - other_bridge = dsa_to_port(ds, i)->bridge_dev; + other_bridge = other_dp->bridge_dev; if (!other_bridge) continue; /* If it's the same bridge, it also has same @@ -607,20 +608,16 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, return err; if (ds->vlan_filtering_is_global) { - int port; + struct dsa_port *other_dp; ds->vlan_filtering = vlan_filtering; - for (port = 0; port < ds->num_ports; port++) { - struct net_device *slave; - - if (!dsa_is_user_port(ds, port)) - continue; + dsa_switch_for_each_user_port(other_dp, ds) { + struct net_device *slave = dp->slave; /* We might be called in the unbind path, so not * all slave devices might still be registered. */ - slave = dsa_to_port(ds, port)->slave; if (!slave) continue; @@ -1041,7 +1038,7 @@ static void dsa_port_phylink_mac_link_down(struct phylink_config *config, struct phy_device *phydev = NULL; struct dsa_switch *ds = dp->ds; - if (dsa_is_user_port(ds, dp->index)) + if (dsa_port_is_user(dp)) phydev = dp->slave->phydev; if (!ds->ops->phylink_mac_link_down) { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index a2bf2d8ac65b..9d9fef668eba 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -174,7 +174,7 @@ static int dsa_slave_set_mac_address(struct net_device *dev, void *a) dev_uc_del(master, dev->dev_addr); out: - ether_addr_copy(dev->dev_addr, addr->sa_data); + eth_hw_addr_set(dev, addr->sa_data); return 0; } @@ -789,6 +789,37 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset) return -EOPNOTSUPP; } +static void dsa_slave_get_eth_phy_stats(struct net_device *dev, + struct ethtool_eth_phy_stats *phy_stats) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (ds->ops->get_eth_phy_stats) + ds->ops->get_eth_phy_stats(ds, dp->index, phy_stats); +} + +static void dsa_slave_get_eth_mac_stats(struct net_device *dev, + struct ethtool_eth_mac_stats *mac_stats) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (ds->ops->get_eth_mac_stats) + ds->ops->get_eth_mac_stats(ds, dp->index, mac_stats); +} + +static void +dsa_slave_get_eth_ctrl_stats(struct net_device *dev, + struct ethtool_eth_ctrl_stats *ctrl_stats) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (ds->ops->get_eth_ctrl_stats) + ds->ops->get_eth_ctrl_stats(ds, dp->index, ctrl_stats); +} + static void dsa_slave_net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf) { @@ -1695,6 +1726,9 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_strings = dsa_slave_get_strings, .get_ethtool_stats = dsa_slave_get_ethtool_stats, .get_sset_count = dsa_slave_get_sset_count, + .get_eth_phy_stats = dsa_slave_get_eth_phy_stats, + .get_eth_mac_stats = dsa_slave_get_eth_mac_stats, + .get_eth_ctrl_stats = dsa_slave_get_eth_ctrl_stats, .set_wol = dsa_slave_set_wol, .get_wol = dsa_slave_get_wol, .set_eee = dsa_slave_set_eee, @@ -1954,7 +1988,7 @@ int dsa_slave_create(struct dsa_port *port) slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; if (!is_zero_ether_addr(port->mac)) - ether_addr_copy(slave_dev->dev_addr, port->mac); + eth_hw_addr_set(slave_dev, port->mac); else eth_hw_addr_inherit(slave_dev, master); slave_dev->priv_flags |= IFF_NO_QUEUE; @@ -2334,7 +2368,7 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb, dst = cpu_dp->ds->dst; list_for_each_entry(dp, &dst->ports, list) { - if (!dsa_is_user_port(dp->ds, dp->index)) + if (!dsa_port_is_user(dp)) continue; list_add(&dp->slave->close_list, &close_list); diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 1c797ec8e2c2..2b1b21bde830 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -17,14 +17,11 @@ static unsigned int dsa_switch_fastest_ageing_time(struct dsa_switch *ds, unsigned int ageing_time) { - int i; - - for (i = 0; i < ds->num_ports; ++i) { - struct dsa_port *dp = dsa_to_port(ds, i); + struct dsa_port *dp; + dsa_switch_for_each_port(dp, ds) if (dp->ageing_time && dp->ageing_time < ageing_time) ageing_time = dp->ageing_time; - } return ageing_time; } @@ -49,10 +46,10 @@ static int dsa_switch_ageing_time(struct dsa_switch *ds, return 0; } -static bool dsa_switch_mtu_match(struct dsa_switch *ds, int port, - struct dsa_notifier_mtu_info *info) +static bool dsa_port_mtu_match(struct dsa_port *dp, + struct dsa_notifier_mtu_info *info) { - if (ds->index == info->sw_index && port == info->port) + if (dp->ds->index == info->sw_index && dp->index == info->port) return true; /* Do not propagate to other switches in the tree if the notifier was @@ -61,7 +58,7 @@ static bool dsa_switch_mtu_match(struct dsa_switch *ds, int port, if (info->targeted_match) return false; - if (dsa_is_dsa_port(ds, port) || dsa_is_cpu_port(ds, port)) + if (dsa_port_is_dsa(dp) || dsa_port_is_cpu(dp)) return true; return false; @@ -70,14 +67,16 @@ static bool dsa_switch_mtu_match(struct dsa_switch *ds, int port, static int dsa_switch_mtu(struct dsa_switch *ds, struct dsa_notifier_mtu_info *info) { - int port, ret; + struct dsa_port *dp; + int ret; if (!ds->ops->port_change_mtu) return -EOPNOTSUPP; - for (port = 0; port < ds->num_ports; port++) { - if (dsa_switch_mtu_match(ds, port, info)) { - ret = ds->ops->port_change_mtu(ds, port, info->mtu); + dsa_switch_for_each_port(dp, ds) { + if (dsa_port_mtu_match(dp, info)) { + ret = ds->ops->port_change_mtu(ds, dp->index, + info->mtu); if (ret) return ret; } @@ -120,7 +119,8 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, struct netlink_ext_ack extack = {0}; bool change_vlan_filtering = false; bool vlan_filtering; - int err, port; + struct dsa_port *dp; + int err; if (dst->index == info->tree_index && ds->index == info->sw_index && ds->ops->port_bridge_leave) @@ -150,10 +150,10 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, * VLAN-aware bridge. */ if (change_vlan_filtering && ds->vlan_filtering_is_global) { - for (port = 0; port < ds->num_ports; port++) { + dsa_switch_for_each_port(dp, ds) { struct net_device *bridge_dev; - bridge_dev = dsa_to_port(ds, port)->bridge_dev; + bridge_dev = dp->bridge_dev; if (bridge_dev && br_vlan_enabled(bridge_dev)) { change_vlan_filtering = false; @@ -168,7 +168,7 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, if (extack._msg) dev_err(ds->dev, "port %d: %s\n", info->port, extack._msg); - if (err && err != EOPNOTSUPP) + if (err && err != -EOPNOTSUPP) return err; } @@ -179,19 +179,19 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, * DSA links) that sit between the targeted port on which the notifier was * emitted and its dedicated CPU port. */ -static bool dsa_switch_host_address_match(struct dsa_switch *ds, int port, - int info_sw_index, int info_port) +static bool dsa_port_host_address_match(struct dsa_port *dp, + int info_sw_index, int info_port) { struct dsa_port *targeted_dp, *cpu_dp; struct dsa_switch *targeted_ds; - targeted_ds = dsa_switch_find(ds->dst->index, info_sw_index); + targeted_ds = dsa_switch_find(dp->ds->dst->index, info_sw_index); targeted_dp = dsa_to_port(targeted_ds, info_port); cpu_dp = targeted_dp->cpu_dp; - if (dsa_switch_is_upstream_of(ds, targeted_ds)) - return port == dsa_towards_port(ds, cpu_dp->ds->index, - cpu_dp->index); + if (dsa_switch_is_upstream_of(dp->ds, targeted_ds)) + return dp->index == dsa_towards_port(dp->ds, cpu_dp->ds->index, + cpu_dp->index); return false; } @@ -209,11 +209,12 @@ static struct dsa_mac_addr *dsa_mac_addr_find(struct list_head *addr_list, return NULL; } -static int dsa_switch_do_mdb_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb) +static int dsa_port_do_mdb_add(struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb) { - struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_switch *ds = dp->ds; struct dsa_mac_addr *a; + int port = dp->index; int err; /* No need to bother with refcounting for user ports */ @@ -244,11 +245,12 @@ static int dsa_switch_do_mdb_add(struct dsa_switch *ds, int port, return 0; } -static int dsa_switch_do_mdb_del(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb) +static int dsa_port_do_mdb_del(struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb) { - struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_switch *ds = dp->ds; struct dsa_mac_addr *a; + int port = dp->index; int err; /* No need to bother with refcounting for user ports */ @@ -274,11 +276,12 @@ static int dsa_switch_do_mdb_del(struct dsa_switch *ds, int port, return 0; } -static int dsa_switch_do_fdb_add(struct dsa_switch *ds, int port, - const unsigned char *addr, u16 vid) +static int dsa_port_do_fdb_add(struct dsa_port *dp, const unsigned char *addr, + u16 vid) { - struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_switch *ds = dp->ds; struct dsa_mac_addr *a; + int port = dp->index; int err; /* No need to bother with refcounting for user ports */ @@ -309,11 +312,12 @@ static int dsa_switch_do_fdb_add(struct dsa_switch *ds, int port, return 0; } -static int dsa_switch_do_fdb_del(struct dsa_switch *ds, int port, - const unsigned char *addr, u16 vid) +static int dsa_port_do_fdb_del(struct dsa_port *dp, const unsigned char *addr, + u16 vid) { - struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_switch *ds = dp->ds; struct dsa_mac_addr *a; + int port = dp->index; int err; /* No need to bother with refcounting for user ports */ @@ -342,17 +346,16 @@ static int dsa_switch_do_fdb_del(struct dsa_switch *ds, int port, static int dsa_switch_host_fdb_add(struct dsa_switch *ds, struct dsa_notifier_fdb_info *info) { + struct dsa_port *dp; int err = 0; - int port; if (!ds->ops->port_fdb_add) return -EOPNOTSUPP; - for (port = 0; port < ds->num_ports; port++) { - if (dsa_switch_host_address_match(ds, port, info->sw_index, - info->port)) { - err = dsa_switch_do_fdb_add(ds, port, info->addr, - info->vid); + dsa_switch_for_each_port(dp, ds) { + if (dsa_port_host_address_match(dp, info->sw_index, + info->port)) { + err = dsa_port_do_fdb_add(dp, info->addr, info->vid); if (err) break; } @@ -364,17 +367,16 @@ static int dsa_switch_host_fdb_add(struct dsa_switch *ds, static int dsa_switch_host_fdb_del(struct dsa_switch *ds, struct dsa_notifier_fdb_info *info) { + struct dsa_port *dp; int err = 0; - int port; if (!ds->ops->port_fdb_del) return -EOPNOTSUPP; - for (port = 0; port < ds->num_ports; port++) { - if (dsa_switch_host_address_match(ds, port, info->sw_index, - info->port)) { - err = dsa_switch_do_fdb_del(ds, port, info->addr, - info->vid); + dsa_switch_for_each_port(dp, ds) { + if (dsa_port_host_address_match(dp, info->sw_index, + info->port)) { + err = dsa_port_do_fdb_del(dp, info->addr, info->vid); if (err) break; } @@ -387,22 +389,24 @@ static int dsa_switch_fdb_add(struct dsa_switch *ds, struct dsa_notifier_fdb_info *info) { int port = dsa_towards_port(ds, info->sw_index, info->port); + struct dsa_port *dp = dsa_to_port(ds, port); if (!ds->ops->port_fdb_add) return -EOPNOTSUPP; - return dsa_switch_do_fdb_add(ds, port, info->addr, info->vid); + return dsa_port_do_fdb_add(dp, info->addr, info->vid); } static int dsa_switch_fdb_del(struct dsa_switch *ds, struct dsa_notifier_fdb_info *info) { int port = dsa_towards_port(ds, info->sw_index, info->port); + struct dsa_port *dp = dsa_to_port(ds, port); if (!ds->ops->port_fdb_del) return -EOPNOTSUPP; - return dsa_switch_do_fdb_del(ds, port, info->addr, info->vid); + return dsa_port_do_fdb_del(dp, info->addr, info->vid); } static int dsa_switch_hsr_join(struct dsa_switch *ds, @@ -468,37 +472,39 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds, struct dsa_notifier_mdb_info *info) { int port = dsa_towards_port(ds, info->sw_index, info->port); + struct dsa_port *dp = dsa_to_port(ds, port); if (!ds->ops->port_mdb_add) return -EOPNOTSUPP; - return dsa_switch_do_mdb_add(ds, port, info->mdb); + return dsa_port_do_mdb_add(dp, info->mdb); } static int dsa_switch_mdb_del(struct dsa_switch *ds, struct dsa_notifier_mdb_info *info) { int port = dsa_towards_port(ds, info->sw_index, info->port); + struct dsa_port *dp = dsa_to_port(ds, port); if (!ds->ops->port_mdb_del) return -EOPNOTSUPP; - return dsa_switch_do_mdb_del(ds, port, info->mdb); + return dsa_port_do_mdb_del(dp, info->mdb); } static int dsa_switch_host_mdb_add(struct dsa_switch *ds, struct dsa_notifier_mdb_info *info) { + struct dsa_port *dp; int err = 0; - int port; if (!ds->ops->port_mdb_add) return -EOPNOTSUPP; - for (port = 0; port < ds->num_ports; port++) { - if (dsa_switch_host_address_match(ds, port, info->sw_index, - info->port)) { - err = dsa_switch_do_mdb_add(ds, port, info->mdb); + dsa_switch_for_each_port(dp, ds) { + if (dsa_port_host_address_match(dp, info->sw_index, + info->port)) { + err = dsa_port_do_mdb_add(dp, info->mdb); if (err) break; } @@ -510,16 +516,16 @@ static int dsa_switch_host_mdb_add(struct dsa_switch *ds, static int dsa_switch_host_mdb_del(struct dsa_switch *ds, struct dsa_notifier_mdb_info *info) { + struct dsa_port *dp; int err = 0; - int port; if (!ds->ops->port_mdb_del) return -EOPNOTSUPP; - for (port = 0; port < ds->num_ports; port++) { - if (dsa_switch_host_address_match(ds, port, info->sw_index, - info->port)) { - err = dsa_switch_do_mdb_del(ds, port, info->mdb); + dsa_switch_for_each_port(dp, ds) { + if (dsa_port_host_address_match(dp, info->sw_index, + info->port)) { + err = dsa_port_do_mdb_del(dp, info->mdb); if (err) break; } @@ -528,13 +534,13 @@ static int dsa_switch_host_mdb_del(struct dsa_switch *ds, return err; } -static bool dsa_switch_vlan_match(struct dsa_switch *ds, int port, - struct dsa_notifier_vlan_info *info) +static bool dsa_port_vlan_match(struct dsa_port *dp, + struct dsa_notifier_vlan_info *info) { - if (ds->index == info->sw_index && port == info->port) + if (dp->ds->index == info->sw_index && dp->index == info->port) return true; - if (dsa_is_dsa_port(ds, port)) + if (dsa_port_is_dsa(dp)) return true; return false; @@ -543,14 +549,15 @@ static bool dsa_switch_vlan_match(struct dsa_switch *ds, int port, static int dsa_switch_vlan_add(struct dsa_switch *ds, struct dsa_notifier_vlan_info *info) { - int port, err; + struct dsa_port *dp; + int err; if (!ds->ops->port_vlan_add) return -EOPNOTSUPP; - for (port = 0; port < ds->num_ports; port++) { - if (dsa_switch_vlan_match(ds, port, info)) { - err = ds->ops->port_vlan_add(ds, port, info->vlan, + dsa_switch_for_each_port(dp, ds) { + if (dsa_port_vlan_match(dp, info)) { + err = ds->ops->port_vlan_add(ds, dp->index, info->vlan, info->extack); if (err) return err; @@ -579,38 +586,34 @@ static int dsa_switch_change_tag_proto(struct dsa_switch *ds, struct dsa_notifier_tag_proto_info *info) { const struct dsa_device_ops *tag_ops = info->tag_ops; - int port, err; + struct dsa_port *dp, *cpu_dp; + int err; if (!ds->ops->change_tag_protocol) return -EOPNOTSUPP; ASSERT_RTNL(); - for (port = 0; port < ds->num_ports; port++) { - if (!dsa_is_cpu_port(ds, port)) - continue; - - err = ds->ops->change_tag_protocol(ds, port, tag_ops->proto); + dsa_switch_for_each_cpu_port(cpu_dp, ds) { + err = ds->ops->change_tag_protocol(ds, cpu_dp->index, + tag_ops->proto); if (err) return err; - dsa_port_set_tag_protocol(dsa_to_port(ds, port), tag_ops); + dsa_port_set_tag_protocol(cpu_dp, tag_ops); } /* Now that changing the tag protocol can no longer fail, let's update * the remaining bits which are "duplicated for faster access", and the * bits that depend on the tagger, such as the MTU. */ - for (port = 0; port < ds->num_ports; port++) { - if (dsa_is_user_port(ds, port)) { - struct net_device *slave; + dsa_switch_for_each_user_port(dp, ds) { + struct net_device *slave = dp->slave; - slave = dsa_to_port(ds, port)->slave; - dsa_slave_setup_tagger(slave); + dsa_slave_setup_tagger(slave); - /* rtnl_mutex is held in dsa_tree_change_tag_proto */ - dsa_slave_change_mtu(slave, slave->mtu); - } + /* rtnl_mutex is held in dsa_tree_change_tag_proto */ + dsa_slave_change_mtu(slave, slave->mtu); } return 0; diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index f8f7b7c34e7d..72cac2c0af7b 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -6,7 +6,6 @@ * dsa_8021q_netdev_ops is registered for API compliance and not used * directly by callers. */ -#include <linux/if_bridge.h> #include <linux/if_vlan.h> #include <linux/dsa/8021q.h> @@ -78,22 +77,22 @@ EXPORT_SYMBOL_GPL(dsa_8021q_bridge_tx_fwd_offload_vid); /* Returns the VID to be inserted into the frame from xmit for switch steering * instructions on egress. Encodes switch ID and port ID. */ -u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port) +u16 dsa_tag_8021q_tx_vid(const struct dsa_port *dp) { - return DSA_8021Q_DIR_TX | DSA_8021Q_SWITCH_ID(ds->index) | - DSA_8021Q_PORT(port); + return DSA_8021Q_DIR_TX | DSA_8021Q_SWITCH_ID(dp->ds->index) | + DSA_8021Q_PORT(dp->index); } -EXPORT_SYMBOL_GPL(dsa_8021q_tx_vid); +EXPORT_SYMBOL_GPL(dsa_tag_8021q_tx_vid); /* Returns the VID that will be installed as pvid for this switch port, sent as * tagged egress towards the CPU port and decoded by the rcv function. */ -u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port) +u16 dsa_tag_8021q_rx_vid(const struct dsa_port *dp) { - return DSA_8021Q_DIR_RX | DSA_8021Q_SWITCH_ID(ds->index) | - DSA_8021Q_PORT(port); + return DSA_8021Q_DIR_RX | DSA_8021Q_SWITCH_ID(dp->ds->index) | + DSA_8021Q_PORT(dp->index); } -EXPORT_SYMBOL_GPL(dsa_8021q_rx_vid); +EXPORT_SYMBOL_GPL(dsa_tag_8021q_rx_vid); /* Returns the decoded switch ID from the RX VID. */ int dsa_8021q_rx_switch_id(u16 vid) @@ -139,12 +138,13 @@ dsa_tag_8021q_vlan_find(struct dsa_8021q_context *ctx, int port, u16 vid) return NULL; } -static int dsa_switch_do_tag_8021q_vlan_add(struct dsa_switch *ds, int port, - u16 vid, u16 flags) +static int dsa_port_do_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid, + u16 flags) { - struct dsa_8021q_context *ctx = ds->tag_8021q_ctx; - struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_8021q_context *ctx = dp->ds->tag_8021q_ctx; + struct dsa_switch *ds = dp->ds; struct dsa_tag_8021q_vlan *v; + int port = dp->index; int err; /* No need to bother with refcounting for user ports */ @@ -175,12 +175,12 @@ static int dsa_switch_do_tag_8021q_vlan_add(struct dsa_switch *ds, int port, return 0; } -static int dsa_switch_do_tag_8021q_vlan_del(struct dsa_switch *ds, int port, - u16 vid) +static int dsa_port_do_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid) { - struct dsa_8021q_context *ctx = ds->tag_8021q_ctx; - struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_8021q_context *ctx = dp->ds->tag_8021q_ctx; + struct dsa_switch *ds = dp->ds; struct dsa_tag_8021q_vlan *v; + int port = dp->index; int err; /* No need to bother with refcounting for user ports */ @@ -207,14 +207,16 @@ static int dsa_switch_do_tag_8021q_vlan_del(struct dsa_switch *ds, int port, } static bool -dsa_switch_tag_8021q_vlan_match(struct dsa_switch *ds, int port, - struct dsa_notifier_tag_8021q_vlan_info *info) +dsa_port_tag_8021q_vlan_match(struct dsa_port *dp, + struct dsa_notifier_tag_8021q_vlan_info *info) { - if (dsa_is_dsa_port(ds, port) || dsa_is_cpu_port(ds, port)) + struct dsa_switch *ds = dp->ds; + + if (dsa_port_is_dsa(dp) || dsa_port_is_cpu(dp)) return true; if (ds->dst->index == info->tree_index && ds->index == info->sw_index) - return port == info->port; + return dp->index == info->port; return false; } @@ -222,7 +224,8 @@ dsa_switch_tag_8021q_vlan_match(struct dsa_switch *ds, int port, int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds, struct dsa_notifier_tag_8021q_vlan_info *info) { - int port, err; + struct dsa_port *dp; + int err; /* Since we use dsa_broadcast(), there might be other switches in other * trees which don't support tag_8021q, so don't return an error. @@ -232,21 +235,20 @@ int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds, if (!ds->ops->tag_8021q_vlan_add || !ds->tag_8021q_ctx) return 0; - for (port = 0; port < ds->num_ports; port++) { - if (dsa_switch_tag_8021q_vlan_match(ds, port, info)) { + dsa_switch_for_each_port(dp, ds) { + if (dsa_port_tag_8021q_vlan_match(dp, info)) { u16 flags = 0; - if (dsa_is_user_port(ds, port)) + if (dsa_port_is_user(dp)) flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (vid_is_dsa_8021q_rxvlan(info->vid) && dsa_8021q_rx_switch_id(info->vid) == ds->index && - dsa_8021q_rx_source_port(info->vid) == port) + dsa_8021q_rx_source_port(info->vid) == dp->index) flags |= BRIDGE_VLAN_INFO_PVID; - err = dsa_switch_do_tag_8021q_vlan_add(ds, port, - info->vid, - flags); + err = dsa_port_do_tag_8021q_vlan_add(dp, info->vid, + flags); if (err) return err; } @@ -258,15 +260,15 @@ int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds, int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds, struct dsa_notifier_tag_8021q_vlan_info *info) { - int port, err; + struct dsa_port *dp; + int err; if (!ds->ops->tag_8021q_vlan_del || !ds->tag_8021q_ctx) return 0; - for (port = 0; port < ds->num_ports; port++) { - if (dsa_switch_tag_8021q_vlan_match(ds, port, info)) { - err = dsa_switch_do_tag_8021q_vlan_del(ds, port, - info->vid); + dsa_switch_for_each_port(dp, ds) { + if (dsa_port_tag_8021q_vlan_match(dp, info)) { + err = dsa_port_do_tag_8021q_vlan_del(dp, info->vid); if (err) return err; } @@ -322,15 +324,14 @@ int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds, * +-+-----+-+-----+-+-----+-+-----+-+ +-+-----+-+-----+-+-----+-+-----+-+ * swp0 swp1 swp2 swp3 swp0 swp1 swp2 swp3 */ -static bool dsa_tag_8021q_bridge_match(struct dsa_switch *ds, int port, - struct dsa_notifier_bridge_info *info) +static bool +dsa_port_tag_8021q_bridge_match(struct dsa_port *dp, + struct dsa_notifier_bridge_info *info) { - struct dsa_port *dp = dsa_to_port(ds, port); - /* Don't match on self */ - if (ds->dst->index == info->tree_index && - ds->index == info->sw_index && - port == info->port) + if (dp->ds->dst->index == info->tree_index && + dp->ds->index == info->sw_index && + dp->index == info->port) return false; if (dsa_port_is_user(dp)) @@ -344,21 +345,21 @@ int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, { struct dsa_switch *targeted_ds; struct dsa_port *targeted_dp; + struct dsa_port *dp; u16 targeted_rx_vid; - int err, port; + int err; if (!ds->tag_8021q_ctx) return 0; targeted_ds = dsa_switch_find(info->tree_index, info->sw_index); targeted_dp = dsa_to_port(targeted_ds, info->port); - targeted_rx_vid = dsa_8021q_rx_vid(targeted_ds, info->port); + targeted_rx_vid = dsa_tag_8021q_rx_vid(targeted_dp); - for (port = 0; port < ds->num_ports; port++) { - struct dsa_port *dp = dsa_to_port(ds, port); - u16 rx_vid = dsa_8021q_rx_vid(ds, port); + dsa_switch_for_each_port(dp, ds) { + u16 rx_vid = dsa_tag_8021q_rx_vid(dp); - if (!dsa_tag_8021q_bridge_match(ds, port, info)) + if (!dsa_port_tag_8021q_bridge_match(dp, info)) continue; /* Install the RX VID of the targeted port in our VLAN table */ @@ -380,21 +381,20 @@ int dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, { struct dsa_switch *targeted_ds; struct dsa_port *targeted_dp; + struct dsa_port *dp; u16 targeted_rx_vid; - int port; if (!ds->tag_8021q_ctx) return 0; targeted_ds = dsa_switch_find(info->tree_index, info->sw_index); targeted_dp = dsa_to_port(targeted_ds, info->port); - targeted_rx_vid = dsa_8021q_rx_vid(targeted_ds, info->port); + targeted_rx_vid = dsa_tag_8021q_rx_vid(targeted_dp); - for (port = 0; port < ds->num_ports; port++) { - struct dsa_port *dp = dsa_to_port(ds, port); - u16 rx_vid = dsa_8021q_rx_vid(ds, port); + dsa_switch_for_each_port(dp, ds) { + u16 rx_vid = dsa_tag_8021q_rx_vid(dp); - if (!dsa_tag_8021q_bridge_match(ds, port, info)) + if (!dsa_port_tag_8021q_bridge_match(dp, info)) continue; /* Remove the RX VID of the targeted port from our VLAN table */ @@ -433,8 +433,8 @@ static int dsa_tag_8021q_port_setup(struct dsa_switch *ds, int port) { struct dsa_8021q_context *ctx = ds->tag_8021q_ctx; struct dsa_port *dp = dsa_to_port(ds, port); - u16 rx_vid = dsa_8021q_rx_vid(ds, port); - u16 tx_vid = dsa_8021q_tx_vid(ds, port); + u16 rx_vid = dsa_tag_8021q_rx_vid(dp); + u16 tx_vid = dsa_tag_8021q_tx_vid(dp); struct net_device *master; int err; @@ -478,8 +478,8 @@ static void dsa_tag_8021q_port_teardown(struct dsa_switch *ds, int port) { struct dsa_8021q_context *ctx = ds->tag_8021q_ctx; struct dsa_port *dp = dsa_to_port(ds, port); - u16 rx_vid = dsa_8021q_rx_vid(ds, port); - u16 tx_vid = dsa_8021q_tx_vid(ds, port); + u16 rx_vid = dsa_tag_8021q_rx_vid(dp); + u16 tx_vid = dsa_tag_8021q_tx_vid(dp); struct net_device *master; /* The CPU port is implicitly configured by diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index 77d0ce89ab77..b3da4b2ea11c 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -45,6 +45,7 @@ * 6 6 2 2 4 2 N */ +#include <linux/dsa/mv88e6xxx.h> #include <linux/etherdevice.h> #include <linux/list.h> #include <linux/slab.h> @@ -129,12 +130,9 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, u8 tag_dev, tag_port; enum dsa_cmd cmd; u8 *dsa_header; - u16 pvid = 0; - int err; if (skb->offload_fwd_mark) { struct dsa_switch_tree *dst = dp->ds->dst; - struct net_device *br = dp->bridge_dev; cmd = DSA_CMD_FORWARD; @@ -144,19 +142,6 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, */ tag_dev = dst->last_switch + 1 + dp->bridge_num; tag_port = 0; - - /* If we are offloading forwarding for a VLAN-unaware bridge, - * inject packets to hardware using the bridge's pvid, since - * that's where the packets ingressed from. - */ - if (!br_vlan_enabled(br)) { - /* Safe because __dev_queue_xmit() runs under - * rcu_read_lock_bh() - */ - err = br_vlan_get_pvid_rcu(br, &pvid); - if (err) - return NULL; - } } else { cmd = DSA_CMD_FROM_CPU; tag_dev = dp->ds->index; @@ -180,16 +165,21 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, dsa_header[2] &= ~0x10; } } else { + struct net_device *br = dp->bridge_dev; + u16 vid; + + vid = br ? MV88E6XXX_VID_BRIDGED : MV88E6XXX_VID_STANDALONE; + skb_push(skb, DSA_HLEN + extra); dsa_alloc_etype_header(skb, DSA_HLEN + extra); - /* Construct untagged DSA tag. */ + /* Construct DSA header from untagged frame. */ dsa_header = dsa_etype_header_pos_tx(skb) + extra; dsa_header[0] = (cmd << 6) | tag_dev; dsa_header[1] = tag_port << 3; - dsa_header[2] = pvid >> 8; - dsa_header[3] = pvid & 0xff; + dsa_header[2] = vid >> 8; + dsa_header[3] = vid & 0xff; } return skb; @@ -210,7 +200,7 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev, cmd = dsa_header[0] >> 6; switch (cmd) { case DSA_CMD_FORWARD: - trunk = !!(dsa_header[1] & 7); + trunk = !!(dsa_header[1] & 4); break; case DSA_CMD_TO_CPU: diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index fa1d60d13ad9..3509fc967ca9 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -6,7 +6,6 @@ #include <linux/etherdevice.h> #include <linux/list.h> -#include <linux/slab.h> #include <net/dsa.h> #include "dsa_priv.h" diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index d37ab98e7fe1..cd60b94fc175 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -1,19 +1,55 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright 2019 NXP Semiconductors +/* Copyright 2019 NXP */ #include <linux/dsa/ocelot.h> -#include <soc/mscc/ocelot.h> #include "dsa_priv.h" +/* If the port is under a VLAN-aware bridge, remove the VLAN header from the + * payload and move it into the DSA tag, which will make the switch classify + * the packet to the bridge VLAN. Otherwise, leave the classified VLAN at zero, + * which is the pvid of standalone and VLAN-unaware bridge ports. + */ +static void ocelot_xmit_get_vlan_info(struct sk_buff *skb, struct dsa_port *dp, + u64 *vlan_tci, u64 *tag_type) +{ + struct net_device *br = READ_ONCE(dp->bridge_dev); + struct vlan_ethhdr *hdr; + u16 proto, tci; + + if (!br || !br_vlan_enabled(br)) { + *vlan_tci = 0; + *tag_type = IFH_TAG_TYPE_C; + return; + } + + hdr = (struct vlan_ethhdr *)skb_mac_header(skb); + br_vlan_get_proto(br, &proto); + + if (ntohs(hdr->h_vlan_proto) == proto) { + __skb_vlan_pop(skb, &tci); + *vlan_tci = tci; + } else { + rcu_read_lock(); + br_vlan_get_pvid_rcu(br, &tci); + rcu_read_unlock(); + *vlan_tci = tci; + } + + *tag_type = (proto != ETH_P_8021Q) ? IFH_TAG_TYPE_S : IFH_TAG_TYPE_C; +} + static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev, __be32 ifh_prefix, void **ifh) { struct dsa_port *dp = dsa_slave_to_port(netdev); struct dsa_switch *ds = dp->ds; + u64 vlan_tci, tag_type; void *injection; __be32 *prefix; u32 rew_op = 0; + ocelot_xmit_get_vlan_info(skb, dp, &vlan_tci, &tag_type); + injection = skb_push(skb, OCELOT_TAG_LEN); prefix = skb_push(skb, OCELOT_SHORT_PREFIX_LEN); @@ -22,6 +58,8 @@ static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev, ocelot_ifh_set_bypass(injection, 1); ocelot_ifh_set_src(injection, ds->num_ports); ocelot_ifh_set_qos_class(injection, skb->priority); + ocelot_ifh_set_vlan_tci(injection, vlan_tci); + ocelot_ifh_set_tag_type(injection, tag_type); rew_op = ocelot_ptp_rew_op(skb); if (rew_op) diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index 3038a257ba05..a1919ea5e828 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright 2020-2021 NXP Semiconductors +/* Copyright 2020-2021 NXP * * An implementation of the software-defined tag_8021q.c tagger format, which * also preserves full functionality under a vlan_filtering bridge. It does @@ -9,29 +9,43 @@ * that on egress */ #include <linux/dsa/8021q.h> -#include <soc/mscc/ocelot.h> -#include <soc/mscc/ocelot_ptp.h> +#include <linux/dsa/ocelot.h> #include "dsa_priv.h" +static struct sk_buff *ocelot_defer_xmit(struct dsa_port *dp, + struct sk_buff *skb) +{ + struct felix_deferred_xmit_work *xmit_work; + struct felix_port *felix_port = dp->priv; + + xmit_work = kzalloc(sizeof(*xmit_work), GFP_ATOMIC); + if (!xmit_work) + return NULL; + + /* Calls felix_port_deferred_xmit in felix.c */ + kthread_init_work(&xmit_work->work, felix_port->xmit_work_fn); + /* Increase refcount so the kfree_skb in dsa_slave_xmit + * won't really free the packet. + */ + xmit_work->dp = dp; + xmit_work->skb = skb_get(skb); + + kthread_queue_work(felix_port->xmit_worker, &xmit_work->work); + + return NULL; +} + static struct sk_buff *ocelot_xmit(struct sk_buff *skb, struct net_device *netdev) { struct dsa_port *dp = dsa_slave_to_port(netdev); - u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); - struct ocelot *ocelot = dp->ds->priv; - int port = dp->index; - u32 rew_op = 0; + u16 tx_vid = dsa_tag_8021q_tx_vid(dp); + struct ethhdr *hdr = eth_hdr(skb); - rew_op = ocelot_ptp_rew_op(skb); - if (rew_op) { - if (!ocelot_can_inject(ocelot, 0)) - return NULL; - - ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb); - return NULL; - } + if (ocelot_ptp_rew_op(skb) || is_link_local_ether_addr(hdr->h_dest)) + return ocelot_defer_xmit(dp, skb); return dsa_8021q_xmit(skb, netdev, ETH_P_8021Q, ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); diff --git a/net/dsa/tag_rtl8_4.c b/net/dsa/tag_rtl8_4.c new file mode 100644 index 000000000000..02686ad4045d --- /dev/null +++ b/net/dsa/tag_rtl8_4.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Handler for Realtek 8 byte switch tags + * + * Copyright (C) 2021 Alvin Å ipraga <alsi@bang-olufsen.dk> + * + * NOTE: Currently only supports protocol "4" found in the RTL8365MB, hence + * named tag_rtl8_4. + * + * This tag header has the following format: + * + * ------------------------------------------- + * | MAC DA | MAC SA | 8 byte tag | Type | ... + * ------------------------------------------- + * _______________/ \______________________________________ + * / \ + * 0 7|8 15 + * |-----------------------------------+-----------------------------------|--- + * | (16-bit) | ^ + * | Realtek EtherType [0x8899] | | + * |-----------------------------------+-----------------------------------| 8 + * | (8-bit) | (8-bit) | + * | Protocol [0x04] | REASON | b + * |-----------------------------------+-----------------------------------| y + * | (1) | (1) | (2) | (1) | (3) | (1) | (1) | (1) | (5) | t + * | FID_EN | X | FID | PRI_EN | PRI | KEEP | X | LEARN_DIS | X | e + * |-----------------------------------+-----------------------------------| s + * | (1) | (15-bit) | | + * | ALLOW | TX/RX | v + * |-----------------------------------+-----------------------------------|--- + * + * With the following field descriptions: + * + * field | description + * ------------+------------- + * Realtek | 0x8899: indicates that this is a proprietary Realtek tag; + * EtherType | note that Realtek uses the same EtherType for + * | other incompatible tag formats (e.g. tag_rtl4_a.c) + * Protocol | 0x04: indicates that this tag conforms to this format + * X | reserved + * ------------+------------- + * REASON | reason for forwarding packet to CPU + * | 0: packet was forwarded or flooded to CPU + * | 80: packet was trapped to CPU + * FID_EN | 1: packet has an FID + * | 0: no FID + * FID | FID of packet (if FID_EN=1) + * PRI_EN | 1: force priority of packet + * | 0: don't force priority + * PRI | priority of packet (if PRI_EN=1) + * KEEP | preserve packet VLAN tag format + * LEARN_DIS | don't learn the source MAC address of the packet + * ALLOW | 1: treat TX/RX field as an allowance port mask, meaning the + * | packet may only be forwarded to ports specified in the + * | mask + * | 0: no allowance port mask, TX/RX field is the forwarding + * | port mask + * TX/RX | TX (switch->CPU): port number the packet was received on + * | RX (CPU->switch): forwarding port mask (if ALLOW=0) + * | allowance port mask (if ALLOW=1) + */ + +#include <linux/bitfield.h> +#include <linux/bits.h> +#include <linux/etherdevice.h> + +#include "dsa_priv.h" + +/* Protocols supported: + * + * 0x04 = RTL8365MB DSA protocol + */ + +#define RTL8_4_TAG_LEN 8 + +#define RTL8_4_PROTOCOL GENMASK(15, 8) +#define RTL8_4_PROTOCOL_RTL8365MB 0x04 +#define RTL8_4_REASON GENMASK(7, 0) +#define RTL8_4_REASON_FORWARD 0 +#define RTL8_4_REASON_TRAP 80 + +#define RTL8_4_LEARN_DIS BIT(5) + +#define RTL8_4_TX GENMASK(3, 0) +#define RTL8_4_RX GENMASK(10, 0) + +static struct sk_buff *rtl8_4_tag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + __be16 *tag; + + skb_push(skb, RTL8_4_TAG_LEN); + + dsa_alloc_etype_header(skb, RTL8_4_TAG_LEN); + tag = dsa_etype_header_pos_tx(skb); + + /* Set Realtek EtherType */ + tag[0] = htons(ETH_P_REALTEK); + + /* Set Protocol; zero REASON */ + tag[1] = htons(FIELD_PREP(RTL8_4_PROTOCOL, RTL8_4_PROTOCOL_RTL8365MB)); + + /* Zero FID_EN, FID, PRI_EN, PRI, KEEP; set LEARN_DIS */ + tag[2] = htons(FIELD_PREP(RTL8_4_LEARN_DIS, 1)); + + /* Zero ALLOW; set RX (CPU->switch) forwarding port mask */ + tag[3] = htons(FIELD_PREP(RTL8_4_RX, BIT(dp->index))); + + return skb; +} + +static struct sk_buff *rtl8_4_tag_rcv(struct sk_buff *skb, + struct net_device *dev) +{ + __be16 *tag; + u16 etype; + u8 reason; + u8 proto; + u8 port; + + if (unlikely(!pskb_may_pull(skb, RTL8_4_TAG_LEN))) + return NULL; + + tag = dsa_etype_header_pos_rx(skb); + + /* Parse Realtek EtherType */ + etype = ntohs(tag[0]); + if (unlikely(etype != ETH_P_REALTEK)) { + dev_warn_ratelimited(&dev->dev, + "non-realtek ethertype 0x%04x\n", etype); + return NULL; + } + + /* Parse Protocol */ + proto = FIELD_GET(RTL8_4_PROTOCOL, ntohs(tag[1])); + if (unlikely(proto != RTL8_4_PROTOCOL_RTL8365MB)) { + dev_warn_ratelimited(&dev->dev, + "unknown realtek protocol 0x%02x\n", + proto); + return NULL; + } + + /* Parse REASON */ + reason = FIELD_GET(RTL8_4_REASON, ntohs(tag[1])); + + /* Parse TX (switch->CPU) */ + port = FIELD_GET(RTL8_4_TX, ntohs(tag[3])); + skb->dev = dsa_master_find_slave(dev, 0, port); + if (!skb->dev) { + dev_warn_ratelimited(&dev->dev, + "could not find slave for port %d\n", + port); + return NULL; + } + + /* Remove tag and recalculate checksum */ + skb_pull_rcsum(skb, RTL8_4_TAG_LEN); + + dsa_strip_etype_header(skb, RTL8_4_TAG_LEN); + + if (reason != RTL8_4_REASON_TRAP) + dsa_default_offload_fwd_mark(skb); + + return skb; +} + +static const struct dsa_device_ops rtl8_4_netdev_ops = { + .name = "rtl8_4", + .proto = DSA_TAG_PROTO_RTL8_4, + .xmit = rtl8_4_tag_xmit, + .rcv = rtl8_4_tag_rcv, + .needed_headroom = RTL8_4_TAG_LEN, +}; +module_dsa_tag_driver(rtl8_4_netdev_ops); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL8_4); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index c054f48541c8..262c8833a910 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -4,6 +4,7 @@ #include <linux/if_vlan.h> #include <linux/dsa/sja1105.h> #include <linux/dsa/8021q.h> +#include <linux/skbuff.h> #include <linux/packing.h> #include "dsa_priv.h" @@ -53,6 +54,11 @@ #define SJA1110_TX_TRAILER_LEN 4 #define SJA1110_MAX_PADDING_LEN 15 +enum sja1110_meta_tstamp { + SJA1110_META_TSTAMP_TX = 0, + SJA1110_META_TSTAMP_RX = 1, +}; + /* Similar to is_link_local_ether_addr(hdr->h_dest) but also covers PTP */ static inline bool sja1105_is_link_local(const struct sk_buff *skb) { @@ -152,10 +158,7 @@ static u16 sja1105_xmit_tpid(struct dsa_port *dp) * we're sure about that). It may not be on this port though, so we * need to find it. */ - list_for_each_entry(other_dp, &ds->dst->ports, list) { - if (other_dp->ds != ds) - continue; - + dsa_switch_for_each_port(other_dp, ds) { if (!other_dp->bridge_dev) continue; @@ -232,9 +235,9 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb, struct net_device *netdev) { struct dsa_port *dp = dsa_slave_to_port(netdev); - u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); + u16 tx_vid = dsa_tag_8021q_tx_vid(dp); if (skb->offload_fwd_mark) return sja1105_imprecise_xmit(skb, netdev); @@ -260,9 +263,9 @@ static struct sk_buff *sja1110_xmit(struct sk_buff *skb, { struct sk_buff *clone = SJA1105_SKB_CB(skb)->clone; struct dsa_port *dp = dsa_slave_to_port(netdev); - u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); + u16 tx_vid = dsa_tag_8021q_tx_vid(dp); __be32 *tx_trailer; __be16 *tx_header; int trailer_pos; @@ -520,6 +523,43 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, is_meta); } +static void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, + u8 ts_id, enum sja1110_meta_tstamp dir, + u64 tstamp) +{ + struct sk_buff *skb, *skb_tmp, *skb_match = NULL; + struct dsa_port *dp = dsa_to_port(ds, port); + struct skb_shared_hwtstamps shwt = {0}; + struct sja1105_port *sp = dp->priv; + + if (!dsa_port_is_sja1105(dp)) + return; + + /* We don't care about RX timestamps on the CPU port */ + if (dir == SJA1110_META_TSTAMP_RX) + return; + + spin_lock(&sp->data->skb_txtstamp_queue.lock); + + skb_queue_walk_safe(&sp->data->skb_txtstamp_queue, skb, skb_tmp) { + if (SJA1105_SKB_CB(skb)->ts_id != ts_id) + continue; + + __skb_unlink(skb, &sp->data->skb_txtstamp_queue); + skb_match = skb; + + break; + } + + spin_unlock(&sp->data->skb_txtstamp_queue.lock); + + if (WARN_ON(!skb_match)) + return; + + shwt.hwtstamp = ns_to_ktime(sja1105_ticks_to_ns(tstamp)); + skb_complete_tx_timestamp(skb_match, &shwt); +} + static struct sk_buff *sja1110_rcv_meta(struct sk_buff *skb, u16 rx_header) { u8 *buf = dsa_etype_header_pos_rx(skb) + SJA1110_HEADER_LEN; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 73fce9467467..c7d9e08107cb 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -51,6 +51,7 @@ #include <linux/if_ether.h> #include <linux/of_net.h> #include <linux/pci.h> +#include <linux/property.h> #include <net/dst.h> #include <net/arp.h> #include <net/sock.h> @@ -304,7 +305,7 @@ void eth_commit_mac_addr_change(struct net_device *dev, void *p) { struct sockaddr *addr = p; - memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN); + eth_hw_addr_set(dev, addr->sa_data); } EXPORT_SYMBOL(eth_commit_mac_addr_change); @@ -523,6 +524,26 @@ int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr) EXPORT_SYMBOL(eth_platform_get_mac_address); /** + * platform_get_ethdev_address - Set netdev's MAC address from a given device + * @dev: Pointer to the device + * @netdev: Pointer to netdev to write the address to + * + * Wrapper around eth_platform_get_mac_address() which writes the address + * directly to netdev->dev_addr. + */ +int platform_get_ethdev_address(struct device *dev, struct net_device *netdev) +{ + u8 addr[ETH_ALEN] __aligned(2); + int ret; + + ret = eth_platform_get_mac_address(dev, addr); + if (!ret) + eth_hw_addr_set(netdev, addr); + return ret; +} +EXPORT_SYMBOL(platform_get_ethdev_address); + +/** * nvmem_get_mac_address - Obtain the MAC address from an nvmem cell named * 'mac-address' associated with given device. * @@ -557,4 +578,81 @@ int nvmem_get_mac_address(struct device *dev, void *addrbuf) return 0; } -EXPORT_SYMBOL(nvmem_get_mac_address); + +static int fwnode_get_mac_addr(struct fwnode_handle *fwnode, + const char *name, char *addr) +{ + int ret; + + ret = fwnode_property_read_u8_array(fwnode, name, addr, ETH_ALEN); + if (ret) + return ret; + + if (!is_valid_ether_addr(addr)) + return -EINVAL; + return 0; +} + +/** + * fwnode_get_mac_address - Get the MAC from the firmware node + * @fwnode: Pointer to the firmware node + * @addr: Address of buffer to store the MAC in + * + * Search the firmware node for the best MAC address to use. 'mac-address' is + * checked first, because that is supposed to contain to "most recent" MAC + * address. If that isn't set, then 'local-mac-address' is checked next, + * because that is the default address. If that isn't set, then the obsolete + * 'address' is checked, just in case we're using an old device tree. + * + * Note that the 'address' property is supposed to contain a virtual address of + * the register set, but some DTS files have redefined that property to be the + * MAC address. + * + * All-zero MAC addresses are rejected, because those could be properties that + * exist in the firmware tables, but were not updated by the firmware. For + * example, the DTS could define 'mac-address' and 'local-mac-address', with + * zero MAC addresses. Some older U-Boots only initialized 'local-mac-address'. + * In this case, the real MAC is in 'local-mac-address', and 'mac-address' + * exists but is all zeros. + */ +int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr) +{ + if (!fwnode_get_mac_addr(fwnode, "mac-address", addr) || + !fwnode_get_mac_addr(fwnode, "local-mac-address", addr) || + !fwnode_get_mac_addr(fwnode, "address", addr)) + return 0; + + return -ENOENT; +} +EXPORT_SYMBOL(fwnode_get_mac_address); + +/** + * device_get_mac_address - Get the MAC for a given device + * @dev: Pointer to the device + * @addr: Address of buffer to store the MAC in + */ +int device_get_mac_address(struct device *dev, char *addr) +{ + return fwnode_get_mac_address(dev_fwnode(dev), addr); +} +EXPORT_SYMBOL(device_get_mac_address); + +/** + * device_get_ethdev_address - Set netdev's MAC address from a given device + * @dev: Pointer to the device + * @netdev: Pointer to netdev to write the address to + * + * Wrapper around device_get_mac_address() which writes the address + * directly to netdev->dev_addr. + */ +int device_get_ethdev_address(struct device *dev, struct net_device *netdev) +{ + u8 addr[ETH_ALEN]; + int ret; + + ret = device_get_mac_address(dev, addr); + if (!ret) + eth_hw_addr_set(netdev, addr); + return ret; +} +EXPORT_SYMBOL(device_get_ethdev_address); diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 0a19470efbfb..b76432e70e6b 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -7,4 +7,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ - tunnels.o fec.o eeprom.o stats.o phc_vclocks.o + tunnels.o fec.o eeprom.o stats.o phc_vclocks.o module.o diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 999e2a6bed13..bf6e8c2f9bf7 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -89,7 +89,8 @@ static int ethtool_get_features(struct net_device *dev, void __user *useraddr) if (copy_to_user(useraddr, &cmd, sizeof(cmd))) return -EFAULT; useraddr += sizeof(cmd); - if (copy_to_user(useraddr, features, copy_size * sizeof(*features))) + if (copy_to_user(useraddr, features, + array_size(copy_size, sizeof(*features)))) return -EFAULT; return 0; @@ -799,7 +800,7 @@ static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev, goto out; useraddr += offsetof(struct ethtool_sset_info, data); - if (copy_to_user(useraddr, info_buf, idx * sizeof(u32))) + if (copy_to_user(useraddr, info_buf, array_size(idx, sizeof(u32)))) goto out; ret = 0; @@ -1022,7 +1023,7 @@ static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, { int i; - if (copy_from_user(indir, useraddr, size * sizeof(indir[0]))) + if (copy_from_user(indir, useraddr, array_size(size, sizeof(indir[0])))) return -EFAULT; /* Validate ring indices */ @@ -1895,7 +1896,7 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr) if (copy_to_user(useraddr, &test, sizeof(test))) goto out; useraddr += sizeof(test); - if (copy_to_user(useraddr, data, test.len * sizeof(u64))) + if (copy_to_user(useraddr, data, array_size(test.len, sizeof(u64)))) goto out; ret = 0; @@ -1937,7 +1938,8 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) goto out; useraddr += sizeof(gstrings); if (gstrings.len && - copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) + copy_to_user(useraddr, data, + array_size(gstrings.len, ETH_GSTRING_LEN))) goto out; ret = 0; diff --git a/net/ethtool/module.c b/net/ethtool/module.c new file mode 100644 index 000000000000..bc2cef11bbda --- /dev/null +++ b/net/ethtool/module.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/ethtool.h> + +#include "netlink.h" +#include "common.h" +#include "bitset.h" + +struct module_req_info { + struct ethnl_req_info base; +}; + +struct module_reply_data { + struct ethnl_reply_data base; + struct ethtool_module_power_mode_params power; +}; + +#define MODULE_REPDATA(__reply_base) \ + container_of(__reply_base, struct module_reply_data, base) + +/* MODULE_GET */ + +const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + 1] = { + [ETHTOOL_A_MODULE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), +}; + +static int module_get_power_mode(struct net_device *dev, + struct module_reply_data *data, + struct netlink_ext_ack *extack) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + + if (!ops->get_module_power_mode) + return 0; + + return ops->get_module_power_mode(dev, &data->power, extack); +} + +static int module_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct module_reply_data *data = MODULE_REPDATA(reply_base); + struct netlink_ext_ack *extack = info ? info->extack : NULL; + struct net_device *dev = reply_base->dev; + int ret; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + + ret = module_get_power_mode(dev, data, extack); + if (ret < 0) + goto out_complete; + +out_complete: + ethnl_ops_complete(dev); + return ret; +} + +static int module_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + struct module_reply_data *data = MODULE_REPDATA(reply_base); + int len = 0; + + if (data->power.policy) + len += nla_total_size(sizeof(u8)); /* _MODULE_POWER_MODE_POLICY */ + + if (data->power.mode) + len += nla_total_size(sizeof(u8)); /* _MODULE_POWER_MODE */ + + return len; +} + +static int module_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct module_reply_data *data = MODULE_REPDATA(reply_base); + + if (data->power.policy && + nla_put_u8(skb, ETHTOOL_A_MODULE_POWER_MODE_POLICY, + data->power.policy)) + return -EMSGSIZE; + + if (data->power.mode && + nla_put_u8(skb, ETHTOOL_A_MODULE_POWER_MODE, data->power.mode)) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_module_request_ops = { + .request_cmd = ETHTOOL_MSG_MODULE_GET, + .reply_cmd = ETHTOOL_MSG_MODULE_GET_REPLY, + .hdr_attr = ETHTOOL_A_MODULE_HEADER, + .req_info_size = sizeof(struct module_req_info), + .reply_data_size = sizeof(struct module_reply_data), + + .prepare_data = module_prepare_data, + .reply_size = module_reply_size, + .fill_reply = module_fill_reply, +}; + +/* MODULE_SET */ + +const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1] = { + [ETHTOOL_A_MODULE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_MODULE_POWER_MODE_POLICY] = + NLA_POLICY_RANGE(NLA_U8, ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH, + ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO), +}; + +static int module_set_power_mode(struct net_device *dev, struct nlattr **tb, + bool *p_mod, struct netlink_ext_ack *extack) +{ + struct ethtool_module_power_mode_params power = {}; + struct ethtool_module_power_mode_params power_new; + const struct ethtool_ops *ops = dev->ethtool_ops; + int ret; + + if (!tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]) + return 0; + + if (!ops->get_module_power_mode || !ops->set_module_power_mode) { + NL_SET_ERR_MSG_ATTR(extack, + tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY], + "Setting power mode policy is not supported by this device"); + return -EOPNOTSUPP; + } + + power_new.policy = nla_get_u8(tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]); + ret = ops->get_module_power_mode(dev, &power, extack); + if (ret < 0) + return ret; + + if (power_new.policy == power.policy) + return 0; + *p_mod = true; + + return ops->set_module_power_mode(dev, &power_new, extack); +} + +int ethnl_set_module(struct sk_buff *skb, struct genl_info *info) +{ + struct ethnl_req_info req_info = {}; + struct nlattr **tb = info->attrs; + struct net_device *dev; + bool mod = false; + int ret; + + ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_MODULE_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + dev = req_info.dev; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + + ret = module_set_power_mode(dev, tb, &mod, info->extack); + if (ret < 0) + goto out_ops; + + if (!mod) + goto out_ops; + + ethtool_notify(dev, ETHTOOL_MSG_MODULE_NTF, NULL); + +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); + dev_put(dev); + return ret; +} diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 1797a0a90019..38b44c0291b1 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -282,6 +282,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_MODULE_EEPROM_GET] = ðnl_module_eeprom_request_ops, [ETHTOOL_MSG_STATS_GET] = ðnl_stats_request_ops, [ETHTOOL_MSG_PHC_VCLOCKS_GET] = ðnl_phc_vclocks_request_ops, + [ETHTOOL_MSG_MODULE_GET] = ðnl_module_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -593,6 +594,7 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_PAUSE_NTF] = ðnl_pause_request_ops, [ETHTOOL_MSG_EEE_NTF] = ðnl_eee_request_ops, [ETHTOOL_MSG_FEC_NTF] = ðnl_fec_request_ops, + [ETHTOOL_MSG_MODULE_NTF] = ðnl_module_request_ops, }; /* default notification handler */ @@ -686,6 +688,7 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_PAUSE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_EEE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_FEC_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_MODULE_NTF] = ethnl_default_notify, }; void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) @@ -999,6 +1002,22 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_phc_vclocks_get_policy, .maxattr = ARRAY_SIZE(ethnl_phc_vclocks_get_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_MODULE_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_module_get_policy, + .maxattr = ARRAY_SIZE(ethnl_module_get_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_MODULE_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_module, + .policy = ethnl_module_set_policy, + .maxattr = ARRAY_SIZE(ethnl_module_set_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index e8987e28036f..836ee7157848 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -337,6 +337,7 @@ extern const struct ethnl_request_ops ethnl_fec_request_ops; extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops; extern const struct ethnl_request_ops ethnl_stats_request_ops; extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops; +extern const struct ethnl_request_ops ethnl_module_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -373,6 +374,8 @@ extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1]; extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS + 1]; extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1]; extern const struct nla_policy ethnl_phc_vclocks_get_policy[ETHTOOL_A_PHC_VCLOCKS_HEADER + 1]; +extern const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + 1]; +extern const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1]; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); @@ -391,6 +394,7 @@ int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_start(struct netlink_callback *cb); int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info); +int ethnl_set_module(struct sk_buff *skb, struct genl_info *info); extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 26c32407f029..e00fbb16391f 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -493,7 +493,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], INIT_LIST_HEAD(&hsr->self_node_db); spin_lock_init(&hsr->list_lock); - ether_addr_copy(hsr_dev->dev_addr, slave[0]->dev_addr); + eth_hw_addr_set(hsr_dev, slave[0]->dev_addr); /* initialize protocol specific functions */ if (protocol_version == PRP_V1) { diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index f7e284f23b1f..b099c3150150 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -75,7 +75,7 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (port->type == HSR_PT_SLAVE_A) { - ether_addr_copy(master->dev->dev_addr, dev->dev_addr); + eth_hw_addr_set(master->dev, dev->dev_addr); call_netdevice_notifiers(NETDEV_CHANGEADDR, master->dev); } diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 3297e7fa9945..2cf62718a282 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -157,7 +157,7 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev, lowpan_802154_dev(ldev)->wdev = wdev; /* Set the lowpan hardware address to the wpan hardware address. */ - memcpy(ldev->dev_addr, wdev->dev_addr, IEEE802154_ADDR_LEN); + __dev_addr_set(ldev, wdev->dev_addr, IEEE802154_ADDR_LEN); /* We need headroom for possible wpan_dev_hard_header call and tailroom * for encryption/fcs handling. The lowpan interface will replace * the IPv6 header with 6LoWPAN header. At worst case the 6LoWPAN diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1d816a5fd3eb..8eb428387bac 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -133,13 +133,9 @@ void inet_sock_destruct(struct sock *sk) struct inet_sock *inet = inet_sk(sk); __skb_queue_purge(&sk->sk_receive_queue); - if (sk->sk_rx_skb_cache) { - __kfree_skb(sk->sk_rx_skb_cache); - sk->sk_rx_skb_cache = NULL; - } __skb_queue_purge(&sk->sk_error_queue); - sk_mem_reclaim(sk); + sk_mem_reclaim_final(sk); if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) { pr_err("Attempt to release TCP socket in state %d %p\n", @@ -1666,12 +1662,6 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, } EXPORT_SYMBOL_GPL(inet_ctl_sock_create); -u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offt) -{ - return *(((unsigned long *)per_cpu_ptr(mib, cpu)) + offt); -} -EXPORT_SYMBOL_GPL(snmp_get_cpu_field); - unsigned long snmp_fold_field(void __percpu *mib, int offt) { unsigned long res = 0; diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 4a8550c49202..48f337ccf949 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -9,7 +9,6 @@ #include <linux/types.h> #include <linux/module.h> -#include <linux/ip.h> #include <linux/in.h> #include <net/ip.h> #include <net/sock.h> diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c index 0c28bd469a68..0e23ade74493 100644 --- a/net/ipv4/fib_notifier.c +++ b/net/ipv4/fib_notifier.c @@ -6,7 +6,6 @@ #include <linux/export.h> #include <net/net_namespace.h> #include <net/fib_notifier.h> -#include <net/netns/ipv4.h> #include <net/ip_fib.h> int call_fib4_notifier(struct notifier_block *nb, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b42c429cebbe..3364cb9c67e0 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1661,7 +1661,7 @@ EXPORT_SYMBOL_GPL(fib_nexthop_info); #if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6) int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, - int nh_weight, u8 rt_family) + int nh_weight, u8 rt_family, u32 nh_tclassid) { const struct net_device *dev = nhc->nhc_dev; struct rtnexthop *rtnh; @@ -1679,6 +1679,9 @@ int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, rtnh->rtnh_flags = flags; + if (nh_tclassid && nla_put_u32(skb, RTA_FLOW, nh_tclassid)) + goto nla_put_failure; + /* length of rtnetlink header + attributes */ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; @@ -1706,14 +1709,13 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) } for_nexthops(fi) { - if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight, - AF_INET) < 0) - goto nla_put_failure; + u32 nh_tclassid = 0; #ifdef CONFIG_IP_ROUTE_CLASSID - if (nh->nh_tclassid && - nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) - goto nla_put_failure; + nh_tclassid = nh->nh_tclassid; #endif + if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight, + AF_INET, nh_tclassid) < 0) + goto nla_put_failure; } endfor_nexthops(fi); mp_end: diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 8b30cadff708..b7e277d8a84d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1054,14 +1054,19 @@ bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr) iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr), &_iio); if (!ext_hdr || !iio) goto send_mal_query; - if (ntohs(iio->extobj_hdr.length) <= sizeof(iio->extobj_hdr)) + if (ntohs(iio->extobj_hdr.length) <= sizeof(iio->extobj_hdr) || + ntohs(iio->extobj_hdr.length) > sizeof(_iio)) goto send_mal_query; ident_len = ntohs(iio->extobj_hdr.length) - sizeof(iio->extobj_hdr); + iio = skb_header_pointer(skb, sizeof(_ext_hdr), + sizeof(iio->extobj_hdr) + ident_len, &_iio); + if (!iio) + goto send_mal_query; + status = 0; dev = NULL; switch (iio->extobj_hdr.class_type) { case ICMP_EXT_ECHO_CTYPE_NAME: - iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(_iio), &_iio); if (ident_len >= IFNAMSIZ) goto send_mal_query; memset(buff, 0, sizeof(buff)); @@ -1069,30 +1074,24 @@ bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr) dev = dev_get_by_name(net, buff); break; case ICMP_EXT_ECHO_CTYPE_INDEX: - iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr) + - sizeof(iio->ident.ifindex), &_iio); if (ident_len != sizeof(iio->ident.ifindex)) goto send_mal_query; dev = dev_get_by_index(net, ntohl(iio->ident.ifindex)); break; case ICMP_EXT_ECHO_CTYPE_ADDR: - if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + + if (ident_len < sizeof(iio->ident.addr.ctype3_hdr) || + ident_len != sizeof(iio->ident.addr.ctype3_hdr) + iio->ident.addr.ctype3_hdr.addrlen) goto send_mal_query; switch (ntohs(iio->ident.addr.ctype3_hdr.afi)) { case ICMP_AFI_IP: - iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr) + - sizeof(struct in_addr), &_iio); - if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + - sizeof(struct in_addr)) + if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in_addr)) goto send_mal_query; dev = ip_dev_find(net, iio->ident.addr.ip_addr.ipv4_addr); break; #if IS_ENABLED(CONFIG_IPV6) case ICMP_AFI_IP6: - iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(_iio), &_iio); - if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + - sizeof(struct in6_addr)) + if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in6_addr)) goto send_mal_query; dev = ipv6_stub->ipv6_dev_find(net, &iio->ident.addr.ip_addr.ipv6_addr, dev); dev_hold(dev); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f25d02ad4a8a..f7fea3a7c5e6 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1015,7 +1015,7 @@ void inet_csk_destroy_sock(struct sock *sk) sk_refcnt_debug_release(sk); - percpu_counter_dec(sk->sk_prot->orphan_count); + this_cpu_dec(*sk->sk_prot->orphan_count); sock_put(sk); } @@ -1074,7 +1074,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, sock_orphan(child); - percpu_counter_inc(sk->sk_prot->orphan_count); + this_cpu_inc(*sk->sk_prot->orphan_count); if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 80aeaf9e6e16..75737267746f 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -242,8 +242,10 @@ static inline int compute_score(struct sock *sk, struct net *net, if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return -1; + score = sk->sk_bound_dev_if ? 2 : 1; - score = sk->sk_family == PF_INET ? 2 : 1; + if (sk->sk_family == PF_INET) + score++; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } @@ -596,7 +598,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) if (ok) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } else { - percpu_counter_inc(sk->sk_prot->orphan_count); + this_cpu_inc(*sk->sk_prot->orphan_count); inet_sk_set_state(sk, TCP_CLOSE); sock_set_flag(sk, SOCK_DEAD); inet_csk_destroy_sock(sk); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0fe6c936dc54..2ac2b95c5694 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -986,7 +986,7 @@ static int ipgre_tunnel_init(struct net_device *dev) __gre_tunnel_init(dev); - memcpy(dev->dev_addr, &iph->saddr, 4); + __dev_addr_set(dev, &iph->saddr, 4); memcpy(dev->broadcast, &iph->daddr, 4); dev->flags = IFF_NOARP; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index fe9101d3d69e..5a473319d3a5 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -834,7 +834,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, t->parms.i_key = p->i_key; t->parms.o_key = p->o_key; if (dev->type != ARPHRD_ETHER) { - memcpy(dev->dev_addr, &p->iph.saddr, 4); + __dev_addr_set(dev, &p->iph.saddr, 4); memcpy(dev->broadcast, &p->iph.daddr, 4); } ip_tunnel_add(itn, t); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index efe25a0172e6..8c2bd1d9ddce 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -425,7 +425,7 @@ static int vti_tunnel_init(struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; - memcpy(dev->dev_addr, &iph->saddr, 4); + __dev_addr_set(dev, &iph->saddr, 4); memcpy(dev->broadcast, &iph->daddr, 4); dev->flags = IFF_NOARP; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 3aa78ccbec3e..123ea63a04cb 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -380,7 +380,7 @@ static int ipip_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); + __dev_addr_set(dev, &tunnel->parms.iph.saddr, 4); memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); tunnel->tun_hlen = 0; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index c53f14b94356..ffc0cab7cf18 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -179,10 +179,11 @@ struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry) return (void *)entry + entry->next_offset; } -unsigned int arpt_do_table(struct sk_buff *skb, - const struct nf_hook_state *state, - struct xt_table *table) +unsigned int arpt_do_table(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) { + const struct xt_table *table = priv; unsigned int hook = state->hook; static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); unsigned int verdict = NF_DROP; diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 3de78416ec76..78cd5ee24448 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -26,14 +26,6 @@ static const struct xt_table packet_filter = { .priority = NF_IP_PRI_FILTER, }; -/* The work comes in here from netfilter.c */ -static unsigned int -arptable_filter_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return arpt_do_table(skb, state, priv); -} - static struct nf_hook_ops *arpfilter_ops __read_mostly; static int arptable_filter_table_init(struct net *net) @@ -72,7 +64,7 @@ static int __init arptable_filter_init(void) if (ret < 0) return ret; - arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arptable_filter_hook); + arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arpt_do_table); if (IS_ERR(arpfilter_ops)) { xt_unregister_template(&packet_filter); return PTR_ERR(arpfilter_ops); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 13acb687c19a..2ed7c58b471a 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -222,10 +222,11 @@ struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry) /* Returns one of the generic firewall policies, like NF_ACCEPT. */ unsigned int -ipt_do_table(struct sk_buff *skb, - const struct nf_hook_state *state, - struct xt_table *table) +ipt_do_table(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) { + const struct xt_table *table = priv; unsigned int hook = state->hook; static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); const struct iphdr *ip; diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 0eb0e2ab9bfc..b9062f4552ac 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -28,13 +28,6 @@ static const struct xt_table packet_filter = { .priority = NF_IP_PRI_FILTER, }; -static unsigned int -iptable_filter_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ipt_do_table(skb, state, priv); -} - static struct nf_hook_ops *filter_ops __read_mostly; /* Default to forward because I got too much mail already. */ @@ -90,7 +83,7 @@ static int __init iptable_filter_init(void) if (ret < 0) return ret; - filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook); + filter_ops = xt_hook_ops_alloc(&packet_filter, ipt_do_table); if (IS_ERR(filter_ops)) { xt_unregister_template(&packet_filter); return PTR_ERR(filter_ops); diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 40417a3f930b..3abb430af9e6 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -34,7 +34,7 @@ static const struct xt_table packet_mangler = { }; static unsigned int -ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state, void *priv) +ipt_mangle_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { unsigned int ret; const struct iphdr *iph; @@ -50,7 +50,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state, void *pri daddr = iph->daddr; tos = iph->tos; - ret = ipt_do_table(skb, state, priv); + ret = ipt_do_table(priv, skb, state); /* Reroute for ANY change. */ if (ret != NF_DROP && ret != NF_STOLEN) { iph = ip_hdr(skb); @@ -75,8 +75,8 @@ iptable_mangle_hook(void *priv, const struct nf_hook_state *state) { if (state->hook == NF_INET_LOCAL_OUT) - return ipt_mangle_out(skb, state, priv); - return ipt_do_table(skb, state, priv); + return ipt_mangle_out(priv, skb, state); + return ipt_do_table(priv, skb, state); } static struct nf_hook_ops *mangle_ops __read_mostly; diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 45d7e072e6a5..56f6ecc43451 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -29,34 +29,27 @@ static const struct xt_table nf_nat_ipv4_table = { .af = NFPROTO_IPV4, }; -static unsigned int iptable_nat_do_chain(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ipt_do_table(skb, state, priv); -} - static const struct nf_hook_ops nf_nat_ipv4_ops[] = { { - .hook = iptable_nat_do_chain, + .hook = ipt_do_table, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, }, { - .hook = iptable_nat_do_chain, + .hook = ipt_do_table, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC, }, { - .hook = iptable_nat_do_chain, + .hook = ipt_do_table, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST, }, { - .hook = iptable_nat_do_chain, + .hook = ipt_do_table, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index b88e0f36cd05..ca5e5b21587c 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -32,17 +32,9 @@ static const struct xt_table packet_raw_before_defrag = { .priority = NF_IP_PRI_RAW_BEFORE_DEFRAG, }; -/* The work comes in here from netfilter.c. */ -static unsigned int -iptable_raw_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ipt_do_table(skb, state, priv); -} - static struct nf_hook_ops *rawtable_ops __read_mostly; -static int __net_init iptable_raw_table_init(struct net *net) +static int iptable_raw_table_init(struct net *net) { struct ipt_replace *repl; const struct xt_table *table = &packet_raw; @@ -90,7 +82,7 @@ static int __init iptable_raw_init(void) if (ret < 0) return ret; - rawtable_ops = xt_hook_ops_alloc(table, iptable_raw_hook); + rawtable_ops = xt_hook_ops_alloc(table, ipt_do_table); if (IS_ERR(rawtable_ops)) { xt_unregister_template(table); return PTR_ERR(rawtable_ops); diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index f519162a2fa5..d885443cb267 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -33,13 +33,6 @@ static const struct xt_table security_table = { .priority = NF_IP_PRI_SECURITY, }; -static unsigned int -iptable_security_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ipt_do_table(skb, state, priv); -} - static struct nf_hook_ops *sectbl_ops __read_mostly; static int iptable_security_table_init(struct net *net) @@ -78,7 +71,7 @@ static int __init iptable_security_init(void) if (ret < 0) return ret; - sectbl_ops = xt_hook_ops_alloc(&security_table, iptable_security_hook); + sectbl_ops = xt_hook_ops_alloc(&security_table, ipt_do_table); if (IS_ERR(sectbl_ops)) { xt_unregister_template(&security_table); return PTR_ERR(sectbl_ops); diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 613432a36f0a..e61ea428ea18 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -20,13 +20,8 @@ #endif #include <net/netfilter/nf_conntrack_zones.h> -static unsigned int defrag4_pernet_id __read_mostly; static DEFINE_MUTEX(defrag4_mutex); -struct defrag4_pernet { - unsigned int users; -}; - static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, u_int32_t user) { @@ -111,19 +106,15 @@ static const struct nf_hook_ops ipv4_defrag_ops[] = { static void __net_exit defrag4_net_exit(struct net *net) { - struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id); - - if (nf_defrag->users) { + if (net->nf.defrag_ipv4_users) { nf_unregister_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); - nf_defrag->users = 0; + net->nf.defrag_ipv4_users = 0; } } static struct pernet_operations defrag4_net_ops = { .exit = defrag4_net_exit, - .id = &defrag4_pernet_id, - .size = sizeof(struct defrag4_pernet), }; static int __init nf_defrag_init(void) @@ -138,24 +129,23 @@ static void __exit nf_defrag_fini(void) int nf_defrag_ipv4_enable(struct net *net) { - struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id); int err = 0; mutex_lock(&defrag4_mutex); - if (nf_defrag->users == UINT_MAX) { + if (net->nf.defrag_ipv4_users == UINT_MAX) { err = -EOVERFLOW; goto out_unlock; } - if (nf_defrag->users) { - nf_defrag->users++; + if (net->nf.defrag_ipv4_users) { + net->nf.defrag_ipv4_users++; goto out_unlock; } err = nf_register_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); if (err == 0) - nf_defrag->users = 1; + net->nf.defrag_ipv4_users = 1; out_unlock: mutex_unlock(&defrag4_mutex); @@ -165,12 +155,10 @@ EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable); void nf_defrag_ipv4_disable(struct net *net) { - struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id); - mutex_lock(&defrag4_mutex); - if (nf_defrag->users) { - nf_defrag->users--; - if (nf_defrag->users == 0) + if (net->nf.defrag_ipv4_users) { + net->nf.defrag_ipv4_users--; + if (net->nf.defrag_ipv4_users == 0) nf_unregister_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); } diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 75ca4b6e484f..9e8100728d46 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -1982,6 +1982,8 @@ static int replace_nexthop_grp(struct net *net, struct nexthop *old, rcu_assign_pointer(old->nh_grp, newg); if (newg->resilient) { + /* Make sure concurrent readers are not using 'oldg' anymore. */ + synchronize_net(); rcu_assign_pointer(oldg->res_table, tmp_table); rcu_assign_pointer(oldg->spare->res_table, tmp_table); } @@ -3565,6 +3567,7 @@ static struct notifier_block nh_netdev_notifier = { }; static int nexthops_dump(struct net *net, struct notifier_block *nb, + enum nexthop_event_type event_type, struct netlink_ext_ack *extack) { struct rb_root *root = &net->nexthop.rb_root; @@ -3575,8 +3578,7 @@ static int nexthops_dump(struct net *net, struct notifier_block *nb, struct nexthop *nh; nh = rb_entry(node, struct nexthop, rb_node); - err = call_nexthop_notifier(nb, net, NEXTHOP_EVENT_REPLACE, nh, - extack); + err = call_nexthop_notifier(nb, net, event_type, nh, extack); if (err) break; } @@ -3590,7 +3592,7 @@ int register_nexthop_notifier(struct net *net, struct notifier_block *nb, int err; rtnl_lock(); - err = nexthops_dump(net, nb, extack); + err = nexthops_dump(net, nb, NEXTHOP_EVENT_REPLACE, extack); if (err) goto unlock; err = blocking_notifier_chain_register(&net->nexthop.notifier_chain, @@ -3603,8 +3605,17 @@ EXPORT_SYMBOL(register_nexthop_notifier); int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) { - return blocking_notifier_chain_unregister(&net->nexthop.notifier_chain, - nb); + int err; + + rtnl_lock(); + err = blocking_notifier_chain_unregister(&net->nexthop.notifier_chain, + nb); + if (err) + goto unlock; + nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL); +unlock: + rtnl_unlock(); + return err; } EXPORT_SYMBOL(unregister_nexthop_notifier); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index b0d3a09dc84e..f30273afb539 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -53,7 +53,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) struct net *net = seq->private; int orphans, sockets; - orphans = percpu_counter_sum_positive(&tcp_orphan_count); + orphans = tcp_orphan_count_sum(); sockets = proto_sockets_allocated_sum_positive(&tcp_prot); socket_seq_show(seq); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4680268f2e59..97eb54774924 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -585,18 +585,6 @@ static struct ctl_table ipv4_table[] = { .extra1 = &sysctl_fib_sync_mem_min, .extra2 = &sysctl_fib_sync_mem_max, }, - { - .procname = "tcp_rx_skb_cache", - .data = &tcp_rx_skb_cache_key.key, - .mode = 0644, - .proc_handler = proc_do_static_key, - }, - { - .procname = "tcp_tx_skb_cache", - .data = &tcp_tx_skb_cache_key.key, - .mode = 0644, - .proc_handler = proc_do_static_key, - }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e8b48df73c85..c2d9830136d2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -287,8 +287,8 @@ enum { TCP_CMSG_TS = 2 }; -struct percpu_counter tcp_orphan_count; -EXPORT_SYMBOL_GPL(tcp_orphan_count); +DEFINE_PER_CPU(unsigned int, tcp_orphan_count); +EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); long sysctl_tcp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_tcp_mem); @@ -325,11 +325,6 @@ struct tcp_splice_state { unsigned long tcp_memory_pressure __read_mostly; EXPORT_SYMBOL_GPL(tcp_memory_pressure); -DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key); -EXPORT_SYMBOL(tcp_rx_skb_cache_key); - -DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key); - void tcp_enter_memory_pressure(struct sock *sk) { unsigned long val; @@ -647,7 +642,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) } EXPORT_SYMBOL(tcp_ioctl); -static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) +void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; tp->pushed_seq = tp->write_seq; @@ -658,7 +653,7 @@ static inline bool forced_push(const struct tcp_sock *tp) return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); } -static void skb_entail(struct sock *sk, struct sk_buff *skb) +void tcp_skb_entail(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -866,18 +861,6 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, { struct sk_buff *skb; - if (likely(!size)) { - skb = sk->sk_tx_skb_cache; - if (skb) { - skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); - sk->sk_tx_skb_cache = NULL; - pskb_trim(skb, 0); - INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); - skb_shinfo(skb)->tx_flags = 0; - memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb)); - return skb; - } - } /* The TCP header must be at least 32-bit aligned. */ size = ALIGN(size, 4); @@ -963,8 +946,8 @@ void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) } } -struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, - struct page *page, int offset, size_t *size) +static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, + struct page *page, int offset, size_t *size) { struct sk_buff *skb = tcp_write_queue_tail(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -985,7 +968,7 @@ new_segment: #ifdef CONFIG_TLS_DEVICE skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); #endif - skb_entail(sk, skb); + tcp_skb_entail(sk, skb); copy = size_goal; } @@ -1314,7 +1297,7 @@ new_segment: process_backlog++; skb->ip_summed = CHECKSUM_PARTIAL; - skb_entail(sk, skb); + tcp_skb_entail(sk, skb); copy = size_goal; /* All packets are restored as if they have @@ -2690,11 +2673,36 @@ void tcp_shutdown(struct sock *sk, int how) } EXPORT_SYMBOL(tcp_shutdown); +int tcp_orphan_count_sum(void) +{ + int i, total = 0; + + for_each_possible_cpu(i) + total += per_cpu(tcp_orphan_count, i); + + return max(total, 0); +} + +static int tcp_orphan_cache; +static struct timer_list tcp_orphan_timer; +#define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100) + +static void tcp_orphan_update(struct timer_list *unused) +{ + WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum()); + mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); +} + +static bool tcp_too_many_orphans(int shift) +{ + return READ_ONCE(tcp_orphan_cache) << shift > sysctl_tcp_max_orphans; +} + bool tcp_check_oom(struct sock *sk, int shift) { bool too_many_orphans, out_of_socket_memory; - too_many_orphans = tcp_too_many_orphans(sk, shift); + too_many_orphans = tcp_too_many_orphans(shift); out_of_socket_memory = tcp_out_of_memory(sk); if (too_many_orphans) @@ -2803,7 +2811,7 @@ adjudge_to_death: /* remove backlog if any, without releasing ownership. */ __release_sock(sk); - percpu_counter_inc(sk->sk_prot->orphan_count); + this_cpu_inc(tcp_orphan_count); /* Have we already been destroyed by a softirq or backlog? */ if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) @@ -2920,11 +2928,6 @@ void tcp_write_queue_purge(struct sock *sk) sk_wmem_free_skb(sk, skb); } tcp_rtx_queue_purge(sk); - skb = sk->sk_tx_skb_cache; - if (skb) { - __kfree_skb(skb); - sk->sk_tx_skb_cache = NULL; - } INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); sk_mem_reclaim(sk); tcp_clear_all_retrans_hints(tcp_sk(sk)); @@ -2961,10 +2964,6 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); - if (sk->sk_rx_skb_cache) { - __kfree_skb(sk->sk_rx_skb_cache); - sk->sk_rx_skb_cache = NULL; - } WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); tp->urg_data = 0; tcp_write_queue_purge(sk); @@ -4505,7 +4504,10 @@ void __init tcp_init(void) sizeof_field(struct sk_buff, cb)); percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); - percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); + + timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE); + mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); + inet_hashinfo_init(&tcp_hashinfo); inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", thash_entries, 21, /* one slot per 2 MB*/ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 141e85e6422b..246ab7b5e857 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -500,8 +500,11 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb, room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh; + if (room <= 0) + return; + /* Check #1 */ - if (room > 0 && !tcp_under_memory_pressure(sk)) { + if (!tcp_under_memory_pressure(sk)) { unsigned int truesize = truesize_adjust(adjust, skb); int incr; @@ -518,6 +521,11 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb, tp->rcv_ssthresh += min(room, incr); inet_csk(sk)->icsk_ack.quick |= 1; } + } else { + /* Under pressure: + * Adjust rcv_ssthresh according to reserved mem + */ + tcp_adjust_rcv_ssthresh(sk); } } @@ -3221,7 +3229,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, long seq_rtt_us = -1L; long ca_rtt_us = -1L; u32 pkts_acked = 0; - u32 last_in_flight = 0; bool rtt_update; int flag = 0; @@ -3257,7 +3264,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, if (!first_ackt) first_ackt = last_ackt; - last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; if (before(start_seq, reord)) reord = start_seq; if (!after(scb->end_seq, tp->high_seq)) @@ -3323,8 +3329,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt); - if (pkts_acked == 1 && last_in_flight < tp->mss_cache && - last_in_flight && !prior_sacked && fully_acked && + if (pkts_acked == 1 && fully_acked && !prior_sacked && + (tp->snd_una - prior_snd_una) < tp->mss_cache && sack->rate->prior_delivered + 1 == tp->delivered && !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) { /* Conservatively mark a delayed ACK. It's typically @@ -3381,9 +3387,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, if (icsk->icsk_ca_ops->pkts_acked) { struct ack_sample sample = { .pkts_acked = pkts_acked, - .rtt_us = sack->rate->rtt_us, - .in_flight = last_in_flight }; + .rtt_us = sack->rate->rtt_us }; + sample.in_flight = tp->mss_cache * + (tp->delivered - sack->rate->prior_delivered); icsk->icsk_ca_ops->pkts_acked(sk, &sample); } @@ -5346,7 +5353,7 @@ static int tcp_prune_queue(struct sock *sk) if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk); else if (tcp_under_memory_pressure(sk)) - tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); + tcp_adjust_rcv_ssthresh(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; @@ -5381,7 +5388,7 @@ static int tcp_prune_queue(struct sock *sk) return -1; } -static bool tcp_should_expand_sndbuf(const struct sock *sk) +static bool tcp_should_expand_sndbuf(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -5392,8 +5399,18 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk) return false; /* If we are under global TCP memory pressure, do not expand. */ - if (tcp_under_memory_pressure(sk)) + if (tcp_under_memory_pressure(sk)) { + int unused_mem = sk_unused_reserved_mem(sk); + + /* Adjust sndbuf according to reserved mem. But make sure + * it never goes below SOCK_MIN_SNDBUF. + * See sk_stream_moderate_sndbuf() for more details. + */ + if (unused_mem > SOCK_MIN_SNDBUF) + WRITE_ONCE(sk->sk_sndbuf, unused_mem); + return false; + } /* If we are under soft global TCP memory pressure, do not expand. */ if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0)) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2e62e0d6373a..5e6d8cb82ce5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1037,6 +1037,20 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); EXPORT_SYMBOL(tcp_md5_needed); +static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) +{ + if (!old) + return true; + + /* l3index always overrides non-l3index */ + if (old->l3index && new->l3index == 0) + return false; + if (old->l3index == 0 && new->l3index) + return true; + + return old->prefixlen < new->prefixlen; +} + /* Find the Key structure for an address. */ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, @@ -1059,7 +1073,7 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, lockdep_sock_is_held(sk)) { if (key->family != family) continue; - if (key->l3index && key->l3index != l3index) + if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index) continue; if (family == AF_INET) { mask = inet_make_mask(key->prefixlen); @@ -1074,8 +1088,7 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, match = false; } - if (match && (!best_match || - key->prefixlen > best_match->prefixlen)) + if (match && better_md5_match(best_match, key)) best_match = key; } return best_match; @@ -1085,7 +1098,7 @@ EXPORT_SYMBOL(__tcp_md5_do_lookup); static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, - int l3index) + int l3index, u8 flags) { const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; @@ -1105,7 +1118,9 @@ static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, lockdep_sock_is_held(sk)) { if (key->family != family) continue; - if (key->l3index && key->l3index != l3index) + if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) + continue; + if (key->l3index != l3index) continue; if (!memcmp(&key->addr, addr, size) && key->prefixlen == prefixlen) @@ -1129,7 +1144,7 @@ EXPORT_SYMBOL(tcp_v4_md5_lookup); /* This can be called on a newly created socket, from other files */ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen, int l3index, + int family, u8 prefixlen, int l3index, u8 flags, const u8 *newkey, u8 newkeylen, gfp_t gfp) { /* Add Key to the list */ @@ -1137,7 +1152,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_info *md5sig; - key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); + key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); if (key) { /* Pre-existing entry - just update that one. * Note that the key might be used concurrently. @@ -1182,6 +1197,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, key->family = family; key->prefixlen = prefixlen; key->l3index = l3index; + key->flags = flags; memcpy(&key->addr, addr, (family == AF_INET6) ? sizeof(struct in6_addr) : sizeof(struct in_addr)); @@ -1191,11 +1207,11 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, EXPORT_SYMBOL(tcp_md5_do_add); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, - u8 prefixlen, int l3index) + u8 prefixlen, int l3index, u8 flags) { struct tcp_md5sig_key *key; - key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); + key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); if (!key) return -ENOENT; hlist_del_rcu(&key->node); @@ -1229,6 +1245,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, const union tcp_md5_addr *addr; u8 prefixlen = 32; int l3index = 0; + u8 flags; if (optlen < sizeof(cmd)) return -EINVAL; @@ -1239,6 +1256,8 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, if (sin->sin_family != AF_INET) return -EINVAL; + flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; + if (optname == TCP_MD5SIG_EXT && cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { prefixlen = cmd.tcpm_prefixlen; @@ -1246,7 +1265,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, return -EINVAL; } - if (optname == TCP_MD5SIG_EXT && + if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { struct net_device *dev; @@ -1267,12 +1286,12 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; if (!cmd.tcpm_keylen) - return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index); + return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) return -EINVAL; - return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, + return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); } @@ -1596,7 +1615,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, * memory, then we end up not copying the key * across. Shucks. */ - tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, + tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags, key->key, key->keylen, GFP_ATOMIC); sk_nocaps_add(newsk, NETIF_F_GSO_MASK); } @@ -1941,7 +1960,6 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, int tcp_v4_rcv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); - struct sk_buff *skb_to_free; int sdif = inet_sdif(skb); int dif = inet_iif(skb); const struct iphdr *iph; @@ -2082,17 +2100,12 @@ process: tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { - skb_to_free = sk->sk_rx_skb_cache; - sk->sk_rx_skb_cache = NULL; ret = tcp_v4_do_rcv(sk, skb); } else { if (tcp_add_backlog(sk, skb)) goto discard_and_relse; - skb_to_free = NULL; } bh_unlock_sock(sk); - if (skb_to_free) - __kfree_skb(skb_to_free); put_and_return: if (refcounted) diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index 95db7a11ba2a..ab552356bdba 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -25,7 +25,6 @@ * 1) Add mechanism to deal with reverse congestion. */ -#include <linux/mm.h> #include <linux/module.h> #include <linux/math64.h> #include <net/tcp.h> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6d72f3ea48c4..3a01e5593a17 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1256,8 +1256,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); skb->skb_mstamp_ns = tp->tcp_wstamp_ns; if (clone_it) { - TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - - tp->snd_una; oskb = skb; tcp_skb_tsorted_save(oskb) { @@ -2969,8 +2967,7 @@ u32 __tcp_select_window(struct sock *sk) icsk->icsk_ack.quick = 0; if (tcp_under_memory_pressure(sk)) - tp->rcv_ssthresh = min(tp->rcv_ssthresh, - 4U * tp->advmss); + tcp_adjust_rcv_ssthresh(sk); /* free_space might become our new window, make sure we don't * increase it due to wscale. diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 0de693565963..fbab921670cc 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -65,6 +65,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; } @@ -86,6 +87,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, if (!rs->prior_delivered || after(scb->tx.delivered, rs->prior_delivered)) { + rs->prior_delivered_ce = scb->tx.delivered_ce; rs->prior_delivered = scb->tx.delivered; rs->prior_mstamp = scb->tx.delivered_mstamp; rs->is_app_limited = scb->tx.is_app_limited; @@ -138,6 +140,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, } rs->delivered = tp->delivered - rs->prior_delivered; + rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; + /* delivered_ce occupies less than 32 bits in the skb control block */ + rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK; + /* Model sending data and receiving ACKs as separate pipeline phases * for a window. Usually the ACK phase is longer, but with ACK * compression the send phase can be longer. To be safe we use the diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8851c9463b4b..8536b2a7210b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -390,7 +390,8 @@ static int compute_score(struct sock *sk, struct net *net, dif, sdif); if (!dev_match) return -1; - score += 4; + if (sk->sk_bound_dev_if) + score += 4; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; @@ -1053,7 +1054,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) __be16 dport; u8 tos; int err, is_udplite = IS_UDPLITE(sk); - int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; + int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE; int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); struct sk_buff *skb; struct ip_options_data opt_copy; @@ -1361,7 +1362,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset, } up->len += size; - if (!(up->corkflag || (flags&MSG_MORE))) + if (!(READ_ONCE(up->corkflag) || (flags&MSG_MORE))) ret = udp_push_pending_frames(sk); if (!ret) ret = size; @@ -2662,9 +2663,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, switch (optname) { case UDP_CORK: if (val != 0) { - up->corkflag = 1; + WRITE_ONCE(up->corkflag, 1); } else { - up->corkflag = 0; + WRITE_ONCE(up->corkflag, 0); lock_sock(sk); push_pending_frames(sk); release_sock(sk); @@ -2787,7 +2788,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, switch (optname) { case UDP_CORK: - val = up->corkflag; + val = READ_ONCE(up->corkflag); break; case UDP_ENCAP: diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index e504204bca92..bf2e5e5fe142 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -332,10 +332,10 @@ config IPV6_IOAM6_LWTUNNEL bool "IPv6: IOAM Pre-allocated Trace insertion support" depends on IPV6 select LWTUNNEL + select DST_CACHE help - Support for the inline insertion of IOAM Pre-allocated - Trace Header (only on locally generated packets), using - the lightweight tunnels mechanism. + Support for the insertion of IOAM Pre-allocated Trace + Header using the lightweight tunnels mechanism. If unsure, say N. diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 1bc7e143217b..3036a45e8a1e 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -5,16 +5,14 @@ obj-$(CONFIG_IPV6) += ipv6.o -ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \ +ipv6-y := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \ addrlabel.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \ raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \ exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \ udp_offload.o seg6.o fib6_notifier.o rpl.o ioam6.o -ipv6-offload := ip6_offload.o tcpv6_offload.o exthdrs_offload.o - -ipv6-$(CONFIG_SYSCTL) = sysctl_net_ipv6.o +ipv6-$(CONFIG_SYSCTL) += sysctl_net_ipv6.o ipv6-$(CONFIG_IPV6_MROUTE) += ip6mr.o ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ @@ -29,8 +27,6 @@ ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o ipv6-$(CONFIG_IPV6_RPL_LWTUNNEL) += rpl_iptunnel.o ipv6-$(CONFIG_IPV6_IOAM6_LWTUNNEL) += ioam6_iptunnel.o -ipv6-objs += $(ipv6-y) - obj-$(CONFIG_INET6_AH) += ah6.o obj-$(CONFIG_INET6_ESP) += esp6.o obj-$(CONFIG_INET6_ESP_OFFLOAD) += esp6_offload.o @@ -48,7 +44,8 @@ obj-$(CONFIG_IPV6_GRE) += ip6_gre.o obj-$(CONFIG_IPV6_FOU) += fou6.o obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o -obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload) +obj-$(CONFIG_INET) += output_core.o protocol.o \ + ip6_offload.o tcpv6_offload.o exthdrs_offload.o obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c6a90b7bbb70..d4fae16deec4 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2237,12 +2237,12 @@ static int addrconf_ifid_6lowpan(u8 *eui, struct net_device *dev) static int addrconf_ifid_ieee1394(u8 *eui, struct net_device *dev) { - union fwnet_hwaddr *ha; + const union fwnet_hwaddr *ha; if (dev->addr_len != FWNET_ALEN) return -1; - ha = (union fwnet_hwaddr *)dev->dev_addr; + ha = (const union fwnet_hwaddr *)dev->dev_addr; memcpy(eui, &ha->uc.uniq_id, sizeof(ha->uc.uniq_id)); eui[0] ^= 2; diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 3a871a09f962..38ece3b7b839 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -979,7 +979,7 @@ static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff) if (!skb_valid_dst(skb)) ip6_route_input(skb); - ioam6_fill_trace_data(skb, ns, trace); + ioam6_fill_trace_data(skb, ns, trace, true); break; default: break; diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index a1ac0e3d8c60..47447f0241df 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -610,7 +610,11 @@ int ila_xlat_init_net(struct net *net) if (err) return err; - rhashtable_init(&ilan->xlat.rhash_table, &rht_params); + err = rhashtable_init(&ilan->xlat.rhash_table, &rht_params); + if (err) { + free_bucket_spinlocks(ilan->xlat.locks); + return err; + } return 0; } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 55c290d55605..67c9114835c8 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -106,7 +106,7 @@ static inline int compute_score(struct sock *sk, struct net *net, if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return -1; - score = 1; + score = sk->sk_bound_dev_if ? 2 : 1; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c index 5e8961004832..122a3d47424c 100644 --- a/net/ipv6/ioam6.c +++ b/net/ipv6/ioam6.c @@ -631,7 +631,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, struct ioam6_namespace *ns, struct ioam6_trace_hdr *trace, struct ioam6_schema *sc, - u8 sclen) + u8 sclen, bool is_input) { struct __kernel_sock_timeval ts; u64 raw64; @@ -645,7 +645,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, /* hop_lim and node_id */ if (trace->type.bit0) { byte = ipv6_hdr(skb)->hop_limit; - if (skb->dev) + if (is_input) byte--; raw32 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id; @@ -730,7 +730,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, /* hop_lim and node_id (wide) */ if (trace->type.bit8) { byte = ipv6_hdr(skb)->hop_limit; - if (skb->dev) + if (is_input) byte--; raw64 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id_wide; @@ -770,6 +770,66 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, data += sizeof(__be32); } + /* bit12 undefined: filled with empty value */ + if (trace->type.bit12) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit13 undefined: filled with empty value */ + if (trace->type.bit13) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit14 undefined: filled with empty value */ + if (trace->type.bit14) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit15 undefined: filled with empty value */ + if (trace->type.bit15) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit16 undefined: filled with empty value */ + if (trace->type.bit16) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit17 undefined: filled with empty value */ + if (trace->type.bit17) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit18 undefined: filled with empty value */ + if (trace->type.bit18) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit19 undefined: filled with empty value */ + if (trace->type.bit19) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit20 undefined: filled with empty value */ + if (trace->type.bit20) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit21 undefined: filled with empty value */ + if (trace->type.bit21) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + /* opaque state snapshot */ if (trace->type.bit22) { if (!sc) { @@ -786,21 +846,16 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, /* called with rcu_read_lock() */ void ioam6_fill_trace_data(struct sk_buff *skb, struct ioam6_namespace *ns, - struct ioam6_trace_hdr *trace) + struct ioam6_trace_hdr *trace, + bool is_input) { struct ioam6_schema *sc; u8 sclen = 0; - /* Skip if Overflow flag is set OR - * if an unknown type (bit 12-21) is set + /* Skip if Overflow flag is set */ - if (trace->overflow || - trace->type.bit12 | trace->type.bit13 | trace->type.bit14 | - trace->type.bit15 | trace->type.bit16 | trace->type.bit17 | - trace->type.bit18 | trace->type.bit19 | trace->type.bit20 | - trace->type.bit21) { + if (trace->overflow) return; - } /* NodeLen does not include Opaque State Snapshot length. We need to * take it into account if the corresponding bit is set (bit 22) and @@ -822,7 +877,7 @@ void ioam6_fill_trace_data(struct sk_buff *skb, return; } - __ioam6_fill_trace_data(skb, ns, trace, sc, sclen); + __ioam6_fill_trace_data(skb, ns, trace, sc, sclen, is_input); trace->remlen -= trace->nodelen + sclen; } diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index f9ee04541c17..f90a87389fcc 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -9,7 +9,6 @@ #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/net.h> -#include <linux/netlink.h> #include <linux/in6.h> #include <linux/ioam6.h> #include <linux/ioam6_iptunnel.h> @@ -17,18 +16,26 @@ #include <net/sock.h> #include <net/lwtunnel.h> #include <net/ioam6.h> +#include <net/netlink.h> +#include <net/ipv6.h> +#include <net/dst_cache.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> #define IOAM6_MASK_SHORT_FIELDS 0xff100000 #define IOAM6_MASK_WIDE_FIELDS 0xe00000 struct ioam6_lwt_encap { - struct ipv6_hopopt_hdr eh; - u8 pad[2]; /* 2-octet padding for 4n-alignment */ - struct ioam6_hdr ioamh; - struct ioam6_trace_hdr traceh; + struct ipv6_hopopt_hdr eh; + u8 pad[2]; /* 2-octet padding for 4n-alignment */ + struct ioam6_hdr ioamh; + struct ioam6_trace_hdr traceh; } __packed; struct ioam6_lwt { + struct dst_cache cache; + u8 mode; + struct in6_addr tundst; struct ioam6_lwt_encap tuninfo; }; @@ -42,40 +49,29 @@ static struct ioam6_lwt_encap *ioam6_lwt_info(struct lwtunnel_state *lwt) return &ioam6_lwt_state(lwt)->tuninfo; } -static struct ioam6_trace_hdr *ioam6_trace(struct lwtunnel_state *lwt) +static struct ioam6_trace_hdr *ioam6_lwt_trace(struct lwtunnel_state *lwt) { return &(ioam6_lwt_state(lwt)->tuninfo.traceh); } static const struct nla_policy ioam6_iptunnel_policy[IOAM6_IPTUNNEL_MAX + 1] = { + [IOAM6_IPTUNNEL_MODE] = NLA_POLICY_RANGE(NLA_U8, + IOAM6_IPTUNNEL_MODE_MIN, + IOAM6_IPTUNNEL_MODE_MAX), + [IOAM6_IPTUNNEL_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [IOAM6_IPTUNNEL_TRACE] = NLA_POLICY_EXACT_LEN(sizeof(struct ioam6_trace_hdr)), }; -static int nla_put_ioam6_trace(struct sk_buff *skb, int attrtype, - struct ioam6_trace_hdr *trace) -{ - struct ioam6_trace_hdr *data; - struct nlattr *nla; - int len; - - len = sizeof(*trace); - - nla = nla_reserve(skb, attrtype, len); - if (!nla) - return -EMSGSIZE; - - data = nla_data(nla); - memcpy(data, trace, len); - - return 0; -} - static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace) { u32 fields; if (!trace->type_be32 || !trace->remlen || - trace->remlen > IOAM6_TRACE_DATA_SIZE_MAX / 4) + trace->remlen > IOAM6_TRACE_DATA_SIZE_MAX / 4 || + trace->type.bit12 | trace->type.bit13 | trace->type.bit14 | + trace->type.bit15 | trace->type.bit16 | trace->type.bit17 | + trace->type.bit18 | trace->type.bit19 | trace->type.bit20 | + trace->type.bit21) return false; trace->nodelen = 0; @@ -97,9 +93,10 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla, struct nlattr *tb[IOAM6_IPTUNNEL_MAX + 1]; struct ioam6_lwt_encap *tuninfo; struct ioam6_trace_hdr *trace; - struct lwtunnel_state *s; - int len_aligned; - int len, err; + struct lwtunnel_state *lwt; + struct ioam6_lwt *ilwt; + int len_aligned, err; + u8 mode; if (family != AF_INET6) return -EINVAL; @@ -109,6 +106,16 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla, if (err < 0) return err; + if (!tb[IOAM6_IPTUNNEL_MODE]) + mode = IOAM6_IPTUNNEL_MODE_INLINE; + else + mode = nla_get_u8(tb[IOAM6_IPTUNNEL_MODE]); + + if (!tb[IOAM6_IPTUNNEL_DST] && mode != IOAM6_IPTUNNEL_MODE_INLINE) { + NL_SET_ERR_MSG(extack, "this mode needs a tunnel destination"); + return -EINVAL; + } + if (!tb[IOAM6_IPTUNNEL_TRACE]) { NL_SET_ERR_MSG(extack, "missing trace"); return -EINVAL; @@ -121,15 +128,24 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla, return -EINVAL; } - len = sizeof(*tuninfo) + trace->remlen * 4; - len_aligned = ALIGN(len, 8); - - s = lwtunnel_state_alloc(len_aligned); - if (!s) + len_aligned = ALIGN(trace->remlen * 4, 8); + lwt = lwtunnel_state_alloc(sizeof(*ilwt) + len_aligned); + if (!lwt) return -ENOMEM; - tuninfo = ioam6_lwt_info(s); - tuninfo->eh.hdrlen = (len_aligned >> 3) - 1; + ilwt = ioam6_lwt_state(lwt); + err = dst_cache_init(&ilwt->cache, GFP_ATOMIC); + if (err) { + kfree(lwt); + return err; + } + + ilwt->mode = mode; + if (tb[IOAM6_IPTUNNEL_DST]) + ilwt->tundst = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_DST]); + + tuninfo = ioam6_lwt_info(lwt); + tuninfo->eh.hdrlen = ((sizeof(*tuninfo) + len_aligned) >> 3) - 1; tuninfo->pad[0] = IPV6_TLV_PADN; tuninfo->ioamh.type = IOAM6_TYPE_PREALLOC; tuninfo->ioamh.opt_type = IPV6_TLV_IOAM; @@ -138,27 +154,39 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla, memcpy(&tuninfo->traceh, trace, sizeof(*trace)); - len = len_aligned - len; - if (len == 1) { - tuninfo->traceh.data[trace->remlen * 4] = IPV6_TLV_PAD1; - } else if (len > 0) { + if (len_aligned - trace->remlen * 4) { tuninfo->traceh.data[trace->remlen * 4] = IPV6_TLV_PADN; - tuninfo->traceh.data[trace->remlen * 4 + 1] = len - 2; + tuninfo->traceh.data[trace->remlen * 4 + 1] = 2; } - s->type = LWTUNNEL_ENCAP_IOAM6; - s->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + lwt->type = LWTUNNEL_ENCAP_IOAM6; + lwt->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; - *ts = s; + *ts = lwt; return 0; } -static int ioam6_do_inline(struct sk_buff *skb, struct ioam6_lwt_encap *tuninfo) +static int ioam6_do_fill(struct net *net, struct sk_buff *skb) { struct ioam6_trace_hdr *trace; - struct ipv6hdr *oldhdr, *hdr; struct ioam6_namespace *ns; + + trace = (struct ioam6_trace_hdr *)(skb_transport_header(skb) + + sizeof(struct ipv6_hopopt_hdr) + 2 + + sizeof(struct ioam6_hdr)); + + ns = ioam6_namespace(net, trace->namespace_id); + if (ns) + ioam6_fill_trace_data(skb, ns, trace, false); + + return 0; +} + +static int ioam6_do_inline(struct net *net, struct sk_buff *skb, + struct ioam6_lwt_encap *tuninfo) +{ + struct ipv6hdr *oldhdr, *hdr; int hdrlen, err; hdrlen = (tuninfo->eh.hdrlen + 1) << 3; @@ -187,80 +215,200 @@ static int ioam6_do_inline(struct sk_buff *skb, struct ioam6_lwt_encap *tuninfo) hdr->nexthdr = NEXTHDR_HOP; hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr)); - trace = (struct ioam6_trace_hdr *)(skb_transport_header(skb) - + sizeof(struct ipv6_hopopt_hdr) + 2 - + sizeof(struct ioam6_hdr)); + return ioam6_do_fill(net, skb); +} - ns = ioam6_namespace(dev_net(skb_dst(skb)->dev), trace->namespace_id); - if (ns) - ioam6_fill_trace_data(skb, ns, trace); +static int ioam6_do_encap(struct net *net, struct sk_buff *skb, + struct ioam6_lwt_encap *tuninfo, + struct in6_addr *tundst) +{ + struct dst_entry *dst = skb_dst(skb); + struct ipv6hdr *hdr, *inner_hdr; + int hdrlen, len, err; - return 0; + hdrlen = (tuninfo->eh.hdrlen + 1) << 3; + len = sizeof(*hdr) + hdrlen; + + err = skb_cow_head(skb, len + skb->mac_len); + if (unlikely(err)) + return err; + + inner_hdr = ipv6_hdr(skb); + + skb_push(skb, len); + skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + skb_set_transport_header(skb, sizeof(*hdr)); + + tuninfo->eh.nexthdr = NEXTHDR_IPV6; + memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen); + + hdr = ipv6_hdr(skb); + memcpy(hdr, inner_hdr, sizeof(*hdr)); + + hdr->nexthdr = NEXTHDR_HOP; + hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr)); + hdr->daddr = *tundst; + ipv6_dev_get_saddr(net, dst->dev, &hdr->daddr, + IPV6_PREFER_SRC_PUBLIC, &hdr->saddr); + + skb_postpush_rcsum(skb, hdr, len); + + return ioam6_do_fill(net, skb); } static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct lwtunnel_state *lwt = skb_dst(skb)->lwtstate; + struct dst_entry *dst = skb_dst(skb); + struct in6_addr orig_daddr; + struct ioam6_lwt *ilwt; int err = -EINVAL; if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - /* Only for packets we send and - * that do not contain a Hop-by-Hop yet - */ - if (skb->dev || ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) - goto out; - - err = ioam6_do_inline(skb, ioam6_lwt_info(lwt)); - if (unlikely(err)) + ilwt = ioam6_lwt_state(dst->lwtstate); + orig_daddr = ipv6_hdr(skb)->daddr; + + switch (ilwt->mode) { + case IOAM6_IPTUNNEL_MODE_INLINE: +do_inline: + /* Direct insertion - if there is no Hop-by-Hop yet */ + if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) + goto out; + + err = ioam6_do_inline(net, skb, &ilwt->tuninfo); + if (unlikely(err)) + goto drop; + + break; + case IOAM6_IPTUNNEL_MODE_ENCAP: +do_encap: + /* Encapsulation (ip6ip6) */ + err = ioam6_do_encap(net, skb, &ilwt->tuninfo, &ilwt->tundst); + if (unlikely(err)) + goto drop; + + break; + case IOAM6_IPTUNNEL_MODE_AUTO: + /* Automatic (RFC8200 compliant): + * - local packets -> INLINE mode + * - in-transit packets -> ENCAP mode + */ + if (!skb->dev) + goto do_inline; + + goto do_encap; + default: goto drop; + } - err = skb_cow_head(skb, LL_RESERVED_SPACE(skb_dst(skb)->dev)); + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) goto drop; + if (!ipv6_addr_equal(&orig_daddr, &ipv6_hdr(skb)->daddr)) { + preempt_disable(); + dst = dst_cache_get(&ilwt->cache); + preempt_enable(); + + if (unlikely(!dst)) { + struct ipv6hdr *hdr = ipv6_hdr(skb); + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.daddr = hdr->daddr; + fl6.saddr = hdr->saddr; + fl6.flowlabel = ip6_flowinfo(hdr); + fl6.flowi6_mark = skb->mark; + fl6.flowi6_proto = hdr->nexthdr; + + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) { + err = dst->error; + dst_release(dst); + goto drop; + } + + preempt_disable(); + dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr); + preempt_enable(); + } + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + return dst_output(net, sk, skb); + } out: - return lwt->orig_output(net, sk, skb); - + return dst->lwtstate->orig_output(net, sk, skb); drop: kfree_skb(skb); return err; } +static void ioam6_destroy_state(struct lwtunnel_state *lwt) +{ + dst_cache_destroy(&ioam6_lwt_state(lwt)->cache); +} + static int ioam6_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { - struct ioam6_trace_hdr *trace = ioam6_trace(lwtstate); + struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate); + int err; - if (nla_put_ioam6_trace(skb, IOAM6_IPTUNNEL_TRACE, trace)) - return -EMSGSIZE; + err = nla_put_u8(skb, IOAM6_IPTUNNEL_MODE, ilwt->mode); + if (err) + goto ret; - return 0; + if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) { + err = nla_put_in6_addr(skb, IOAM6_IPTUNNEL_DST, &ilwt->tundst); + if (err) + goto ret; + } + + err = nla_put(skb, IOAM6_IPTUNNEL_TRACE, sizeof(ilwt->tuninfo.traceh), + &ilwt->tuninfo.traceh); +ret: + return err; } static int ioam6_encap_nlsize(struct lwtunnel_state *lwtstate) { - struct ioam6_trace_hdr *trace = ioam6_trace(lwtstate); + struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate); + int nlsize; + + nlsize = nla_total_size(sizeof(ilwt->mode)) + + nla_total_size(sizeof(ilwt->tuninfo.traceh)); - return nla_total_size(sizeof(*trace)); + if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) + nlsize += nla_total_size(sizeof(ilwt->tundst)); + + return nlsize; } static int ioam6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { - struct ioam6_trace_hdr *a_hdr = ioam6_trace(a); - struct ioam6_trace_hdr *b_hdr = ioam6_trace(b); - - return (a_hdr->namespace_id != b_hdr->namespace_id); + struct ioam6_trace_hdr *trace_a = ioam6_lwt_trace(a); + struct ioam6_trace_hdr *trace_b = ioam6_lwt_trace(b); + struct ioam6_lwt *ilwt_a = ioam6_lwt_state(a); + struct ioam6_lwt *ilwt_b = ioam6_lwt_state(b); + + return (ilwt_a->mode != ilwt_b->mode || + (ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE && + !ipv6_addr_equal(&ilwt_a->tundst, &ilwt_b->tundst)) || + trace_a->namespace_id != trace_b->namespace_id); } static const struct lwtunnel_encap_ops ioam6_iptun_ops = { - .build_state = ioam6_build_state, + .build_state = ioam6_build_state, + .destroy_state = ioam6_destroy_state, .output = ioam6_output, - .fill_encap = ioam6_fill_encap_info, + .fill_encap = ioam6_fill_encap_info, .get_encap_size = ioam6_encap_nlsize, - .cmp_encap = ioam6_encap_cmp, - .owner = THIS_MODULE, + .cmp_encap = ioam6_encap_cmp, + .owner = THIS_MODULE, }; int __init ioam6_iptunnel_init(void) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 3ad201d372d8..d831d2439693 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1088,7 +1088,7 @@ static void ip6gre_tnl_link_config_common(struct ip6_tnl *t) struct flowi6 *fl6 = &t->fl.u.ip6; if (dev->type != ARPHRD_ETHER) { - memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); + __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); } @@ -1521,7 +1521,7 @@ static int ip6gre_tunnel_init(struct net_device *dev) if (tunnel->parms.collect_md) return 0; - memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr)); + __dev_addr_set(dev, &tunnel->parms.laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr)); if (ipv6_addr_any(&tunnel->parms.raddr)) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 12f985f43bcc..2f044a49afa8 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -464,13 +464,14 @@ static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) int ip6_forward(struct sk_buff *skb) { - struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr = ipv6_hdr(skb); struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(dst->dev); + struct inet6_dev *idev; u32 mtu; + idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); if (net->ipv6.devconf_all->forwarding == 0) goto error; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 20a67efda47f..484aca492cc0 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1449,7 +1449,7 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) unsigned int mtu; int t_hlen; - memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); + __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); /* Set up flowi template */ diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 1d8e3ffa225d..527e9ead7449 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -660,7 +660,7 @@ static void vti6_link_config(struct ip6_tnl *t, bool keep_mtu) struct net_device *tdev = NULL; int mtu; - memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); + __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); p->flags &= ~(IP6_TNL_F_CAP_XMIT | IP6_TNL_F_CAP_RCV | diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 4b098521a44c..184190b9ea25 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -142,7 +142,7 @@ struct neigh_table nd_tbl = { }; EXPORT_SYMBOL_GPL(nd_tbl); -void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, +void __ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data, int data_len, int pad) { int space = __ndisc_opt_addr_space(data_len, pad); @@ -165,7 +165,7 @@ void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, EXPORT_SYMBOL_GPL(__ndisc_fill_addr_option); static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type, - void *data, u8 icmp6_type) + const void *data, u8 icmp6_type) { __ndisc_fill_addr_option(skb, type, data, skb->dev->addr_len, ndisc_addr_option_pad(skb->dev->type)); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index de2cf3943b91..2d816277f2c5 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -247,10 +247,10 @@ ip6t_next_entry(const struct ip6t_entry *entry) /* Returns one of the generic firewall policies, like NF_ACCEPT. */ unsigned int -ip6t_do_table(struct sk_buff *skb, - const struct nf_hook_state *state, - struct xt_table *table) +ip6t_do_table(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) { + const struct xt_table *table = priv; unsigned int hook = state->hook; static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); /* Initializing verdict to NF_DROP keeps gcc happy. */ @@ -273,6 +273,7 @@ ip6t_do_table(struct sk_buff *skb, * things we don't know, ie. tcp syn flag or ports). If the * rule is also a fragment-specific rule, non-fragments won't * match it. */ + acpar.fragoff = 0; acpar.hotdrop = false; acpar.state = state; diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c index 733c83d38b30..4ad8b2032f1f 100644 --- a/net/ipv6/netfilter/ip6t_rt.c +++ b/net/ipv6/netfilter/ip6t_rt.c @@ -25,12 +25,7 @@ MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); static inline bool segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert) { - bool r; - pr_debug("segsleft_match:%c 0x%x <= 0x%x <= 0x%x\n", - invert ? '!' : ' ', min, id, max); - r = (id >= min && id <= max) ^ invert; - pr_debug(" result %s\n", r ? "PASS" : "FAILED"); - return r; + return (id >= min && id <= max) ^ invert; } static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) @@ -65,30 +60,6 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) return false; } - pr_debug("IPv6 RT LEN %u %u ", hdrlen, rh->hdrlen); - pr_debug("TYPE %04X ", rh->type); - pr_debug("SGS_LEFT %u %02X\n", rh->segments_left, rh->segments_left); - - pr_debug("IPv6 RT segsleft %02X ", - segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1], - rh->segments_left, - !!(rtinfo->invflags & IP6T_RT_INV_SGS))); - pr_debug("type %02X %02X %02X ", - rtinfo->rt_type, rh->type, - (!(rtinfo->flags & IP6T_RT_TYP) || - ((rtinfo->rt_type == rh->type) ^ - !!(rtinfo->invflags & IP6T_RT_INV_TYP)))); - pr_debug("len %02X %04X %02X ", - rtinfo->hdrlen, hdrlen, - !(rtinfo->flags & IP6T_RT_LEN) || - ((rtinfo->hdrlen == hdrlen) ^ - !!(rtinfo->invflags & IP6T_RT_INV_LEN))); - pr_debug("res %02X %02X %02X ", - rtinfo->flags & IP6T_RT_RES, - ((const struct rt0_hdr *)rh)->reserved, - !((rtinfo->flags & IP6T_RT_RES) && - (((const struct rt0_hdr *)rh)->reserved))); - ret = (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1], rh->segments_left, !!(rtinfo->invflags & IP6T_RT_INV_SGS))) && @@ -107,22 +78,22 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) reserved), sizeof(_reserved), &_reserved); + if (!rp) { + par->hotdrop = true; + return false; + } ret = (*rp == 0); } - pr_debug("#%d ", rtinfo->addrnr); if (!(rtinfo->flags & IP6T_RT_FST)) { return ret; } else if (rtinfo->flags & IP6T_RT_FST_NSTRICT) { - pr_debug("Not strict "); if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) { - pr_debug("There isn't enough space\n"); return false; } else { unsigned int i = 0; - pr_debug("#%d ", rtinfo->addrnr); for (temp = 0; temp < (unsigned int)((hdrlen - 8) / 16); temp++) { @@ -138,26 +109,20 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) return false; } - if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) { - pr_debug("i=%d temp=%d;\n", i, temp); + if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) i++; - } if (i == rtinfo->addrnr) break; } - pr_debug("i=%d #%d\n", i, rtinfo->addrnr); if (i == rtinfo->addrnr) return ret; else return false; } } else { - pr_debug("Strict "); if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) { - pr_debug("There isn't enough space\n"); return false; } else { - pr_debug("#%d ", rtinfo->addrnr); for (temp = 0; temp < rtinfo->addrnr; temp++) { ap = skb_header_pointer(skb, ptr @@ -173,7 +138,6 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) if (!ipv6_addr_equal(ap, &rtinfo->addrs[temp])) break; } - pr_debug("temp=%d #%d\n", temp, rtinfo->addrnr); if (temp == rtinfo->addrnr && temp == (unsigned int)((hdrlen - 8) / 16)) return ret; diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index 727ee8097012..df785ebda0ca 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -27,14 +27,6 @@ static const struct xt_table packet_filter = { .priority = NF_IP6_PRI_FILTER, }; -/* The work comes in here from netfilter.c. */ -static unsigned int -ip6table_filter_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip6t_do_table(skb, state, priv); -} - static struct nf_hook_ops *filter_ops __read_mostly; /* Default to forward because I got too much mail already. */ @@ -90,7 +82,7 @@ static int __init ip6table_filter_init(void) if (ret < 0) return ret; - filter_ops = xt_hook_ops_alloc(&packet_filter, ip6table_filter_hook); + filter_ops = xt_hook_ops_alloc(&packet_filter, ip6t_do_table); if (IS_ERR(filter_ops)) { xt_unregister_template(&packet_filter); return PTR_ERR(filter_ops); diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index 9b518ce37d6a..a88b2ce4a3cb 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -29,7 +29,7 @@ static const struct xt_table packet_mangler = { }; static unsigned int -ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state, void *priv) +ip6t_mangle_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { unsigned int ret; struct in6_addr saddr, daddr; @@ -46,7 +46,7 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state, void *pr /* flowlabel and prio (includes version, which shouldn't change either */ flowlabel = *((u_int32_t *)ipv6_hdr(skb)); - ret = ip6t_do_table(skb, state, priv); + ret = ip6t_do_table(priv, skb, state); if (ret != NF_DROP && ret != NF_STOLEN && (!ipv6_addr_equal(&ipv6_hdr(skb)->saddr, &saddr) || @@ -68,8 +68,8 @@ ip6table_mangle_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { if (state->hook == NF_INET_LOCAL_OUT) - return ip6t_mangle_out(skb, state, priv); - return ip6t_do_table(skb, state, priv); + return ip6t_mangle_out(priv, skb, state); + return ip6t_do_table(priv, skb, state); } static struct nf_hook_ops *mangle_ops __read_mostly; diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 921c1723a01e..bf3cb3a13600 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -31,34 +31,27 @@ static const struct xt_table nf_nat_ipv6_table = { .af = NFPROTO_IPV6, }; -static unsigned int ip6table_nat_do_chain(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip6t_do_table(skb, state, priv); -} - static const struct nf_hook_ops nf_nat_ipv6_ops[] = { { - .hook = ip6table_nat_do_chain, + .hook = ip6t_do_table, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_NAT_DST, }, { - .hook = ip6table_nat_do_chain, + .hook = ip6t_do_table, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_NAT_SRC, }, { - .hook = ip6table_nat_do_chain, + .hook = ip6t_do_table, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST, }, { - .hook = ip6table_nat_do_chain, + .hook = ip6t_do_table, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC, diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 4f2a04af71d3..08861d5d1f4d 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -31,14 +31,6 @@ static const struct xt_table packet_raw_before_defrag = { .priority = NF_IP6_PRI_RAW_BEFORE_DEFRAG, }; -/* The work comes in here from netfilter.c. */ -static unsigned int -ip6table_raw_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip6t_do_table(skb, state, priv); -} - static struct nf_hook_ops *rawtable_ops __read_mostly; static int ip6table_raw_table_init(struct net *net) @@ -88,7 +80,7 @@ static int __init ip6table_raw_init(void) return ret; /* Register hooks */ - rawtable_ops = xt_hook_ops_alloc(table, ip6table_raw_hook); + rawtable_ops = xt_hook_ops_alloc(table, ip6t_do_table); if (IS_ERR(rawtable_ops)) { xt_unregister_template(table); return PTR_ERR(rawtable_ops); diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 931674034d8b..4df14a9bae78 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -32,13 +32,6 @@ static const struct xt_table security_table = { .priority = NF_IP6_PRI_SECURITY, }; -static unsigned int -ip6table_security_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip6t_do_table(skb, state, priv); -} - static struct nf_hook_ops *sectbl_ops __read_mostly; static int ip6table_security_table_init(struct net *net) @@ -77,7 +70,7 @@ static int __init ip6table_security_init(void) if (ret < 0) return ret; - sectbl_ops = xt_hook_ops_alloc(&security_table, ip6table_security_hook); + sectbl_ops = xt_hook_ops_alloc(&security_table, ip6t_do_table); if (IS_ERR(sectbl_ops)) { xt_unregister_template(&security_table); return PTR_ERR(sectbl_ops); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index a0108415275f..5c47be29b9ee 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -33,7 +33,7 @@ static const char nf_frags_cache_name[] = "nf-frags"; -unsigned int nf_frag_pernet_id __read_mostly; +static unsigned int nf_frag_pernet_id __read_mostly; static struct inet_frags nf_frags; static struct nft_ct_frag6_pernet *nf_frag_pernet(struct net *net) diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index e8a59d8bf2ad..cb4eb1d2c620 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -25,8 +25,6 @@ #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> -extern unsigned int nf_frag_pernet_id; - static DEFINE_MUTEX(defrag6_mutex); static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, @@ -91,12 +89,10 @@ static const struct nf_hook_ops ipv6_defrag_ops[] = { static void __net_exit defrag6_net_exit(struct net *net) { - struct nft_ct_frag6_pernet *nf_frag = net_generic(net, nf_frag_pernet_id); - - if (nf_frag->users) { + if (net->nf.defrag_ipv6_users) { nf_unregister_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); - nf_frag->users = 0; + net->nf.defrag_ipv6_users = 0; } } @@ -134,24 +130,23 @@ static void __exit nf_defrag_fini(void) int nf_defrag_ipv6_enable(struct net *net) { - struct nft_ct_frag6_pernet *nf_frag = net_generic(net, nf_frag_pernet_id); int err = 0; mutex_lock(&defrag6_mutex); - if (nf_frag->users == UINT_MAX) { + if (net->nf.defrag_ipv6_users == UINT_MAX) { err = -EOVERFLOW; goto out_unlock; } - if (nf_frag->users) { - nf_frag->users++; + if (net->nf.defrag_ipv6_users) { + net->nf.defrag_ipv6_users++; goto out_unlock; } err = nf_register_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); if (err == 0) - nf_frag->users = 1; + net->nf.defrag_ipv6_users = 1; out_unlock: mutex_unlock(&defrag6_mutex); @@ -161,12 +156,10 @@ EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable); void nf_defrag_ipv6_disable(struct net *net) { - struct nft_ct_frag6_pernet *nf_frag = net_generic(net, nf_frag_pernet_id); - mutex_lock(&defrag6_mutex); - if (nf_frag->users) { - nf_frag->users--; - if (nf_frag->users == 0) + if (net->nf.defrag_ipv6_users) { + net->nf.defrag_ipv6_users--; + if (net->nf.defrag_ipv6_users == 0) nf_unregister_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index dbc224023977..9b9ef09382ab 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5681,14 +5681,15 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, - rt->fib6_nh->fib_nh_weight, AF_INET6) < 0) + rt->fib6_nh->fib_nh_weight, AF_INET6, + 0) < 0) goto nla_put_failure; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, sibling->fib6_nh->fib_nh_weight, - AF_INET6) < 0) + AF_INET6, 0) < 0) goto nla_put_failure; } diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index e412817fba2f..5daa1c3ed83b 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -374,7 +374,11 @@ static int __net_init seg6_net_init(struct net *net) net->ipv6.seg6_data = sdata; #ifdef CONFIG_IPV6_SEG6_HMAC - seg6_hmac_net_init(net); + if (seg6_hmac_net_init(net)) { + kfree(rcu_dereference_raw(sdata->tun_src)); + kfree(sdata); + return -ENOMEM; + }; #endif return 0; @@ -388,7 +392,7 @@ static void __net_exit seg6_net_exit(struct net *net) seg6_hmac_net_exit(net); #endif - kfree(sdata->tun_src); + kfree(rcu_dereference_raw(sdata->tun_src)); kfree(sdata); } diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index 687d95dce085..29bc4e7c3046 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -405,9 +405,7 @@ int __net_init seg6_hmac_net_init(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); - rhashtable_init(&sdata->hmac_infos, &rht_params); - - return 0; + return rhashtable_init(&sdata->hmac_infos, &rht_params); } EXPORT_SYMBOL(seg6_hmac_net_init); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index ef0c7a7c18e2..1b57ee36d668 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -204,7 +204,7 @@ static int ipip6_tunnel_create(struct net_device *dev) struct sit_net *sitn = net_generic(net, sit_net_id); int err; - memcpy(dev->dev_addr, &t->parms.iph.saddr, 4); + __dev_addr_set(dev, &t->parms.iph.saddr, 4); memcpy(dev->broadcast, &t->parms.iph.daddr, 4); if ((__force u16)t->parms.i_flags & SIT_ISATAP) @@ -1149,7 +1149,7 @@ static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p, synchronize_net(); t->parms.iph.saddr = p->iph.saddr; t->parms.iph.daddr = p->iph.daddr; - memcpy(t->dev->dev_addr, &p->iph.saddr, 4); + __dev_addr_set(t->dev, &p->iph.saddr, 4); memcpy(t->dev->broadcast, &p->iph.daddr, 4); ipip6_tunnel_link(sitn, t); t->parms.iph.ttl = p->iph.ttl; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0ce52d46e4f8..1ef27cd7dbff 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -599,6 +599,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr; int l3index = 0; u8 prefixlen; + u8 flags; if (optlen < sizeof(cmd)) return -EINVAL; @@ -609,6 +610,8 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, if (sin6->sin6_family != AF_INET6) return -EINVAL; + flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; + if (optname == TCP_MD5SIG_EXT && cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { prefixlen = cmd.tcpm_prefixlen; @@ -619,7 +622,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, prefixlen = ipv6_addr_v4mapped(&sin6->sin6_addr) ? 32 : 128; } - if (optname == TCP_MD5SIG_EXT && + if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { struct net_device *dev; @@ -640,9 +643,9 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, if (ipv6_addr_v4mapped(&sin6->sin6_addr)) return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], AF_INET, prefixlen, - l3index); + l3index, flags); return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr, - AF_INET6, prefixlen, l3index); + AF_INET6, prefixlen, l3index, flags); } if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) @@ -650,12 +653,12 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, if (ipv6_addr_v4mapped(&sin6->sin6_addr)) return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], - AF_INET, prefixlen, l3index, + AF_INET, prefixlen, l3index, flags, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr, - AF_INET6, prefixlen, l3index, + AF_INET6, prefixlen, l3index, flags, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); } @@ -1404,7 +1407,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * * across. Shucks. */ tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newsk->sk_v6_daddr, - AF_INET6, 128, l3index, key->key, key->keylen, + AF_INET6, 128, l3index, key->flags, key->key, key->keylen, sk_gfp_mask(sk, GFP_ATOMIC)); } #endif @@ -1618,7 +1621,6 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) { - struct sk_buff *skb_to_free; int sdif = inet6_sdif(skb); int dif = inet6_iif(skb); const struct tcphdr *th; @@ -1754,17 +1756,12 @@ process: tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { - skb_to_free = sk->sk_rx_skb_cache; - sk->sk_rx_skb_cache = NULL; ret = tcp_v6_do_rcv(sk, skb); } else { if (tcp_add_backlog(sk, skb)) goto discard_and_relse; - skb_to_free = NULL; } bh_unlock_sock(sk); - if (skb_to_free) - __kfree_skb(skb_to_free); put_and_return: if (refcounted) sock_put(sk); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ea53847b5b7e..8d785232b479 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -133,7 +133,8 @@ static int compute_score(struct sock *sk, struct net *net, dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif); if (!dev_match) return -1; - score++; + if (sk->sk_bound_dev_if) + score++; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; @@ -1303,7 +1304,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int addr_len = msg->msg_namelen; bool connected = false; int ulen = len; - int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; + int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE; int err; int is_udplite = IS_UDPLITE(sk); int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c index 647c0554d04c..40ca3c1e42a2 100644 --- a/net/llc/llc_c_ac.c +++ b/net/llc/llc_c_ac.c @@ -781,7 +781,7 @@ int llc_conn_ac_send_sabme_cmd_p_set_x(struct sock *sk, struct sk_buff *skb) if (nskb) { struct llc_sap *sap = llc->sap; - u8 *dmac = llc->daddr.mac; + const u8 *dmac = llc->daddr.mac; if (llc->dev->flags & IFF_LOOPBACK) dmac = llc->dev->dev_addr; diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c index ad6547736c21..dde9bf08a593 100644 --- a/net/llc/llc_if.c +++ b/net/llc/llc_if.c @@ -80,7 +80,7 @@ out_free: * establishment will inform to upper layer via calling it's confirm * function and passing proper information. */ -int llc_establish_connection(struct sock *sk, u8 *lmac, u8 *dmac, u8 dsap) +int llc_establish_connection(struct sock *sk, const u8 *lmac, u8 *dmac, u8 dsap) { int rc = -EISCONN; struct llc_addr laddr, daddr; diff --git a/net/llc/llc_output.c b/net/llc/llc_output.c index b9ad087bcbd7..5a6466fc626a 100644 --- a/net/llc/llc_output.c +++ b/net/llc/llc_output.c @@ -56,7 +56,7 @@ int llc_mac_hdr_init(struct sk_buff *skb, * package primitive as an event and send to SAP event handler */ int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb, - unsigned char *dmac, unsigned char dsap) + const unsigned char *dmac, unsigned char dsap) { int rc; llc_pdu_header_init(skb, LLC_PDU_TYPE_U, sap->laddr.lsap, diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c index a4eccb98220a..0ff490a73fae 100644 --- a/net/llc/llc_proc.c +++ b/net/llc/llc_proc.c @@ -26,7 +26,7 @@ #include <net/llc_c_st.h> #include <net/llc_conn.h> -static void llc_ui_format_mac(struct seq_file *seq, u8 *addr) +static void llc_ui_format_mac(struct seq_file *seq, const u8 *addr) { seq_printf(seq, "%pM", addr); } diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index efbefcbac3ac..7cab1cf09bf1 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -60,7 +60,10 @@ static struct mesh_table *mesh_table_alloc(void) atomic_set(&newtbl->entries, 0); spin_lock_init(&newtbl->gates_lock); spin_lock_init(&newtbl->walk_lock); - rhashtable_init(&newtbl->rhead, &mesh_rht_params); + if (rhashtable_init(&newtbl->rhead, &mesh_rht_params)) { + kfree(newtbl); + return NULL; + } return newtbl; } diff --git a/net/mac80211/mesh_ps.c b/net/mac80211/mesh_ps.c index 204830a55240..3fbd0b9ff913 100644 --- a/net/mac80211/mesh_ps.c +++ b/net/mac80211/mesh_ps.c @@ -2,6 +2,7 @@ /* * Copyright 2012-2013, Marco Porsch <marco.porsch@s2005.tu-chemnitz.de> * Copyright 2012-2013, cozybit Inc. + * Copyright (C) 2021 Intel Corporation */ #include "mesh.h" @@ -588,7 +589,7 @@ void ieee80211_mps_frame_release(struct sta_info *sta, /* only transmit to PS STA with announced, non-zero awake window */ if (test_sta_flag(sta, WLAN_STA_PS_STA) && - (!elems->awake_window || !le16_to_cpu(*elems->awake_window))) + (!elems->awake_window || !get_unaligned_le16(elems->awake_window))) return; if (!test_sta_flag(sta, WLAN_STA_MPSP_OWNER)) diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index e5935e3d7a07..8c6416129d5b 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -392,10 +392,6 @@ static bool rate_control_send_low(struct ieee80211_sta *pubsta, int mcast_rate; bool use_basicrate = false; - if (ieee80211_is_tx_data(txrc->skb) && - info->flags & IEEE80211_TX_CTL_NO_ACK) - return false; - if (!pubsta || rc_no_data_or_no_ack_use_min(txrc)) { __rate_control_send_low(txrc->hw, sband, pubsta, info, txrc->rate_idx_mask); diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 8acc743559c3..fc5c608d02e2 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4121,7 +4121,8 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) if (!bssid) return false; if (ether_addr_equal(sdata->vif.addr, hdr->addr2) || - ether_addr_equal(sdata->u.ibss.bssid, hdr->addr2)) + ether_addr_equal(sdata->u.ibss.bssid, hdr->addr2) || + !is_valid_ether_addr(hdr->addr2)) return false; if (ieee80211_is_beacon(hdr->frame_control)) return true; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 36524101d11f..51b49f0d3ad4 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -514,6 +514,8 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, sta->cparams.target = MS2TIME(20); sta->cparams.interval = MS2TIME(100); sta->cparams.ecn = true; + sta->cparams.ce_threshold_selector = 0; + sta->cparams.ce_threshold_mask = 0; sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 5c426b093ee2..a756a197c770 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2210,7 +2210,11 @@ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, } vht_mcs = iterator.this_arg[4] >> 4; + if (vht_mcs > 11) + vht_mcs = 0; vht_nss = iterator.this_arg[4] & 0xF; + if (!vht_nss || vht_nss > 8) + vht_nss = 1; break; /* @@ -3381,6 +3385,14 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head)) goto out; + /* If n == 2, the "while (*frag_tail)" loop above didn't execute + * and frag_tail should be &skb_shinfo(head)->frag_list. + * However, ieee80211_amsdu_prepare_head() can reallocate it. + * Reload frag_tail to have it pointing to the correct place. + */ + if (n == 2) + frag_tail = &skb_shinfo(head)->frag_list; + /* * Pad out the previous subframe to a multiple of 4 by adding the * padding to the next one, that's being added. Note that head->len diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index bca47fad5a16..4eed23e27610 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -520,6 +520,9 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx, return RX_DROP_UNUSABLE; } + /* reload hdr - skb might have been reallocated */ + hdr = (void *)rx->skb->data; + data_len = skb->len - hdrlen - IEEE80211_CCMP_HDR_LEN - mic_len; if (!rx->sta || data_len < 0) return RX_DROP_UNUSABLE; @@ -749,6 +752,9 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx) return RX_DROP_UNUSABLE; } + /* reload hdr - skb might have been reallocated */ + hdr = (void *)rx->skb->data; + data_len = skb->len - hdrlen - IEEE80211_GCMP_HDR_LEN - mic_len; if (!rx->sta || data_len < 0) return RX_DROP_UNUSABLE; diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 323d3d2d986f..500ed1b81250 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -129,15 +129,14 @@ static int mac802154_wpan_mac_addr(struct net_device *dev, void *p) if (!ieee802154_is_valid_extended_unicast_addr(extended_addr)) return -EINVAL; - memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); + dev_addr_set(dev, addr->sa_data); sdata->wpan_dev.extended_addr = extended_addr; /* update lowpan interface mac address when * wpan mac has been changed */ if (sdata->wpan_dev.lowpan_dev) - memcpy(sdata->wpan_dev.lowpan_dev->dev_addr, dev->dev_addr, - dev->addr_len); + dev_addr_set(sdata->wpan_dev.lowpan_dev, dev->dev_addr); return mac802154_wpan_update_llsec(dev); } @@ -615,6 +614,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, unsigned char name_assign_type, enum nl802154_iftype type, __le64 extended_addr) { + u8 addr[IEEE802154_EXTENDED_ADDR_LEN]; struct net_device *ndev = NULL; struct ieee802154_sub_if_data *sdata = NULL; int ret; @@ -638,11 +638,12 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, switch (type) { case NL802154_IFTYPE_NODE: ndev->type = ARPHRD_IEEE802154; - if (ieee802154_is_valid_extended_unicast_addr(extended_addr)) - ieee802154_le64_to_be64(ndev->dev_addr, &extended_addr); - else - memcpy(ndev->dev_addr, ndev->perm_addr, - IEEE802154_EXTENDED_ADDR_LEN); + if (ieee802154_is_valid_extended_unicast_addr(extended_addr)) { + ieee802154_le64_to_be64(addr, &extended_addr); + dev_addr_set(ndev, addr); + } else { + dev_addr_set(ndev, ndev->perm_addr); + } break; case NL802154_IFTYPE_MONITOR: ndev->type = ARPHRD_IEEE802154_MONITOR; diff --git a/net/mctp/Kconfig b/net/mctp/Kconfig index 2cdf3d0a28c9..868c92272cbd 100644 --- a/net/mctp/Kconfig +++ b/net/mctp/Kconfig @@ -11,3 +11,8 @@ menuconfig MCTP This option enables core MCTP support. For communicating with other devices, you'll want to enable a driver for a specific hardware channel. + +config MCTP_TEST + bool "MCTP core tests" if !KUNIT_ALL_TESTS + depends on MCTP=y && KUNIT=y + default KUNIT_ALL_TESTS diff --git a/net/mctp/Makefile b/net/mctp/Makefile index 0171333384d7..6cd55233e685 100644 --- a/net/mctp/Makefile +++ b/net/mctp/Makefile @@ -1,3 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_MCTP) += mctp.o mctp-objs := af_mctp.o device.o route.o neigh.o + +# tests +obj-$(CONFIG_MCTP_TEST) += test/utils.o diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index a9526ac29dff..66a411d60b6c 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -16,6 +16,9 @@ #include <net/mctpdevice.h> #include <net/sock.h> +#define CREATE_TRACE_POINTS +#include <trace/events/mctp.h> + /* socket implementation */ static int mctp_release(struct socket *sock) @@ -223,16 +226,61 @@ static const struct proto_ops mctp_dgram_ops = { .sendpage = sock_no_sendpage, }; +static void mctp_sk_expire_keys(struct timer_list *timer) +{ + struct mctp_sock *msk = container_of(timer, struct mctp_sock, + key_expiry); + struct net *net = sock_net(&msk->sk); + unsigned long next_expiry, flags; + struct mctp_sk_key *key; + struct hlist_node *tmp; + bool next_expiry_valid = false; + + spin_lock_irqsave(&net->mctp.keys_lock, flags); + + hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) { + spin_lock(&key->lock); + + if (!time_after_eq(key->expiry, jiffies)) { + trace_mctp_key_release(key, MCTP_TRACE_KEY_TIMEOUT); + key->valid = false; + hlist_del_rcu(&key->hlist); + hlist_del_rcu(&key->sklist); + spin_unlock(&key->lock); + mctp_key_unref(key); + continue; + } + + if (next_expiry_valid) { + if (time_before(key->expiry, next_expiry)) + next_expiry = key->expiry; + } else { + next_expiry = key->expiry; + next_expiry_valid = true; + } + spin_unlock(&key->lock); + } + + spin_unlock_irqrestore(&net->mctp.keys_lock, flags); + + if (next_expiry_valid) + mod_timer(timer, next_expiry); +} + static int mctp_sk_init(struct sock *sk) { struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); INIT_HLIST_HEAD(&msk->keys); + timer_setup(&msk->key_expiry, mctp_sk_expire_keys, 0); return 0; } static void mctp_sk_close(struct sock *sk, long timeout) { + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + + del_timer_sync(&msk->key_expiry); sk_common_release(sk); } @@ -263,21 +311,23 @@ static void mctp_sk_unhash(struct sock *sk) /* remove tag allocations */ spin_lock_irqsave(&net->mctp.keys_lock, flags); hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) { - hlist_del_rcu(&key->sklist); - hlist_del_rcu(&key->hlist); + hlist_del(&key->sklist); + hlist_del(&key->hlist); - spin_lock(&key->reasm_lock); + trace_mctp_key_release(key, MCTP_TRACE_KEY_CLOSED); + + spin_lock(&key->lock); if (key->reasm_head) kfree_skb(key->reasm_head); key->reasm_head = NULL; key->reasm_dead = true; - spin_unlock(&key->reasm_lock); + key->valid = false; + spin_unlock(&key->lock); - kfree_rcu(key, rcu); + /* key is no longer on the lookup lists, unref */ + mctp_key_unref(key); } spin_unlock_irqrestore(&net->mctp.keys_lock, flags); - - synchronize_rcu(); } static struct proto mctp_proto = { @@ -385,7 +435,7 @@ static __exit void mctp_exit(void) sock_unregister(PF_MCTP); } -module_init(mctp_init); +subsys_initcall(mctp_init); module_exit(mctp_exit); MODULE_DESCRIPTION("MCTP core"); diff --git a/net/mctp/device.c b/net/mctp/device.c index b9f38e765f61..3827d62f52c9 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -35,14 +35,6 @@ struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev) return rtnl_dereference(dev->mctp_ptr); } -static void mctp_dev_destroy(struct mctp_dev *mdev) -{ - struct net_device *dev = mdev->dev; - - dev_put(dev); - kfree_rcu(mdev, rcu); -} - static int mctp_fill_addrinfo(struct sk_buff *skb, struct netlink_callback *cb, struct mctp_dev *mdev, mctp_eid_t eid) { @@ -255,6 +247,19 @@ static int mctp_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, return 0; } +void mctp_dev_hold(struct mctp_dev *mdev) +{ + refcount_inc(&mdev->refs); +} + +void mctp_dev_put(struct mctp_dev *mdev) +{ + if (refcount_dec_and_test(&mdev->refs)) { + dev_put(mdev->dev); + kfree_rcu(mdev, rcu); + } +} + static struct mctp_dev *mctp_add_dev(struct net_device *dev) { struct mctp_dev *mdev; @@ -270,7 +275,9 @@ static struct mctp_dev *mctp_add_dev(struct net_device *dev) mdev->net = mctp_default_net(dev_net(dev)); /* associate to net_device */ + refcount_set(&mdev->refs, 1); rcu_assign_pointer(dev->mctp_ptr, mdev); + dev_hold(dev); mdev->dev = dev; @@ -330,12 +337,26 @@ static int mctp_set_link_af(struct net_device *dev, const struct nlattr *attr, return 0; } +/* Matches netdev types that should have MCTP handling */ +static bool mctp_known(struct net_device *dev) +{ + /* only register specific types (inc. NONE for TUN devices) */ + return dev->type == ARPHRD_MCTP || + dev->type == ARPHRD_LOOPBACK || + dev->type == ARPHRD_NONE; +} + static void mctp_unregister(struct net_device *dev) { struct mctp_dev *mdev; mdev = mctp_dev_get_rtnl(dev); - + if (mctp_known(dev) != (bool)mdev) { + // Sanity check, should match what was set in mctp_register + netdev_warn(dev, "%s: mdev pointer %d but type (%d) match is %d", + __func__, (bool)mdev, mctp_known(dev), dev->type); + return; + } if (!mdev) return; @@ -345,7 +366,7 @@ static void mctp_unregister(struct net_device *dev) mctp_neigh_remove_dev(mdev); kfree(mdev->addrs); - mctp_dev_destroy(mdev); + mctp_dev_put(mdev); } static int mctp_register(struct net_device *dev) @@ -353,11 +374,17 @@ static int mctp_register(struct net_device *dev) struct mctp_dev *mdev; /* Already registered? */ - if (rtnl_dereference(dev->mctp_ptr)) + mdev = rtnl_dereference(dev->mctp_ptr); + + if (mdev) { + if (!mctp_known(dev)) + netdev_warn(dev, "%s: mctp_dev set for unknown type %d", + __func__, dev->type); return 0; + } - /* only register specific types; MCTP-specific and loopback for now */ - if (dev->type != ARPHRD_MCTP && dev->type != ARPHRD_LOOPBACK) + /* only register specific types */ + if (!mctp_known(dev)) return 0; mdev = mctp_add_dev(dev); diff --git a/net/mctp/neigh.c b/net/mctp/neigh.c index 90ed2f02d1fb..5cc042121493 100644 --- a/net/mctp/neigh.c +++ b/net/mctp/neigh.c @@ -47,7 +47,7 @@ static int mctp_neigh_add(struct mctp_dev *mdev, mctp_eid_t eid, } INIT_LIST_HEAD(&neigh->list); neigh->dev = mdev; - dev_hold(neigh->dev->dev); + mctp_dev_hold(neigh->dev); neigh->eid = eid; neigh->source = source; memcpy(neigh->ha, lladdr, lladdr_len); @@ -63,7 +63,7 @@ static void __mctp_neigh_free(struct rcu_head *rcu) { struct mctp_neigh *neigh = container_of(rcu, struct mctp_neigh, rcu); - dev_put(neigh->dev->dev); + mctp_dev_put(neigh->dev); kfree(neigh); } diff --git a/net/mctp/route.c b/net/mctp/route.c index 5ca186d53cb0..82fb5ae524f6 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -11,6 +11,7 @@ */ #include <linux/idr.h> +#include <linux/kconfig.h> #include <linux/mctp.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> @@ -23,7 +24,10 @@ #include <net/netlink.h> #include <net/sock.h> +#include <trace/events/mctp.h> + static const unsigned int mctp_message_maxlen = 64 * 1024; +static const unsigned long mctp_key_lifetime = 6 * CONFIG_HZ; /* route output callbacks */ static int mctp_route_discard(struct mctp_route *route, struct sk_buff *skb) @@ -83,25 +87,43 @@ static bool mctp_key_match(struct mctp_sk_key *key, mctp_eid_t local, return true; } +/* returns a key (with key->lock held, and refcounted), or NULL if no such + * key exists. + */ static struct mctp_sk_key *mctp_lookup_key(struct net *net, struct sk_buff *skb, - mctp_eid_t peer) + mctp_eid_t peer, + unsigned long *irqflags) + __acquires(&key->lock) { struct mctp_sk_key *key, *ret; + unsigned long flags; struct mctp_hdr *mh; u8 tag; - WARN_ON(!rcu_read_lock_held()); - mh = mctp_hdr(skb); tag = mh->flags_seq_tag & (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO); ret = NULL; + spin_lock_irqsave(&net->mctp.keys_lock, flags); + + hlist_for_each_entry(key, &net->mctp.keys, hlist) { + if (!mctp_key_match(key, mh->dest, peer, tag)) + continue; - hlist_for_each_entry_rcu(key, &net->mctp.keys, hlist) { - if (mctp_key_match(key, mh->dest, peer, tag)) { + spin_lock(&key->lock); + if (key->valid) { + refcount_inc(&key->refs); ret = key; break; } + spin_unlock(&key->lock); + } + + if (ret) { + spin_unlock(&net->mctp.keys_lock); + *irqflags = flags; + } else { + spin_unlock_irqrestore(&net->mctp.keys_lock, flags); } return ret; @@ -121,11 +143,19 @@ static struct mctp_sk_key *mctp_key_alloc(struct mctp_sock *msk, key->local_addr = local; key->tag = tag; key->sk = &msk->sk; - spin_lock_init(&key->reasm_lock); + key->valid = true; + spin_lock_init(&key->lock); + refcount_set(&key->refs, 1); return key; } +void mctp_key_unref(struct mctp_sk_key *key) +{ + if (refcount_dec_and_test(&key->refs)) + kfree(key); +} + static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk) { struct net *net = sock_net(&msk->sk); @@ -138,12 +168,20 @@ static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk) hlist_for_each_entry(tmp, &net->mctp.keys, hlist) { if (mctp_key_match(tmp, key->local_addr, key->peer_addr, key->tag)) { - rc = -EEXIST; - break; + spin_lock(&tmp->lock); + if (tmp->valid) + rc = -EEXIST; + spin_unlock(&tmp->lock); + if (rc) + break; } } if (!rc) { + refcount_inc(&key->refs); + key->expiry = jiffies + mctp_key_lifetime; + timer_reduce(&msk->key_expiry, key->expiry); + hlist_add_head(&key->hlist, &net->mctp.keys); hlist_add_head(&key->sklist, &msk->keys); } @@ -153,28 +191,35 @@ static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk) return rc; } -/* Must be called with key->reasm_lock, which it will release. Will schedule - * the key for an RCU free. +/* We're done with the key; unset valid and remove from lists. There may still + * be outstanding refs on the key though... */ static void __mctp_key_unlock_drop(struct mctp_sk_key *key, struct net *net, unsigned long flags) - __releases(&key->reasm_lock) + __releases(&key->lock) { struct sk_buff *skb; skb = key->reasm_head; key->reasm_head = NULL; key->reasm_dead = true; - spin_unlock_irqrestore(&key->reasm_lock, flags); + key->valid = false; + spin_unlock_irqrestore(&key->lock, flags); spin_lock_irqsave(&net->mctp.keys_lock, flags); - hlist_del_rcu(&key->hlist); - hlist_del_rcu(&key->sklist); + hlist_del(&key->hlist); + hlist_del(&key->sklist); spin_unlock_irqrestore(&net->mctp.keys_lock, flags); - kfree_rcu(key, rcu); + + /* one unref for the lists */ + mctp_key_unref(key); + + /* and one for the local reference */ + mctp_key_unref(key); if (skb) kfree_skb(skb); + } static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb) @@ -248,8 +293,10 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) rcu_read_lock(); - /* lookup socket / reasm context, exactly matching (src,dest,tag) */ - key = mctp_lookup_key(net, skb, mh->src); + /* lookup socket / reasm context, exactly matching (src,dest,tag). + * we hold a ref on the key, and key->lock held. + */ + key = mctp_lookup_key(net, skb, mh->src, &f); if (flags & MCTP_HDR_FLAG_SOM) { if (key) { @@ -260,10 +307,12 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * key for reassembly - we'll create a more specific * one for future packets if required (ie, !EOM). */ - key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY); + key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY, &f); if (key) { msk = container_of(key->sk, struct mctp_sock, sk); + spin_unlock_irqrestore(&key->lock, f); + mctp_key_unref(key); key = NULL; } } @@ -282,11 +331,13 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) if (flags & MCTP_HDR_FLAG_EOM) { sock_queue_rcv_skb(&msk->sk, skb); if (key) { - spin_lock_irqsave(&key->reasm_lock, f); /* we've hit a pending reassembly; not much we * can do but drop it */ + trace_mctp_key_release(key, + MCTP_TRACE_KEY_REPLIED); __mctp_key_unlock_drop(key, net, f); + key = NULL; } rc = 0; goto out_unlock; @@ -303,7 +354,7 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) goto out_unlock; } - /* we can queue without the reasm lock here, as the + /* we can queue without the key lock here, as the * key isn't observable yet */ mctp_frag_queue(key, skb); @@ -318,17 +369,22 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) if (rc) kfree(key); - } else { - /* existing key: start reassembly */ - spin_lock_irqsave(&key->reasm_lock, f); + trace_mctp_key_acquire(key); + /* we don't need to release key->lock on exit */ + mctp_key_unref(key); + key = NULL; + + } else { if (key->reasm_head || key->reasm_dead) { /* duplicate start? drop everything */ + trace_mctp_key_release(key, + MCTP_TRACE_KEY_INVALIDATED); __mctp_key_unlock_drop(key, net, f); rc = -EEXIST; + key = NULL; } else { rc = mctp_frag_queue(key, skb); - spin_unlock_irqrestore(&key->reasm_lock, f); } } @@ -337,8 +393,6 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * using the message-specific key */ - spin_lock_irqsave(&key->reasm_lock, f); - /* we need to be continuing an existing reassembly... */ if (!key->reasm_head) rc = -EINVAL; @@ -351,9 +405,9 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) if (!rc && flags & MCTP_HDR_FLAG_EOM) { sock_queue_rcv_skb(key->sk, key->reasm_head); key->reasm_head = NULL; + trace_mctp_key_release(key, MCTP_TRACE_KEY_REPLIED); __mctp_key_unlock_drop(key, net, f); - } else { - spin_unlock_irqrestore(&key->reasm_lock, f); + key = NULL; } } else { @@ -363,6 +417,10 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) out_unlock: rcu_read_unlock(); + if (key) { + spin_unlock_irqrestore(&key->lock, f); + mctp_key_unref(key); + } out: if (rc) kfree_skb(skb); @@ -412,7 +470,7 @@ static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb) static void mctp_route_release(struct mctp_route *rt) { if (refcount_dec_and_test(&rt->refs)) { - dev_put(rt->dev->dev); + mctp_dev_put(rt->dev); kfree_rcu(rt, rcu); } } @@ -454,11 +512,15 @@ static void mctp_reserve_tag(struct net *net, struct mctp_sk_key *key, lockdep_assert_held(&mns->keys_lock); + key->expiry = jiffies + mctp_key_lifetime; + timer_reduce(&msk->key_expiry, key->expiry); + /* we hold the net->key_lock here, allowing updates to both * then net and sk */ hlist_add_head_rcu(&key->hlist, &mns->keys); hlist_add_head_rcu(&key->sklist, &msk->keys); + refcount_inc(&key->refs); } /* Allocate a locally-owned tag value for (saddr, daddr), and reserve @@ -474,6 +536,10 @@ static int mctp_alloc_local_tag(struct mctp_sock *msk, int rc = -EAGAIN; u8 tagbits; + /* for NULL destination EIDs, we may get a response from any peer */ + if (daddr == MCTP_ADDR_NULL) + daddr = MCTP_ADDR_ANY; + /* be optimistic, alloc now */ key = mctp_key_alloc(msk, saddr, daddr, 0, GFP_KERNEL); if (!key) @@ -488,14 +554,26 @@ static int mctp_alloc_local_tag(struct mctp_sock *msk, * tags. If we find a conflict, clear that bit from tagbits */ hlist_for_each_entry(tmp, &mns->keys, hlist) { + /* We can check the lookup fields (*_addr, tag) without the + * lock held, they don't change over the lifetime of the key. + */ + /* if we don't own the tag, it can't conflict */ if (tmp->tag & MCTP_HDR_FLAG_TO) continue; - if ((tmp->peer_addr == daddr || - tmp->peer_addr == MCTP_ADDR_ANY) && - tmp->local_addr == saddr) + if (!((tmp->peer_addr == daddr || + tmp->peer_addr == MCTP_ADDR_ANY) && + tmp->local_addr == saddr)) + continue; + + spin_lock(&tmp->lock); + /* key must still be valid. If we find a match, clear the + * potential tag value + */ + if (tmp->valid) tagbits &= ~(1 << tmp->tag); + spin_unlock(&tmp->lock); if (!tagbits) break; @@ -504,7 +582,12 @@ static int mctp_alloc_local_tag(struct mctp_sock *msk, if (tagbits) { key->tag = __ffs(tagbits); mctp_reserve_tag(net, key, msk); + trace_mctp_key_acquire(key); + *tagp = key->tag; + /* done with the key in this scope */ + mctp_key_unref(key); + key = NULL; rc = 0; } @@ -552,6 +635,20 @@ struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet, return rt; } +static struct mctp_route *mctp_route_lookup_null(struct net *net, + struct net_device *dev) +{ + struct mctp_route *rt; + + list_for_each_entry_rcu(rt, &net->mctp.routes, list) { + if (rt->dev->dev == dev && rt->type == RTN_LOCAL && + refcount_inc_not_zero(&rt->refs)) + return rt; + } + + return NULL; +} + /* sends a skb to rt and releases the route. */ int mctp_do_route(struct mctp_route *rt, struct sk_buff *skb) { @@ -741,7 +838,7 @@ static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start, rt->max = daddr_start + daddr_extent; rt->mtu = mtu; rt->dev = mdev; - dev_hold(rt->dev->dev); + mctp_dev_hold(rt->dev); rt->type = type; rt->output = rtfn; @@ -821,13 +918,18 @@ static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev, struct net_device *orig_dev) { struct net *net = dev_net(dev); + struct mctp_dev *mdev; struct mctp_skb_cb *cb; struct mctp_route *rt; struct mctp_hdr *mh; - /* basic non-data sanity checks */ - if (dev->type != ARPHRD_MCTP) + rcu_read_lock(); + mdev = __mctp_dev_get(dev); + rcu_read_unlock(); + if (!mdev) { + /* basic non-data sanity checks */ goto err_drop; + } if (!pskb_may_pull(skb, sizeof(struct mctp_hdr))) goto err_drop; @@ -841,11 +943,14 @@ static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev, goto err_drop; cb = __mctp_cb(skb); - rcu_read_lock(); - cb->net = READ_ONCE(__mctp_dev_get(dev)->net); - rcu_read_unlock(); + cb->net = READ_ONCE(mdev->net); rt = mctp_route_lookup(net, cb->net, mh->dest); + + /* NULL EID, but addressed to our physical address */ + if (!rt && mh->dest == MCTP_ADDR_NULL && skb->pkt_type == PACKET_HOST) + rt = mctp_route_lookup_null(net, dev); + if (!rt) goto err_drop; @@ -926,10 +1031,15 @@ static int mctp_route_nlparse(struct sk_buff *skb, struct nlmsghdr *nlh, return 0; } +static const struct nla_policy rta_metrics_policy[RTAX_MAX + 1] = { + [RTAX_MTU] = { .type = NLA_U32 }, +}; + static int mctp_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RTA_MAX + 1]; + struct nlattr *tbx[RTAX_MAX + 1]; mctp_eid_t daddr_start; struct mctp_dev *mdev; struct rtmsg *rtm; @@ -946,8 +1056,15 @@ static int mctp_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; } - /* TODO: parse mtu from nlparse */ mtu = 0; + if (tb[RTA_METRICS]) { + rc = nla_parse_nested(tbx, RTAX_MAX, tb[RTA_METRICS], + rta_metrics_policy, NULL); + if (rc < 0) + return rc; + if (tbx[RTAX_MTU]) + mtu = nla_get_u32(tbx[RTAX_MTU]); + } if (rtm->rtm_type != RTN_UNICAST) return -EINVAL; @@ -1116,3 +1233,7 @@ void __exit mctp_routes_exit(void) rtnl_unregister(PF_MCTP, RTM_GETROUTE); dev_remove_pack(&mctp_packet_type); } + +#if IS_ENABLED(CONFIG_MCTP_TEST) +#include "test/route-test.c" +#endif diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c new file mode 100644 index 000000000000..36fac3daf86a --- /dev/null +++ b/net/mctp/test/route-test.c @@ -0,0 +1,544 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <kunit/test.h> + +#include "utils.h" + +struct mctp_test_route { + struct mctp_route rt; + struct sk_buff_head pkts; +}; + +static int mctp_test_route_output(struct mctp_route *rt, struct sk_buff *skb) +{ + struct mctp_test_route *test_rt = container_of(rt, struct mctp_test_route, rt); + + skb_queue_tail(&test_rt->pkts, skb); + + return 0; +} + +/* local version of mctp_route_alloc() */ +static struct mctp_test_route *mctp_route_test_alloc(void) +{ + struct mctp_test_route *rt; + + rt = kzalloc(sizeof(*rt), GFP_KERNEL); + if (!rt) + return NULL; + + INIT_LIST_HEAD(&rt->rt.list); + refcount_set(&rt->rt.refs, 1); + rt->rt.output = mctp_test_route_output; + + skb_queue_head_init(&rt->pkts); + + return rt; +} + +static struct mctp_test_route *mctp_test_create_route(struct net *net, + struct mctp_dev *dev, + mctp_eid_t eid, + unsigned int mtu) +{ + struct mctp_test_route *rt; + + rt = mctp_route_test_alloc(); + if (!rt) + return NULL; + + rt->rt.min = eid; + rt->rt.max = eid; + rt->rt.mtu = mtu; + rt->rt.type = RTN_UNSPEC; + if (dev) + mctp_dev_hold(dev); + rt->rt.dev = dev; + + list_add_rcu(&rt->rt.list, &net->mctp.routes); + + return rt; +} + +static void mctp_test_route_destroy(struct kunit *test, + struct mctp_test_route *rt) +{ + unsigned int refs; + + rtnl_lock(); + list_del_rcu(&rt->rt.list); + rtnl_unlock(); + + skb_queue_purge(&rt->pkts); + if (rt->rt.dev) + mctp_dev_put(rt->rt.dev); + + refs = refcount_read(&rt->rt.refs); + KUNIT_ASSERT_EQ_MSG(test, refs, 1, "route ref imbalance"); + + kfree_rcu(&rt->rt, rcu); +} + +static struct sk_buff *mctp_test_create_skb(const struct mctp_hdr *hdr, + unsigned int data_len) +{ + size_t hdr_len = sizeof(*hdr); + struct sk_buff *skb; + unsigned int i; + u8 *buf; + + skb = alloc_skb(hdr_len + data_len, GFP_KERNEL); + if (!skb) + return NULL; + + memcpy(skb_put(skb, hdr_len), hdr, hdr_len); + + buf = skb_put(skb, data_len); + for (i = 0; i < data_len; i++) + buf[i] = i & 0xff; + + return skb; +} + +static struct sk_buff *__mctp_test_create_skb_data(const struct mctp_hdr *hdr, + const void *data, + size_t data_len) +{ + size_t hdr_len = sizeof(*hdr); + struct sk_buff *skb; + + skb = alloc_skb(hdr_len + data_len, GFP_KERNEL); + if (!skb) + return NULL; + + memcpy(skb_put(skb, hdr_len), hdr, hdr_len); + memcpy(skb_put(skb, data_len), data, data_len); + + return skb; +} + +#define mctp_test_create_skb_data(h, d) \ + __mctp_test_create_skb_data(h, d, sizeof(*d)) + +struct mctp_frag_test { + unsigned int mtu; + unsigned int msgsize; + unsigned int n_frags; +}; + +static void mctp_test_fragment(struct kunit *test) +{ + const struct mctp_frag_test *params; + int rc, i, n, mtu, msgsize; + struct mctp_test_route *rt; + struct sk_buff *skb; + struct mctp_hdr hdr; + u8 seq; + + params = test->param_value; + mtu = params->mtu; + msgsize = params->msgsize; + + hdr.ver = 1; + hdr.src = 8; + hdr.dest = 10; + hdr.flags_seq_tag = MCTP_HDR_FLAG_TO; + + skb = mctp_test_create_skb(&hdr, msgsize); + KUNIT_ASSERT_TRUE(test, skb); + + rt = mctp_test_create_route(&init_net, NULL, 10, mtu); + KUNIT_ASSERT_TRUE(test, rt); + + /* The refcount would usually be incremented as part of a route lookup, + * but we're setting the route directly here. + */ + refcount_inc(&rt->rt.refs); + + rc = mctp_do_fragment_route(&rt->rt, skb, mtu, MCTP_TAG_OWNER); + KUNIT_EXPECT_FALSE(test, rc); + + n = rt->pkts.qlen; + + KUNIT_EXPECT_EQ(test, n, params->n_frags); + + for (i = 0;; i++) { + struct mctp_hdr *hdr2; + struct sk_buff *skb2; + u8 tag_mask, seq2; + bool first, last; + + first = i == 0; + last = i == (n - 1); + + skb2 = skb_dequeue(&rt->pkts); + + if (!skb2) + break; + + hdr2 = mctp_hdr(skb2); + + tag_mask = MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO; + + KUNIT_EXPECT_EQ(test, hdr2->ver, hdr.ver); + KUNIT_EXPECT_EQ(test, hdr2->src, hdr.src); + KUNIT_EXPECT_EQ(test, hdr2->dest, hdr.dest); + KUNIT_EXPECT_EQ(test, hdr2->flags_seq_tag & tag_mask, + hdr.flags_seq_tag & tag_mask); + + KUNIT_EXPECT_EQ(test, + !!(hdr2->flags_seq_tag & MCTP_HDR_FLAG_SOM), first); + KUNIT_EXPECT_EQ(test, + !!(hdr2->flags_seq_tag & MCTP_HDR_FLAG_EOM), last); + + seq2 = (hdr2->flags_seq_tag >> MCTP_HDR_SEQ_SHIFT) & + MCTP_HDR_SEQ_MASK; + + if (first) { + seq = seq2; + } else { + seq++; + KUNIT_EXPECT_EQ(test, seq2, seq & MCTP_HDR_SEQ_MASK); + } + + if (!last) + KUNIT_EXPECT_EQ(test, skb2->len, mtu); + else + KUNIT_EXPECT_LE(test, skb2->len, mtu); + + kfree_skb(skb2); + } + + mctp_test_route_destroy(test, rt); +} + +static const struct mctp_frag_test mctp_frag_tests[] = { + {.mtu = 68, .msgsize = 63, .n_frags = 1}, + {.mtu = 68, .msgsize = 64, .n_frags = 1}, + {.mtu = 68, .msgsize = 65, .n_frags = 2}, + {.mtu = 68, .msgsize = 66, .n_frags = 2}, + {.mtu = 68, .msgsize = 127, .n_frags = 2}, + {.mtu = 68, .msgsize = 128, .n_frags = 2}, + {.mtu = 68, .msgsize = 129, .n_frags = 3}, + {.mtu = 68, .msgsize = 130, .n_frags = 3}, +}; + +static void mctp_frag_test_to_desc(const struct mctp_frag_test *t, char *desc) +{ + sprintf(desc, "mtu %d len %d -> %d frags", + t->msgsize, t->mtu, t->n_frags); +} + +KUNIT_ARRAY_PARAM(mctp_frag, mctp_frag_tests, mctp_frag_test_to_desc); + +struct mctp_rx_input_test { + struct mctp_hdr hdr; + bool input; +}; + +static void mctp_test_rx_input(struct kunit *test) +{ + const struct mctp_rx_input_test *params; + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct sk_buff *skb; + + params = test->param_value; + + dev = mctp_test_create_dev(); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); + + rt = mctp_test_create_route(&init_net, dev->mdev, 8, 68); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt); + + skb = mctp_test_create_skb(¶ms->hdr, 1); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); + + __mctp_cb(skb); + + mctp_pkttype_receive(skb, dev->ndev, &mctp_packet_type, NULL); + + KUNIT_EXPECT_EQ(test, !!rt->pkts.qlen, params->input); + + mctp_test_route_destroy(test, rt); + mctp_test_destroy_dev(dev); +} + +#define RX_HDR(_ver, _src, _dest, _fst) \ + { .ver = _ver, .src = _src, .dest = _dest, .flags_seq_tag = _fst } + +/* we have a route for EID 8 only */ +static const struct mctp_rx_input_test mctp_rx_input_tests[] = { + { .hdr = RX_HDR(1, 10, 8, 0), .input = true }, + { .hdr = RX_HDR(1, 10, 9, 0), .input = false }, /* no input route */ + { .hdr = RX_HDR(2, 10, 8, 0), .input = false }, /* invalid version */ +}; + +static void mctp_rx_input_test_to_desc(const struct mctp_rx_input_test *t, + char *desc) +{ + sprintf(desc, "{%x,%x,%x,%x}", t->hdr.ver, t->hdr.src, t->hdr.dest, + t->hdr.flags_seq_tag); +} + +KUNIT_ARRAY_PARAM(mctp_rx_input, mctp_rx_input_tests, + mctp_rx_input_test_to_desc); + +/* set up a local dev, route on EID 8, and a socket listening on type 0 */ +static void __mctp_route_test_init(struct kunit *test, + struct mctp_test_dev **devp, + struct mctp_test_route **rtp, + struct socket **sockp) +{ + struct sockaddr_mctp addr; + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct socket *sock; + int rc; + + dev = mctp_test_create_dev(); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); + + rt = mctp_test_create_route(&init_net, dev->mdev, 8, 68); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt); + + rc = sock_create_kern(&init_net, AF_MCTP, SOCK_DGRAM, 0, &sock); + KUNIT_ASSERT_EQ(test, rc, 0); + + addr.smctp_family = AF_MCTP; + addr.smctp_network = MCTP_NET_ANY; + addr.smctp_addr.s_addr = 8; + addr.smctp_type = 0; + rc = kernel_bind(sock, (struct sockaddr *)&addr, sizeof(addr)); + KUNIT_ASSERT_EQ(test, rc, 0); + + *rtp = rt; + *devp = dev; + *sockp = sock; +} + +static void __mctp_route_test_fini(struct kunit *test, + struct mctp_test_dev *dev, + struct mctp_test_route *rt, + struct socket *sock) +{ + sock_release(sock); + mctp_test_route_destroy(test, rt); + mctp_test_destroy_dev(dev); +} + +struct mctp_route_input_sk_test { + struct mctp_hdr hdr; + u8 type; + bool deliver; +}; + +static void mctp_test_route_input_sk(struct kunit *test) +{ + const struct mctp_route_input_sk_test *params; + struct sk_buff *skb, *skb2; + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct socket *sock; + int rc; + + params = test->param_value; + + __mctp_route_test_init(test, &dev, &rt, &sock); + + skb = mctp_test_create_skb_data(¶ms->hdr, ¶ms->type); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); + + skb->dev = dev->ndev; + __mctp_cb(skb); + + rc = mctp_route_input(&rt->rt, skb); + + if (params->deliver) { + KUNIT_EXPECT_EQ(test, rc, 0); + + skb2 = skb_recv_datagram(sock->sk, 0, 1, &rc); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, skb2); + KUNIT_EXPECT_EQ(test, skb->len, 1); + + skb_free_datagram(sock->sk, skb2); + + } else { + KUNIT_EXPECT_NE(test, rc, 0); + skb2 = skb_recv_datagram(sock->sk, 0, 1, &rc); + KUNIT_EXPECT_PTR_EQ(test, skb2, NULL); + } + + __mctp_route_test_fini(test, dev, rt, sock); +} + +#define FL_S (MCTP_HDR_FLAG_SOM) +#define FL_E (MCTP_HDR_FLAG_EOM) +#define FL_T (MCTP_HDR_FLAG_TO) + +static const struct mctp_route_input_sk_test mctp_route_input_sk_tests[] = { + { .hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_T), .type = 0, .deliver = true }, + { .hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_T), .type = 1, .deliver = false }, + { .hdr = RX_HDR(1, 10, 8, FL_S | FL_E), .type = 0, .deliver = false }, + { .hdr = RX_HDR(1, 10, 8, FL_E | FL_T), .type = 0, .deliver = false }, + { .hdr = RX_HDR(1, 10, 8, FL_T), .type = 0, .deliver = false }, + { .hdr = RX_HDR(1, 10, 8, 0), .type = 0, .deliver = false }, +}; + +static void mctp_route_input_sk_to_desc(const struct mctp_route_input_sk_test *t, + char *desc) +{ + sprintf(desc, "{%x,%x,%x,%x} type %d", t->hdr.ver, t->hdr.src, + t->hdr.dest, t->hdr.flags_seq_tag, t->type); +} + +KUNIT_ARRAY_PARAM(mctp_route_input_sk, mctp_route_input_sk_tests, + mctp_route_input_sk_to_desc); + +struct mctp_route_input_sk_reasm_test { + const char *name; + struct mctp_hdr hdrs[4]; + int n_hdrs; + int rx_len; +}; + +static void mctp_test_route_input_sk_reasm(struct kunit *test) +{ + const struct mctp_route_input_sk_reasm_test *params; + struct sk_buff *skb, *skb2; + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct socket *sock; + int i, rc; + u8 c; + + params = test->param_value; + + __mctp_route_test_init(test, &dev, &rt, &sock); + + for (i = 0; i < params->n_hdrs; i++) { + c = i; + skb = mctp_test_create_skb_data(¶ms->hdrs[i], &c); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); + + skb->dev = dev->ndev; + __mctp_cb(skb); + + rc = mctp_route_input(&rt->rt, skb); + } + + skb2 = skb_recv_datagram(sock->sk, 0, 1, &rc); + + if (params->rx_len) { + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, skb2); + KUNIT_EXPECT_EQ(test, skb2->len, params->rx_len); + skb_free_datagram(sock->sk, skb2); + + } else { + KUNIT_EXPECT_PTR_EQ(test, skb2, NULL); + } + + __mctp_route_test_fini(test, dev, rt, sock); +} + +#define RX_FRAG(f, s) RX_HDR(1, 10, 8, FL_T | (f) | ((s) << MCTP_HDR_SEQ_SHIFT)) + +static const struct mctp_route_input_sk_reasm_test mctp_route_input_sk_reasm_tests[] = { + { + .name = "single packet", + .hdrs = { + RX_FRAG(FL_S | FL_E, 0), + }, + .n_hdrs = 1, + .rx_len = 1, + }, + { + .name = "single packet, offset seq", + .hdrs = { + RX_FRAG(FL_S | FL_E, 1), + }, + .n_hdrs = 1, + .rx_len = 1, + }, + { + .name = "start & end packets", + .hdrs = { + RX_FRAG(FL_S, 0), + RX_FRAG(FL_E, 1), + }, + .n_hdrs = 2, + .rx_len = 2, + }, + { + .name = "start & end packets, offset seq", + .hdrs = { + RX_FRAG(FL_S, 1), + RX_FRAG(FL_E, 2), + }, + .n_hdrs = 2, + .rx_len = 2, + }, + { + .name = "start & end packets, out of order", + .hdrs = { + RX_FRAG(FL_E, 1), + RX_FRAG(FL_S, 0), + }, + .n_hdrs = 2, + .rx_len = 0, + }, + { + .name = "start, middle & end packets", + .hdrs = { + RX_FRAG(FL_S, 0), + RX_FRAG(0, 1), + RX_FRAG(FL_E, 2), + }, + .n_hdrs = 3, + .rx_len = 3, + }, + { + .name = "missing seq", + .hdrs = { + RX_FRAG(FL_S, 0), + RX_FRAG(FL_E, 2), + }, + .n_hdrs = 2, + .rx_len = 0, + }, + { + .name = "seq wrap", + .hdrs = { + RX_FRAG(FL_S, 3), + RX_FRAG(FL_E, 0), + }, + .n_hdrs = 2, + .rx_len = 2, + }, +}; + +static void mctp_route_input_sk_reasm_to_desc( + const struct mctp_route_input_sk_reasm_test *t, + char *desc) +{ + sprintf(desc, "%s", t->name); +} + +KUNIT_ARRAY_PARAM(mctp_route_input_sk_reasm, mctp_route_input_sk_reasm_tests, + mctp_route_input_sk_reasm_to_desc); + +static struct kunit_case mctp_test_cases[] = { + KUNIT_CASE_PARAM(mctp_test_fragment, mctp_frag_gen_params), + KUNIT_CASE_PARAM(mctp_test_rx_input, mctp_rx_input_gen_params), + KUNIT_CASE_PARAM(mctp_test_route_input_sk, mctp_route_input_sk_gen_params), + KUNIT_CASE_PARAM(mctp_test_route_input_sk_reasm, + mctp_route_input_sk_reasm_gen_params), + {} +}; + +static struct kunit_suite mctp_test_suite = { + .name = "mctp", + .test_cases = mctp_test_cases, +}; + +kunit_test_suite(mctp_test_suite); diff --git a/net/mctp/test/utils.c b/net/mctp/test/utils.c new file mode 100644 index 000000000000..cc6b8803aa9d --- /dev/null +++ b/net/mctp/test/utils.c @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/netdevice.h> +#include <linux/mctp.h> +#include <linux/if_arp.h> + +#include <net/mctpdevice.h> +#include <net/pkt_sched.h> + +#include "utils.h" + +static netdev_tx_t mctp_test_dev_tx(struct sk_buff *skb, + struct net_device *ndev) +{ + kfree(skb); + return NETDEV_TX_OK; +} + +static const struct net_device_ops mctp_test_netdev_ops = { + .ndo_start_xmit = mctp_test_dev_tx, +}; + +static void mctp_test_dev_setup(struct net_device *ndev) +{ + ndev->type = ARPHRD_MCTP; + ndev->mtu = MCTP_DEV_TEST_MTU; + ndev->hard_header_len = 0; + ndev->addr_len = 0; + ndev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; + ndev->flags = IFF_NOARP; + ndev->netdev_ops = &mctp_test_netdev_ops; + ndev->needs_free_netdev = true; +} + +struct mctp_test_dev *mctp_test_create_dev(void) +{ + struct mctp_test_dev *dev; + struct net_device *ndev; + int rc; + + ndev = alloc_netdev(sizeof(*dev), "mctptest%d", NET_NAME_ENUM, + mctp_test_dev_setup); + if (!ndev) + return NULL; + + dev = netdev_priv(ndev); + dev->ndev = ndev; + + rc = register_netdev(ndev); + if (rc) { + free_netdev(ndev); + return NULL; + } + + rcu_read_lock(); + dev->mdev = __mctp_dev_get(ndev); + mctp_dev_hold(dev->mdev); + rcu_read_unlock(); + + return dev; +} + +void mctp_test_destroy_dev(struct mctp_test_dev *dev) +{ + mctp_dev_put(dev->mdev); + unregister_netdev(dev->ndev); +} diff --git a/net/mctp/test/utils.h b/net/mctp/test/utils.h new file mode 100644 index 000000000000..df6aa1c03440 --- /dev/null +++ b/net/mctp/test/utils.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __NET_MCTP_TEST_UTILS_H +#define __NET_MCTP_TEST_UTILS_H + +#include <kunit/test.h> + +#define MCTP_DEV_TEST_MTU 68 + +struct mctp_test_dev { + struct net_device *ndev; + struct mctp_dev *mdev; +}; + +struct mctp_test_dev; + +struct mctp_test_dev *mctp_test_create_dev(void); +void mctp_test_destroy_dev(struct mctp_test_dev *dev); + +#endif /* __NET_MCTP_TEST_UTILS_H */ diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index b21ff9be04c6..3240b72271a7 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -72,6 +72,7 @@ bool mptcp_mib_alloc(struct net *net) void mptcp_seq_show(struct seq_file *seq) { + unsigned long sum[ARRAY_SIZE(mptcp_snmp_list) - 1]; struct net *net = seq->private; int i; @@ -81,17 +82,13 @@ void mptcp_seq_show(struct seq_file *seq) seq_puts(seq, "\nMPTcpExt:"); - if (!net->mib.mptcp_statistics) { - for (i = 0; mptcp_snmp_list[i].name; i++) - seq_puts(seq, " 0"); - - seq_putc(seq, '\n'); - return; - } + memset(sum, 0, sizeof(sum)); + if (net->mib.mptcp_statistics) + snmp_get_cpu_field_batch(sum, mptcp_snmp_list, + net->mib.mptcp_statistics); for (i = 0; mptcp_snmp_list[i].name; i++) - seq_printf(seq, " %lu", - snmp_fold_field(net->mib.mptcp_statistics, - mptcp_snmp_list[i].entry)); + seq_printf(seq, " %lu", sum[i]); + seq_putc(seq, '\n'); } diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index fdf0c1f15a65..f44125dd6697 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -36,7 +36,7 @@ static int mptcp_diag_dump_one(struct netlink_callback *cb, struct sock *sk; net = sock_net(in_skb->sk); - msk = mptcp_token_get_sock(req->id.idiag_cookie[0]); + msk = mptcp_token_get_sock(net, req->id.idiag_cookie[0]); if (!msk) goto out_nosk; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index c41273cefc51..422f4acfb3e6 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -748,9 +748,7 @@ static bool mptcp_established_options_mp_prio(struct sock *sk, /* can't send MP_PRIO with MPC, as they share the same option space: * 'backup'. Also it makes no sense at all */ - if (!subflow->send_mp_prio || - ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | - OPTION_MPTCP_MPC_ACK) & opts->suboptions)) + if (!subflow->send_mp_prio || (opts->suboptions & OPTIONS_MPTCP_MPC)) return false; /* account for the trailing 'nop' option */ @@ -1019,11 +1017,9 @@ static void ack_update_msk(struct mptcp_sock *msk, old_snd_una = msk->snd_una; new_snd_una = mptcp_expand_seq(old_snd_una, mp_opt->data_ack, mp_opt->ack64); - /* ACK for data not even sent yet and even above recovery bound? Ignore.*/ - if (unlikely(after64(new_snd_una, snd_nxt))) { - if (!msk->recovery || after64(new_snd_una, msk->recovery_snd_nxt)) - new_snd_una = old_snd_una; - } + /* ACK for data not even sent yet? Ignore.*/ + if (unlikely(after64(new_snd_una, snd_nxt))) + new_snd_una = old_snd_una; new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd; @@ -1329,8 +1325,7 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } } - } else if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | - OPTION_MPTCP_MPC_ACK) & opts->suboptions) { + } else if (OPTIONS_MPTCP_MPC & opts->suboptions) { u8 len, flag = MPTCP_CAP_HMAC_SHA256; if (OPTION_MPTCP_MPC_SYN & opts->suboptions) { diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index c4f9a5ce3815..7b96be1e9f14 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -654,9 +654,9 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) } } -int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, - u8 bkup) +static int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, + struct mptcp_addr_info *addr, + u8 bkup) { struct mptcp_subflow_context *subflow; @@ -1718,9 +1718,7 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) list_for_each_entry(entry, &pernet->local_addr_list, list) { if (addresses_equal(&entry->addr, &addr.addr, true)) { - ret = mptcp_nl_addr_backup(net, &entry->addr, bkup); - if (ret) - return ret; + mptcp_nl_addr_backup(net, &entry->addr, bkup); if (bkup) entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP; @@ -2054,6 +2052,9 @@ static int __net_init pm_nl_init_net(struct net *net) struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id); INIT_LIST_HEAD_RCU(&pernet->local_addr_list); + + /* Cit. 2 subflows ought to be enough for anybody. */ + pernet->subflows_max = 2; pernet->next_id = 1; pernet->stale_loss_cnt = 4; spin_lock_init(&pernet->lock); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2602f1386160..cd6b11c9b54d 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -528,7 +528,6 @@ static bool mptcp_check_data_fin(struct sock *sk) sk->sk_shutdown |= RCV_SHUTDOWN; smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ - set_bit(MPTCP_DATA_READY, &msk->flags); switch (sk->sk_state) { case TCP_ESTABLISHED: @@ -742,10 +741,9 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) /* Wake-up the reader only for in-sequence data */ mptcp_data_lock(sk); - if (move_skbs_to_msk(msk, ssk)) { - set_bit(MPTCP_DATA_READY, &msk->flags); + if (move_skbs_to_msk(msk, ssk)) sk->sk_data_ready(sk); - } + mptcp_data_unlock(sk); } @@ -847,7 +845,6 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) sk->sk_shutdown |= RCV_SHUTDOWN; smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ - set_bit(MPTCP_DATA_READY, &msk->flags); sk->sk_data_ready(sk); } @@ -956,9 +953,7 @@ static void __mptcp_update_wmem(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); -#ifdef CONFIG_LOCKDEP - WARN_ON_ONCE(!lockdep_is_held(&sk->sk_lock.slock)); -#endif + lockdep_assert_held_once(&sk->sk_lock.slock); if (!msk->wmem_reserved) return; @@ -1107,7 +1102,8 @@ out: if (cleaned && tcp_under_memory_pressure(sk)) __mptcp_mem_reclaim_partial(sk); - if (snd_una == READ_ONCE(msk->snd_nxt) && !msk->recovery) { + if (snd_una == READ_ONCE(msk->snd_nxt) && + snd_una == READ_ONCE(msk->write_seq)) { if (mptcp_timer_pending(sk) && !mptcp_data_fin_enabled(msk)) mptcp_stop_timer(sk); } else { @@ -1117,9 +1113,8 @@ out: static void __mptcp_clean_una_wakeup(struct sock *sk) { -#ifdef CONFIG_LOCKDEP - WARN_ON_ONCE(!lockdep_is_held(&sk->sk_lock.slock)); -#endif + lockdep_assert_held_once(&sk->sk_lock.slock); + __mptcp_clean_una(sk); mptcp_write_space(sk); } @@ -1224,6 +1219,7 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) if (likely(__mptcp_add_ext(skb, gfp))) { skb_reserve(skb, MAX_TCP_HEADER); skb->reserved_tailroom = skb->end - skb->tail; + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); @@ -1233,31 +1229,23 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) return NULL; } -static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) +static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) { struct sk_buff *skb; - if (ssk->sk_tx_skb_cache) { - skb = ssk->sk_tx_skb_cache; - if (unlikely(!skb_ext_find(skb, SKB_EXT_MPTCP) && - !__mptcp_add_ext(skb, gfp))) - return false; - return true; - } - skb = __mptcp_do_alloc_tx_skb(sk, gfp); if (!skb) - return false; + return NULL; if (likely(sk_wmem_schedule(ssk, skb->truesize))) { - ssk->sk_tx_skb_cache = skb; - return true; + tcp_skb_entail(ssk, skb); + return skb; } kfree_skb(skb); - return false; + return NULL; } -static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held) +static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held) { gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation; @@ -1287,23 +1275,29 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_sendmsg_info *info) { u64 data_seq = dfrag->data_seq + info->sent; + int offset = dfrag->offset + info->sent; struct mptcp_sock *msk = mptcp_sk(sk); bool zero_window_probe = false; struct mptcp_ext *mpext = NULL; - struct sk_buff *skb, *tail; - bool must_collapse = false; - int size_bias = 0; - int avail_size; - size_t ret = 0; + bool can_coalesce = false; + bool reuse_skb = true; + struct sk_buff *skb; + size_t copy; + int i; pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u", msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); + if (WARN_ON_ONCE(info->sent > info->limit || + info->limit > dfrag->data_len)) + return 0; + /* compute send limit */ info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); - avail_size = info->size_goal; + copy = info->size_goal; + skb = tcp_write_queue_tail(ssk); - if (skb) { + if (skb && copy > skb->len) { /* Limit the write to the size available in the * current skb, if any, so that we create at most a new skb. * Explicitly tells TCP internals to avoid collapsing on later @@ -1316,62 +1310,80 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, goto alloc_skb; } - must_collapse = (info->size_goal - skb->len > 0) && - (skb_shinfo(skb)->nr_frags < sysctl_max_skb_frags); - if (must_collapse) { - size_bias = skb->len; - avail_size = info->size_goal - skb->len; + i = skb_shinfo(skb)->nr_frags; + can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); + if (!can_coalesce && i >= sysctl_max_skb_frags) { + tcp_mark_push(tcp_sk(ssk), skb); + goto alloc_skb; } - } + copy -= skb->len; + } else { alloc_skb: - if (!must_collapse && !ssk->sk_tx_skb_cache && - !mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held)) - return 0; + skb = mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held); + if (!skb) + return -ENOMEM; + + i = skb_shinfo(skb)->nr_frags; + reuse_skb = false; + mpext = skb_ext_find(skb, SKB_EXT_MPTCP); + } /* Zero window and all data acked? Probe. */ - avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size); - if (avail_size == 0) { + copy = mptcp_check_allowed_size(msk, data_seq, copy); + if (copy == 0) { u64 snd_una = READ_ONCE(msk->snd_una); - if (skb || snd_una != msk->snd_nxt) + if (snd_una != msk->snd_nxt) { + tcp_remove_empty_skb(ssk, tcp_write_queue_tail(ssk)); return 0; + } + zero_window_probe = true; data_seq = snd_una - 1; - avail_size = 1; - } + copy = 1; - if (WARN_ON_ONCE(info->sent > info->limit || - info->limit > dfrag->data_len)) - return 0; + /* all mptcp-level data is acked, no skbs should be present into the + * ssk write queue + */ + WARN_ON_ONCE(reuse_skb); + } - ret = info->limit - info->sent; - tail = tcp_build_frag(ssk, avail_size + size_bias, info->flags, - dfrag->page, dfrag->offset + info->sent, &ret); - if (!tail) { - tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk)); + copy = min_t(size_t, copy, info->limit - info->sent); + if (!sk_wmem_schedule(ssk, copy)) { + tcp_remove_empty_skb(ssk, tcp_write_queue_tail(ssk)); return -ENOMEM; } - /* if the tail skb is still the cached one, collapsing really happened. - */ - if (skb == tail) { - TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH; - mpext->data_len += ret; + if (can_coalesce) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); + } else { + get_page(dfrag->page); + skb_fill_page_desc(skb, i, dfrag->page, offset, copy); + } + + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; + sk_wmem_queued_add(ssk, copy); + sk_mem_charge(ssk, copy); + skb->ip_summed = CHECKSUM_PARTIAL; + WRITE_ONCE(tcp_sk(ssk)->write_seq, tcp_sk(ssk)->write_seq + copy); + TCP_SKB_CB(skb)->end_seq += copy; + tcp_skb_pcount_set(skb, 0); + + /* on skb reuse we just need to update the DSS len */ + if (reuse_skb) { + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; + mpext->data_len += copy; WARN_ON_ONCE(zero_window_probe); goto out; } - mpext = skb_ext_find(tail, SKB_EXT_MPTCP); - if (WARN_ON_ONCE(!mpext)) { - /* should never reach here, stream corrupted */ - return -EINVAL; - } - memset(mpext, 0, sizeof(*mpext)); mpext->data_seq = data_seq; mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; - mpext->data_len = ret; + mpext->data_len = copy; mpext->use_map = 1; mpext->dsn64 = 1; @@ -1380,18 +1392,18 @@ alloc_skb: mpext->dsn64); if (zero_window_probe) { - mptcp_subflow_ctx(ssk)->rel_write_seq += ret; + mptcp_subflow_ctx(ssk)->rel_write_seq += copy; mpext->frozen = 1; if (READ_ONCE(msk->csum_enabled)) - mptcp_update_data_checksum(tail, ret); + mptcp_update_data_checksum(skb, copy); tcp_push_pending_frames(ssk); return 0; } out: if (READ_ONCE(msk->csum_enabled)) - mptcp_update_data_checksum(tail, ret); - mptcp_subflow_ctx(ssk)->rel_write_seq += ret; - return ret; + mptcp_update_data_checksum(skb, copy); + mptcp_subflow_ctx(ssk)->rel_write_seq += copy; + return copy; } #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ @@ -1508,6 +1520,38 @@ static void mptcp_push_release(struct sock *sk, struct sock *ssk, release_sock(ssk); } +static void mptcp_update_post_push(struct mptcp_sock *msk, + struct mptcp_data_frag *dfrag, + u32 sent) +{ + u64 snd_nxt_new = dfrag->data_seq; + + dfrag->already_sent += sent; + + msk->snd_burst -= sent; + + snd_nxt_new += dfrag->already_sent; + + /* snd_nxt_new can be smaller than snd_nxt in case mptcp + * is recovering after a failover. In that event, this re-sends + * old segments. + * + * Thus compute snd_nxt_new candidate based on + * the dfrag->data_seq that was sent and the data + * that has been handed to the subflow for transmission + * and skip update in case it was old dfrag. + */ + if (likely(after64(snd_nxt_new, msk->snd_nxt))) + msk->snd_nxt = snd_nxt_new; +} + +static void mptcp_check_and_set_pending(struct sock *sk) +{ + if (mptcp_send_head(sk) && + !test_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) + set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); +} + void __mptcp_push_pending(struct sock *sk, unsigned int flags) { struct sock *prev_ssk = NULL, *ssk = NULL; @@ -1551,12 +1595,10 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) } info.sent += ret; - dfrag->already_sent += ret; - msk->snd_nxt += ret; - msk->snd_burst -= ret; - msk->tx_pending_data -= ret; copied += ret; len -= ret; + + mptcp_update_post_push(msk, dfrag, ret); } WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); } @@ -1609,13 +1651,11 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) goto out; info.sent += ret; - dfrag->already_sent += ret; - msk->snd_nxt += ret; - msk->snd_burst -= ret; - msk->tx_pending_data -= ret; copied += ret; len -= ret; first = false; + + mptcp_update_post_push(msk, dfrag, ret); } WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); } @@ -1725,7 +1765,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) frag_truesize += psize; pfrag->offset += frag_truesize; WRITE_ONCE(msk->write_seq, msk->write_seq + psize); - msk->tx_pending_data += psize; /* charge data on mptcp pending queue to the msk socket * Note: we charge such data both to sk and ssk @@ -1759,21 +1798,6 @@ out: return copied ? : ret; } -static void mptcp_wait_data(struct sock *sk, long *timeo) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - struct mptcp_sock *msk = mptcp_sk(sk); - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - - sk_wait_event(sk, timeo, - test_bit(MPTCP_DATA_READY, &msk->flags), &wait); - - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); -} - static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, size_t len, int flags, @@ -2077,19 +2101,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } pr_debug("block timeout %ld", timeo); - mptcp_wait_data(sk, &timeo); - } - - if (skb_queue_empty_lockless(&sk->sk_receive_queue) && - skb_queue_empty(&msk->receive_queue)) { - /* entire backlog drained, clear DATA_READY. */ - clear_bit(MPTCP_DATA_READY, &msk->flags); - - /* .. race-breaker: ssk might have gotten new data - * after last __mptcp_move_skbs() returned false. - */ - if (unlikely(__mptcp_move_skbs(msk))) - set_bit(MPTCP_DATA_READY, &msk->flags); + sk_wait_data(sk, &timeo, NULL); } out_err: @@ -2098,9 +2110,9 @@ out_err: tcp_recv_timestamp(msg, sk, &tss); } - pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d", - msk, test_bit(MPTCP_DATA_READY, &msk->flags), - skb_queue_empty_lockless(&sk->sk_receive_queue), copied); + pr_debug("msk=%p rx queue empty=%d:%d copied=%d", + msk, skb_queue_empty_lockless(&sk->sk_receive_queue), + skb_queue_empty(&msk->receive_queue), copied); if (!(flags & MSG_PEEK)) mptcp_rcv_space_adjust(msk, copied); @@ -2213,15 +2225,11 @@ bool __mptcp_retransmit_pending_data(struct sock *sk) return false; } - /* will accept ack for reijected data before re-sending them */ - if (!msk->recovery || after64(msk->snd_nxt, msk->recovery_snd_nxt)) - msk->recovery_snd_nxt = msk->snd_nxt; + msk->recovery_snd_nxt = msk->snd_nxt; msk->recovery = true; mptcp_data_unlock(sk); msk->first_pending = rtx_head; - msk->tx_pending_data += msk->snd_nxt - rtx_head->data_seq; - msk->snd_nxt = rtx_head->data_seq; msk->snd_burst = 0; /* be sure to clear the "sent status" on all re-injected fragments */ @@ -2368,7 +2376,6 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) inet_sk_state_store(sk, TCP_CLOSE); sk->sk_shutdown = SHUTDOWN_MASK; smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ - set_bit(MPTCP_DATA_READY, &msk->flags); set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); mptcp_close_wake_up(sk); @@ -2384,6 +2391,9 @@ static void __mptcp_retrans(struct sock *sk) int ret; mptcp_clean_una_wakeup(sk); + + /* first check ssk: need to kick "stale" logic */ + ssk = mptcp_subflow_get_retrans(msk); dfrag = mptcp_rtx_head(sk); if (!dfrag) { if (mptcp_data_fin_enabled(msk)) { @@ -2396,10 +2406,12 @@ static void __mptcp_retrans(struct sock *sk) goto reset_timer; } - return; + if (!mptcp_send_head(sk)) + return; + + goto reset_timer; } - ssk = mptcp_subflow_get_retrans(msk); if (!ssk) goto reset_timer; @@ -2426,6 +2438,8 @@ static void __mptcp_retrans(struct sock *sk) release_sock(ssk); reset_timer: + mptcp_check_and_set_pending(sk); + if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); } @@ -2492,7 +2506,6 @@ static int __mptcp_init_sock(struct sock *sk) msk->first_pending = NULL; msk->wmem_reserved = 0; WRITE_ONCE(msk->rmem_released, 0); - msk->tx_pending_data = 0; msk->timer_ival = TCP_RTO_MIN; msk->first = NULL; @@ -2735,7 +2748,7 @@ cleanup: inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32; mptcp_for_each_subflow(mptcp_sk(sk), subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - bool slow = lock_sock_fast(ssk); + bool slow = lock_sock_fast_nested(ssk); sock_orphan(ssk); unlock_sock_fast(ssk, slow); @@ -3385,8 +3398,14 @@ unlock_fail: static __poll_t mptcp_check_readable(struct mptcp_sock *msk) { - return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : - 0; + /* Concurrent splices from sk_receive_queue into receive_queue will + * always show at least one non-empty queue when checked in this order. + */ + if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) && + skb_queue_empty_lockless(&msk->receive_queue)) + return 0; + + return EPOLLIN | EPOLLRDNORM; } static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) @@ -3421,7 +3440,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, state = inet_sk_state_load(sk); pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); if (state == TCP_LISTEN) - return mptcp_check_readable(msk); + return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : 0; if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(msk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index d3e6fd1615f1..284fdcec067e 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -254,7 +254,6 @@ struct mptcp_sock { struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; struct sk_buff_head receive_queue; - int tx_pending_data; struct list_head conn_list; struct list_head rtx_queue; struct mptcp_data_frag *first_pending; @@ -709,7 +708,7 @@ int mptcp_token_new_connect(struct sock *sk); void mptcp_token_accept(struct mptcp_subflow_request_sock *r, struct mptcp_sock *msk); bool mptcp_token_exists(u32 token); -struct mptcp_sock *mptcp_token_get_sock(u32 token); +struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token); struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, long *s_num); void mptcp_token_destroy(struct mptcp_sock *msk); @@ -737,9 +736,6 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); -int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, - u8 bkup); void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq); void mptcp_pm_free_anno_list(struct mptcp_sock *msk); bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 8137cc3a4296..0f1e661c2032 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -861,6 +861,9 @@ static void mptcp_get_sub_addrs(const struct sock *sk, struct mptcp_subflow_addr } else if (sk->sk_family == AF_INET6) { const struct ipv6_pinfo *np = inet6_sk(sk); + if (WARN_ON_ONCE(!np)) + return; + a->sin6_local.sin6_family = AF_INET6; a->sin6_local.sin6_port = inet->inet_sport; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 1de7ce883c37..6172f380dfb7 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -86,7 +86,7 @@ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req) struct mptcp_sock *msk; int local_id; - msk = mptcp_token_get_sock(subflow_req->token); + msk = mptcp_token_get_sock(sock_net(req_to_sk(req)), subflow_req->token); if (!msk) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN); return NULL; diff --git a/net/mptcp/syncookies.c b/net/mptcp/syncookies.c index 37127781aee9..7f22526346a7 100644 --- a/net/mptcp/syncookies.c +++ b/net/mptcp/syncookies.c @@ -108,18 +108,12 @@ bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subfl e->valid = 0; - msk = mptcp_token_get_sock(e->token); + msk = mptcp_token_get_sock(net, e->token); if (!msk) { spin_unlock_bh(&join_entry_locks[i]); return false; } - /* If this fails, the token got re-used in the mean time by another - * mptcp socket in a different netns, i.e. entry is outdated. - */ - if (!net_eq(sock_net((struct sock *)msk), net)) - goto err_put; - subflow_req->remote_nonce = e->remote_nonce; subflow_req->local_nonce = e->local_nonce; subflow_req->backup = e->backup; @@ -128,11 +122,6 @@ bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subfl subflow_req->msk = msk; spin_unlock_bh(&join_entry_locks[i]); return true; - -err_put: - spin_unlock_bh(&join_entry_locks[i]); - sock_put((struct sock *)msk); - return false; } void __init mptcp_join_cookie_init(void) diff --git a/net/mptcp/token.c b/net/mptcp/token.c index a98e554b034f..e581b341c5be 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -231,6 +231,7 @@ found: /** * mptcp_token_get_sock - retrieve mptcp connection sock using its token + * @net: restrict to this namespace * @token: token of the mptcp connection to retrieve * * This function returns the mptcp connection structure with the given token. @@ -238,7 +239,7 @@ found: * * returns NULL if no connection with the given token value exists. */ -struct mptcp_sock *mptcp_token_get_sock(u32 token) +struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token) { struct hlist_nulls_node *pos; struct token_bucket *bucket; @@ -251,11 +252,15 @@ struct mptcp_sock *mptcp_token_get_sock(u32 token) again: sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) { msk = mptcp_sk(sk); - if (READ_ONCE(msk->token) != token) + if (READ_ONCE(msk->token) != token || + !net_eq(sock_net(sk), net)) continue; + if (!refcount_inc_not_zero(&sk->sk_refcnt)) goto not_found; - if (READ_ONCE(msk->token) != token) { + + if (READ_ONCE(msk->token) != token || + !net_eq(sock_net(sk), net)) { sock_put(sk); goto again; } diff --git a/net/mptcp/token_test.c b/net/mptcp/token_test.c index e1bd6f0a0676..5d984bec1cd8 100644 --- a/net/mptcp/token_test.c +++ b/net/mptcp/token_test.c @@ -11,6 +11,7 @@ static struct mptcp_subflow_request_sock *build_req_sock(struct kunit *test) GFP_USER); KUNIT_EXPECT_NOT_ERR_OR_NULL(test, req); mptcp_token_init_request((struct request_sock *)req); + sock_net_set((struct sock *)req, &init_net); return req; } @@ -22,7 +23,7 @@ static void mptcp_token_test_req_basic(struct kunit *test) KUNIT_ASSERT_EQ(test, 0, mptcp_token_new_request((struct request_sock *)req)); KUNIT_EXPECT_NE(test, 0, (int)req->token); - KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(req->token)); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(&init_net, req->token)); /* cleanup */ mptcp_token_destroy_request((struct request_sock *)req); @@ -55,6 +56,7 @@ static struct mptcp_sock *build_msk(struct kunit *test) msk = kunit_kzalloc(test, sizeof(struct mptcp_sock), GFP_USER); KUNIT_EXPECT_NOT_ERR_OR_NULL(test, msk); refcount_set(&((struct sock *)msk)->sk_refcnt, 1); + sock_net_set((struct sock *)msk, &init_net); return msk; } @@ -74,11 +76,11 @@ static void mptcp_token_test_msk_basic(struct kunit *test) mptcp_token_new_connect((struct sock *)icsk)); KUNIT_EXPECT_NE(test, 0, (int)ctx->token); KUNIT_EXPECT_EQ(test, ctx->token, msk->token); - KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(ctx->token)); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(&init_net, ctx->token)); KUNIT_EXPECT_EQ(test, 2, (int)refcount_read(&sk->sk_refcnt)); mptcp_token_destroy(msk); - KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(ctx->token)); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(&init_net, ctx->token)); } static void mptcp_token_test_accept(struct kunit *test) @@ -90,11 +92,11 @@ static void mptcp_token_test_accept(struct kunit *test) mptcp_token_new_request((struct request_sock *)req)); msk->token = req->token; mptcp_token_accept(req, msk); - KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(msk->token)); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(&init_net, msk->token)); /* this is now a no-op */ mptcp_token_destroy_request((struct request_sock *)req); - KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(msk->token)); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(&init_net, msk->token)); /* cleanup */ mptcp_token_destroy(msk); @@ -116,7 +118,7 @@ static void mptcp_token_test_destroyed(struct kunit *test) /* simulate race on removal */ refcount_set(&sk->sk_refcnt, 0); - KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(msk->token)); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(&init_net, msk->token)); /* cleanup */ mptcp_token_destroy(msk); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 54395266339d..3646fc195e7d 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -10,6 +10,17 @@ config NETFILTER_INGRESS This allows you to classify packets from ingress using the Netfilter infrastructure. +config NETFILTER_EGRESS + bool "Netfilter egress support" + default y + select NET_EGRESS + help + This allows you to classify packets before transmission using the + Netfilter infrastructure. + +config NETFILTER_SKIP_EGRESS + def_bool NETFILTER_EGRESS && (NET_CLS_ACT || IFB) + config NETFILTER_NETLINK tristate @@ -109,7 +120,7 @@ config NF_CONNTRACK_MARK config NF_CONNTRACK_SECMARK bool 'Connection tracking security mark support' depends on NETWORK_SECMARK - default m if NETFILTER_ADVANCED=n + default y if NETFILTER_ADVANCED=n help This option enables security markings to be applied to connections. Typically they are copied to connections from diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 63d032191e62..6dec9cd395f1 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -317,6 +317,12 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum, return &dev->nf_hooks_ingress; } #endif +#ifdef CONFIG_NETFILTER_EGRESS + if (hooknum == NF_NETDEV_EGRESS) { + if (dev && dev_net(dev) == net) + return &dev->nf_hooks_egress; + } +#endif WARN_ON_ONCE(1); return NULL; } @@ -335,7 +341,8 @@ static int nf_ingress_check(struct net *net, const struct nf_hook_ops *reg, return 0; } -static inline bool nf_ingress_hook(const struct nf_hook_ops *reg, int pf) +static inline bool __maybe_unused nf_ingress_hook(const struct nf_hook_ops *reg, + int pf) { if ((pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) || (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS)) @@ -344,6 +351,12 @@ static inline bool nf_ingress_hook(const struct nf_hook_ops *reg, int pf) return false; } +static inline bool __maybe_unused nf_egress_hook(const struct nf_hook_ops *reg, + int pf) +{ + return pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_EGRESS; +} + static void nf_static_key_inc(const struct nf_hook_ops *reg, int pf) { #ifdef CONFIG_JUMP_LABEL @@ -383,9 +396,18 @@ static int __nf_register_net_hook(struct net *net, int pf, switch (pf) { case NFPROTO_NETDEV: - err = nf_ingress_check(net, reg, NF_NETDEV_INGRESS); - if (err < 0) - return err; +#ifndef CONFIG_NETFILTER_INGRESS + if (reg->hooknum == NF_NETDEV_INGRESS) + return -EOPNOTSUPP; +#endif +#ifndef CONFIG_NETFILTER_EGRESS + if (reg->hooknum == NF_NETDEV_EGRESS) + return -EOPNOTSUPP; +#endif + if ((reg->hooknum != NF_NETDEV_INGRESS && + reg->hooknum != NF_NETDEV_EGRESS) || + !reg->dev || dev_net(reg->dev) != net) + return -EINVAL; break; case NFPROTO_INET: if (reg->hooknum != NF_INET_INGRESS) @@ -418,6 +440,10 @@ static int __nf_register_net_hook(struct net *net, int pf, if (nf_ingress_hook(reg, pf)) net_inc_ingress_queue(); #endif +#ifdef CONFIG_NETFILTER_EGRESS + if (nf_egress_hook(reg, pf)) + net_inc_egress_queue(); +#endif nf_static_key_inc(reg, pf); BUG_ON(p == new_hooks); @@ -475,6 +501,10 @@ static void __nf_unregister_net_hook(struct net *net, int pf, if (nf_ingress_hook(reg, pf)) net_dec_ingress_queue(); #endif +#ifdef CONFIG_NETFILTER_EGRESS + if (nf_egress_hook(reg, pf)) + net_dec_egress_queue(); +#endif nf_static_key_dec(reg, pf); } else { WARN_ONCE(1, "hook not found, pf %d num %d", pf, reg->hooknum); diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 6186358eac7c..6e391308431d 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -130,11 +130,11 @@ htable_size(u8 hbits) { size_t hsize; - /* We must fit both into u32 in jhash and size_t */ + /* We must fit both into u32 in jhash and INT_MAX in kvmalloc_node() */ if (hbits > 31) return 0; hsize = jhash_size(hbits); - if ((((size_t)-1) - sizeof(struct htable)) / sizeof(struct hbucket *) + if ((INT_MAX - sizeof(struct htable)) / sizeof(struct hbucket *) < hsize) return 0; diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index c100c6b112c8..2c467c422dc6 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1468,6 +1468,10 @@ int __init ip_vs_conn_init(void) int idx; /* Compute size and mask */ + if (ip_vs_conn_tab_bits < 8 || ip_vs_conn_tab_bits > 20) { + pr_info("conn_tab_bits not in [8, 20]. Using default value\n"); + ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; + } ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 128690c512df..e93c937a8bf0 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1330,12 +1330,15 @@ drop: * Check if outgoing packet belongs to the established ip_vs_conn. */ static unsigned int -ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af) +ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { + struct netns_ipvs *ipvs = net_ipvs(state->net); + unsigned int hooknum = state->hook; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; + int af = state->pf; struct sock *sk; EnterFunction(11); @@ -1468,56 +1471,6 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in return NF_ACCEPT; } -/* - * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, - * used only for VS/NAT. - * Check if packet is reply for established ip_vs_conn. - */ -static unsigned int -ip_vs_reply4(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET); -} - -/* - * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. - * Check if packet is reply for established ip_vs_conn. - */ -static unsigned int -ip_vs_local_reply4(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET); -} - -#ifdef CONFIG_IP_VS_IPV6 - -/* - * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, - * used only for VS/NAT. - * Check if packet is reply for established ip_vs_conn. - */ -static unsigned int -ip_vs_reply6(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6); -} - -/* - * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. - * Check if packet is reply for established ip_vs_conn. - */ -static unsigned int -ip_vs_local_reply6(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6); -} - -#endif - static unsigned int ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, @@ -1957,8 +1910,10 @@ out: * and send it on its way... */ static unsigned int -ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af) +ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { + struct netns_ipvs *ipvs = net_ipvs(state->net); + unsigned int hooknum = state->hook; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; @@ -1966,6 +1921,7 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int int ret, pkts; int conn_reuse_mode; struct sock *sk; + int af = state->pf; /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) @@ -2138,55 +2094,6 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int } /* - * AF_INET handler in NF_INET_LOCAL_IN chain - * Schedule and forward packets from remote clients - */ -static unsigned int -ip_vs_remote_request4(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET); -} - -/* - * AF_INET handler in NF_INET_LOCAL_OUT chain - * Schedule and forward packets from local clients - */ -static unsigned int -ip_vs_local_request4(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET); -} - -#ifdef CONFIG_IP_VS_IPV6 - -/* - * AF_INET6 handler in NF_INET_LOCAL_IN chain - * Schedule and forward packets from remote clients - */ -static unsigned int -ip_vs_remote_request6(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6); -} - -/* - * AF_INET6 handler in NF_INET_LOCAL_OUT chain - * Schedule and forward packets from local clients - */ -static unsigned int -ip_vs_local_request6(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6); -} - -#endif - - -/* * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP * related packets destined for 0.0.0.0/0. * When fwmark-based virtual service is used, such as transparent @@ -2199,45 +2106,36 @@ static unsigned int ip_vs_forward_icmp(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - int r; struct netns_ipvs *ipvs = net_ipvs(state->net); - - if (ip_hdr(skb)->protocol != IPPROTO_ICMP) - return NF_ACCEPT; + int r; /* ipvs enabled in this netns ? */ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; - return ip_vs_in_icmp(ipvs, skb, &r, state->hook); -} - + if (state->pf == NFPROTO_IPV4) { + if (ip_hdr(skb)->protocol != IPPROTO_ICMP) + return NF_ACCEPT; #ifdef CONFIG_IP_VS_IPV6 -static unsigned int -ip_vs_forward_icmp_v6(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - int r; - struct netns_ipvs *ipvs = net_ipvs(state->net); - struct ip_vs_iphdr iphdr; + } else { + struct ip_vs_iphdr iphdr; - ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr); - if (iphdr.protocol != IPPROTO_ICMPV6) - return NF_ACCEPT; + ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr); - /* ipvs enabled in this netns ? */ - if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) - return NF_ACCEPT; + if (iphdr.protocol != IPPROTO_ICMPV6) + return NF_ACCEPT; - return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr); -} + return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr); #endif + } + return ip_vs_in_icmp(ipvs, skb, &r, state->hook); +} static const struct nf_hook_ops ip_vs_ops4[] = { /* After packet filtering, change source only for VS/NAT */ { - .hook = ip_vs_reply4, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC - 2, @@ -2246,21 +2144,21 @@ static const struct nf_hook_ops ip_vs_ops4[] = { * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { - .hook = ip_vs_remote_request4, + .hook = ip_vs_in_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC - 1, }, /* Before ip_vs_in, change source only for VS/NAT */ { - .hook = ip_vs_local_reply4, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST + 1, }, /* After mangle, schedule and forward local requests */ { - .hook = ip_vs_local_request4, + .hook = ip_vs_in_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST + 2, @@ -2275,7 +2173,7 @@ static const struct nf_hook_ops ip_vs_ops4[] = { }, /* After packet filtering, change source only for VS/NAT */ { - .hook = ip_vs_reply4, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = 100, @@ -2286,7 +2184,7 @@ static const struct nf_hook_ops ip_vs_ops4[] = { static const struct nf_hook_ops ip_vs_ops6[] = { /* After packet filtering, change source only for VS/NAT */ { - .hook = ip_vs_reply6, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC - 2, @@ -2295,21 +2193,21 @@ static const struct nf_hook_ops ip_vs_ops6[] = { * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { - .hook = ip_vs_remote_request6, + .hook = ip_vs_in_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC - 1, }, /* Before ip_vs_in, change source only for VS/NAT */ { - .hook = ip_vs_local_reply6, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST + 1, }, /* After mangle, schedule and forward local requests */ { - .hook = ip_vs_local_request6, + .hook = ip_vs_in_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST + 2, @@ -2317,14 +2215,14 @@ static const struct nf_hook_ops ip_vs_ops6[] = { /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { - .hook = ip_vs_forward_icmp_v6, + .hook = ip_vs_forward_icmp, .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = 99, }, /* After packet filtering, change source only for VS/NAT */ { - .hook = ip_vs_reply6, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = 100, diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c25097092a06..e62b40bd349e 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2017,6 +2017,12 @@ static struct ctl_table vs_vars[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "run_estimation", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef CONFIG_IP_VS_DEBUG { .procname = "debug_level", @@ -4090,6 +4096,13 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; tbl[idx++].data = &ipvs->sysctl_schedule_icmp; tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; + ipvs->sysctl_run_estimation = 1; + tbl[idx++].data = &ipvs->sysctl_run_estimation; +#ifdef CONFIG_IP_VS_DEBUG + /* Global sysctls must be ro in non-init netns */ + if (!net_eq(net, &init_net)) + tbl[idx++].mode = 0444; +#endif ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); if (ipvs->sysctl_hdr == NULL) { diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 05b8112ffb37..9a1a7af6a186 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -100,6 +100,9 @@ static void estimation_timer(struct timer_list *t) u64 rate; struct netns_ipvs *ipvs = from_timer(ipvs, t, est_timer); + if (!sysctl_run_estimation(ipvs)) + goto skip; + spin_lock(&ipvs->est_lock); list_for_each_entry(e, &ipvs->est_list, list) { s = container_of(e, struct ip_vs_stats, est); @@ -131,6 +134,8 @@ static void estimation_timer(struct timer_list *t) spin_unlock(&s->lock); } spin_unlock(&ipvs->est_lock); + +skip: mod_timer(&ipvs->est_timer, jiffies + 2*HZ); } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 94e18fb9690d..770a63103c7a 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -74,10 +74,14 @@ static __read_mostly struct kmem_cache *nf_conntrack_cachep; static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; +/* serialize hash resizes and nf_ct_iterate_cleanup */ +static DEFINE_MUTEX(nf_conntrack_mutex); + #define GC_SCAN_INTERVAL (120u * HZ) #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) -#define MAX_CHAINLEN 64u +#define MIN_CHAINLEN 8u +#define MAX_CHAINLEN (32u - MIN_CHAINLEN) static struct conntrack_gc_work conntrack_gc_work; @@ -188,11 +192,13 @@ seqcount_spinlock_t nf_conntrack_generation __read_mostly; static siphash_key_t nf_conntrack_hash_rnd __read_mostly; static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, + unsigned int zoneid, const struct net *net) { struct { struct nf_conntrack_man src; union nf_inet_addr dst_addr; + unsigned int zone; u32 net_mix; u16 dport; u16 proto; @@ -205,6 +211,7 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, /* The direction must be ignored, so handle usable members manually. */ combined.src = tuple->src; combined.dst_addr = tuple->dst.u3; + combined.zone = zoneid; combined.net_mix = net_hash_mix(net); combined.dport = (__force __u16)tuple->dst.u.all; combined.proto = tuple->dst.protonum; @@ -219,15 +226,17 @@ static u32 scale_hash(u32 hash) static u32 __hash_conntrack(const struct net *net, const struct nf_conntrack_tuple *tuple, + unsigned int zoneid, unsigned int size) { - return reciprocal_scale(hash_conntrack_raw(tuple, net), size); + return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size); } static u32 hash_conntrack(const struct net *net, - const struct nf_conntrack_tuple *tuple) + const struct nf_conntrack_tuple *tuple, + unsigned int zoneid) { - return scale_hash(hash_conntrack_raw(tuple, net)); + return scale_hash(hash_conntrack_raw(tuple, zoneid, net)); } static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, @@ -650,9 +659,11 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct) do { sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); reply_hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); clean_from_lists(ct); @@ -819,8 +830,20 @@ struct nf_conntrack_tuple_hash * nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { - return __nf_conntrack_find_get(net, zone, tuple, - hash_conntrack_raw(tuple, net)); + unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); + struct nf_conntrack_tuple_hash *thash; + + thash = __nf_conntrack_find_get(net, zone, tuple, + hash_conntrack_raw(tuple, zone_id, net)); + + if (thash) + return thash; + + rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); + if (rid != zone_id) + return __nf_conntrack_find_get(net, zone, tuple, + hash_conntrack_raw(tuple, rid, net)); + return thash; } EXPORT_SYMBOL_GPL(nf_conntrack_find_get); @@ -842,6 +865,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; + unsigned int max_chainlen; unsigned int chainlen = 0; unsigned int sequence; int err = -EEXIST; @@ -852,18 +876,22 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) do { sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); reply_hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); + max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN); + /* See if there's one in the list already, including reverse */ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; - if (chainlen++ > MAX_CHAINLEN) + if (chainlen++ > max_chainlen) goto chaintoolong; } @@ -873,7 +901,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; - if (chainlen++ > MAX_CHAINLEN) + if (chainlen++ > max_chainlen) goto chaintoolong; } @@ -1103,8 +1131,8 @@ drop: int __nf_conntrack_confirm(struct sk_buff *skb) { + unsigned int chainlen = 0, sequence, max_chainlen; const struct nf_conntrack_zone *zone; - unsigned int chainlen = 0, sequence; unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; @@ -1133,8 +1161,8 @@ __nf_conntrack_confirm(struct sk_buff *skb) hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; hash = scale_hash(hash); reply_hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); /* We're not in hash table, and we refuse to set up related @@ -1168,6 +1196,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) goto dying; } + max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN); /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ @@ -1175,7 +1204,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; - if (chainlen++ > MAX_CHAINLEN) + if (chainlen++ > max_chainlen) goto chaintoolong; } @@ -1184,7 +1213,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; - if (chainlen++ > MAX_CHAINLEN) { + if (chainlen++ > max_chainlen) { chaintoolong: nf_ct_add_to_dying_list(ct); NF_CT_STAT_INC(net, chaintoolong); @@ -1246,7 +1275,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, rcu_read_lock(); begin: nf_conntrack_get_ht(&ct_hash, &hsize); - hash = __hash_conntrack(net, tuple, hsize); + hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize); hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); @@ -1687,8 +1716,8 @@ resolve_normal_ct(struct nf_conn *tmpl, struct nf_conntrack_tuple_hash *h; enum ip_conntrack_info ctinfo; struct nf_conntrack_zone tmp; + u32 hash, zone_id, rid; struct nf_conn *ct; - u32 hash; if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, state->pf, protonum, state->net, @@ -1699,8 +1728,20 @@ resolve_normal_ct(struct nf_conn *tmpl, /* look for tuple match */ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); - hash = hash_conntrack_raw(&tuple, state->net); + + zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); + hash = hash_conntrack_raw(&tuple, zone_id, state->net); h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); + + if (!h) { + rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); + if (zone_id != rid) { + u32 tmp = hash_conntrack_raw(&tuple, rid, state->net); + + h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp); + } + } + if (!h) { h = init_conntrack(state->net, tmpl, &tuple, skb, dataoff, hash); @@ -2225,28 +2266,31 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data), spinlock_t *lockp; for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { + struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket]; + + if (hlist_nulls_empty(hslot)) + continue; + lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; local_bh_disable(); nf_conntrack_lock(lockp); - if (*bucket < nf_conntrack_htable_size) { - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) { - if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) - continue; - /* All nf_conn objects are added to hash table twice, one - * for original direction tuple, once for the reply tuple. - * - * Exception: In the IPS_NAT_CLASH case, only the reply - * tuple is added (the original tuple already existed for - * a different object). - * - * We only need to call the iterator once for each - * conntrack, so we just use the 'reply' direction - * tuple while iterating. - */ - ct = nf_ct_tuplehash_to_ctrack(h); - if (iter(ct, data)) - goto found; - } + hlist_nulls_for_each_entry(h, n, hslot, hnnode) { + if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) + continue; + /* All nf_conn objects are added to hash table twice, one + * for original direction tuple, once for the reply tuple. + * + * Exception: In the IPS_NAT_CLASH case, only the reply + * tuple is added (the original tuple already existed for + * a different object). + * + * We only need to call the iterator once for each + * conntrack, so we just use the 'reply' direction + * tuple while iterating. + */ + ct = nf_ct_tuplehash_to_ctrack(h); + if (iter(ct, data)) + goto found; } spin_unlock(lockp); local_bh_enable(); @@ -2264,26 +2308,20 @@ found: static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data, u32 portid, int report) { - unsigned int bucket = 0, sequence; + unsigned int bucket = 0; struct nf_conn *ct; might_sleep(); - for (;;) { - sequence = read_seqcount_begin(&nf_conntrack_generation); - - while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { - /* Time to push up daises... */ + mutex_lock(&nf_conntrack_mutex); + while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { + /* Time to push up daises... */ - nf_ct_delete(ct, portid, report); - nf_ct_put(ct); - cond_resched(); - } - - if (!read_seqcount_retry(&nf_conntrack_generation, sequence)) - break; - bucket = 0; + nf_ct_delete(ct, portid, report); + nf_ct_put(ct); + cond_resched(); } + mutex_unlock(&nf_conntrack_mutex); } struct iter_data { @@ -2519,8 +2557,10 @@ int nf_conntrack_hash_resize(unsigned int hashsize) if (!hash) return -ENOMEM; + mutex_lock(&nf_conntrack_mutex); old_size = nf_conntrack_htable_size; if (old_size == hashsize) { + mutex_unlock(&nf_conntrack_mutex); kvfree(hash); return 0; } @@ -2537,12 +2577,16 @@ int nf_conntrack_hash_resize(unsigned int hashsize) for (i = 0; i < nf_conntrack_htable_size; i++) { while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { + unsigned int zone_id; + h = hlist_nulls_entry(nf_conntrack_hash[i].first, struct nf_conntrack_tuple_hash, hnnode); ct = nf_ct_tuplehash_to_ctrack(h); hlist_nulls_del_rcu(&h->hnnode); + + zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h)); bucket = __hash_conntrack(nf_ct_net(ct), - &h->tuple, hashsize); + &h->tuple, zone_id, hashsize); hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); } } @@ -2556,6 +2600,8 @@ int nf_conntrack_hash_resize(unsigned int hashsize) nf_conntrack_all_unlock(); local_bh_enable(); + mutex_unlock(&nf_conntrack_mutex); + synchronize_net(); kvfree(old_hash); return 0; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 7008961f5cb0..273117683922 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -150,13 +150,16 @@ static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) /* We keep an extra hash for each conntrack, for fast searching. */ static unsigned int -hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) +hash_by_src(const struct net *net, + const struct nf_conntrack_zone *zone, + const struct nf_conntrack_tuple *tuple) { unsigned int hash; struct { struct nf_conntrack_man src; u32 net_mix; u32 protonum; + u32 zone; } __aligned(SIPHASH_ALIGNMENT) combined; get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); @@ -165,9 +168,13 @@ hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) /* Original src, to ensure we map it consistently if poss. */ combined.src = tuple->src; - combined.net_mix = net_hash_mix(n); + combined.net_mix = net_hash_mix(net); combined.protonum = tuple->dst.protonum; + /* Zone ID can be used provided its valid for both directions */ + if (zone->dir == NF_CT_DEFAULT_ZONE_DIR) + combined.zone = zone->id; + hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); return reciprocal_scale(hash, nf_nat_htable_size); @@ -272,7 +279,7 @@ find_appropriate_src(struct net *net, struct nf_conntrack_tuple *result, const struct nf_nat_range2 *range) { - unsigned int h = hash_by_src(net, tuple); + unsigned int h = hash_by_src(net, zone, tuple); const struct nf_conn *ct; hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { @@ -619,7 +626,7 @@ nf_nat_setup_info(struct nf_conn *ct, unsigned int srchash; spinlock_t *lock; - srchash = hash_by_src(net, + srchash = hash_by_src(net, nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; spin_lock_bh(lock); @@ -788,7 +795,7 @@ static void __nf_nat_cleanup_conntrack(struct nf_conn *ct) { unsigned int h; - h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); hlist_del_rcu(&ct->nat_bysource); spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c index 8e8a65d46345..acd73f717a08 100644 --- a/net/netfilter/nf_nat_masquerade.c +++ b/net/netfilter/nf_nat_masquerade.c @@ -9,8 +9,19 @@ #include <net/netfilter/nf_nat_masquerade.h> +struct masq_dev_work { + struct work_struct work; + struct net *net; + union nf_inet_addr addr; + int ifindex; + int (*iter)(struct nf_conn *i, void *data); +}; + +#define MAX_MASQ_WORKER_COUNT 16 + static DEFINE_MUTEX(masq_mutex); static unsigned int masq_refcnt __read_mostly; +static atomic_t masq_worker_count __read_mostly; unsigned int nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, @@ -63,13 +74,71 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4); -static int device_cmp(struct nf_conn *i, void *ifindex) +static void iterate_cleanup_work(struct work_struct *work) +{ + struct masq_dev_work *w; + + w = container_of(work, struct masq_dev_work, work); + + nf_ct_iterate_cleanup_net(w->net, w->iter, (void *)w, 0, 0); + + put_net(w->net); + kfree(w); + atomic_dec(&masq_worker_count); + module_put(THIS_MODULE); +} + +/* Iterate conntrack table in the background and remove conntrack entries + * that use the device/address being removed. + * + * In case too many work items have been queued already or memory allocation + * fails iteration is skipped, conntrack entries will time out eventually. + */ +static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr, + int ifindex, + int (*iter)(struct nf_conn *i, void *data), + gfp_t gfp_flags) +{ + struct masq_dev_work *w; + + if (atomic_read(&masq_worker_count) > MAX_MASQ_WORKER_COUNT) + return; + + net = maybe_get_net(net); + if (!net) + return; + + if (!try_module_get(THIS_MODULE)) + goto err_module; + + w = kzalloc(sizeof(*w), gfp_flags); + if (w) { + /* We can overshoot MAX_MASQ_WORKER_COUNT, no big deal */ + atomic_inc(&masq_worker_count); + + INIT_WORK(&w->work, iterate_cleanup_work); + w->ifindex = ifindex; + w->net = net; + w->iter = iter; + if (addr) + w->addr = *addr; + schedule_work(&w->work); + return; + } + + module_put(THIS_MODULE); + err_module: + put_net(net); +} + +static int device_cmp(struct nf_conn *i, void *arg) { const struct nf_conn_nat *nat = nfct_nat(i); + const struct masq_dev_work *w = arg; if (!nat) return 0; - return nat->masq_index == (int)(long)ifindex; + return nat->masq_index == w->ifindex; } static int masq_device_event(struct notifier_block *this, @@ -85,8 +154,8 @@ static int masq_device_event(struct notifier_block *this, * and forget them. */ - nf_ct_iterate_cleanup_net(net, device_cmp, - (void *)(long)dev->ifindex, 0, 0); + nf_nat_masq_schedule(net, NULL, dev->ifindex, + device_cmp, GFP_KERNEL); } return NOTIFY_DONE; @@ -94,35 +163,45 @@ static int masq_device_event(struct notifier_block *this, static int inet_cmp(struct nf_conn *ct, void *ptr) { - struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; - struct net_device *dev = ifa->ifa_dev->dev; struct nf_conntrack_tuple *tuple; + struct masq_dev_work *w = ptr; - if (!device_cmp(ct, (void *)(long)dev->ifindex)) + if (!device_cmp(ct, ptr)) return 0; tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; - return ifa->ifa_address == tuple->dst.u3.ip; + return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3); } static int masq_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev; - struct net *net = dev_net(idev->dev); + const struct in_ifaddr *ifa = ptr; + const struct in_device *idev; + const struct net_device *dev; + union nf_inet_addr addr; + + if (event != NETDEV_DOWN) + return NOTIFY_DONE; /* The masq_dev_notifier will catch the case of the device going * down. So if the inetdev is dead and being destroyed we have * no work to do. Otherwise this is an individual address removal * and we have to perform the flush. */ + idev = ifa->ifa_dev; if (idev->dead) return NOTIFY_DONE; - if (event == NETDEV_DOWN) - nf_ct_iterate_cleanup_net(net, inet_cmp, ptr, 0, 0); + memset(&addr, 0, sizeof(addr)); + + addr.ip = ifa->ifa_address; + + dev = idev->dev; + nf_nat_masq_schedule(dev_net(idev->dev), &addr, dev->ifindex, + inet_cmp, GFP_KERNEL); return NOTIFY_DONE; } @@ -136,8 +215,6 @@ static struct notifier_block masq_inet_notifier = { }; #if IS_ENABLED(CONFIG_IPV6) -static atomic_t v6_worker_count __read_mostly; - static int nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, @@ -187,40 +264,6 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6); -struct masq_dev_work { - struct work_struct work; - struct net *net; - struct in6_addr addr; - int ifindex; -}; - -static int inet6_cmp(struct nf_conn *ct, void *work) -{ - struct masq_dev_work *w = (struct masq_dev_work *)work; - struct nf_conntrack_tuple *tuple; - - if (!device_cmp(ct, (void *)(long)w->ifindex)) - return 0; - - tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; - - return ipv6_addr_equal(&w->addr, &tuple->dst.u3.in6); -} - -static void iterate_cleanup_work(struct work_struct *work) -{ - struct masq_dev_work *w; - - w = container_of(work, struct masq_dev_work, work); - - nf_ct_iterate_cleanup_net(w->net, inet6_cmp, (void *)w, 0, 0); - - put_net(w->net); - kfree(w); - atomic_dec(&v6_worker_count); - module_put(THIS_MODULE); -} - /* atomic notifier; can't call nf_ct_iterate_cleanup_net (it can sleep). * * Defer it to the system workqueue. @@ -233,36 +276,19 @@ static int masq_inet6_event(struct notifier_block *this, { struct inet6_ifaddr *ifa = ptr; const struct net_device *dev; - struct masq_dev_work *w; - struct net *net; + union nf_inet_addr addr; - if (event != NETDEV_DOWN || atomic_read(&v6_worker_count) >= 16) + if (event != NETDEV_DOWN) return NOTIFY_DONE; dev = ifa->idev->dev; - net = maybe_get_net(dev_net(dev)); - if (!net) - return NOTIFY_DONE; - if (!try_module_get(THIS_MODULE)) - goto err_module; + memset(&addr, 0, sizeof(addr)); - w = kmalloc(sizeof(*w), GFP_ATOMIC); - if (w) { - atomic_inc(&v6_worker_count); - - INIT_WORK(&w->work, iterate_cleanup_work); - w->ifindex = dev->ifindex; - w->net = net; - w->addr = ifa->addr; - schedule_work(&w->work); + addr.in6 = ifa->addr; - return NOTIFY_DONE; - } - - module_put(THIS_MODULE); - err_module: - put_net(net); + nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet_cmp, + GFP_ATOMIC); return NOTIFY_DONE; } diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 081437dd75b7..c0851fec11d4 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -780,6 +780,7 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) { struct nftables_pernet *nft_net; struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && @@ -790,8 +791,11 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) if (skb == NULL) goto err; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_table_info(skb, ctx->net, ctx->portid, ctx->seq, - event, 0, ctx->family, ctx->table); + event, flags, ctx->family, ctx->table); if (err < 0) { kfree_skb(skb); goto err; @@ -1563,6 +1567,7 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) { struct nftables_pernet *nft_net; struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && @@ -1573,8 +1578,11 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) if (skb == NULL) goto err; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq, - event, 0, ctx->family, ctx->table, + event, flags, ctx->family, ctx->table, ctx->chain); if (err < 0) { kfree_skb(skb); @@ -2866,8 +2874,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, u32 flags, int family, const struct nft_table *table, const struct nft_chain *chain, - const struct nft_rule *rule, - const struct nft_rule *prule) + const struct nft_rule *rule, u64 handle) { struct nlmsghdr *nlh; const struct nft_expr *expr, *next; @@ -2887,9 +2894,8 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, NFTA_RULE_PAD)) goto nla_put_failure; - if (event != NFT_MSG_DELRULE && prule) { - if (nla_put_be64(skb, NFTA_RULE_POSITION, - cpu_to_be64(prule->handle), + if (event != NFT_MSG_DELRULE && handle) { + if (nla_put_be64(skb, NFTA_RULE_POSITION, cpu_to_be64(handle), NFTA_RULE_PAD)) goto nla_put_failure; } @@ -2925,7 +2931,10 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx, const struct nft_rule *rule, int event) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); + const struct nft_rule *prule; struct sk_buff *skb; + u64 handle = 0; + u16 flags = 0; int err; if (!ctx->report && @@ -2936,9 +2945,20 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx, if (skb == NULL) goto err; + if (event == NFT_MSG_NEWRULE && + !list_is_first(&rule->list, &ctx->chain->rules) && + !list_is_last(&rule->list, &ctx->chain->rules)) { + prule = list_prev_entry(rule, list); + handle = prule->handle; + } + if (ctx->flags & (NLM_F_APPEND | NLM_F_REPLACE)) + flags |= NLM_F_APPEND; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq, - event, 0, ctx->family, ctx->table, - ctx->chain, rule, NULL); + event, flags, ctx->family, ctx->table, + ctx->chain, rule, handle); if (err < 0) { kfree_skb(skb); goto err; @@ -2964,6 +2984,7 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, struct net *net = sock_net(skb->sk); const struct nft_rule *rule, *prule; unsigned int s_idx = cb->args[0]; + u64 handle; prule = NULL; list_for_each_entry_rcu(rule, &chain->rules, list) { @@ -2975,12 +2996,17 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); } + if (prule) + handle = prule->handle; + else + handle = 0; + if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWRULE, NLM_F_MULTI | NLM_F_APPEND, table->family, - table, chain, rule, prule) < 0) + table, chain, rule, handle) < 0) return 1; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -3143,7 +3169,7 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, - family, table, chain, rule, NULL); + family, table, chain, rule, 0); if (err < 0) goto err_fill_rule_info; @@ -3403,17 +3429,15 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, } if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { + err = nft_delrule(&ctx, old_rule); + if (err < 0) + goto err_destroy_flow_rule; + trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (trans == NULL) { err = -ENOMEM; goto err_destroy_flow_rule; } - err = nft_delrule(&ctx, old_rule); - if (err < 0) { - nft_trans_destroy(trans); - goto err_destroy_flow_rule; - } - list_add_tail_rcu(&rule->list, &old_rule->list); } else { trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); @@ -3943,8 +3967,9 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx, gfp_t gfp_flags) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); - struct sk_buff *skb; u32 portid = ctx->portid; + struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && @@ -3955,7 +3980,10 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx, if (skb == NULL) goto err; - err = nf_tables_fill_set(skb, ctx, set, event, 0); + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + + err = nf_tables_fill_set(skb, ctx, set, event, flags); if (err < 0) { kfree_skb(skb); goto err; @@ -4336,7 +4364,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (ops->privsize != NULL) size = ops->privsize(nla, &desc); alloc_size = sizeof(*set) + size + udlen; - if (alloc_size < size) + if (alloc_size < size || alloc_size > INT_MAX) return -ENOMEM; set = kvzalloc(alloc_size, GFP_KERNEL); if (!set) @@ -5231,12 +5259,13 @@ static int nf_tables_getsetelem(struct sk_buff *skb, static void nf_tables_setelem_notify(const struct nft_ctx *ctx, const struct nft_set *set, const struct nft_set_elem *elem, - int event, u16 flags) + int event) { struct nftables_pernet *nft_net; struct net *net = ctx->net; u32 portid = ctx->portid; struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) @@ -5246,6 +5275,9 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx, if (skb == NULL) goto err; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags, set, elem); if (err < 0) { @@ -6921,7 +6953,7 @@ static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info, void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, - int family, int report, gfp_t gfp) + u16 flags, int family, int report, gfp_t gfp) { struct nftables_pernet *nft_net = nft_pernet(net); struct sk_buff *skb; @@ -6946,8 +6978,9 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, if (skb == NULL) goto err; - err = nf_tables_fill_obj_info(skb, net, portid, seq, event, 0, family, - table, obj, false); + err = nf_tables_fill_obj_info(skb, net, portid, seq, event, + flags & (NLM_F_CREATE | NLM_F_EXCL), + family, table, obj, false); if (err < 0) { kfree_skb(skb); goto err; @@ -6964,7 +6997,7 @@ static void nf_tables_obj_notify(const struct nft_ctx *ctx, struct nft_object *obj, int event) { nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event, - ctx->family, ctx->report, GFP_KERNEL); + ctx->flags, ctx->family, ctx->report, GFP_KERNEL); } /* @@ -7745,6 +7778,7 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, { struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && @@ -7755,8 +7789,11 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, if (skb == NULL) goto err; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid, - ctx->seq, event, 0, + ctx->seq, event, flags, ctx->family, flowtable, hook_list); if (err < 0) { kfree_skb(skb); @@ -8634,7 +8671,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_setelem_activate(net, te->set, &te->elem); nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, - NFT_MSG_NEWSETELEM, 0); + NFT_MSG_NEWSETELEM); nft_trans_destroy(trans); break; case NFT_MSG_DELSETELEM: @@ -8642,7 +8679,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, - NFT_MSG_DELSETELEM, 0); + NFT_MSG_DELSETELEM); nft_setelem_remove(net, te->set, &te->elem); if (!nft_setelem_is_catchall(te->set, &te->elem)) { atomic_dec(&te->set->nelems); @@ -9599,7 +9636,6 @@ static void __nft_release_table(struct net *net, struct nft_table *table) table->use--; nf_tables_chain_destroy(&ctx); } - list_del(&table->list); nf_tables_table_destroy(&ctx); } @@ -9612,6 +9648,8 @@ static void __nft_release_tables(struct net *net) if (nft_table_has_owner(table)) continue; + list_del(&table->list); + __nft_release_table(net, table); } } @@ -9619,31 +9657,38 @@ static void __nft_release_tables(struct net *net) static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { + struct nft_table *table, *to_delete[8]; struct nftables_pernet *nft_net; struct netlink_notify *n = ptr; - struct nft_table *table, *nt; struct net *net = n->net; - bool release = false; + unsigned int deleted; + bool restart = false; if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER) return NOTIFY_DONE; nft_net = nft_pernet(net); + deleted = 0; mutex_lock(&nft_net->commit_mutex); +again: list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_has_owner(table) && n->portid == table->nlpid) { __nft_release_hook(net, table); - release = true; + list_del_rcu(&table->list); + to_delete[deleted++] = table; + if (deleted >= ARRAY_SIZE(to_delete)) + break; } } - if (release) { + if (deleted) { + restart = deleted >= ARRAY_SIZE(to_delete); synchronize_rcu(); - list_for_each_entry_safe(table, nt, &nft_net->tables, list) { - if (nft_table_has_owner(table) && - n->portid == table->nlpid) - __nft_release_table(net, table); - } + while (deleted) + __nft_release_table(net, to_delete[--deleted]); + + if (restart) + goto again; } mutex_unlock(&nft_net->commit_mutex); diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c index f554e2ea32ee..d5c719c9e36c 100644 --- a/net/netfilter/nfnetlink_hook.c +++ b/net/netfilter/nfnetlink_hook.c @@ -185,7 +185,7 @@ static const struct nf_hook_entries * nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *dev) { const struct nf_hook_entries *hook_head = NULL; -#ifdef CONFIG_NETFILTER_INGRESS +#if defined(CONFIG_NETFILTER_INGRESS) || defined(CONFIG_NETFILTER_EGRESS) struct net_device *netdev; #endif @@ -221,9 +221,9 @@ nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *de hook_head = rcu_dereference(net->nf.hooks_decnet[hook]); break; #endif -#ifdef CONFIG_NETFILTER_INGRESS +#if defined(CONFIG_NETFILTER_INGRESS) || defined(CONFIG_NETFILTER_EGRESS) case NFPROTO_NETDEV: - if (hook != NF_NETDEV_INGRESS) + if (hook >= NF_NETDEV_NUMHOOKS) return ERR_PTR(-EOPNOTSUPP); if (!dev) @@ -233,7 +233,15 @@ nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *de if (!netdev) return ERR_PTR(-ENODEV); - return rcu_dereference(netdev->nf_hooks_ingress); +#ifdef CONFIG_NETFILTER_INGRESS + if (hook == NF_NETDEV_INGRESS) + return rcu_dereference(netdev->nf_hooks_ingress); +#endif +#ifdef CONFIG_NETFILTER_EGRESS + if (hook == NF_NETDEV_EGRESS) + return rcu_dereference(netdev->nf_hooks_egress); +#endif + fallthrough; #endif default: return ERR_PTR(-EPROTONOSUPPORT); diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 5b02408a920b..c3563f0be269 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -310,9 +310,11 @@ static const struct nft_chain_type nft_chain_filter_netdev = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_NETDEV, - .hook_mask = (1 << NF_NETDEV_INGRESS), + .hook_mask = (1 << NF_NETDEV_INGRESS) | + (1 << NF_NETDEV_EGRESS), .hooks = { [NF_NETDEV_INGRESS] = nft_do_chain_netdev, + [NF_NETDEV_EGRESS] = nft_do_chain_netdev, }, }; @@ -342,12 +344,6 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev, return; } - /* UNREGISTER events are also happening on netns exit. - * - * Although nf_tables core releases all tables/chains, only this event - * handler provides guarantee that hook->ops.dev is still accessible, - * so we cannot skip exiting net namespaces. - */ __nft_release_basechain(ctx); } @@ -366,6 +362,9 @@ static int nf_tables_netdev_event(struct notifier_block *this, event != NETDEV_CHANGENAME) return NOTIFY_DONE; + if (!check_net(ctx.net)) + return NOTIFY_DONE; + nft_net = nft_pernet(ctx.net); mutex_lock(&nft_net->commit_mutex); list_for_each_entry(table, &nft_net->tables, list) { diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 272bcdb1392d..f69cc73c5813 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -19,6 +19,7 @@ #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_arp/arp_tables.h> #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_log.h> /* Used for matches where *info is larger than X byte */ #define NFT_MATCH_LARGE_THRESH 192 @@ -257,8 +258,22 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, nft_compat_wait_for_destructors(); ret = xt_check_target(&par, size, proto, inv); - if (ret < 0) + if (ret < 0) { + if (ret == -ENOENT) { + const char *modname = NULL; + + if (strcmp(target->name, "LOG") == 0) + modname = "nf_log_syslog"; + else if (strcmp(target->name, "NFLOG") == 0) + modname = "nfnetlink_log"; + + if (modname && + nft_request_module(ctx->net, "%s", modname) == -EAGAIN) + return -EAGAIN; + } + return ret; + } /* The standard target cannot be used */ if (!target->target) diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 6ba3256fa844..87f3af4645d9 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -198,17 +198,8 @@ static int nft_dynset_init(const struct nft_ctx *ctx, return -EBUSY; priv->op = ntohl(nla_get_be32(tb[NFTA_DYNSET_OP])); - switch (priv->op) { - case NFT_DYNSET_OP_ADD: - case NFT_DYNSET_OP_DELETE: - break; - case NFT_DYNSET_OP_UPDATE: - if (!(set->flags & NFT_SET_TIMEOUT)) - return -EOPNOTSUPP; - break; - default: + if (priv->op > NFT_DYNSET_OP_DELETE) return -EOPNOTSUPP; - } timeout = 0; if (tb[NFTA_DYNSET_TIMEOUT] != NULL) { diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index 0363f533a42b..c4d1389f7185 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -60,7 +60,7 @@ static void nft_quota_obj_eval(struct nft_object *obj, if (overquota && !test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags)) nft_obj_notify(nft_net(pkt), obj->key.table, obj, 0, 0, - NFT_MSG_NEWOBJ, nft_pf(pkt), 0, GFP_ATOMIC); + NFT_MSG_NEWOBJ, 0, nft_pf(pkt), 0, GFP_ATOMIC); } static int nft_quota_do_init(const struct nlattr * const tb[], diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 7b2f359bfce4..2f7cf5ecebf4 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -137,7 +137,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) { int ret; - info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL); + info->timer = kzalloc(sizeof(*info->timer), GFP_KERNEL); if (!info->timer) { ret = -ENOMEM; goto out; diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c index 2ff75f7637b0..f39244f9c0ed 100644 --- a/net/netfilter/xt_LOG.c +++ b/net/netfilter/xt_LOG.c @@ -44,6 +44,7 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par) static int log_tg_check(const struct xt_tgchk_param *par) { const struct xt_log_info *loginfo = par->targinfo; + int ret; if (par->family != NFPROTO_IPV4 && par->family != NFPROTO_IPV6) return -EINVAL; @@ -58,7 +59,14 @@ static int log_tg_check(const struct xt_tgchk_param *par) return -EINVAL; } - return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); + if (ret != 0 && !par->nft_compat) { + request_module("%s", "nf_log_syslog"); + + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); + } + + return ret; } static void log_tg_destroy(const struct xt_tgdtor_param *par) diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index fb5793208059..e660c3710a10 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -42,13 +42,21 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) static int nflog_tg_check(const struct xt_tgchk_param *par) { const struct xt_nflog_info *info = par->targinfo; + int ret; if (info->flags & ~XT_NFLOG_MASK) return -EINVAL; if (info->prefix[sizeof(info->prefix) - 1] != '\0') return -EINVAL; - return nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); + if (ret != 0 && !par->nft_compat) { + request_module("%s", "nfnetlink_log"); + + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); + } + + return ret; } static void nflog_tg_destroy(const struct xt_tgdtor_param *par) diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 0d5c422f8745..8aec1b529364 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -94,11 +94,11 @@ static unsigned int xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_rateest_target_info *info = par->targinfo; - struct gnet_stats_basic_packed *stats = &info->est->bstats; + struct gnet_stats_basic_sync *stats = &info->est->bstats; spin_lock_bh(&info->est->lock); - stats->bytes += skb->len; - stats->packets++; + u64_stats_add(&stats->bytes, skb->len); + u64_stats_inc(&stats->packets); spin_unlock_bh(&info->est->lock); return XT_CONTINUE; @@ -143,6 +143,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) if (!est) goto err1; + gnet_stats_basic_sync_init(&est->bstats); strlcpy(est->name, info->name, sizeof(est->name)); spin_lock_init(&est->lock); est->refcnt = 1; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 24b7cf447bc5..4c575324a985 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -594,7 +594,10 @@ static int netlink_insert(struct sock *sk, u32 portid) /* We need to ensure that the socket is hashed and visible. */ smp_wmb(); - nlk_sk(sk)->bound = portid; + /* Paired with lockless reads from netlink_bind(), + * netlink_connect() and netlink_sendmsg(). + */ + WRITE_ONCE(nlk_sk(sk)->bound, portid); err: release_sock(sk); @@ -1012,7 +1015,8 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, if (nlk->ngroups < BITS_PER_LONG) groups &= (1UL << nlk->ngroups) - 1; - bound = nlk->bound; + /* Paired with WRITE_ONCE() in netlink_insert() */ + bound = READ_ONCE(nlk->bound); if (bound) { /* Ensure nlk->portid is up-to-date. */ smp_rmb(); @@ -1098,8 +1102,9 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, /* No need for barriers here as we return to user-space without * using any of the bound attributes. + * Paired with WRITE_ONCE() in netlink_insert(). */ - if (!nlk->bound) + if (!READ_ONCE(nlk->bound)) err = netlink_autobind(sock); if (err == 0) { @@ -1407,8 +1412,6 @@ struct netlink_broadcast_data { int delivered; gfp_t allocation; struct sk_buff *skb, *skb2; - int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data); - void *tx_data; }; static void do_one_broadcast(struct sock *sk, @@ -1462,11 +1465,6 @@ static void do_one_broadcast(struct sock *sk, p->delivery_failure = 1; goto out; } - if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { - kfree_skb(p->skb2); - p->skb2 = NULL; - goto out; - } if (sk_filter(sk, p->skb2)) { kfree_skb(p->skb2); p->skb2 = NULL; @@ -1489,10 +1487,8 @@ out: sock_put(sk); } -int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid, - u32 group, gfp_t allocation, - int (*filter)(struct sock *dsk, struct sk_buff *skb, void *data), - void *filter_data) +int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, + u32 group, gfp_t allocation) { struct net *net = sock_net(ssk); struct netlink_broadcast_data info; @@ -1511,8 +1507,6 @@ int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid info.allocation = allocation; info.skb = skb; info.skb2 = NULL; - info.tx_filter = filter; - info.tx_data = filter_data; /* While we sleep in clone, do not allow to change socket list */ @@ -1538,14 +1532,6 @@ int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid } return -ESRCH; } -EXPORT_SYMBOL(netlink_broadcast_filtered); - -int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, - u32 group, gfp_t allocation) -{ - return netlink_broadcast_filtered(ssk, skb, portid, group, allocation, - NULL, NULL); -} EXPORT_SYMBOL(netlink_broadcast); struct netlink_set_err_data { @@ -1888,7 +1874,8 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) dst_group = nlk->dst_group; } - if (!nlk->bound) { + /* Paired with WRITE_ONCE() in netlink_insert() */ + if (!READ_ONCE(nlk->bound)) { err = netlink_autobind(sock); if (err) goto out; diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 6d16e1ab1a8a..775064cdd0ee 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -633,7 +633,7 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr, struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); struct sockaddr_ax25 *addr = (struct sockaddr_ax25 *)uaddr; - ax25_address *source = NULL; + const ax25_address *source = NULL; ax25_uid_assoc *user; struct net_device *dev; int err = 0; @@ -673,7 +673,7 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr, err = -ENETUNREACH; goto out_release; } - source = (ax25_address *)dev->dev_addr; + source = (const ax25_address *)dev->dev_addr; user = ax25_findbyuid(current_euid()); if (user) { diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c index 29e418c8c6c3..3aaac4a22b38 100644 --- a/net/netrom/nr_dev.c +++ b/net/netrom/nr_dev.c @@ -108,10 +108,10 @@ static int __must_check nr_set_mac_address(struct net_device *dev, void *addr) if (err) return err; - ax25_listen_release((ax25_address *)dev->dev_addr, NULL); + ax25_listen_release((const ax25_address *)dev->dev_addr, NULL); } - memcpy(dev->dev_addr, sa->sa_data, dev->addr_len); + dev_addr_set(dev, sa->sa_data); return 0; } @@ -120,7 +120,7 @@ static int nr_open(struct net_device *dev) { int err; - err = ax25_listen_register((ax25_address *)dev->dev_addr, NULL); + err = ax25_listen_register((const ax25_address *)dev->dev_addr, NULL); if (err) return err; @@ -131,7 +131,7 @@ static int nr_open(struct net_device *dev) static int nr_close(struct net_device *dev) { - ax25_listen_release((ax25_address *)dev->dev_addr, NULL); + ax25_listen_release((const ax25_address *)dev->dev_addr, NULL); netif_stop_queue(dev); return 0; } diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index ddd5cbd455e3..baea3cbd76ca 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -598,7 +598,7 @@ struct net_device *nr_dev_get(ax25_address *addr) rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) { if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM && - ax25cmp(addr, (ax25_address *)dev->dev_addr) == 0) { + ax25cmp(addr, (const ax25_address *)dev->dev_addr) == 0) { dev_hold(dev); goto out; } @@ -825,7 +825,7 @@ int nr_route_frame(struct sk_buff *skb, ax25_cb *ax25) ax25s = nr_neigh->ax25; nr_neigh->ax25 = ax25_send_frame(skb, 256, - (ax25_address *)dev->dev_addr, + (const ax25_address *)dev->dev_addr, &nr_neigh->callsign, nr_neigh->digipeat, nr_neigh->dev); if (ax25s) diff --git a/net/nfc/af_nfc.c b/net/nfc/af_nfc.c index 6024fad905ff..dda323e0a473 100644 --- a/net/nfc/af_nfc.c +++ b/net/nfc/af_nfc.c @@ -60,6 +60,9 @@ int nfc_proto_register(const struct nfc_protocol *nfc_proto) proto_tab[nfc_proto->id] = nfc_proto; write_unlock(&proto_tab_lock); + if (rc) + proto_unregister(nfc_proto->proto); + return rc; } EXPORT_SYMBOL(nfc_proto_register); diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index fefc03674f4f..d63d2e5dc60c 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -277,6 +277,7 @@ int digital_tg_configure_hw(struct nfc_digital_dev *ddev, int type, int param) static int digital_tg_listen_mdaa(struct nfc_digital_dev *ddev, u8 rf_tech) { struct digital_tg_mdaa_params *params; + int rc; params = kzalloc(sizeof(*params), GFP_KERNEL); if (!params) @@ -291,8 +292,12 @@ static int digital_tg_listen_mdaa(struct nfc_digital_dev *ddev, u8 rf_tech) get_random_bytes(params->nfcid2 + 2, NFC_NFCID2_MAXSIZE - 2); params->sc = DIGITAL_SENSF_FELICA_SC; - return digital_send_cmd(ddev, DIGITAL_CMD_TG_LISTEN_MDAA, NULL, params, - 500, digital_tg_recv_atr_req, NULL); + rc = digital_send_cmd(ddev, DIGITAL_CMD_TG_LISTEN_MDAA, NULL, params, + 500, digital_tg_recv_atr_req, NULL); + if (rc) + kfree(params); + + return rc; } static int digital_tg_listen_md(struct nfc_digital_dev *ddev, u8 rf_tech) diff --git a/net/nfc/digital_technology.c b/net/nfc/digital_technology.c index 84d2345c75a3..3adf4589852a 100644 --- a/net/nfc/digital_technology.c +++ b/net/nfc/digital_technology.c @@ -465,8 +465,12 @@ static int digital_in_send_sdd_req(struct nfc_digital_dev *ddev, skb_put_u8(skb, sel_cmd); skb_put_u8(skb, DIGITAL_SDD_REQ_SEL_PAR); - return digital_in_send_cmd(ddev, skb, 30, digital_in_recv_sdd_res, - target); + rc = digital_in_send_cmd(ddev, skb, 30, digital_in_recv_sdd_res, + target); + if (rc) + kfree_skb(skb); + + return rc; } static void digital_in_recv_sens_res(struct nfc_digital_dev *ddev, void *arg, diff --git a/net/nfc/hci/command.c b/net/nfc/hci/command.c index 3a89bd9b89fc..af6bacb3ba98 100644 --- a/net/nfc/hci/command.c +++ b/net/nfc/hci/command.c @@ -114,8 +114,6 @@ int nfc_hci_send_cmd(struct nfc_hci_dev *hdev, u8 gate, u8 cmd, { u8 pipe; - pr_debug("\n"); - pipe = hdev->gate2pipe[gate]; if (pipe == NFC_HCI_INVALID_PIPE) return -EADDRNOTAVAIL; @@ -130,8 +128,6 @@ int nfc_hci_send_cmd_async(struct nfc_hci_dev *hdev, u8 gate, u8 cmd, { u8 pipe; - pr_debug("\n"); - pipe = hdev->gate2pipe[gate]; if (pipe == NFC_HCI_INVALID_PIPE) return -EADDRNOTAVAIL; @@ -205,8 +201,6 @@ static int nfc_hci_open_pipe(struct nfc_hci_dev *hdev, u8 pipe) static int nfc_hci_close_pipe(struct nfc_hci_dev *hdev, u8 pipe) { - pr_debug("\n"); - return nfc_hci_execute_cmd(hdev, pipe, NFC_HCI_ANY_CLOSE_PIPE, NULL, 0, NULL); } @@ -242,8 +236,6 @@ static u8 nfc_hci_create_pipe(struct nfc_hci_dev *hdev, u8 dest_host, static int nfc_hci_delete_pipe(struct nfc_hci_dev *hdev, u8 pipe) { - pr_debug("\n"); - return nfc_hci_execute_cmd(hdev, NFC_HCI_ADMIN_PIPE, NFC_HCI_ADM_DELETE_PIPE, &pipe, 1, NULL); } @@ -256,8 +248,6 @@ static int nfc_hci_clear_all_pipes(struct nfc_hci_dev *hdev) /* TODO: Find out what the identity reference data is * and fill param with it. HCI spec 6.1.3.5 */ - pr_debug("\n"); - if (test_bit(NFC_HCI_QUIRK_SHORT_CLEAR, &hdev->quirks)) param_len = 0; @@ -271,8 +261,6 @@ int nfc_hci_disconnect_gate(struct nfc_hci_dev *hdev, u8 gate) int r; u8 pipe = hdev->gate2pipe[gate]; - pr_debug("\n"); - if (pipe == NFC_HCI_INVALID_PIPE) return -EADDRNOTAVAIL; @@ -296,8 +284,6 @@ int nfc_hci_disconnect_all_gates(struct nfc_hci_dev *hdev) { int r; - pr_debug("\n"); - r = nfc_hci_clear_all_pipes(hdev); if (r < 0) return r; @@ -314,8 +300,6 @@ int nfc_hci_connect_gate(struct nfc_hci_dev *hdev, u8 dest_host, u8 dest_gate, bool pipe_created = false; int r; - pr_debug("\n"); - if (pipe == NFC_HCI_DO_NOT_CREATE_PIPE) return 0; diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c index 71e10347e6a9..e90f70385813 100644 --- a/net/nfc/hci/llc_shdlc.c +++ b/net/nfc/hci/llc_shdlc.c @@ -363,8 +363,6 @@ static int llc_shdlc_connect_initiate(const struct llc_shdlc *shdlc) { struct sk_buff *skb; - pr_debug("\n"); - skb = llc_shdlc_alloc_skb(shdlc, 2); if (skb == NULL) return -ENOMEM; @@ -379,8 +377,6 @@ static int llc_shdlc_connect_send_ua(const struct llc_shdlc *shdlc) { struct sk_buff *skb; - pr_debug("\n"); - skb = llc_shdlc_alloc_skb(shdlc, 0); if (skb == NULL) return -ENOMEM; @@ -570,8 +566,6 @@ static void llc_shdlc_connect_timeout(struct timer_list *t) { struct llc_shdlc *shdlc = from_timer(shdlc, t, connect_timer); - pr_debug("\n"); - schedule_work(&shdlc->sm_work); } @@ -598,8 +592,6 @@ static void llc_shdlc_sm_work(struct work_struct *work) struct llc_shdlc *shdlc = container_of(work, struct llc_shdlc, sm_work); int r; - pr_debug("\n"); - mutex_lock(&shdlc->state_mutex); switch (shdlc->state) { @@ -681,8 +673,6 @@ static int llc_shdlc_connect(struct llc_shdlc *shdlc) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(connect_wq); - pr_debug("\n"); - mutex_lock(&shdlc->state_mutex); shdlc->state = SHDLC_CONNECTING; @@ -701,8 +691,6 @@ static int llc_shdlc_connect(struct llc_shdlc *shdlc) static void llc_shdlc_disconnect(struct llc_shdlc *shdlc) { - pr_debug("\n"); - mutex_lock(&shdlc->state_mutex); shdlc->state = SHDLC_DISCONNECTED; diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c index 3c4172a5aeb5..41e3a20c8935 100644 --- a/net/nfc/llcp_commands.c +++ b/net/nfc/llcp_commands.c @@ -337,8 +337,6 @@ int nfc_llcp_send_disconnect(struct nfc_llcp_sock *sock) struct nfc_dev *dev; struct nfc_llcp_local *local; - pr_debug("Sending DISC\n"); - local = sock->local; if (local == NULL) return -ENODEV; @@ -362,8 +360,6 @@ int nfc_llcp_send_symm(struct nfc_dev *dev) struct nfc_llcp_local *local; u16 size = 0; - pr_debug("Sending SYMM\n"); - local = nfc_llcp_find_local(dev); if (local == NULL) return -ENODEV; @@ -399,8 +395,6 @@ int nfc_llcp_send_connect(struct nfc_llcp_sock *sock) u16 size = 0; __be16 miux; - pr_debug("Sending CONNECT\n"); - local = sock->local; if (local == NULL) return -ENODEV; @@ -475,8 +469,6 @@ int nfc_llcp_send_cc(struct nfc_llcp_sock *sock) u16 size = 0; __be16 miux; - pr_debug("Sending CC\n"); - local = sock->local; if (local == NULL) return -ENODEV; diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index eaeb2b1cfa6a..5ad5157aa9c5 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -45,8 +45,6 @@ static void nfc_llcp_socket_purge(struct nfc_llcp_sock *sock) struct nfc_llcp_local *local = sock->local; struct sk_buff *s, *tmp; - pr_debug("%p\n", &sock->sk); - skb_queue_purge(&sock->tx_queue); skb_queue_purge(&sock->tx_pending_queue); @@ -1505,9 +1503,8 @@ void nfc_llcp_recv(void *data, struct sk_buff *skb, int err) { struct nfc_llcp_local *local = (struct nfc_llcp_local *) data; - pr_debug("Received an LLCP PDU\n"); if (err < 0) { - pr_err("err %d\n", err); + pr_err("LLCP PDU receive err %d\n", err); return; } diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 82ab39d80726..6fd873aa86be 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -930,8 +930,6 @@ static void nci_deactivate_target(struct nfc_dev *nfc_dev, struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); unsigned long nci_mode = NCI_DEACTIVATE_TYPE_IDLE_MODE; - pr_debug("entry\n"); - if (!ndev->target_active_prot) { pr_err("unable to deactivate target, no active target\n"); return; @@ -977,8 +975,6 @@ static int nci_dep_link_down(struct nfc_dev *nfc_dev) struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); int rc; - pr_debug("entry\n"); - if (nfc_dev->rf_mode == NFC_RF_INITIATOR) { nci_deactivate_target(nfc_dev, NULL, NCI_DEACTIVATE_TYPE_IDLE_MODE); } else { diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index e199912ee1e5..19703a649b5a 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -432,8 +432,6 @@ void nci_hci_data_received_cb(void *context, struct sk_buff *frag_skb; int msg_len; - pr_debug("\n"); - if (err) { nci_req_complete(ndev, err); return; @@ -547,8 +545,6 @@ static u8 nci_hci_create_pipe(struct nci_dev *ndev, u8 dest_host, static int nci_hci_delete_pipe(struct nci_dev *ndev, u8 pipe) { - pr_debug("\n"); - return nci_hci_send_cmd(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADM_DELETE_PIPE, &pipe, 1, NULL); } diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index c5eacaac41ae..282c51051dcc 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -738,8 +738,6 @@ static void nci_nfcee_discover_ntf_packet(struct nci_dev *ndev, const struct nci_nfcee_discover_ntf *nfcee_ntf = (struct nci_nfcee_discover_ntf *)skb->data; - pr_debug("\n"); - /* NFCForum NCI 9.2.1 HCI Network Specific Handling * If the NFCC supports the HCI Network, it SHALL return one, * and only one, NFCEE_DISCOVER_NTF with a Protocol type of @@ -751,12 +749,6 @@ static void nci_nfcee_discover_ntf_packet(struct nci_dev *ndev, nci_req_complete(ndev, status); } -static void nci_nfcee_action_ntf_packet(struct nci_dev *ndev, - const struct sk_buff *skb) -{ - pr_debug("\n"); -} - void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) { __u16 ntf_opcode = nci_opcode(skb->data); @@ -813,7 +805,6 @@ void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) break; case NCI_OP_RF_NFCEE_ACTION_NTF: - nci_nfcee_action_ntf_packet(ndev, skb); break; default: diff --git a/net/nfc/nci/rsp.c b/net/nfc/nci/rsp.c index a2e72c003805..b911ab78bed9 100644 --- a/net/nfc/nci/rsp.c +++ b/net/nfc/nci/rsp.c @@ -334,6 +334,8 @@ static void nci_core_conn_close_rsp_packet(struct nci_dev *ndev, ndev->cur_conn_id); if (conn_info) { list_del(&conn_info->list); + if (conn_info == ndev->rf_conn_info) + ndev->rf_conn_info = NULL; devm_kfree(&ndev->nfc_dev->dev, conn_info); } } diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c index 502e7a3f8948..57500c262cc3 100644 --- a/net/nfc/nci/uart.c +++ b/net/nfc/nci/uart.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2015, Marvell International Ltd. * - * This software file (the "File") is distributed by Marvell International - * Ltd. under the terms of the GNU General Public License Version 2, June 1991 - * (the "License"). You may use, redistribute and/or modify this File in - * accordance with the terms and conditions of the License, a copy of which - * is available on the worldwide web at - * http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt. - * - * THE FILE IS DISTRIBUTED AS-IS, WITHOUT WARRANTY OF ANY KIND, AND THE - * IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE - * ARE EXPRESSLY DISCLAIMED. The License provides additional details about - * this warranty disclaimer. - */ - -/* Inspired (hugely) by HCI LDISC implementation in Bluetooth. + * Inspired (hugely) by HCI LDISC implementation in Bluetooth. * * Copyright (C) 2000-2001 Qualcomm Incorporated * Copyright (C) 2002-2003 Maxim Krasnyansky <maxk@qualcomm.com> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 2a2bc64f75cf..46943a18a10d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -91,6 +91,7 @@ #endif #include <linux/bpf.h> #include <net/compat.h> +#include <linux/netfilter_netdev.h> #include "internal.h" @@ -241,8 +242,42 @@ struct packet_skb_cb { static void __fanout_unlink(struct sock *sk, struct packet_sock *po); static void __fanout_link(struct sock *sk, struct packet_sock *po); +#ifdef CONFIG_NETFILTER_EGRESS +static noinline struct sk_buff *nf_hook_direct_egress(struct sk_buff *skb) +{ + struct sk_buff *next, *head = NULL, *tail; + int rc; + + rcu_read_lock(); + for (; skb != NULL; skb = next) { + next = skb->next; + skb_mark_not_on_list(skb); + + if (!nf_hook_egress(skb, &rc, skb->dev)) + continue; + + if (!head) + head = skb; + else + tail->next = skb; + + tail = skb; + } + rcu_read_unlock(); + + return head; +} +#endif + static int packet_direct_xmit(struct sk_buff *skb) { +#ifdef CONFIG_NETFILTER_EGRESS + if (nf_hook_egress_active()) { + skb = nf_hook_direct_egress(skb); + if (!skb) + return NET_XMIT_DROP; + } +#endif return dev_direct_xmit(skb, packet_pick_tx_queue(skb)); } diff --git a/net/qrtr/Makefile b/net/qrtr/Makefile index 1b1411d158a7..8e0605f88a73 100644 --- a/net/qrtr/Makefile +++ b/net/qrtr/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_QRTR) := qrtr.o ns.o +obj-$(CONFIG_QRTR) += qrtr.o +qrtr-y := af_qrtr.o ns.o obj-$(CONFIG_QRTR_SMD) += qrtr-smd.o qrtr-smd-y := smd.o diff --git a/net/qrtr/qrtr.c b/net/qrtr/af_qrtr.c index ec2322529727..ec2322529727 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/af_qrtr.c diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index cf7d974e0f61..30a1cf4c16c6 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -109,7 +109,7 @@ char *rose2asc(char *buf, const rose_address *addr) /* * Compare two ROSE addresses, 0 == equal. */ -int rosecmp(rose_address *addr1, rose_address *addr2) +int rosecmp(const rose_address *addr1, const rose_address *addr2) { int i; @@ -123,7 +123,8 @@ int rosecmp(rose_address *addr1, rose_address *addr2) /* * Compare two ROSE addresses for only mask digits, 0 == equal. */ -int rosecmpm(rose_address *addr1, rose_address *addr2, unsigned short mask) +int rosecmpm(const rose_address *addr1, const rose_address *addr2, + unsigned short mask) { unsigned int i, j; diff --git a/net/rose/rose_dev.c b/net/rose/rose_dev.c index 051804fbee17..f1a76a5820f1 100644 --- a/net/rose/rose_dev.c +++ b/net/rose/rose_dev.c @@ -66,10 +66,10 @@ static int rose_set_mac_address(struct net_device *dev, void *addr) if (err) return err; - rose_del_loopback_node((rose_address *)dev->dev_addr); + rose_del_loopback_node((const rose_address *)dev->dev_addr); } - memcpy(dev->dev_addr, sa->sa_data, dev->addr_len); + dev_addr_set(dev, sa->sa_data); return 0; } @@ -78,7 +78,7 @@ static int rose_open(struct net_device *dev) { int err; - err = rose_add_loopback_node((rose_address *)dev->dev_addr); + err = rose_add_loopback_node((const rose_address *)dev->dev_addr); if (err) return err; @@ -90,7 +90,7 @@ static int rose_open(struct net_device *dev) static int rose_close(struct net_device *dev) { netif_stop_queue(dev); - rose_del_loopback_node((rose_address *)dev->dev_addr); + rose_del_loopback_node((const rose_address *)dev->dev_addr); return 0; } diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c index f6102e6f5161..8b96a56d3a49 100644 --- a/net/rose/rose_link.c +++ b/net/rose/rose_link.c @@ -94,11 +94,11 @@ static void rose_t0timer_expiry(struct timer_list *t) */ static int rose_send_frame(struct sk_buff *skb, struct rose_neigh *neigh) { - ax25_address *rose_call; + const ax25_address *rose_call; ax25_cb *ax25s; if (ax25cmp(&rose_callsign, &null_ax25_address) == 0) - rose_call = (ax25_address *)neigh->dev->dev_addr; + rose_call = (const ax25_address *)neigh->dev->dev_addr; else rose_call = &rose_callsign; @@ -117,11 +117,11 @@ static int rose_send_frame(struct sk_buff *skb, struct rose_neigh *neigh) */ static int rose_link_up(struct rose_neigh *neigh) { - ax25_address *rose_call; + const ax25_address *rose_call; ax25_cb *ax25s; if (ax25cmp(&rose_callsign, &null_ax25_address) == 0) - rose_call = (ax25_address *)neigh->dev->dev_addr; + rose_call = (const ax25_address *)neigh->dev->dev_addr; else rose_call = &rose_callsign; diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index c0e04c261a15..e2e6b6b78578 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -401,7 +401,7 @@ void rose_add_loopback_neigh(void) /* * Add a loopback node. */ -int rose_add_loopback_node(rose_address *address) +int rose_add_loopback_node(const rose_address *address) { struct rose_node *rose_node; int err = 0; @@ -446,7 +446,7 @@ out: /* * Delete a loopback node. */ -void rose_del_loopback_node(rose_address *address) +void rose_del_loopback_node(const rose_address *address) { struct rose_node *rose_node; @@ -629,7 +629,8 @@ struct net_device *rose_dev_get(rose_address *addr) rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) { - if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && rosecmp(addr, (rose_address *)dev->dev_addr) == 0) { + if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && + rosecmp(addr, (const rose_address *)dev->dev_addr) == 0) { dev_hold(dev); goto out; } @@ -646,7 +647,8 @@ static int rose_dev_exists(rose_address *addr) rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) { - if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && rosecmp(addr, (rose_address *)dev->dev_addr) == 0) + if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && + rosecmp(addr, (const rose_address *)dev->dev_addr) == 0) goto out; } dev = NULL; diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c index 4e565eeab426..be61d6f5be8d 100644 --- a/net/rxrpc/rtt.c +++ b/net/rxrpc/rtt.c @@ -22,7 +22,7 @@ static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer) static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer) { - return _usecs_to_jiffies((peer->srtt_us >> 3) + peer->rttvar_us); + return usecs_to_jiffies((peer->srtt_us >> 3) + peer->rttvar_us); } static u32 rxrpc_bound_rto(u32 rto) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 7dd3a2dc5fa4..3258da3d5bed 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -480,16 +480,18 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, atomic_set(&p->tcfa_bindcnt, 1); if (cpustats) { - p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!p->cpu_bstats) goto err1; - p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!p->cpu_bstats_hw) goto err2; p->cpu_qstats = alloc_percpu(struct gnet_stats_queue); if (!p->cpu_qstats) goto err3; } + gnet_stats_basic_sync_init(&p->tcfa_bstats); + gnet_stats_basic_sync_init(&p->tcfa_bstats_hw); spin_lock_init(&p->tcfa_lock); p->tcfa_index = index; p->tcfa_tm.install = jiffies; @@ -499,7 +501,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, if (est) { err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats, &p->tcfa_rate_est, - &p->tcfa_lock, NULL, est); + &p->tcfa_lock, false, est); if (err) goto err4; } @@ -1126,13 +1128,13 @@ void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, u64 drops, bool hw) { if (a->cpu_bstats) { - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); + _bstats_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); this_cpu_ptr(a->cpu_qstats)->drops += drops; if (hw) - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), - bytes, packets); + _bstats_update(this_cpu_ptr(a->cpu_bstats_hw), + bytes, packets); return; } @@ -1171,9 +1173,10 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, if (err < 0) goto errout; - if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 || - gnet_stats_copy_basic_hw(NULL, &d, p->cpu_bstats_hw, - &p->tcfa_bstats_hw) < 0 || + if (gnet_stats_copy_basic(&d, p->cpu_bstats, + &p->tcfa_bstats, false) < 0 || + gnet_stats_copy_basic_hw(&d, p->cpu_bstats_hw, + &p->tcfa_bstats_hw, false) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 || gnet_stats_copy_queue(&d, p->cpu_qstats, &p->tcfa_qstats, diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 5c36013339e1..f2bf896331a5 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -41,7 +41,7 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, int action, filter_res; tcf_lastuse_update(&prog->tcf_tm); - bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(prog->common.cpu_bstats), skb); filter = rcu_dereference(prog->filter); if (at_ingress) { diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index ad9df0cb4b98..90866ae45573 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -960,6 +960,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, tmpl = p->tmpl; tcf_lastuse_update(&c->tcf_tm); + tcf_action_update_bstats(&c->common, skb); if (clear) { qdisc_skb_cb(skb)->post_ct = false; @@ -1049,7 +1050,6 @@ out_push: qdisc_skb_cb(skb)->post_ct = true; out_clear: - tcf_action_update_bstats(&c->common, skb); if (defrag) qdisc_skb_cb(skb)->pkt_len = skb->len; return retval; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 7064a365a1a9..b757f90a2d58 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -718,7 +718,7 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, u8 *tlv_data; u16 metalen; - bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); if (skb_at_tc_ingress(skb)) @@ -806,7 +806,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, exceed_mtu = true; } - bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); if (!metalen) { /* no metadata to send */ diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index e4529b428cf4..8faa4c58305e 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -59,7 +59,7 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, int ret, mac_len; tcf_lastuse_update(&m->tcf_tm); - bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(m->common.cpu_bstats), skb); /* Ensure 'data' points at mac_header prior calling mpls manipulating * functions. diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 832157a840fc..9e77ba8401e5 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -125,7 +125,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, police->common.cpu_bstats, &police->tcf_rate_est, &police->tcf_lock, - NULL, est); + false, est); if (err) goto failure; } else if (tb[TCA_POLICE_AVRATE] && @@ -248,7 +248,7 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, int ret; tcf_lastuse_update(&police->tcf_tm); - bstats_cpu_update(this_cpu_ptr(police->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(police->common.cpu_bstats), skb); ret = READ_ONCE(police->tcf_action); p = rcu_dereference_bh(police->params); diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 230501eb9e06..ce859b0e0deb 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -163,7 +163,7 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, int retval; tcf_lastuse_update(&s->tcf_tm); - bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(s->common.cpu_bstats), skb); retval = READ_ONCE(s->tcf_action); psample_group = rcu_dereference_bh(s->psample_group); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index cbbe1861d3a2..e617ab4505ca 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -36,7 +36,8 @@ static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a, * then it would look like "hello_3" (without quotes) */ pr_info("simple: %s_%llu\n", - (char *)d->tcfd_defdata, d->tcf_bstats.packets); + (char *)d->tcfd_defdata, + u64_stats_read(&d->tcf_bstats.packets)); spin_unlock(&d->tcf_lock); return d->tcf_action; } diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 605418538347..d30ecbfc8f84 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -31,7 +31,7 @@ static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a, int action; tcf_lastuse_update(&d->tcf_tm); - bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); params = rcu_dereference_bh(d->params); action = READ_ONCE(d->tcf_action); diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index ecb9ee666095..9b6b52c5e24e 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -31,7 +31,7 @@ static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a, u64 flags; tcf_lastuse_update(&d->tcf_tm); - bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); action = READ_ONCE(d->tcf_action); if (unlikely(action == TC_ACT_SHOT)) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 23b21253b3c3..eb6345a027e1 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -2188,18 +2188,24 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg, arg->count = arg->skip; + rcu_read_lock(); idr_for_each_entry_continue_ul(&head->handle_idr, f, tmp, id) { /* don't return filters that are being deleted */ if (!refcount_inc_not_zero(&f->refcnt)) continue; + rcu_read_unlock(); + if (arg->fn(tp, f, arg) < 0) { __fl_put(f); arg->stop = 1; + rcu_read_lock(); break; } __fl_put(f); arg->count++; + rcu_read_lock(); } + rcu_read_unlock(); arg->cookie = id; } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 5e90e9b160e3..efcd0b5e9a32 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -507,20 +507,27 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt, list_for_each_entry(stab, &qdisc_stab_list, list) { if (memcmp(&stab->szopts, s, sizeof(*s))) continue; - if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16))) + if (tsize > 0 && + memcmp(stab->data, tab, flex_array_size(stab, data, tsize))) continue; stab->refcnt++; return stab; } - stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL); + if (s->size_log > STAB_SIZE_LOG_MAX || + s->cell_log > STAB_SIZE_LOG_MAX) { + NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table"); + return ERR_PTR(-EINVAL); + } + + stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL); if (!stab) return ERR_PTR(-ENOMEM); stab->refcnt = 1; stab->szopts = *s; if (tsize > 0) - memcpy(stab->data, tab, tsize * sizeof(u16)); + memcpy(stab->data, tab, flex_array_size(stab, data, tsize)); list_add_tail(&stab->list, &qdisc_stab_list); @@ -878,7 +885,7 @@ static void qdisc_offload_graft_root(struct net_device *dev, static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, u32 portid, u32 seq, u16 flags, int event) { - struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; + struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL; struct gnet_stats_queue __percpu *cpu_qstats = NULL; struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -936,8 +943,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, cpu_qstats = q->cpu_qstats; } - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q), - &d, cpu_bstats, &q->bstats) < 0 || + if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 || gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) goto nla_put_failure; @@ -1258,26 +1264,17 @@ static struct Qdisc *qdisc_create(struct net_device *dev, rcu_assign_pointer(sch->stab, stab); } if (tca[TCA_RATE]) { - seqcount_t *running; - err = -EOPNOTSUPP; if (sch->flags & TCQ_F_MQROOT) { NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc"); goto err_out4; } - if (sch->parent != TC_H_ROOT && - !(sch->flags & TCQ_F_INGRESS) && - (!p || !(p->flags & TCQ_F_MQROOT))) - running = qdisc_root_sleeping_running(sch); - else - running = &sch->running; - err = gen_new_estimator(&sch->bstats, sch->cpu_bstats, &sch->rate_est, NULL, - running, + true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to generate new estimator"); @@ -1353,7 +1350,7 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca, sch->cpu_bstats, &sch->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); } out: diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 7d8518176b45..4c8e994cf0a5 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -52,7 +52,7 @@ struct atm_flow_data { struct atm_qdisc_data *parent; /* parent qdisc */ struct socket *sock; /* for closing */ int ref; /* reference count */ - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct list_head list; struct atm_flow_data *excess; /* flow for excess traffic; @@ -548,6 +548,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt, pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); INIT_LIST_HEAD(&p->flows); INIT_LIST_HEAD(&p->link.list); + gnet_stats_basic_sync_init(&p->link.bstats); list_add(&p->link.list, &p->flows); p->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle, extack); @@ -652,8 +653,7 @@ atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, { struct atm_flow_data *flow = (struct atm_flow_data *)arg; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &flow->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &flow->bstats, true) < 0 || gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0) return -1; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index e0da15530f0e..02d9f0dfe356 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -116,7 +116,7 @@ struct cbq_class { long avgidle; long deficit; /* Saved deficit for WRR */ psched_time_t penalized; - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct tc_cbq_xstats xstats; @@ -565,8 +565,7 @@ cbq_update(struct cbq_sched_data *q) long avgidle = cl->avgidle; long idle; - cl->bstats.packets++; - cl->bstats.bytes += len; + _bstats_update(&cl->bstats, len, 1); /* * (now - last) is total time between packet right edges. @@ -1384,8 +1383,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (cl->undertime != PSCHED_PASTPERFECT) cl->xstats.undertime = cl->undertime - q->now; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) return -1; @@ -1519,7 +1517,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator"); @@ -1611,6 +1609,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (cl == NULL) goto failure; + gnet_stats_basic_sync_init(&cl->bstats); err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); if (err) { kfree(cl); @@ -1619,9 +1618,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - NULL, - qdisc_root_sleeping_running(sch), - tca[TCA_RATE]); + NULL, true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Couldn't create new estimator"); tcf_block_put(cl->block); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 642cd179b7a7..18e4f7a0b291 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -19,7 +19,7 @@ struct drr_class { struct Qdisc_class_common common; unsigned int filter_cnt; - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct list_head alist; @@ -85,8 +85,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - NULL, - qdisc_root_sleeping_running(sch), + NULL, true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to replace estimator"); @@ -106,6 +105,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl == NULL) return -ENOBUFS; + gnet_stats_basic_sync_init(&cl->bstats); cl->common.classid = classid; cl->quantum = quantum; cl->qdisc = qdisc_create_dflt(sch->dev_queue, @@ -118,9 +118,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - NULL, - qdisc_root_sleeping_running(sch), - tca[TCA_RATE]); + NULL, true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to replace estimator"); qdisc_put(cl->qdisc); @@ -267,8 +265,7 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (qlen) xstats.deficit = cl->deficit; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, cl_q->cpu_qstats, &cl_q->qstats, qlen) < 0) return -1; diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index 1f857ffd1ac2..0eae9ff5edf6 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -41,7 +41,7 @@ struct ets_class { struct Qdisc *qdisc; u32 quantum; u32 deficit; - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; }; @@ -325,8 +325,7 @@ static int ets_class_dump_stats(struct Qdisc *sch, unsigned long arg, struct ets_class *cl = ets_class_from_arg(sch, arg); struct Qdisc *cl_q = cl->qdisc; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats, true) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; @@ -661,7 +660,6 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, q->nbands = nbands; for (i = nstrict; i < q->nstrict; i++) { - INIT_LIST_HEAD(&q->classes[i].alist); if (q->classes[i].qdisc->q.qlen) { list_add_tail(&q->classes[i].alist, &q->active); q->classes[i].deficit = quanta[i]; @@ -687,7 +685,11 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, ets_offload_change(sch); for (i = q->nbands; i < oldbands; i++) { qdisc_put(q->classes[i].qdisc); - memset(&q->classes[i], 0, sizeof(q->classes[i])); + q->classes[i].qdisc = NULL; + q->classes[i].quantum = 0; + q->classes[i].deficit = 0; + gnet_stats_basic_sync_init(&q->classes[i].bstats); + memset(&q->classes[i].qstats, 0, sizeof(q->classes[i].qstats)); } return 0; } @@ -696,7 +698,7 @@ static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct ets_sched *q = qdisc_priv(sch); - int err; + int err, i; if (!opt) return -EINVAL; @@ -706,6 +708,9 @@ static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt, return err; INIT_LIST_HEAD(&q->active); + for (i = 0; i < TCQ_ETS_MAX_BANDS; i++) + INIT_LIST_HEAD(&q->classes[i].alist); + return ets_qdisc_change(sch, opt, extack); } diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index a579a4131d22..e1040421b797 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -233,6 +233,9 @@ int fifo_set_limit(struct Qdisc *q, unsigned int limit) if (strncmp(q->ops->id + 1, "fifo", 4) != 0) return 0; + if (!q->ops->change) + return 0; + nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); if (nla) { nla->nla_type = RTM_NEWQDISC; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index bb0cd6d3d2c2..839e1235db05 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -362,6 +362,8 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { [TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 }, [TCA_FQ_CODEL_DROP_BATCH_SIZE] = { .type = NLA_U32 }, [TCA_FQ_CODEL_MEMORY_LIMIT] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR] = { .type = NLA_U8 }, + [TCA_FQ_CODEL_CE_THRESHOLD_MASK] = { .type = NLA_U8 }, }; static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, @@ -408,6 +410,11 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, q->cparams.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT; } + if (tb[TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR]) + q->cparams.ce_threshold_selector = nla_get_u8(tb[TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR]); + if (tb[TCA_FQ_CODEL_CE_THRESHOLD_MASK]) + q->cparams.ce_threshold_mask = nla_get_u8(tb[TCA_FQ_CODEL_CE_THRESHOLD_MASK]); + if (tb[TCA_FQ_CODEL_INTERVAL]) { u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]); @@ -544,10 +551,15 @@ static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) q->flows_cnt)) goto nla_put_failure; - if (q->cparams.ce_threshold != CODEL_DISABLED_THRESHOLD && - nla_put_u32(skb, TCA_FQ_CODEL_CE_THRESHOLD, - codel_time_to_us(q->cparams.ce_threshold))) - goto nla_put_failure; + if (q->cparams.ce_threshold != CODEL_DISABLED_THRESHOLD) { + if (nla_put_u32(skb, TCA_FQ_CODEL_CE_THRESHOLD, + codel_time_to_us(q->cparams.ce_threshold))) + goto nla_put_failure; + if (nla_put_u8(skb, TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR, q->cparams.ce_threshold_selector)) + goto nla_put_failure; + if (nla_put_u8(skb, TCA_FQ_CODEL_CE_THRESHOLD_MASK, q->cparams.ce_threshold_mask)) + goto nla_put_failure; + } return nla_nest_end(skb, opts); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 8c64a552a64f..b0ff0dff2773 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -304,8 +304,8 @@ trace: /* * Transmit possibly several skbs, and handle the return status as - * required. Owning running seqcount bit guarantees that - * only one CPU can execute this function. + * required. Owning qdisc running bit guarantees that only one CPU + * can execute this function. * * Returns to the caller: * false - hardware queue frozen backoff @@ -606,7 +606,6 @@ struct Qdisc noop_qdisc = { .ops = &noop_qdisc_ops, .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), .dev_queue = &noop_netdev_queue, - .running = SEQCNT_ZERO(noop_qdisc.running), .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), .gso_skb = { .next = (struct sk_buff *)&noop_qdisc.gso_skb, @@ -867,7 +866,6 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { EXPORT_SYMBOL(pfifo_fast_ops); static struct lock_class_key qdisc_tx_busylock; -static struct lock_class_key qdisc_running_key; struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, @@ -892,11 +890,12 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, __skb_queue_head_init(&sch->gso_skb); __skb_queue_head_init(&sch->skb_bad_txq); qdisc_skb_head_init(&sch->q); + gnet_stats_basic_sync_init(&sch->bstats); spin_lock_init(&sch->q.lock); if (ops->static_flags & TCQ_F_CPUSTATS) { sch->cpu_bstats = - netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!sch->cpu_bstats) goto errout1; @@ -916,10 +915,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, lockdep_set_class(&sch->seqlock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); - seqcount_init(&sch->running); - lockdep_set_class(&sch->running, - dev->qdisc_running_key ?: &qdisc_running_key); - sch->ops = ops; sch->flags = ops->static_flags; sch->enqueue = ops->enqueue; diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 621dc6afde8f..72de08ef8335 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -353,6 +353,7 @@ static int gred_offload_dump_stats(struct Qdisc *sch) { struct gred_sched *table = qdisc_priv(sch); struct tc_gred_qopt_offload *hw_stats; + u64 bytes = 0, packets = 0; unsigned int i; int ret; @@ -364,9 +365,11 @@ static int gred_offload_dump_stats(struct Qdisc *sch) hw_stats->handle = sch->handle; hw_stats->parent = sch->parent; - for (i = 0; i < MAX_DPs; i++) + for (i = 0; i < MAX_DPs; i++) { + gnet_stats_basic_sync_init(&hw_stats->stats.bstats[i]); if (table->tab[i]) hw_stats->stats.xstats[i] = &table->tab[i]->stats; + } ret = qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_GRED, hw_stats); /* Even if driver returns failure adjust the stats - in case offload @@ -375,19 +378,19 @@ static int gred_offload_dump_stats(struct Qdisc *sch) for (i = 0; i < MAX_DPs; i++) { if (!table->tab[i]) continue; - table->tab[i]->packetsin += hw_stats->stats.bstats[i].packets; - table->tab[i]->bytesin += hw_stats->stats.bstats[i].bytes; + table->tab[i]->packetsin += u64_stats_read(&hw_stats->stats.bstats[i].packets); + table->tab[i]->bytesin += u64_stats_read(&hw_stats->stats.bstats[i].bytes); table->tab[i]->backlog += hw_stats->stats.qstats[i].backlog; - _bstats_update(&sch->bstats, - hw_stats->stats.bstats[i].bytes, - hw_stats->stats.bstats[i].packets); + bytes += u64_stats_read(&hw_stats->stats.bstats[i].bytes); + packets += u64_stats_read(&hw_stats->stats.bstats[i].packets); sch->qstats.qlen += hw_stats->stats.qstats[i].qlen; sch->qstats.backlog += hw_stats->stats.qstats[i].backlog; sch->qstats.drops += hw_stats->stats.qstats[i].drops; sch->qstats.requeues += hw_stats->stats.qstats[i].requeues; sch->qstats.overlimits += hw_stats->stats.qstats[i].overlimits; } + _bstats_update(&sch->bstats, bytes, packets); kfree(hw_stats); return ret; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index b7ac30cca035..d3979a6000e7 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -111,7 +111,7 @@ enum hfsc_class_flags { struct hfsc_class { struct Qdisc_class_common cl_common; - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct tcf_proto __rcu *filter_list; /* filter list */ @@ -965,7 +965,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); if (err) return err; @@ -1033,9 +1033,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - NULL, - qdisc_root_sleeping_running(sch), - tca[TCA_RATE]); + NULL, true, tca[TCA_RATE]); if (err) { tcf_block_put(cl->block); kfree(cl); @@ -1328,7 +1326,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, xstats.work = cl->cl_total; xstats.rtwork = cl->cl_cumul; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) return -1; @@ -1406,6 +1404,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt, if (err) return err; + gnet_stats_basic_sync_init(&q->root.bstats); q->root.cl_common.classid = sch->handle; q->root.sched = q; q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 5067a6e5d4fd..cf1d45db4e84 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -113,8 +113,8 @@ struct htb_class { /* * Written often fields */ - struct gnet_stats_basic_packed bstats; - struct gnet_stats_basic_packed bstats_bias; + struct gnet_stats_basic_sync bstats; + struct gnet_stats_basic_sync bstats_bias; struct tc_htb_xstats xstats; /* our special stats */ /* token bucket parameters */ @@ -1308,10 +1308,11 @@ nla_put_failure: static void htb_offload_aggregate_stats(struct htb_sched *q, struct htb_class *cl) { + u64 bytes = 0, packets = 0; struct htb_class *c; unsigned int i; - memset(&cl->bstats, 0, sizeof(cl->bstats)); + gnet_stats_basic_sync_init(&cl->bstats); for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(c, &q->clhash.hash[i], common.hnode) { @@ -1323,14 +1324,15 @@ static void htb_offload_aggregate_stats(struct htb_sched *q, if (p != cl) continue; - cl->bstats.bytes += c->bstats_bias.bytes; - cl->bstats.packets += c->bstats_bias.packets; + bytes += u64_stats_read(&c->bstats_bias.bytes); + packets += u64_stats_read(&c->bstats_bias.packets); if (c->level == 0) { - cl->bstats.bytes += c->leaf.q->bstats.bytes; - cl->bstats.packets += c->leaf.q->bstats.packets; + bytes += u64_stats_read(&c->leaf.q->bstats.bytes); + packets += u64_stats_read(&c->leaf.q->bstats.packets); } } } + _bstats_update(&cl->bstats, bytes, packets); } static int @@ -1357,16 +1359,16 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) if (cl->leaf.q) cl->bstats = cl->leaf.q->bstats; else - memset(&cl->bstats, 0, sizeof(cl->bstats)); - cl->bstats.bytes += cl->bstats_bias.bytes; - cl->bstats.packets += cl->bstats_bias.packets; + gnet_stats_basic_sync_init(&cl->bstats); + _bstats_update(&cl->bstats, + u64_stats_read(&cl->bstats_bias.bytes), + u64_stats_read(&cl->bstats_bias.packets)); } else { htb_offload_aggregate_stats(q, cl); } } - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0) return -1; @@ -1578,8 +1580,9 @@ static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl, WARN_ON(old != q); if (cl->parent) { - cl->parent->bstats_bias.bytes += q->bstats.bytes; - cl->parent->bstats_bias.packets += q->bstats.packets; + _bstats_update(&cl->parent->bstats_bias, + u64_stats_read(&q->bstats.bytes), + u64_stats_read(&q->bstats.packets)); } offload_opt = (struct tc_htb_qopt_offload) { @@ -1849,6 +1852,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!cl) goto failure; + gnet_stats_basic_sync_init(&cl->bstats); + gnet_stats_basic_sync_init(&cl->bstats_bias); + err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); if (err) { kfree(cl); @@ -1858,7 +1864,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE] ? : &est.nla); if (err) goto err_block_put; @@ -1922,8 +1928,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, htb_graft_helper(dev_queue, old_q); goto err_kill_estimator; } - parent->bstats_bias.bytes += old_q->bstats.bytes; - parent->bstats_bias.packets += old_q->bstats.packets; + _bstats_update(&parent->bstats_bias, + u64_stats_read(&old_q->bstats.bytes), + u64_stats_read(&old_q->bstats.packets)); qdisc_put(old_q); } new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, @@ -1983,7 +1990,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); if (err) return err; diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index e04f1a87642b..83d2e54bf303 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -130,10 +130,9 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) struct net_device *dev = qdisc_dev(sch); struct Qdisc *qdisc; unsigned int ntx; - __u32 qlen = 0; sch->q.qlen = 0; - memset(&sch->bstats, 0, sizeof(sch->bstats)); + gnet_stats_basic_sync_init(&sch->bstats); memset(&sch->qstats, 0, sizeof(sch->qstats)); /* MQ supports lockless qdiscs. However, statistics accounting needs @@ -145,25 +144,11 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; spin_lock_bh(qdisc_lock(qdisc)); - if (qdisc_is_percpu_stats(qdisc)) { - qlen = qdisc_qlen_sum(qdisc); - __gnet_stats_copy_basic(NULL, &sch->bstats, - qdisc->cpu_bstats, - &qdisc->bstats); - __gnet_stats_copy_queue(&sch->qstats, - qdisc->cpu_qstats, - &qdisc->qstats, qlen); - sch->q.qlen += qlen; - } else { - sch->q.qlen += qdisc->q.qlen; - sch->bstats.bytes += qdisc->bstats.bytes; - sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.qlen += qdisc->qstats.qlen; - sch->qstats.backlog += qdisc->qstats.backlog; - sch->qstats.drops += qdisc->qstats.drops; - sch->qstats.requeues += qdisc->qstats.requeues; - sch->qstats.overlimits += qdisc->qstats.overlimits; - } + gnet_stats_add_basic(&sch->bstats, qdisc->cpu_bstats, + &qdisc->bstats, false); + gnet_stats_add_queue(&sch->qstats, qdisc->cpu_qstats, + &qdisc->qstats); + sch->q.qlen += qdisc_qlen(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); } @@ -246,8 +231,7 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct netdev_queue *dev_queue = mq_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(&sch->running, d, sch->cpu_bstats, - &sch->bstats) < 0 || + if (gnet_stats_copy_basic(d, sch->cpu_bstats, &sch->bstats, true) < 0 || qdisc_qstats_copy(d, sch) < 0) return -1; return 0; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 0bc10234e306..b29f3453c6ea 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -390,7 +390,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) unsigned int ntx, tc; sch->q.qlen = 0; - memset(&sch->bstats, 0, sizeof(sch->bstats)); + gnet_stats_basic_sync_init(&sch->bstats); memset(&sch->qstats, 0, sizeof(sch->qstats)); /* MQ supports lockless qdiscs. However, statistics accounting needs @@ -402,25 +402,11 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; spin_lock_bh(qdisc_lock(qdisc)); - if (qdisc_is_percpu_stats(qdisc)) { - __u32 qlen = qdisc_qlen_sum(qdisc); - - __gnet_stats_copy_basic(NULL, &sch->bstats, - qdisc->cpu_bstats, - &qdisc->bstats); - __gnet_stats_copy_queue(&sch->qstats, - qdisc->cpu_qstats, - &qdisc->qstats, qlen); - sch->q.qlen += qlen; - } else { - sch->q.qlen += qdisc->q.qlen; - sch->bstats.bytes += qdisc->bstats.bytes; - sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.backlog += qdisc->qstats.backlog; - sch->qstats.drops += qdisc->qstats.drops; - sch->qstats.requeues += qdisc->qstats.requeues; - sch->qstats.overlimits += qdisc->qstats.overlimits; - } + gnet_stats_add_basic(&sch->bstats, qdisc->cpu_bstats, + &qdisc->bstats, false); + gnet_stats_add_queue(&sch->qstats, qdisc->cpu_qstats, + &qdisc->qstats); + sch->q.qlen += qdisc_qlen(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); } @@ -512,12 +498,13 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, { if (cl >= TC_H_MIN_PRIORITY) { int i; - __u32 qlen = 0; + __u32 qlen; struct gnet_stats_queue qstats = {0}; - struct gnet_stats_basic_packed bstats = {0}; + struct gnet_stats_basic_sync bstats; struct net_device *dev = qdisc_dev(sch); struct netdev_tc_txq tc = dev->tc_to_txq[cl & TC_BITMASK]; + gnet_stats_basic_sync_init(&bstats); /* Drop lock here it will be reclaimed before touching * statistics this is required because the d->lock we * hold here is the look on dev_queue->qdisc_sleeping @@ -529,37 +516,31 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, for (i = tc.offset; i < tc.offset + tc.count; i++) { struct netdev_queue *q = netdev_get_tx_queue(dev, i); struct Qdisc *qdisc = rtnl_dereference(q->qdisc); - struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; - struct gnet_stats_queue __percpu *cpu_qstats = NULL; spin_lock_bh(qdisc_lock(qdisc)); - if (qdisc_is_percpu_stats(qdisc)) { - cpu_bstats = qdisc->cpu_bstats; - cpu_qstats = qdisc->cpu_qstats; - } - qlen = qdisc_qlen_sum(qdisc); - __gnet_stats_copy_basic(NULL, &sch->bstats, - cpu_bstats, &qdisc->bstats); - __gnet_stats_copy_queue(&sch->qstats, - cpu_qstats, - &qdisc->qstats, - qlen); + gnet_stats_add_basic(&bstats, qdisc->cpu_bstats, + &qdisc->bstats, false); + gnet_stats_add_queue(&qstats, qdisc->cpu_qstats, + &qdisc->qstats); + sch->q.qlen += qdisc_qlen(qdisc); + spin_unlock_bh(qdisc_lock(qdisc)); } + qlen = qdisc_qlen(sch) + qstats.qlen; /* Reclaim root sleeping lock before completing stats */ if (d->lock) spin_lock_bh(d->lock); - if (gnet_stats_copy_basic(NULL, d, NULL, &bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &bstats, false) < 0 || gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0) return -1; } else { struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, - sch->cpu_bstats, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(d, sch->cpu_bstats, + &sch->bstats, true) < 0 || qdisc_qstats_copy(d, sch) < 0) return -1; } diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index e282e7382117..cd8ab90c4765 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -338,8 +338,7 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, cl_q->cpu_bstats, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, &cl_q->bstats, true) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 0c345e43a09a..ecbb10db1111 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -785,7 +785,7 @@ static int get_dist_table(struct Qdisc *sch, struct disttable **tbl, if (!n || n > NETEM_DIST_MAX) return -EINVAL; - d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL); + d = kvmalloc(struct_size(d, table, n), GFP_KERNEL); if (!d) return -ENOMEM; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 03fdf31ccb6a..3b8d7197c06b 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -361,8 +361,8 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, cl_q->cpu_bstats, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, + &cl_q->bstats, true) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 58a9d42b52b8..0b7f9ba28deb 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -131,7 +131,7 @@ struct qfq_class { unsigned int filter_cnt; - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct Qdisc *qdisc; @@ -451,7 +451,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); if (err) return err; @@ -465,6 +465,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl == NULL) return -ENOBUFS; + gnet_stats_basic_sync_init(&cl->bstats); cl->common.classid = classid; cl->deficit = lmax; @@ -477,7 +478,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); if (err) goto destroy_class; @@ -639,8 +640,7 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, xstats.weight = cl->agg->class_weight; xstats.lmax = cl->agg->lmax; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || qdisc_qstats_copy(d, cl->qdisc) < 0) return -1; @@ -1234,8 +1234,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } - cl->bstats.bytes += len; - cl->bstats.packets += gso_segs; + _bstats_update(&cl->bstats, len, gso_segs); sch->qstats.backlog += len; ++sch->q.qlen; diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 1ab2fc933a21..9ab068fa2672 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1641,6 +1641,10 @@ static void taprio_destroy(struct Qdisc *sch) list_del(&q->taprio_list); spin_unlock(&taprio_list_lock); + /* Note that taprio_reset() might not be called if an error + * happens in qdisc_create(), after taprio_init() has been called. + */ + hrtimer_cancel(&q->advance_timer); taprio_disable_offload(dev, q, NULL); @@ -1973,7 +1977,7 @@ static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &sch->bstats, true) < 0 || qdisc_qstats_copy(d, sch) < 0) return -1; return 0; diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 78e79029dc63..72102277449e 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -184,6 +184,20 @@ static int tbf_offload_dump(struct Qdisc *sch) return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt); } +static void tbf_offload_graft(struct Qdisc *sch, struct Qdisc *new, + struct Qdisc *old, struct netlink_ext_ack *extack) +{ + struct tc_tbf_qopt_offload graft_offload = { + .handle = sch->handle, + .parent = sch->parent, + .child_handle = new->handle, + .command = TC_TBF_GRAFT, + }; + + qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, old, + TC_SETUP_QDISC_TBF, &graft_offload, extack); +} + /* GSO packet is too big, segment it so that tbf can transmit * each segment in time */ @@ -547,6 +561,8 @@ static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, new = &noop_qdisc; *old = qdisc_replace(sch, new, &q->qdisc); + + tbf_offload_graft(sch, new, *old, extack); return 0; } diff --git a/net/sctp/input.c b/net/sctp/input.c index 5ef86fdb1176..1f1786021d9c 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -702,7 +702,7 @@ static int sctp_rcv_ootb(struct sk_buff *skb) ch = skb_header_pointer(skb, offset, sizeof(*ch), &_ch); /* Break out if chunk length is less then minimal. */ - if (ntohs(ch->length) < sizeof(_ch)) + if (!ch || ntohs(ch->length) < sizeof(_ch)) break; ch_end = offset + SCTP_PAD4(ntohs(ch->length)); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index b8fa8f1a7277..c7503fd64915 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -3697,7 +3697,7 @@ struct sctp_chunk *sctp_make_strreset_req( outlen = (sizeof(outreq) + stream_len) * out; inlen = (sizeof(inreq) + stream_len) * in; - retval = sctp_make_reconf(asoc, outlen + inlen); + retval = sctp_make_reconf(asoc, SCTP_PAD4(outlen) + SCTP_PAD4(inlen)); if (!retval) return NULL; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index f69ef3f2019f..5e50e007a7da 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -439,6 +439,47 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) return 0; } +static bool smc_isascii(char *hostname) +{ + int i; + + for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++) + if (!isascii(hostname[i])) + return false; + return true; +} + +static void smc_conn_save_peer_info_fce(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *clc) +{ + struct smc_clc_msg_accept_confirm_v2 *clc_v2 = + (struct smc_clc_msg_accept_confirm_v2 *)clc; + struct smc_clc_first_contact_ext *fce; + int clc_v2_len; + + if (clc->hdr.version == SMC_V1 || + !(clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) + return; + + if (smc->conn.lgr->is_smcd) { + memcpy(smc->conn.lgr->negotiated_eid, clc_v2->d1.eid, + SMC_MAX_EID_LEN); + clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2, + d1); + } else { + memcpy(smc->conn.lgr->negotiated_eid, clc_v2->r1.eid, + SMC_MAX_EID_LEN); + clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2, + r1); + } + fce = (struct smc_clc_first_contact_ext *)(((u8 *)clc_v2) + clc_v2_len); + smc->conn.lgr->peer_os = fce->os_type; + smc->conn.lgr->peer_smc_release = fce->release; + if (smc_isascii(fce->hostname)) + memcpy(smc->conn.lgr->peer_hostname, fce->hostname, + SMC_MAX_HOSTNAME_LEN); +} + static void smcr_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { @@ -451,16 +492,6 @@ static void smcr_conn_save_peer_info(struct smc_sock *smc, smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1); } -static bool smc_isascii(char *hostname) -{ - int i; - - for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++) - if (!isascii(hostname[i])) - return false; - return true; -} - static void smcd_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { @@ -472,22 +503,6 @@ static void smcd_conn_save_peer_info(struct smc_sock *smc, smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg); atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx; - if (clc->hdr.version > SMC_V1 && - (clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) { - struct smc_clc_msg_accept_confirm_v2 *clc_v2 = - (struct smc_clc_msg_accept_confirm_v2 *)clc; - struct smc_clc_first_contact_ext *fce = - (struct smc_clc_first_contact_ext *) - (((u8 *)clc_v2) + sizeof(*clc_v2)); - - memcpy(smc->conn.lgr->negotiated_eid, clc_v2->eid, - SMC_MAX_EID_LEN); - smc->conn.lgr->peer_os = fce->os_type; - smc->conn.lgr->peer_smc_release = fce->release; - if (smc_isascii(fce->hostname)) - memcpy(smc->conn.lgr->peer_hostname, fce->hostname, - SMC_MAX_HOSTNAME_LEN); - } } static void smc_conn_save_peer_info(struct smc_sock *smc, @@ -497,14 +512,16 @@ static void smc_conn_save_peer_info(struct smc_sock *smc, smcd_conn_save_peer_info(smc, clc); else smcr_conn_save_peer_info(smc, clc); + smc_conn_save_peer_info_fce(smc, clc); } static void smc_link_save_peer_info(struct smc_link *link, - struct smc_clc_msg_accept_confirm *clc) + struct smc_clc_msg_accept_confirm *clc, + struct smc_init_info *ini) { link->peer_qpn = ntoh24(clc->r0.qpn); - memcpy(link->peer_gid, clc->r0.lcl.gid, SMC_GID_SIZE); - memcpy(link->peer_mac, clc->r0.lcl.mac, sizeof(link->peer_mac)); + memcpy(link->peer_gid, ini->peer_gid, SMC_GID_SIZE); + memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac)); link->peer_psn = ntoh24(clc->r0.psn); link->peer_mtu = clc->r0.qp_mtu; } @@ -608,7 +625,9 @@ static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini) * used for the internal TCP socket */ smc_pnet_find_roce_resource(smc->clcsock->sk, ini); - if (!ini->ib_dev) + if (!ini->check_smcrv2 && !ini->ib_dev) + return SMC_CLC_DECL_NOSMCRDEV; + if (ini->check_smcrv2 && !ini->smcrv2.ib_dev_v2) return SMC_CLC_DECL_NOSMCRDEV; return 0; } @@ -692,27 +711,42 @@ static int smc_find_proposal_devices(struct smc_sock *smc, int rc = 0; /* check if there is an ism device available */ - if (ini->smcd_version & SMC_V1) { - if (smc_find_ism_device(smc, ini) || - smc_connect_ism_vlan_setup(smc, ini)) { - if (ini->smc_type_v1 == SMC_TYPE_B) - ini->smc_type_v1 = SMC_TYPE_R; - else - ini->smc_type_v1 = SMC_TYPE_N; - } /* else ISM V1 is supported for this connection */ - if (smc_find_rdma_device(smc, ini)) { - if (ini->smc_type_v1 == SMC_TYPE_B) - ini->smc_type_v1 = SMC_TYPE_D; - else - ini->smc_type_v1 = SMC_TYPE_N; - } /* else RDMA is supported for this connection */ - } - if (smc_ism_is_v2_capable() && smc_find_ism_v2_device_clnt(smc, ini)) - ini->smc_type_v2 = SMC_TYPE_N; + if (!(ini->smcd_version & SMC_V1) || + smc_find_ism_device(smc, ini) || + smc_connect_ism_vlan_setup(smc, ini)) + ini->smcd_version &= ~SMC_V1; + /* else ISM V1 is supported for this connection */ + + /* check if there is an rdma device available */ + if (!(ini->smcr_version & SMC_V1) || + smc_find_rdma_device(smc, ini)) + ini->smcr_version &= ~SMC_V1; + /* else RDMA is supported for this connection */ + + ini->smc_type_v1 = smc_indicated_type(ini->smcd_version & SMC_V1, + ini->smcr_version & SMC_V1); + + /* check if there is an ism v2 device available */ + if (!(ini->smcd_version & SMC_V2) || + !smc_ism_is_v2_capable() || + smc_find_ism_v2_device_clnt(smc, ini)) + ini->smcd_version &= ~SMC_V2; + + /* check if there is an rdma v2 device available */ + ini->check_smcrv2 = true; + ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr; + if (!(ini->smcr_version & SMC_V2) || + smc->clcsock->sk->sk_family != AF_INET || + !smc_clc_ueid_count() || + smc_find_rdma_device(smc, ini)) + ini->smcr_version &= ~SMC_V2; + ini->check_smcrv2 = false; + + ini->smc_type_v2 = smc_indicated_type(ini->smcd_version & SMC_V2, + ini->smcr_version & SMC_V2); /* if neither ISM nor RDMA are supported, fallback */ - if (!smcr_indicated(ini->smc_type_v1) && - ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N) + if (ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N) rc = SMC_CLC_DECL_NOSMCDEV; return rc; @@ -752,6 +786,64 @@ static int smc_connect_clc(struct smc_sock *smc, SMC_CLC_ACCEPT, CLC_WAIT_TIME); } +void smc_fill_gid_list(struct smc_link_group *lgr, + struct smc_gidlist *gidlist, + struct smc_ib_device *known_dev, u8 *known_gid) +{ + struct smc_init_info *alt_ini = NULL; + + memset(gidlist, 0, sizeof(*gidlist)); + memcpy(gidlist->list[gidlist->len++], known_gid, SMC_GID_SIZE); + + alt_ini = kzalloc(sizeof(*alt_ini), GFP_KERNEL); + if (!alt_ini) + goto out; + + alt_ini->vlan_id = lgr->vlan_id; + alt_ini->check_smcrv2 = true; + alt_ini->smcrv2.saddr = lgr->saddr; + smc_pnet_find_alt_roce(lgr, alt_ini, known_dev); + + if (!alt_ini->smcrv2.ib_dev_v2) + goto out; + + memcpy(gidlist->list[gidlist->len++], alt_ini->smcrv2.ib_gid_v2, + SMC_GID_SIZE); + +out: + kfree(alt_ini); +} + +static int smc_connect_rdma_v2_prepare(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *aclc, + struct smc_init_info *ini) +{ + struct smc_clc_msg_accept_confirm_v2 *clc_v2 = + (struct smc_clc_msg_accept_confirm_v2 *)aclc; + struct smc_clc_first_contact_ext *fce = + (struct smc_clc_first_contact_ext *) + (((u8 *)clc_v2) + sizeof(*clc_v2)); + + if (!ini->first_contact_peer || aclc->hdr.version == SMC_V1) + return 0; + + if (fce->v2_direct) { + memcpy(ini->smcrv2.nexthop_mac, &aclc->r0.lcl.mac, ETH_ALEN); + ini->smcrv2.uses_gateway = false; + } else { + if (smc_ib_find_route(smc->clcsock->sk->sk_rcv_saddr, + smc_ib_gid_to_ipv4(aclc->r0.lcl.gid), + ini->smcrv2.nexthop_mac, + &ini->smcrv2.uses_gateway)) + return SMC_CLC_DECL_NOROUTE; + if (!ini->smcrv2.uses_gateway) { + /* mismatch: peer claims indirect, but its direct */ + return SMC_CLC_DECL_NOINDIRECT; + } + } + return 0; +} + /* setup for RDMA connection of client */ static int smc_connect_rdma(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, @@ -759,11 +851,18 @@ static int smc_connect_rdma(struct smc_sock *smc, { int i, reason_code = 0; struct smc_link *link; + u8 *eid = NULL; ini->is_smcd = false; - ini->ib_lcl = &aclc->r0.lcl; ini->ib_clcqpn = ntoh24(aclc->r0.qpn); ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK; + memcpy(ini->peer_systemid, aclc->r0.lcl.id_for_peer, SMC_SYSTEMID_LEN); + memcpy(ini->peer_gid, aclc->r0.lcl.gid, SMC_GID_SIZE); + memcpy(ini->peer_mac, aclc->r0.lcl.mac, ETH_ALEN); + + reason_code = smc_connect_rdma_v2_prepare(smc, aclc, ini); + if (reason_code) + return reason_code; mutex_lock(&smc_client_lgr_pending); reason_code = smc_conn_create(smc, ini); @@ -785,8 +884,9 @@ static int smc_connect_rdma(struct smc_sock *smc, if (l->peer_qpn == ntoh24(aclc->r0.qpn) && !memcmp(l->peer_gid, &aclc->r0.lcl.gid, SMC_GID_SIZE) && - !memcmp(l->peer_mac, &aclc->r0.lcl.mac, - sizeof(l->peer_mac))) { + (aclc->hdr.version > SMC_V1 || + !memcmp(l->peer_mac, &aclc->r0.lcl.mac, + sizeof(l->peer_mac)))) { link = l; break; } @@ -805,7 +905,7 @@ static int smc_connect_rdma(struct smc_sock *smc, } if (ini->first_contact_local) - smc_link_save_peer_info(link, aclc); + smc_link_save_peer_info(link, aclc, ini); if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) { reason_code = SMC_CLC_DECL_ERR_RTOK; @@ -828,8 +928,18 @@ static int smc_connect_rdma(struct smc_sock *smc, } smc_rmb_sync_sg_for_device(&smc->conn); + if (aclc->hdr.version > SMC_V1) { + struct smc_clc_msg_accept_confirm_v2 *clc_v2 = + (struct smc_clc_msg_accept_confirm_v2 *)aclc; + + eid = clc_v2->r1.eid; + if (ini->first_contact_local) + smc_fill_gid_list(link->lgr, &ini->smcrv2.gidlist, + link->smcibdev, link->gid); + } + reason_code = smc_clc_send_confirm(smc, ini->first_contact_local, - SMC_V1, NULL); + aclc->hdr.version, eid, ini); if (reason_code) goto connect_abort; @@ -869,7 +979,7 @@ smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm_v2 *aclc, int i; for (i = 0; i < ini->ism_offered_cnt + 1; i++) { - if (ini->ism_chid[i] == ntohs(aclc->chid)) { + if (ini->ism_chid[i] == ntohs(aclc->d1.chid)) { ini->ism_selected = i; return 0; } @@ -923,11 +1033,11 @@ static int smc_connect_ism(struct smc_sock *smc, struct smc_clc_msg_accept_confirm_v2 *clc_v2 = (struct smc_clc_msg_accept_confirm_v2 *)aclc; - eid = clc_v2->eid; + eid = clc_v2->d1.eid; } rc = smc_clc_send_confirm(smc, ini->first_contact_local, - aclc->hdr.version, eid); + aclc->hdr.version, eid, NULL); if (rc) goto connect_abort; mutex_unlock(&smc_server_lgr_pending); @@ -950,17 +1060,24 @@ connect_abort: static int smc_connect_check_aclc(struct smc_init_info *ini, struct smc_clc_msg_accept_confirm *aclc) { - if ((aclc->hdr.typev1 == SMC_TYPE_R && - !smcr_indicated(ini->smc_type_v1)) || - (aclc->hdr.typev1 == SMC_TYPE_D && - ((!smcd_indicated(ini->smc_type_v1) && - !smcd_indicated(ini->smc_type_v2)) || - (aclc->hdr.version == SMC_V1 && - !smcd_indicated(ini->smc_type_v1)) || - (aclc->hdr.version == SMC_V2 && - !smcd_indicated(ini->smc_type_v2))))) + if (aclc->hdr.typev1 != SMC_TYPE_R && + aclc->hdr.typev1 != SMC_TYPE_D) return SMC_CLC_DECL_MODEUNSUPP; + if (aclc->hdr.version >= SMC_V2) { + if ((aclc->hdr.typev1 == SMC_TYPE_R && + !smcr_indicated(ini->smc_type_v2)) || + (aclc->hdr.typev1 == SMC_TYPE_D && + !smcd_indicated(ini->smc_type_v2))) + return SMC_CLC_DECL_MODEUNSUPP; + } else { + if ((aclc->hdr.typev1 == SMC_TYPE_R && + !smcr_indicated(ini->smc_type_v1)) || + (aclc->hdr.typev1 == SMC_TYPE_D && + !smcd_indicated(ini->smc_type_v1))) + return SMC_CLC_DECL_MODEUNSUPP; + } + return 0; } @@ -991,14 +1108,15 @@ static int __smc_connect(struct smc_sock *smc) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM, version); - ini->smcd_version = SMC_V1; - ini->smcd_version |= smc_ism_is_v2_capable() ? SMC_V2 : 0; + ini->smcd_version = SMC_V1 | SMC_V2; + ini->smcr_version = SMC_V1 | SMC_V2; ini->smc_type_v1 = SMC_TYPE_B; - ini->smc_type_v2 = smc_ism_is_v2_capable() ? SMC_TYPE_D : SMC_TYPE_N; + ini->smc_type_v2 = SMC_TYPE_B; /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(smc->clcsock, ini)) { ini->smcd_version &= ~SMC_V1; + ini->smcr_version = 0; ini->smc_type_v1 = SMC_TYPE_N; if (!ini->smcd_version) { rc = SMC_CLC_DECL_GETVLANERR; @@ -1026,15 +1144,17 @@ static int __smc_connect(struct smc_sock *smc) /* check if smc modes and versions of CLC proposal and accept match */ rc = smc_connect_check_aclc(ini, aclc); version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2; - ini->smcd_version = version; if (rc) goto vlan_cleanup; /* depending on previous steps, connect using rdma or ism */ - if (aclc->hdr.typev1 == SMC_TYPE_R) + if (aclc->hdr.typev1 == SMC_TYPE_R) { + ini->smcr_version = version; rc = smc_connect_rdma(smc, aclc, ini); - else if (aclc->hdr.typev1 == SMC_TYPE_D) + } else if (aclc->hdr.typev1 == SMC_TYPE_D) { + ini->smcd_version = version; rc = smc_connect_ism(smc, aclc, ini); + } if (rc) goto vlan_cleanup; @@ -1315,7 +1435,7 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); /* initial contact - try to establish second link */ - smc_llc_srv_add_link(link); + smc_llc_srv_add_link(link, NULL); return 0; } @@ -1395,33 +1515,48 @@ static int smc_listen_v2_check(struct smc_sock *new_smc, ini->smc_type_v1 = pclc->hdr.typev1; ini->smc_type_v2 = pclc->hdr.typev2; - ini->smcd_version = ini->smc_type_v1 != SMC_TYPE_N ? SMC_V1 : 0; - if (pclc->hdr.version > SMC_V1) - ini->smcd_version |= - ini->smc_type_v2 != SMC_TYPE_N ? SMC_V2 : 0; - if (!(ini->smcd_version & SMC_V2)) { + ini->smcd_version = smcd_indicated(ini->smc_type_v1) ? SMC_V1 : 0; + ini->smcr_version = smcr_indicated(ini->smc_type_v1) ? SMC_V1 : 0; + if (pclc->hdr.version > SMC_V1) { + if (smcd_indicated(ini->smc_type_v2)) + ini->smcd_version |= SMC_V2; + if (smcr_indicated(ini->smc_type_v2)) + ini->smcr_version |= SMC_V2; + } + if (!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) { rc = SMC_CLC_DECL_PEERNOSMC; goto out; } - if (!smc_ism_is_v2_capable()) { - ini->smcd_version &= ~SMC_V2; - rc = SMC_CLC_DECL_NOISM2SUPP; - goto out; - } pclc_v2_ext = smc_get_clc_v2_ext(pclc); if (!pclc_v2_ext) { ini->smcd_version &= ~SMC_V2; + ini->smcr_version &= ~SMC_V2; rc = SMC_CLC_DECL_NOV2EXT; goto out; } pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext); - if (!pclc_smcd_v2_ext) { - ini->smcd_version &= ~SMC_V2; - rc = SMC_CLC_DECL_NOV2DEXT; + if (ini->smcd_version & SMC_V2) { + if (!smc_ism_is_v2_capable()) { + ini->smcd_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOISM2SUPP; + } else if (!pclc_smcd_v2_ext) { + ini->smcd_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOV2DEXT; + } else if (!pclc_v2_ext->hdr.eid_cnt && + !pclc_v2_ext->hdr.flag.seid) { + ini->smcd_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOUEID; + } + } + if (ini->smcr_version & SMC_V2) { + if (!pclc_v2_ext->hdr.eid_cnt) { + ini->smcr_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOUEID; + } } out: - if (!ini->smcd_version) + if (!ini->smcd_version && !ini->smcr_version) return rc; return 0; @@ -1541,10 +1676,6 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, pclc_smcd = smc_get_clc_msg_smcd(pclc); smc_v2_ext = smc_get_clc_v2_ext(pclc); smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext); - if (!smcd_v2_ext) { - smc_find_ism_store_rc(SMC_CLC_DECL_NOV2DEXT, ini); - goto not_found; - } mutex_lock(&smcd_dev_list.mutex); if (pclc_smcd->ism.chid) @@ -1562,8 +1693,10 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, } mutex_unlock(&smcd_dev_list.mutex); - if (!ini->ism_dev[0]) + if (!ini->ism_dev[0]) { + smc_find_ism_store_rc(SMC_CLC_DECL_NOSMCD2DEV, ini); goto not_found; + } smc_ism_get_system_eid(&eid); if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, @@ -1616,6 +1749,7 @@ static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc, not_found: smc_find_ism_store_rc(rc, ini); + ini->smcd_version &= ~SMC_V1; ini->ism_dev[0] = NULL; ini->is_smcd = false; } @@ -1634,24 +1768,69 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first) return 0; } +static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + struct smc_clc_v2_extension *smc_v2_ext; + u8 smcr_version; + int rc; + + if (!(ini->smcr_version & SMC_V2) || !smcr_indicated(ini->smc_type_v2)) + goto not_found; + + smc_v2_ext = smc_get_clc_v2_ext(pclc); + if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL)) + goto not_found; + + /* prepare RDMA check */ + memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN); + memcpy(ini->peer_gid, smc_v2_ext->roce, SMC_GID_SIZE); + memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN); + ini->check_smcrv2 = true; + ini->smcrv2.clc_sk = new_smc->clcsock->sk; + ini->smcrv2.saddr = new_smc->clcsock->sk->sk_rcv_saddr; + ini->smcrv2.daddr = smc_ib_gid_to_ipv4(smc_v2_ext->roce); + rc = smc_find_rdma_device(new_smc, ini); + if (rc) { + smc_find_ism_store_rc(rc, ini); + goto not_found; + } + if (!ini->smcrv2.uses_gateway) + memcpy(ini->smcrv2.nexthop_mac, pclc->lcl.mac, ETH_ALEN); + + smcr_version = ini->smcr_version; + ini->smcr_version = SMC_V2; + rc = smc_listen_rdma_init(new_smc, ini); + if (!rc) + rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local); + if (!rc) + return; + ini->smcr_version = smcr_version; + smc_find_ism_store_rc(rc, ini); + +not_found: + ini->smcr_version &= ~SMC_V2; + ini->check_smcrv2 = false; +} + static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { int rc; - if (!smcr_indicated(ini->smc_type_v1)) + if (!(ini->smcr_version & SMC_V1) || !smcr_indicated(ini->smc_type_v1)) return SMC_CLC_DECL_NOSMCDEV; /* prepare RDMA check */ - ini->ib_lcl = &pclc->lcl; + memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN); + memcpy(ini->peer_gid, pclc->lcl.gid, SMC_GID_SIZE); + memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN); rc = smc_find_rdma_device(new_smc, ini); if (rc) { /* no RDMA device found */ - if (ini->smc_type_v1 == SMC_TYPE_B) - /* neither ISM nor RDMA device found */ - rc = SMC_CLC_DECL_NOSMCDEV; - return rc; + return SMC_CLC_DECL_NOSMCDEV; } rc = smc_listen_rdma_init(new_smc, ini); if (rc) @@ -1664,51 +1843,60 @@ static int smc_listen_find_device(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { - int rc; + int prfx_rc; /* check for ISM device matching V2 proposed device */ smc_find_ism_v2_device_serv(new_smc, pclc, ini); if (ini->ism_dev[0]) return 0; - if (!(ini->smcd_version & SMC_V1)) - return ini->rc ?: SMC_CLC_DECL_NOSMCD2DEV; - - /* check for matching IP prefix and subnet length */ - rc = smc_listen_prfx_check(new_smc, pclc); - if (rc) - return ini->rc ?: rc; + /* check for matching IP prefix and subnet length (V1) */ + prfx_rc = smc_listen_prfx_check(new_smc, pclc); + if (prfx_rc) + smc_find_ism_store_rc(prfx_rc, ini); /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) return ini->rc ?: SMC_CLC_DECL_GETVLANERR; /* check for ISM device matching V1 proposed device */ - smc_find_ism_v1_device_serv(new_smc, pclc, ini); + if (!prfx_rc) + smc_find_ism_v1_device_serv(new_smc, pclc, ini); if (ini->ism_dev[0]) return 0; - if (pclc->hdr.typev1 == SMC_TYPE_D) + if (!smcr_indicated(pclc->hdr.typev1) && + !smcr_indicated(pclc->hdr.typev2)) /* skip RDMA and decline */ return ini->rc ?: SMC_CLC_DECL_NOSMCDDEV; - /* check if RDMA is available */ - rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini); - smc_find_ism_store_rc(rc, ini); + /* check if RDMA V2 is available */ + smc_find_rdma_v2_device_serv(new_smc, pclc, ini); + if (ini->smcrv2.ib_dev_v2) + return 0; - return (!rc) ? 0 : ini->rc; + /* check if RDMA V1 is available */ + if (!prfx_rc) { + int rc; + + rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini); + smc_find_ism_store_rc(rc, ini); + return (!rc) ? 0 : ini->rc; + } + return SMC_CLC_DECL_NOSMCDEV; } /* listen worker: finish RDMA setup */ static int smc_listen_rdma_finish(struct smc_sock *new_smc, struct smc_clc_msg_accept_confirm *cclc, - bool local_first) + bool local_first, + struct smc_init_info *ini) { struct smc_link *link = new_smc->conn.lnk; int reason_code = 0; if (local_first) - smc_link_save_peer_info(link, cclc); + smc_link_save_peer_info(link, cclc, ini); if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc)) return SMC_CLC_DECL_ERR_RTOK; @@ -1729,12 +1917,13 @@ static void smc_listen_work(struct work_struct *work) { struct smc_sock *new_smc = container_of(work, struct smc_sock, smc_listen_work); - u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1; struct socket *newclcsock = new_smc->clcsock; struct smc_clc_msg_accept_confirm *cclc; struct smc_clc_msg_proposal_area *buf; struct smc_clc_msg_proposal *pclc; struct smc_init_info *ini = NULL; + u8 proposal_version = SMC_V1; + u8 accept_version; int rc = 0; if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN) @@ -1765,7 +1954,9 @@ static void smc_listen_work(struct work_struct *work) SMC_CLC_PROPOSAL, CLC_WAIT_TIME); if (rc) goto out_decl; - version = pclc->hdr.version == SMC_V1 ? SMC_V1 : version; + + if (pclc->hdr.version > SMC_V1) + proposal_version = SMC_V2; /* IPSec connections opt out of SMC optimizations */ if (using_ipsec(new_smc)) { @@ -1795,9 +1986,9 @@ static void smc_listen_work(struct work_struct *work) goto out_unlock; /* send SMC Accept CLC message */ + accept_version = ini->is_smcd ? ini->smcd_version : ini->smcr_version; rc = smc_clc_send_accept(new_smc, ini->first_contact_local, - ini->smcd_version == SMC_V2 ? SMC_V2 : SMC_V1, - ini->negotiated_eid); + accept_version, ini->negotiated_eid); if (rc) goto out_unlock; @@ -1819,7 +2010,7 @@ static void smc_listen_work(struct work_struct *work) /* finish worker */ if (!ini->is_smcd) { rc = smc_listen_rdma_finish(new_smc, cclc, - ini->first_contact_local); + ini->first_contact_local, ini); if (rc) goto out_unlock; mutex_unlock(&smc_server_lgr_pending); @@ -1833,7 +2024,7 @@ out_unlock: mutex_unlock(&smc_server_lgr_pending); out_decl: smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0, - version); + proposal_version); out_free: kfree(ini); kfree(buf); diff --git a/net/smc/smc.h b/net/smc/smc.h index 5e7def3ab730..f4286ca1f228 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -56,7 +56,20 @@ enum smc_state { /* possible states of an SMC socket */ struct smc_link_group; struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */ - u8 type; + union { + u8 type; +#if defined(__BIG_ENDIAN_BITFIELD) + struct { + u8 llc_version:4, + llc_type:4; + }; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + struct { + u8 llc_type:4, + llc_version:4; + }; +#endif + }; } __aligned(1); struct smc_cdc_conn_state_flags { @@ -286,7 +299,12 @@ static inline bool using_ipsec(struct smc_sock *smc) } #endif +struct smc_gidlist; + struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock); void smc_close_non_accepted(struct sock *sk); +void smc_fill_gid_list(struct smc_link_group *lgr, + struct smc_gidlist *gidlist, + struct smc_ib_device *known_dev, u8 *known_gid); #endif /* __SMC_H */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index f23f558054a7..99acd337ba90 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -150,9 +150,11 @@ static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn) again: link = conn->lnk; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_cdc_get_free_slot(conn, link, &wr_buf, NULL, &pend); if (rc) - return rc; + goto put_out; spin_lock_bh(&conn->send_lock); if (link != conn->lnk) { @@ -160,6 +162,7 @@ again: spin_unlock_bh(&conn->send_lock); smc_wr_tx_put_slot(link, (struct smc_wr_tx_pend_priv *)pend); + smc_wr_tx_link_put(link); if (again) return -ENOLINK; again = true; @@ -167,6 +170,8 @@ again: } rc = smc_cdc_msg_send(conn, wr_buf, pend); spin_unlock_bh(&conn->send_lock); +put_out: + smc_wr_tx_link_put(link); return rc; } diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 4afd9e71e5c2..8409ab71a5e4 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -31,6 +31,7 @@ #define SMCR_CLC_ACCEPT_CONFIRM_LEN 68 #define SMCD_CLC_ACCEPT_CONFIRM_LEN 48 #define SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 78 +#define SMCR_CLC_ACCEPT_CONFIRM_LEN_V2 108 #define SMC_CLC_RECV_BUF_LEN 100 /* eye catcher "SMCR" EBCDIC for CLC messages */ @@ -114,6 +115,17 @@ err_out: return rc; } +int smc_clc_ueid_count(void) +{ + int count; + + read_lock(&smc_clc_eid_table.lock); + count = smc_clc_eid_table.ueid_cnt; + read_unlock(&smc_clc_eid_table.lock); + + return count; +} + int smc_nl_add_ueid(struct sk_buff *skb, struct genl_info *info) { struct nlattr *nla_ueid = info->attrs[SMC_NLA_EID_TABLE_ENTRY]; @@ -298,7 +310,8 @@ bool smc_clc_match_eid(u8 *negotiated_eid, negotiated_eid[0] = 0; read_lock(&smc_clc_eid_table.lock); - if (smc_clc_eid_table.seid_enabled && + if (peer_eid && local_eid && + smc_clc_eid_table.seid_enabled && smc_v2_ext->hdr.flag.seid && !memcmp(peer_eid, local_eid, SMC_MAX_EID_LEN)) { memcpy(negotiated_eid, peer_eid, SMC_MAX_EID_LEN); @@ -380,6 +393,27 @@ smc_clc_msg_acc_conf_valid(struct smc_clc_msg_accept_confirm_v2 *clc_v2) (ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 + sizeof(struct smc_clc_first_contact_ext))) return false; + if (hdr->typev1 == SMC_TYPE_R && + ntohs(hdr->length) < SMCR_CLC_ACCEPT_CONFIRM_LEN_V2) + return false; + } + return true; +} + +/* check arriving CLC decline */ +static bool +smc_clc_msg_decl_valid(struct smc_clc_msg_decline *dclc) +{ + struct smc_clc_msg_hdr *hdr = &dclc->hdr; + + if (hdr->typev1 != SMC_TYPE_R && hdr->typev1 != SMC_TYPE_D) + return false; + if (hdr->version == SMC_V1) { + if (ntohs(hdr->length) != sizeof(struct smc_clc_msg_decline)) + return false; + } else { + if (ntohs(hdr->length) != sizeof(struct smc_clc_msg_decline_v2)) + return false; } return true; } @@ -425,9 +459,9 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl) break; case SMC_CLC_DECLINE: dclc = (struct smc_clc_msg_decline *)clcm; - if (ntohs(dclc->hdr.length) != sizeof(*dclc)) + if (!smc_clc_msg_decl_valid(dclc)) return false; - trl = &dclc->trl; + check_trl = false; break; default: return false; @@ -510,7 +544,8 @@ static int smc_clc_prfx_set(struct socket *clcsock, goto out_rel; } /* get address to which the internal TCP socket is bound */ - kernel_getsockname(clcsock, (struct sockaddr *)&addrs); + if (kernel_getsockname(clcsock, (struct sockaddr *)&addrs) < 0) + goto out_rel; /* analyze IP specific data of net_device belonging to TCP socket */ addr6 = (struct sockaddr_in6 *)&addrs; rcu_read_lock(); @@ -725,15 +760,16 @@ out: /* send CLC DECLINE message across internal TCP socket */ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version) { - struct smc_clc_msg_decline dclc; + struct smc_clc_msg_decline *dclc_v1; + struct smc_clc_msg_decline_v2 dclc; struct msghdr msg; + int len, send_len; struct kvec vec; - int len; + dclc_v1 = (struct smc_clc_msg_decline *)&dclc; memset(&dclc, 0, sizeof(dclc)); memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); dclc.hdr.type = SMC_CLC_DECLINE; - dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline)); dclc.hdr.version = version; dclc.os_type = version == SMC_V1 ? 0 : SMC_CLC_OS_LINUX; dclc.hdr.typev2 = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? @@ -743,14 +779,22 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version) memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid)); dclc.peer_diagnosis = htonl(peer_diag_info); - memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + if (version == SMC_V1) { + memcpy(dclc_v1->trl.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + send_len = sizeof(*dclc_v1); + } else { + memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + send_len = sizeof(dclc); + } + dclc.hdr.length = htons(send_len); memset(&msg, 0, sizeof(msg)); vec.iov_base = &dclc; - vec.iov_len = sizeof(struct smc_clc_msg_decline); - len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, - sizeof(struct smc_clc_msg_decline)); - if (len < 0 || len < sizeof(struct smc_clc_msg_decline)) + vec.iov_len = send_len; + len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, send_len); + if (len < 0 || len < send_len) len = -EPROTO; return len > 0 ? 0 : len; } @@ -832,8 +876,8 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) } else { struct smc_clc_eid_entry *ueident; u16 v2_ext_offset; - u8 *eid = NULL; + v2_ext->hdr.flag.release = SMC_RELEASE; v2_ext_offset = sizeof(*pclc_smcd) - offsetofend(struct smc_clc_msg_smcd, v2_ext_offset); if (ini->smc_type_v1 != SMC_TYPE_N) @@ -841,6 +885,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) pclc_prfx->ipv6_prefixes_cnt * sizeof(ipv6_prfx[0]); pclc_smcd->v2_ext_offset = htons(v2_ext_offset); + plen += sizeof(*v2_ext); read_lock(&smc_clc_eid_table.lock); v2_ext->hdr.eid_cnt = smc_clc_eid_table.ueid_cnt; @@ -850,10 +895,13 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) memcpy(v2_ext->user_eids[i++], ueident->eid, sizeof(ueident->eid)); } - v2_ext->hdr.flag.seid = smc_clc_eid_table.seid_enabled; read_unlock(&smc_clc_eid_table.lock); + } + if (smcd_indicated(ini->smc_type_v2)) { + u8 *eid = NULL; + + v2_ext->hdr.flag.seid = smc_clc_eid_table.seid_enabled; v2_ext->hdr.ism_gid_cnt = ini->ism_offered_cnt; - v2_ext->hdr.flag.release = SMC_RELEASE; v2_ext->hdr.smcd_v2_ext_offset = htons(sizeof(*v2_ext) - offsetofend(struct smc_clnt_opts_area_hdr, smcd_v2_ext_offset) + @@ -861,7 +909,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) smc_ism_get_system_eid(&eid); if (eid && v2_ext->hdr.flag.seid) memcpy(smcd_v2_ext->system_eid, eid, SMC_MAX_EID_LEN); - plen += sizeof(*v2_ext) + sizeof(*smcd_v2_ext); + plen += sizeof(*smcd_v2_ext); if (ini->ism_offered_cnt) { for (i = 1; i <= ini->ism_offered_cnt; i++) { gidchids[i - 1].gid = @@ -873,6 +921,9 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) sizeof(struct smc_clc_smcd_gid_chid); } } + if (smcr_indicated(ini->smc_type_v2)) + memcpy(v2_ext->roce, ini->smcrv2.ib_gid_v2, SMC_GID_SIZE); + pclc_base->hdr.length = htons(plen); memcpy(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); @@ -896,12 +947,14 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) vec[i].iov_base = v2_ext; vec[i++].iov_len = sizeof(*v2_ext) + (v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN); - vec[i].iov_base = smcd_v2_ext; - vec[i++].iov_len = sizeof(*smcd_v2_ext); - if (ini->ism_offered_cnt) { - vec[i].iov_base = gidchids; - vec[i++].iov_len = ini->ism_offered_cnt * + if (smcd_indicated(ini->smc_type_v2)) { + vec[i].iov_base = smcd_v2_ext; + vec[i++].iov_len = sizeof(*smcd_v2_ext); + if (ini->ism_offered_cnt) { + vec[i].iov_base = gidchids; + vec[i++].iov_len = ini->ism_offered_cnt * sizeof(struct smc_clc_smcd_gid_chid); + } } } vec[i].iov_base = trl; @@ -924,13 +977,14 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) static int smc_clc_send_confirm_accept(struct smc_sock *smc, struct smc_clc_msg_accept_confirm_v2 *clc_v2, int first_contact, u8 version, - u8 *eid) + u8 *eid, struct smc_init_info *ini) { struct smc_connection *conn = &smc->conn; struct smc_clc_msg_accept_confirm *clc; struct smc_clc_first_contact_ext fce; + struct smc_clc_fce_gid_ext gle; struct smc_clc_msg_trail trl; - struct kvec vec[3]; + struct kvec vec[5]; struct msghdr msg; int i, len; @@ -952,9 +1006,10 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, if (version == SMC_V1) { clc->hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN); } else { - clc_v2->chid = htons(smc_ism_get_chid(conn->lgr->smcd)); - if (eid[0]) - memcpy(clc_v2->eid, eid, SMC_MAX_EID_LEN); + clc_v2->d1.chid = + htons(smc_ism_get_chid(conn->lgr->smcd)); + if (eid && eid[0]) + memcpy(clc_v2->d1.eid, eid, SMC_MAX_EID_LEN); len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2; if (first_contact) smc_clc_fill_fce(&fce, &len); @@ -993,6 +1048,26 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, clc->r0.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address (conn->rmb_desc->sgt[link->link_idx].sgl)); hton24(clc->r0.psn, link->psn_initial); + if (version == SMC_V1) { + clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); + } else { + if (eid && eid[0]) + memcpy(clc_v2->r1.eid, eid, SMC_MAX_EID_LEN); + len = SMCR_CLC_ACCEPT_CONFIRM_LEN_V2; + if (first_contact) { + smc_clc_fill_fce(&fce, &len); + fce.v2_direct = !link->lgr->uses_gateway; + memset(&gle, 0, sizeof(gle)); + if (ini && clc->hdr.type == SMC_CLC_CONFIRM) { + gle.gid_cnt = ini->smcrv2.gidlist.len; + len += sizeof(gle); + len += gle.gid_cnt * sizeof(gle.gid[0]); + } else { + len += sizeof(gle.reserved); + } + } + clc_v2->hdr.length = htons(len); + } memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); } @@ -1000,7 +1075,10 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, i = 0; vec[i].iov_base = clc_v2; if (version > SMC_V1) - vec[i++].iov_len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 - sizeof(trl); + vec[i++].iov_len = (clc->hdr.typev1 == SMC_TYPE_D ? + SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 : + SMCR_CLC_ACCEPT_CONFIRM_LEN_V2) - + sizeof(trl); else vec[i++].iov_len = (clc->hdr.typev1 == SMC_TYPE_D ? SMCD_CLC_ACCEPT_CONFIRM_LEN : @@ -1009,6 +1087,18 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, if (version > SMC_V1 && first_contact) { vec[i].iov_base = &fce; vec[i++].iov_len = sizeof(fce); + if (!conn->lgr->is_smcd) { + if (clc->hdr.type == SMC_CLC_CONFIRM) { + vec[i].iov_base = &gle; + vec[i++].iov_len = sizeof(gle); + vec[i].iov_base = &ini->smcrv2.gidlist.list; + vec[i++].iov_len = gle.gid_cnt * + sizeof(gle.gid[0]); + } else { + vec[i].iov_base = &gle.reserved; + vec[i++].iov_len = sizeof(gle.reserved); + } + } } vec[i].iov_base = &trl; vec[i++].iov_len = sizeof(trl); @@ -1018,7 +1108,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, /* send CLC CONFIRM message across internal TCP socket */ int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, - u8 version, u8 *eid) + u8 version, u8 *eid, struct smc_init_info *ini) { struct smc_clc_msg_accept_confirm_v2 cclc_v2; int reason_code = 0; @@ -1028,7 +1118,7 @@ int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, memset(&cclc_v2, 0, sizeof(cclc_v2)); cclc_v2.hdr.type = SMC_CLC_CONFIRM; len = smc_clc_send_confirm_accept(smc, &cclc_v2, clnt_first_contact, - version, eid); + version, eid, ini); if (len < ntohs(cclc_v2.hdr.length)) { if (len >= 0) { reason_code = -ENETUNREACH; @@ -1051,7 +1141,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact, memset(&aclc_v2, 0, sizeof(aclc_v2)); aclc_v2.hdr.type = SMC_CLC_ACCEPT; len = smc_clc_send_confirm_accept(new_smc, &aclc_v2, srv_first_contact, - version, negotiated_eid); + version, negotiated_eid, NULL); if (len < ntohs(aclc_v2.hdr.length)) len = len >= 0 ? -EPROTO : -new_smc->clcsock->sk->sk_err; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 974d01d16bb5..83f02f131fc0 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -44,6 +44,7 @@ #define SMC_CLC_DECL_NOV2DEXT 0x03030005 /* peer sent no clc SMC-Dv2 ext. */ #define SMC_CLC_DECL_NOSEID 0x03030006 /* peer sent no SEID */ #define SMC_CLC_DECL_NOSMCD2DEV 0x03030007 /* no SMC-Dv2 device found */ +#define SMC_CLC_DECL_NOUEID 0x03030008 /* peer sent no UEID */ #define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/ #define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */ #define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */ @@ -54,6 +55,8 @@ #define SMC_CLC_DECL_NOSRVLINK 0x030b0000 /* SMC-R link from srv not found */ #define SMC_CLC_DECL_VERSMISMAT 0x030c0000 /* SMC version mismatch */ #define SMC_CLC_DECL_MAX_DMB 0x030d0000 /* SMC-D DMB limit exceeded */ +#define SMC_CLC_DECL_NOROUTE 0x030e0000 /* SMC-Rv2 conn. no route to peer */ +#define SMC_CLC_DECL_NOINDIRECT 0x030f0000 /* SMC-Rv2 conn. indirect mismatch*/ #define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ #define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */ #define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ @@ -213,11 +216,14 @@ struct smcd_clc_msg_accept_confirm_common { /* SMCD accept/confirm */ #define SMC_CLC_OS_AIX 3 struct smc_clc_first_contact_ext { - u8 reserved1; #if defined(__BIG_ENDIAN_BITFIELD) + u8 v2_direct : 1, + reserved : 7; u8 os_type : 4, release : 4; #elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved : 7, + v2_direct : 1; u8 release : 4, os_type : 4; #endif @@ -225,6 +231,13 @@ struct smc_clc_first_contact_ext { u8 hostname[SMC_MAX_HOSTNAME_LEN]; }; +struct smc_clc_fce_gid_ext { + u8 reserved[16]; + u8 gid_cnt; + u8 reserved2[3]; + u8 gid[][SMC_GID_SIZE]; +}; + struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ struct smc_clc_msg_hdr hdr; union { @@ -239,13 +252,17 @@ struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ struct smc_clc_msg_accept_confirm_v2 { /* clc accept / confirm message */ struct smc_clc_msg_hdr hdr; union { - struct smcr_clc_msg_accept_confirm r0; /* SMC-R */ + struct { /* SMC-R */ + struct smcr_clc_msg_accept_confirm r0; + u8 eid[SMC_MAX_EID_LEN]; + u8 reserved6[8]; + } r1; struct { /* SMC-D */ struct smcd_clc_msg_accept_confirm_common d0; __be16 chid; u8 eid[SMC_MAX_EID_LEN]; u8 reserved5[8]; - }; + } d1; }; }; @@ -264,6 +281,24 @@ struct smc_clc_msg_decline { /* clc decline message */ struct smc_clc_msg_trail trl; /* eye catcher "SMCD" or "SMCR" EBCDIC */ } __aligned(4); +#define SMC_DECL_DIAG_COUNT_V2 4 /* no. of additional peer diagnosis codes */ + +struct smc_clc_msg_decline_v2 { /* clc decline message */ + struct smc_clc_msg_hdr hdr; + u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */ + __be32 peer_diagnosis; /* diagnosis information */ +#if defined(__BIG_ENDIAN_BITFIELD) + u8 os_type : 4, + reserved : 4; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved : 4, + os_type : 4; +#endif + u8 reserved2[3]; + __be32 peer_diagnosis_v2[SMC_DECL_DIAG_COUNT_V2]; + struct smc_clc_msg_trail trl; /* eye catcher "SMCD" or "SMCR" EBCDIC */ +} __aligned(4); + /* determine start of the prefix area within the proposal message */ static inline struct smc_clc_msg_proposal_prefix * smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc) @@ -282,6 +317,17 @@ static inline bool smcd_indicated(int smc_type) return smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B; } +static inline u8 smc_indicated_type(int is_smcd, int is_smcr) +{ + if (is_smcd && is_smcr) + return SMC_TYPE_B; + if (is_smcd) + return SMC_TYPE_D; + if (is_smcr) + return SMC_TYPE_R; + return SMC_TYPE_N; +} + /* get SMC-D info from proposal message */ static inline struct smc_clc_msg_smcd * smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop) @@ -334,7 +380,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version); int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini); int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, - u8 version, u8 *eid); + u8 version, u8 *eid, struct smc_init_info *ini); int smc_clc_send_accept(struct smc_sock *smc, bool srv_first_contact, u8 version, u8 *negotiated_eid); void smc_clc_init(void) __init; @@ -343,6 +389,7 @@ void smc_clc_get_hostname(u8 **host); bool smc_clc_match_eid(u8 *negotiated_eid, struct smc_clc_v2_extension *smc_v2_ext, u8 *peer_eid, u8 *local_eid); +int smc_clc_ueid_count(void); int smc_nl_dump_ueid(struct sk_buff *skb, struct netlink_callback *cb); int smc_nl_add_ueid(struct sk_buff *skb, struct genl_info *info); int smc_nl_remove_ueid(struct sk_buff *skb, struct genl_info *info); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 4d0fcd8f8eba..8e642f8f334f 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -244,6 +244,8 @@ int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb) goto errattr; if (nla_put_u8(skb, SMC_NLA_SYS_IS_ISM_V2, smc_ism_is_v2_capable())) goto errattr; + if (nla_put_u8(skb, SMC_NLA_SYS_IS_SMCR_V2, true)) + goto errattr; smc_clc_get_hostname(&host); if (host) { memcpy(hostname, host, SMC_MAX_HOSTNAME_LEN); @@ -271,12 +273,65 @@ errmsg: return skb->len; } +/* Fill SMC_NLA_LGR_D_V2_COMMON/SMC_NLA_LGR_R_V2_COMMON nested attributes */ +static int smc_nl_fill_lgr_v2_common(struct smc_link_group *lgr, + struct sk_buff *skb, + struct netlink_callback *cb, + struct nlattr *v2_attrs) +{ + char smc_host[SMC_MAX_HOSTNAME_LEN + 1]; + char smc_eid[SMC_MAX_EID_LEN + 1]; + + if (nla_put_u8(skb, SMC_NLA_LGR_V2_VER, lgr->smc_version)) + goto errv2attr; + if (nla_put_u8(skb, SMC_NLA_LGR_V2_REL, lgr->peer_smc_release)) + goto errv2attr; + if (nla_put_u8(skb, SMC_NLA_LGR_V2_OS, lgr->peer_os)) + goto errv2attr; + memcpy(smc_host, lgr->peer_hostname, SMC_MAX_HOSTNAME_LEN); + smc_host[SMC_MAX_HOSTNAME_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_LGR_V2_PEER_HOST, smc_host)) + goto errv2attr; + memcpy(smc_eid, lgr->negotiated_eid, SMC_MAX_EID_LEN); + smc_eid[SMC_MAX_EID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_LGR_V2_NEG_EID, smc_eid)) + goto errv2attr; + + nla_nest_end(skb, v2_attrs); + return 0; + +errv2attr: + nla_nest_cancel(skb, v2_attrs); + return -EMSGSIZE; +} + +static int smc_nl_fill_smcr_lgr_v2(struct smc_link_group *lgr, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nlattr *v2_attrs; + + v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2); + if (!v2_attrs) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_DIRECT, !lgr->uses_gateway)) + goto errv2attr; + + nla_nest_end(skb, v2_attrs); + return 0; + +errv2attr: + nla_nest_cancel(skb, v2_attrs); +errattr: + return -EMSGSIZE; +} + static int smc_nl_fill_lgr(struct smc_link_group *lgr, struct sk_buff *skb, struct netlink_callback *cb) { char smc_target[SMC_MAX_PNETID_LEN + 1]; - struct nlattr *attrs; + struct nlattr *attrs, *v2_attrs; attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCR); if (!attrs) @@ -296,6 +351,15 @@ static int smc_nl_fill_lgr(struct smc_link_group *lgr, smc_target[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_R_PNETID, smc_target)) goto errattr; + if (lgr->smc_version > SMC_V1) { + v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2_COMMON); + if (!v2_attrs) + goto errattr; + if (smc_nl_fill_lgr_v2_common(lgr, skb, cb, v2_attrs)) + goto errattr; + if (smc_nl_fill_smcr_lgr_v2(lgr, skb, cb)) + goto errattr; + } nla_nest_end(skb, attrs); return 0; @@ -428,10 +492,7 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, struct sk_buff *skb, struct netlink_callback *cb) { - char smc_host[SMC_MAX_HOSTNAME_LEN + 1]; char smc_pnet[SMC_MAX_PNETID_LEN + 1]; - char smc_eid[SMC_MAX_EID_LEN + 1]; - struct nlattr *v2_attrs; struct nlattr *attrs; void *nlh; @@ -463,32 +524,19 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, smc_pnet[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_D_PNETID, smc_pnet)) goto errattr; + if (lgr->smc_version > SMC_V1) { + struct nlattr *v2_attrs; - v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_V2); - if (!v2_attrs) - goto errattr; - if (nla_put_u8(skb, SMC_NLA_LGR_V2_VER, lgr->smc_version)) - goto errv2attr; - if (nla_put_u8(skb, SMC_NLA_LGR_V2_REL, lgr->peer_smc_release)) - goto errv2attr; - if (nla_put_u8(skb, SMC_NLA_LGR_V2_OS, lgr->peer_os)) - goto errv2attr; - memcpy(smc_host, lgr->peer_hostname, SMC_MAX_HOSTNAME_LEN); - smc_host[SMC_MAX_HOSTNAME_LEN] = 0; - if (nla_put_string(skb, SMC_NLA_LGR_V2_PEER_HOST, smc_host)) - goto errv2attr; - memcpy(smc_eid, lgr->negotiated_eid, SMC_MAX_EID_LEN); - smc_eid[SMC_MAX_EID_LEN] = 0; - if (nla_put_string(skb, SMC_NLA_LGR_V2_NEG_EID, smc_eid)) - goto errv2attr; - - nla_nest_end(skb, v2_attrs); + v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_D_V2_COMMON); + if (!v2_attrs) + goto errattr; + if (smc_nl_fill_lgr_v2_common(lgr, skb, cb, v2_attrs)) + goto errattr; + } nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); return 0; -errv2attr: - nla_nest_cancel(skb, v2_attrs); errattr: nla_nest_cancel(skb, attrs); errout: @@ -684,24 +732,30 @@ static void smcr_copy_dev_info_to_link(struct smc_link *link) int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, u8 link_idx, struct smc_init_info *ini) { + struct smc_ib_device *smcibdev; u8 rndvec[3]; int rc; - get_device(&ini->ib_dev->ibdev->dev); - atomic_inc(&ini->ib_dev->lnk_cnt); + if (lgr->smc_version == SMC_V2) { + lnk->smcibdev = ini->smcrv2.ib_dev_v2; + lnk->ibport = ini->smcrv2.ib_port_v2; + } else { + lnk->smcibdev = ini->ib_dev; + lnk->ibport = ini->ib_port; + } + get_device(&lnk->smcibdev->ibdev->dev); + atomic_inc(&lnk->smcibdev->lnk_cnt); + lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu; lnk->link_id = smcr_next_link_id(lgr); lnk->lgr = lgr; lnk->link_idx = link_idx; - lnk->smcibdev = ini->ib_dev; - lnk->ibport = ini->ib_port; smc_ibdev_cnt_inc(lnk); smcr_copy_dev_info_to_link(lnk); - lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; atomic_set(&lnk->conn_cnt, 0); smc_llc_link_set_uid(lnk); INIT_WORK(&lnk->link_down_wrk, smc_link_down_work); - if (!ini->ib_dev->initialized) { - rc = (int)smc_ib_setup_per_ibdev(ini->ib_dev); + if (!lnk->smcibdev->initialized) { + rc = (int)smc_ib_setup_per_ibdev(lnk->smcibdev); if (rc) goto out; } @@ -709,7 +763,9 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16); rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, - ini->vlan_id, lnk->gid, &lnk->sgid_index); + ini->vlan_id, lnk->gid, &lnk->sgid_index, + lgr->smc_version == SMC_V2 ? + &ini->smcrv2 : NULL); if (rc) goto out; rc = smc_llc_link_init(lnk); @@ -740,11 +796,12 @@ clear_llc_lnk: smc_llc_link_clear(lnk, false); out: smc_ibdev_cnt_dec(lnk); - put_device(&ini->ib_dev->ibdev->dev); + put_device(&lnk->smcibdev->ibdev->dev); + smcibdev = lnk->smcibdev; memset(lnk, 0, sizeof(struct smc_link)); lnk->state = SMC_LNK_UNUSED; - if (!atomic_dec_return(&ini->ib_dev->lnk_cnt)) - wake_up(&ini->ib_dev->lnks_deleted); + if (!atomic_dec_return(&smcibdev->lnk_cnt)) + wake_up(&smcibdev->lnks_deleted); return rc; } @@ -808,18 +865,37 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) atomic_inc(&ini->ism_dev[ini->ism_selected]->lgr_cnt); } else { /* SMC-R specific settings */ + struct smc_ib_device *ibdev; + int ibport; + lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; - memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, + lgr->smc_version = ini->smcr_version; + memcpy(lgr->peer_systemid, ini->peer_systemid, SMC_SYSTEMID_LEN); - memcpy(lgr->pnet_id, ini->ib_dev->pnetid[ini->ib_port - 1], + if (lgr->smc_version == SMC_V2) { + ibdev = ini->smcrv2.ib_dev_v2; + ibport = ini->smcrv2.ib_port_v2; + lgr->saddr = ini->smcrv2.saddr; + lgr->uses_gateway = ini->smcrv2.uses_gateway; + memcpy(lgr->nexthop_mac, ini->smcrv2.nexthop_mac, + ETH_ALEN); + } else { + ibdev = ini->ib_dev; + ibport = ini->ib_port; + } + memcpy(lgr->pnet_id, ibdev->pnetid[ibport - 1], SMC_MAX_PNETID_LEN); + if (smc_wr_alloc_lgr_mem(lgr)) + goto free_wq; smc_llc_lgr_init(lgr, smc); link_idx = SMC_SINGLE_LINK; lnk = &lgr->lnk[link_idx]; rc = smcr_link_init(lgr, lnk, link_idx, ini); - if (rc) + if (rc) { + smc_wr_free_lgr_mem(lgr); goto free_wq; + } lgr_list = &smc_lgr_list.list; lgr_lock = &smc_lgr_list.lock; atomic_inc(&lgr_cnt); @@ -943,7 +1019,7 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr, to_lnk = &lgr->lnk[i]; break; } - if (!to_lnk) { + if (!to_lnk || !smc_wr_tx_link_hold(to_lnk)) { smc_lgr_terminate_sched(lgr); return NULL; } @@ -975,24 +1051,26 @@ again: read_unlock_bh(&lgr->conns_lock); /* pre-fetch buffer outside of send_lock, might sleep */ rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend); - if (rc) { - smcr_link_down_cond_sched(to_lnk); - return NULL; - } + if (rc) + goto err_out; /* avoid race with smcr_tx_sndbuf_nonempty() */ spin_lock_bh(&conn->send_lock); smc_switch_link_and_count(conn, to_lnk); rc = smc_switch_cursor(smc, pend, wr_buf); spin_unlock_bh(&conn->send_lock); sock_put(&smc->sk); - if (rc) { - smcr_link_down_cond_sched(to_lnk); - return NULL; - } + if (rc) + goto err_out; goto again; } read_unlock_bh(&lgr->conns_lock); + smc_wr_tx_link_put(to_lnk); return to_lnk; + +err_out: + smcr_link_down_cond_sched(to_lnk); + smc_wr_tx_link_put(to_lnk); + return NULL; } static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc, @@ -1224,6 +1302,7 @@ static void smc_lgr_free(struct smc_link_group *lgr) if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) wake_up(&lgr->smcd->lgrs_deleted); } else { + smc_wr_free_lgr_mem(lgr); if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); } @@ -1468,7 +1547,9 @@ static void smc_conn_abort_work(struct work_struct *work) abort_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + lock_sock(&smc->sk); smc_conn_kill(conn, true); + release_sock(&smc->sk); sock_put(&smc->sk); /* sock_hold done by schedulers of abort_work */ } @@ -1632,13 +1713,15 @@ out: return rc; } -static bool smcr_lgr_match(struct smc_link_group *lgr, - struct smc_clc_msg_local *lcl, +static bool smcr_lgr_match(struct smc_link_group *lgr, u8 smcr_version, + u8 peer_systemid[], + u8 peer_gid[], + u8 peer_mac_v1[], enum smc_lgr_role role, u32 clcqpn) { int i; - if (memcmp(lgr->peer_systemid, lcl->id_for_peer, SMC_SYSTEMID_LEN) || + if (memcmp(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN) || lgr->role != role) return false; @@ -1646,8 +1729,9 @@ static bool smcr_lgr_match(struct smc_link_group *lgr, if (!smc_link_active(&lgr->lnk[i])) continue; if ((lgr->role == SMC_SERV || lgr->lnk[i].peer_qpn == clcqpn) && - !memcmp(lgr->lnk[i].peer_gid, &lcl->gid, SMC_GID_SIZE) && - !memcmp(lgr->lnk[i].peer_mac, lcl->mac, sizeof(lcl->mac))) + !memcmp(lgr->lnk[i].peer_gid, peer_gid, SMC_GID_SIZE) && + (smcr_version == SMC_V2 || + !memcmp(lgr->lnk[i].peer_mac, peer_mac_v1, ETH_ALEN))) return true; } return false; @@ -1686,7 +1770,10 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) if ((ini->is_smcd ? smcd_lgr_match(lgr, ini->ism_dev[ini->ism_selected], ini->ism_peer_gid[ini->ism_selected]) : - smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) && + smcr_lgr_match(lgr, ini->smcr_version, + ini->peer_systemid, + ini->peer_gid, ini->peer_mac, role, + ini->ib_clcqpn)) && !lgr->sync_err && (ini->smcd_version == SMC_V2 || lgr->vlan_id == ini->vlan_id) && diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 83d30b06016f..59cef3b830d8 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -42,11 +42,16 @@ enum smc_link_state { /* possible states of a link */ }; #define SMC_WR_BUF_SIZE 48 /* size of work request buffer */ +#define SMC_WR_BUF_V2_SIZE 8192 /* size of v2 work request buffer */ struct smc_wr_buf { u8 raw[SMC_WR_BUF_SIZE]; }; +struct smc_wr_v2_buf { + u8 raw[SMC_WR_BUF_V2_SIZE]; +}; + #define SMC_WR_REG_MR_WAIT_TIME (5 * HZ)/* wait time for ib_wr_reg_mr result */ enum smc_wr_reg_state { @@ -92,7 +97,11 @@ struct smc_link { struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */ struct completion *wr_tx_compl; /* WR send CQE completion */ /* above four vectors have wr_tx_cnt elements and use the same index */ + struct ib_send_wr *wr_tx_v2_ib; /* WR send v2 meta data */ + struct ib_sge *wr_tx_v2_sge; /* WR send v2 gather meta data*/ + struct smc_wr_tx_pend *wr_tx_v2_pend; /* WR send v2 waiting for CQE */ dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */ + dma_addr_t wr_tx_v2_dma_addr; /* DMA address of v2 tx buf*/ atomic_long_t wr_tx_id; /* seq # of last sent WR */ unsigned long *wr_tx_mask; /* bit mask of used indexes */ u32 wr_tx_cnt; /* number of WR send buffers */ @@ -104,6 +113,7 @@ struct smc_link { struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */ /* above three vectors have wr_rx_cnt elements and use the same index */ dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ + dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/ u64 wr_rx_id; /* seq # of last recv WR */ u32 wr_rx_cnt; /* number of WR recv buffers */ unsigned long wr_rx_tstamp; /* jiffies when last buf rx */ @@ -208,6 +218,7 @@ enum smc_llc_flowtype { SMC_LLC_FLOW_NONE = 0, SMC_LLC_FLOW_ADD_LINK = 2, SMC_LLC_FLOW_DEL_LINK = 4, + SMC_LLC_FLOW_REQ_ADD_LINK = 5, SMC_LLC_FLOW_RKEY = 6, }; @@ -250,6 +261,10 @@ struct smc_link_group { /* client or server */ struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */ + struct smc_wr_v2_buf *wr_rx_buf_v2; + /* WR v2 recv payload buffer */ + struct smc_wr_v2_buf *wr_tx_buf_v2; + /* WR v2 send payload buffer */ char peer_systemid[SMC_SYSTEMID_LEN]; /* unique system_id of peer */ struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX] @@ -288,6 +303,9 @@ struct smc_link_group { /* link keep alive time */ u32 llc_termination_rsn; /* rsn code for termination */ + u8 nexthop_mac[ETH_ALEN]; + u8 uses_gateway; + __be32 saddr; }; struct { /* SMC-D */ u64 peer_gid; @@ -302,6 +320,31 @@ struct smc_link_group { struct smc_clc_msg_local; +#define GID_LIST_SIZE 2 + +struct smc_gidlist { + u8 len; + u8 list[GID_LIST_SIZE][SMC_GID_SIZE]; +}; + +struct smc_init_info_smcrv2 { + /* Input fields */ + __be32 saddr; + struct sock *clc_sk; + __be32 daddr; + + /* Output fields when saddr is set */ + struct smc_ib_device *ib_dev_v2; + u8 ib_port_v2; + u8 ib_gid_v2[SMC_GID_SIZE]; + + /* Additional output fields when clc_sk and daddr is set as well */ + u8 uses_gateway; + u8 nexthop_mac[ETH_ALEN]; + + struct smc_gidlist gidlist; +}; + struct smc_init_info { u8 is_smcd; u8 smc_type_v1; @@ -312,11 +355,16 @@ struct smc_init_info { u32 rc; u8 negotiated_eid[SMC_MAX_EID_LEN]; /* SMC-R */ - struct smc_clc_msg_local *ib_lcl; + u8 smcr_version; + u8 check_smcrv2; + u8 peer_gid[SMC_GID_SIZE]; + u8 peer_mac[ETH_ALEN]; + u8 peer_systemid[SMC_SYSTEMID_LEN]; struct smc_ib_device *ib_dev; u8 ib_gid[SMC_GID_SIZE]; u8 ib_port; u32 ib_clcqpn; + struct smc_init_info_smcrv2 smcrv2; /* SMC-D */ u64 ism_peer_gid[SMC_MAX_ISM_DEVS + 1]; struct smcd_dev *ism_dev[SMC_MAX_ISM_DEVS + 1]; diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index a8845343d183..d93055ec17ae 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -17,6 +17,7 @@ #include <linux/scatterlist.h> #include <linux/wait.h> #include <linux/mutex.h> +#include <linux/inetdevice.h> #include <rdma/ib_verbs.h> #include <rdma/ib_cache.h> @@ -62,16 +63,23 @@ static int smc_ib_modify_qp_rtr(struct smc_link *lnk) IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER; struct ib_qp_attr qp_attr; + u8 hop_lim = 1; memset(&qp_attr, 0, sizeof(qp_attr)); qp_attr.qp_state = IB_QPS_RTR; qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu); qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport); - rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, 1, 0); + if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway) + hop_lim = IPV6_DEFAULT_HOPLIMIT; + rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, hop_lim, 0); rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid); - memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac, - sizeof(lnk->peer_mac)); + if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway) + memcpy(&qp_attr.ah_attr.roce.dmac, lnk->lgr->nexthop_mac, + sizeof(lnk->lgr->nexthop_mac)); + else + memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac, + sizeof(lnk->peer_mac)); qp_attr.dest_qp_num = lnk->peer_qpn; qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */ qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming @@ -183,9 +191,81 @@ bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; } +int smc_ib_find_route(__be32 saddr, __be32 daddr, + u8 nexthop_mac[], u8 *uses_gateway) +{ + struct neighbour *neigh = NULL; + struct rtable *rt = NULL; + struct flowi4 fl4 = { + .saddr = saddr, + .daddr = daddr + }; + + if (daddr == cpu_to_be32(INADDR_NONE)) + goto out; + rt = ip_route_output_flow(&init_net, &fl4, NULL); + if (IS_ERR(rt)) + goto out; + if (rt->rt_uses_gateway && rt->rt_gw_family != AF_INET) + goto out; + neigh = rt->dst.ops->neigh_lookup(&rt->dst, NULL, &fl4.daddr); + if (neigh) { + memcpy(nexthop_mac, neigh->ha, ETH_ALEN); + *uses_gateway = rt->rt_uses_gateway; + return 0; + } +out: + return -ENOENT; +} + +static int smc_ib_determine_gid_rcu(const struct net_device *ndev, + const struct ib_gid_attr *attr, + u8 gid[], u8 *sgid_index, + struct smc_init_info_smcrv2 *smcrv2) +{ + if (!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) { + if (gid) + memcpy(gid, &attr->gid, SMC_GID_SIZE); + if (sgid_index) + *sgid_index = attr->index; + return 0; + } + if (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP && + smc_ib_gid_to_ipv4((u8 *)&attr->gid) != cpu_to_be32(INADDR_NONE)) { + struct in_device *in_dev = __in_dev_get_rcu(ndev); + const struct in_ifaddr *ifa; + bool subnet_match = false; + + if (!in_dev) + goto out; + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (!inet_ifa_match(smcrv2->saddr, ifa)) + continue; + subnet_match = true; + break; + } + if (!subnet_match) + goto out; + if (smcrv2->daddr && smc_ib_find_route(smcrv2->saddr, + smcrv2->daddr, + smcrv2->nexthop_mac, + &smcrv2->uses_gateway)) + goto out; + + if (gid) + memcpy(gid, &attr->gid, SMC_GID_SIZE); + if (sgid_index) + *sgid_index = attr->index; + return 0; + } +out: + return -ENODEV; +} + /* determine the gid for an ib-device port and vlan id */ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, - unsigned short vlan_id, u8 gid[], u8 *sgid_index) + unsigned short vlan_id, u8 gid[], u8 *sgid_index, + struct smc_init_info_smcrv2 *smcrv2) { const struct ib_gid_attr *attr; const struct net_device *ndev; @@ -201,15 +281,13 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, if (!IS_ERR(ndev) && ((!vlan_id && !is_vlan_dev(ndev)) || (vlan_id && is_vlan_dev(ndev) && - vlan_dev_vlan_id(ndev) == vlan_id)) && - attr->gid_type == IB_GID_TYPE_ROCE) { - rcu_read_unlock(); - if (gid) - memcpy(gid, &attr->gid, SMC_GID_SIZE); - if (sgid_index) - *sgid_index = attr->index; - rdma_put_gid_attr(attr); - return 0; + vlan_dev_vlan_id(ndev) == vlan_id))) { + if (!smc_ib_determine_gid_rcu(ndev, attr, gid, + sgid_index, smcrv2)) { + rcu_read_unlock(); + rdma_put_gid_attr(attr); + return 0; + } } rcu_read_unlock(); rdma_put_gid_attr(attr); @@ -217,6 +295,58 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, return -ENODEV; } +/* check if gid is still defined on smcibdev */ +static bool smc_ib_check_link_gid(u8 gid[SMC_GID_SIZE], bool smcrv2, + struct smc_ib_device *smcibdev, u8 ibport) +{ + const struct ib_gid_attr *attr; + bool rc = false; + int i; + + for (i = 0; !rc && i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) { + attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i); + if (IS_ERR(attr)) + continue; + + rcu_read_lock(); + if ((!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) || + (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP && + !(ipv6_addr_type((const struct in6_addr *)&attr->gid) + & IPV6_ADDR_LINKLOCAL))) + if (!memcmp(gid, &attr->gid, SMC_GID_SIZE)) + rc = true; + rcu_read_unlock(); + rdma_put_gid_attr(attr); + } + return rc; +} + +/* check all links if the gid is still defined on smcibdev */ +static void smc_ib_gid_check(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_link_group *lgr; + int i; + + spin_lock_bh(&smc_lgr_list.lock); + list_for_each_entry(lgr, &smc_lgr_list.list, list) { + if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, + SMC_MAX_PNETID_LEN)) + continue; /* lgr is not affected */ + if (list_empty(&lgr->list)) + continue; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state == SMC_LNK_UNUSED || + lgr->lnk[i].smcibdev != smcibdev) + continue; + if (!smc_ib_check_link_gid(lgr->lnk[i].gid, + lgr->smc_version == SMC_V2, + smcibdev, ibport)) + smcr_port_err(smcibdev, ibport); + } + } + spin_unlock_bh(&smc_lgr_list.lock); +} + static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) { int rc; @@ -255,6 +385,7 @@ static void smc_ib_port_event_work(struct work_struct *work) } else { clear_bit(port_idx, smcibdev->ports_going_away); smcr_port_add(smcibdev, port_idx + 1); + smc_ib_gid_check(smcibdev, port_idx + 1); } } } @@ -523,6 +654,7 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk) /* create a queue pair within the protection domain for a link */ int smc_ib_create_queue_pair(struct smc_link *lnk) { + int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, @@ -536,7 +668,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .max_send_wr = SMC_WR_BUF_CNT * 3, .max_recv_wr = SMC_WR_BUF_CNT * 3, .max_send_sge = SMC_IB_MAX_SEND_SGE, - .max_recv_sge = 1, + .max_recv_sge = sges_per_buf, }, .sq_sig_type = IB_SIGNAL_REQ_WR, .qp_type = IB_QPT_RC, diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 3085f5180da7..07585937370e 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -59,6 +59,17 @@ struct smc_ib_device { /* ib-device infos for smc */ int ndev_ifidx[SMC_MAX_PORTS]; /* ndev if indexes */ }; +static inline __be32 smc_ib_gid_to_ipv4(u8 gid[SMC_GID_SIZE]) +{ + struct in6_addr *addr6 = (struct in6_addr *)gid; + + if (ipv6_addr_v4mapped(addr6) || + !(addr6->s6_addr32[0] | addr6->s6_addr32[1] | addr6->s6_addr32[2])) + return addr6->s6_addr32[3]; + return cpu_to_be32(INADDR_NONE); +} + +struct smc_init_info_smcrv2; struct smc_buf_desc; struct smc_link; @@ -90,7 +101,10 @@ void smc_ib_sync_sg_for_device(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, - unsigned short vlan_id, u8 gid[], u8 *sgid_index); + unsigned short vlan_id, u8 gid[], u8 *sgid_index, + struct smc_init_info_smcrv2 *smcrv2); +int smc_ib_find_route(__be32 saddr, __be32 daddr, + u8 nexthop_mac[], u8 *uses_gateway); bool smc_ib_is_valid_local_systemid(void); int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb); #endif diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 2e7560eba981..a9623c952007 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -23,16 +23,24 @@ struct smc_llc_hdr { struct smc_wr_rx_hdr common; - u8 length; /* 44 */ -#if defined(__BIG_ENDIAN_BITFIELD) - u8 reserved:4, - add_link_rej_rsn:4; + union { + struct { + u8 length; /* 44 */ + #if defined(__BIG_ENDIAN_BITFIELD) + u8 reserved:4, + add_link_rej_rsn:4; #elif defined(__LITTLE_ENDIAN_BITFIELD) - u8 add_link_rej_rsn:4, - reserved:4; + u8 add_link_rej_rsn:4, + reserved:4; #endif + }; + u16 length_v2; /* 44 - 8192*/ + }; u8 flags; -}; +} __packed; /* format defined in + * IBM Shared Memory Communications Version 2 + * (https://www.ibm.com/support/pages/node/6326337) + */ #define SMC_LLC_FLAG_NO_RMBE_EYEC 0x03 @@ -76,6 +84,32 @@ struct smc_llc_msg_add_link_cont_rt { __be64 rmb_vaddr_new; }; +struct smc_llc_msg_add_link_v2_ext { +#if defined(__BIG_ENDIAN_BITFIELD) + u8 v2_direct : 1, + reserved : 7; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved : 7, + v2_direct : 1; +#endif + u8 reserved2; + u8 client_target_gid[SMC_GID_SIZE]; + u8 reserved3[8]; + u16 num_rkeys; + struct smc_llc_msg_add_link_cont_rt rt[]; +} __packed; /* format defined in + * IBM Shared Memory Communications Version 2 + * (https://www.ibm.com/support/pages/node/6326337) + */ + +struct smc_llc_msg_req_add_link_v2 { + struct smc_llc_hdr hd; + u8 reserved[20]; + u8 gid_cnt; + u8 reserved2[3]; + u8 gid[][SMC_GID_SIZE]; +}; + #define SMC_LLC_RKEYS_PER_CONT_MSG 2 struct smc_llc_msg_add_link_cont { /* type 0x03 */ @@ -114,7 +148,8 @@ struct smc_rmb_rtoken { __be64 rmb_vaddr; } __packed; /* format defined in RFC7609 */ -#define SMC_LLC_RKEYS_PER_MSG 3 +#define SMC_LLC_RKEYS_PER_MSG 3 +#define SMC_LLC_RKEYS_PER_MSG_V2 255 struct smc_llc_msg_confirm_rkey { /* type 0x06 */ struct smc_llc_hdr hd; @@ -135,9 +170,18 @@ struct smc_llc_msg_delete_rkey { /* type 0x09 */ u8 reserved2[4]; }; +struct smc_llc_msg_delete_rkey_v2 { /* type 0x29 */ + struct smc_llc_hdr hd; + u8 num_rkeys; + u8 num_inval_rkeys; + u8 reserved[2]; + __be32 rkey[]; +}; + union smc_llc_msg { struct smc_llc_msg_confirm_link confirm_link; struct smc_llc_msg_add_link add_link; + struct smc_llc_msg_req_add_link_v2 req_add_link; struct smc_llc_msg_add_link_cont add_link_cont; struct smc_llc_msg_del_link delete_link; @@ -189,7 +233,7 @@ static inline void smc_llc_flow_qentry_set(struct smc_llc_flow *flow, static void smc_llc_flow_parallel(struct smc_link_group *lgr, u8 flow_type, struct smc_llc_qentry *qentry) { - u8 msg_type = qentry->msg.raw.hdr.common.type; + u8 msg_type = qentry->msg.raw.hdr.common.llc_type; if ((msg_type == SMC_LLC_ADD_LINK || msg_type == SMC_LLC_DELETE_LINK) && flow_type != msg_type && !lgr->delayed_event) { @@ -219,7 +263,7 @@ static bool smc_llc_flow_start(struct smc_llc_flow *flow, spin_unlock_bh(&lgr->llc_flow_lock); return false; } - switch (qentry->msg.raw.hdr.common.type) { + switch (qentry->msg.raw.hdr.common.llc_type) { case SMC_LLC_ADD_LINK: flow->type = SMC_LLC_FLOW_ADD_LINK; break; @@ -306,7 +350,7 @@ struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, smc_llc_flow_qentry_del(flow); goto out; } - rcv_msg = flow->qentry->msg.raw.hdr.common.type; + rcv_msg = flow->qentry->msg.raw.hdr.common.llc_type; if (exp_msg && rcv_msg != exp_msg) { if (exp_msg == SMC_LLC_ADD_LINK && rcv_msg == SMC_LLC_DELETE_LINK) { @@ -374,6 +418,30 @@ static int smc_llc_add_pending_send(struct smc_link *link, return 0; } +static int smc_llc_add_pending_send_v2(struct smc_link *link, + struct smc_wr_v2_buf **wr_buf, + struct smc_wr_tx_pend_priv **pend) +{ + int rc; + + rc = smc_wr_tx_get_v2_slot(link, smc_llc_tx_handler, wr_buf, pend); + if (rc < 0) + return rc; + return 0; +} + +static void smc_llc_init_msg_hdr(struct smc_llc_hdr *hdr, + struct smc_link_group *lgr, size_t len) +{ + if (lgr->smc_version == SMC_V2) { + hdr->common.llc_version = SMC_V2; + hdr->length_v2 = len; + } else { + hdr->common.llc_version = 0; + hdr->length = len; + } +} + /* high-level API to send LLC confirm link */ int smc_llc_send_confirm_link(struct smc_link *link, enum smc_llc_reqresp reqresp) @@ -383,13 +451,15 @@ int smc_llc_send_confirm_link(struct smc_link *link, struct smc_wr_buf *wr_buf; int rc; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; confllc = (struct smc_llc_msg_confirm_link *)wr_buf; memset(confllc, 0, sizeof(*confllc)); - confllc->hd.common.type = SMC_LLC_CONFIRM_LINK; - confllc->hd.length = sizeof(struct smc_llc_msg_confirm_link); + confllc->hd.common.llc_type = SMC_LLC_CONFIRM_LINK; + smc_llc_init_msg_hdr(&confllc->hd, link->lgr, sizeof(*confllc)); confllc->hd.flags |= SMC_LLC_FLAG_NO_RMBE_EYEC; if (reqresp == SMC_LLC_RESP) confllc->hd.flags |= SMC_LLC_FLAG_RESP; @@ -402,6 +472,8 @@ int smc_llc_send_confirm_link(struct smc_link *link, confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS; /* send llc message */ rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } @@ -415,13 +487,15 @@ static int smc_llc_send_confirm_rkey(struct smc_link *send_link, struct smc_link *link; int i, rc, rtok_ix; + if (!smc_wr_tx_link_hold(send_link)) + return -ENOLINK; rc = smc_llc_add_pending_send(send_link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; rkeyllc = (struct smc_llc_msg_confirm_rkey *)wr_buf; memset(rkeyllc, 0, sizeof(*rkeyllc)); - rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY; - rkeyllc->hd.length = sizeof(struct smc_llc_msg_confirm_rkey); + rkeyllc->hd.common.llc_type = SMC_LLC_CONFIRM_RKEY; + smc_llc_init_msg_hdr(&rkeyllc->hd, send_link->lgr, sizeof(*rkeyllc)); rtok_ix = 1; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { @@ -444,6 +518,8 @@ static int smc_llc_send_confirm_rkey(struct smc_link *send_link, (u64)sg_dma_address(rmb_desc->sgt[send_link->link_idx].sgl)); /* send llc message */ rc = smc_wr_tx_send(send_link, pend); +put_out: + smc_wr_tx_link_put(send_link); return rc; } @@ -456,38 +532,134 @@ static int smc_llc_send_delete_rkey(struct smc_link *link, struct smc_wr_buf *wr_buf; int rc; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; rkeyllc = (struct smc_llc_msg_delete_rkey *)wr_buf; memset(rkeyllc, 0, sizeof(*rkeyllc)); - rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY; - rkeyllc->hd.length = sizeof(struct smc_llc_msg_delete_rkey); + rkeyllc->hd.common.llc_type = SMC_LLC_DELETE_RKEY; + smc_llc_init_msg_hdr(&rkeyllc->hd, link->lgr, sizeof(*rkeyllc)); rkeyllc->num_rkeys = 1; rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[link->link_idx]->rkey); /* send llc message */ rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } +/* return first buffer from any of the next buf lists */ +static struct smc_buf_desc *_smc_llc_get_next_rmb(struct smc_link_group *lgr, + int *buf_lst) +{ + struct smc_buf_desc *buf_pos; + + while (*buf_lst < SMC_RMBE_SIZES) { + buf_pos = list_first_entry_or_null(&lgr->rmbs[*buf_lst], + struct smc_buf_desc, list); + if (buf_pos) + return buf_pos; + (*buf_lst)++; + } + return NULL; +} + +/* return next rmb from buffer lists */ +static struct smc_buf_desc *smc_llc_get_next_rmb(struct smc_link_group *lgr, + int *buf_lst, + struct smc_buf_desc *buf_pos) +{ + struct smc_buf_desc *buf_next; + + if (!buf_pos || list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) { + (*buf_lst)++; + return _smc_llc_get_next_rmb(lgr, buf_lst); + } + buf_next = list_next_entry(buf_pos, list); + return buf_next; +} + +static struct smc_buf_desc *smc_llc_get_first_rmb(struct smc_link_group *lgr, + int *buf_lst) +{ + *buf_lst = 0; + return smc_llc_get_next_rmb(lgr, buf_lst, NULL); +} + +static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext, + struct smc_link *link, struct smc_link *link_new) +{ + struct smc_link_group *lgr = link->lgr; + struct smc_buf_desc *buf_pos; + int prim_lnk_idx, lnk_idx, i; + struct smc_buf_desc *rmb; + int len = sizeof(*ext); + int buf_lst; + + ext->v2_direct = !lgr->uses_gateway; + memcpy(ext->client_target_gid, link_new->gid, SMC_GID_SIZE); + + prim_lnk_idx = link->link_idx; + lnk_idx = link_new->link_idx; + mutex_lock(&lgr->rmbs_lock); + ext->num_rkeys = lgr->conns_num; + if (!ext->num_rkeys) + goto out; + buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); + for (i = 0; i < ext->num_rkeys; i++) { + if (!buf_pos) + break; + rmb = buf_pos; + ext->rt[i].rmb_key = htonl(rmb->mr_rx[prim_lnk_idx]->rkey); + ext->rt[i].rmb_key_new = htonl(rmb->mr_rx[lnk_idx]->rkey); + ext->rt[i].rmb_vaddr_new = + cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl)); + buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos); + while (buf_pos && !(buf_pos)->used) + buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos); + } + len += i * sizeof(ext->rt[0]); +out: + mutex_unlock(&lgr->rmbs_lock); + return len; +} + /* send ADD LINK request or response */ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], struct smc_link *link_new, enum smc_llc_reqresp reqresp) { + struct smc_llc_msg_add_link_v2_ext *ext = NULL; struct smc_llc_msg_add_link *addllc; struct smc_wr_tx_pend_priv *pend; - struct smc_wr_buf *wr_buf; + int len = sizeof(*addllc); int rc; - rc = smc_llc_add_pending_send(link, &wr_buf, &pend); - if (rc) - return rc; - addllc = (struct smc_llc_msg_add_link *)wr_buf; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; + if (link->lgr->smc_version == SMC_V2) { + struct smc_wr_v2_buf *wr_buf; + + rc = smc_llc_add_pending_send_v2(link, &wr_buf, &pend); + if (rc) + goto put_out; + addllc = (struct smc_llc_msg_add_link *)wr_buf; + ext = (struct smc_llc_msg_add_link_v2_ext *) + &wr_buf->raw[sizeof(*addllc)]; + memset(ext, 0, SMC_WR_TX_SIZE); + } else { + struct smc_wr_buf *wr_buf; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + goto put_out; + addllc = (struct smc_llc_msg_add_link *)wr_buf; + } memset(addllc, 0, sizeof(*addllc)); - addllc->hd.common.type = SMC_LLC_ADD_LINK; - addllc->hd.length = sizeof(struct smc_llc_msg_add_link); + addllc->hd.common.llc_type = SMC_LLC_ADD_LINK; if (reqresp == SMC_LLC_RESP) addllc->hd.flags |= SMC_LLC_FLAG_RESP; memcpy(addllc->sender_mac, mac, ETH_ALEN); @@ -502,8 +674,16 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], addllc->qp_mtu = min(link_new->path_mtu, link_new->peer_mtu); } + if (ext && link_new) + len += smc_llc_fill_ext_v2(ext, link, link_new); + smc_llc_init_msg_hdr(&addllc->hd, link->lgr, len); /* send llc message */ - rc = smc_wr_tx_send(link, pend); + if (link->lgr->smc_version == SMC_V2) + rc = smc_wr_tx_v2_send(link, pend, len); + else + rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } @@ -517,14 +697,16 @@ int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, struct smc_wr_buf *wr_buf; int rc; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; delllc = (struct smc_llc_msg_del_link *)wr_buf; memset(delllc, 0, sizeof(*delllc)); - delllc->hd.common.type = SMC_LLC_DELETE_LINK; - delllc->hd.length = sizeof(struct smc_llc_msg_del_link); + delllc->hd.common.llc_type = SMC_LLC_DELETE_LINK; + smc_llc_init_msg_hdr(&delllc->hd, link->lgr, sizeof(*delllc)); if (reqresp == SMC_LLC_RESP) delllc->hd.flags |= SMC_LLC_FLAG_RESP; if (orderly) @@ -536,6 +718,8 @@ int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, delllc->reason = htonl(reason); /* send llc message */ rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } @@ -547,16 +731,20 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) struct smc_wr_buf *wr_buf; int rc; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; testllc = (struct smc_llc_msg_test_link *)wr_buf; memset(testllc, 0, sizeof(*testllc)); - testllc->hd.common.type = SMC_LLC_TEST_LINK; - testllc->hd.length = sizeof(struct smc_llc_msg_test_link); + testllc->hd.common.llc_type = SMC_LLC_TEST_LINK; + smc_llc_init_msg_hdr(&testllc->hd, link->lgr, sizeof(*testllc)); memcpy(testllc->user_data, user_data, sizeof(testllc->user_data)); /* send llc message */ rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } @@ -567,13 +755,16 @@ static int smc_llc_send_message(struct smc_link *link, void *llcbuf) struct smc_wr_buf *wr_buf; int rc; - if (!smc_link_usable(link)) + if (!smc_wr_tx_link_hold(link)) return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg)); - return smc_wr_tx_send(link, pend); + rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); + return rc; } /* schedule an llc send on link, may wait for buffers, @@ -586,13 +777,16 @@ static int smc_llc_send_message_wait(struct smc_link *link, void *llcbuf) struct smc_wr_buf *wr_buf; int rc; - if (!smc_link_usable(link)) + if (!smc_wr_tx_link_hold(link)) return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg)); - return smc_wr_tx_send_wait(link, pend, SMC_LLC_WAIT_TIME); + rc = smc_wr_tx_send_wait(link, pend, SMC_LLC_WAIT_TIME); +put_out: + smc_wr_tx_link_put(link); + return rc; } /********************************* receive ***********************************/ @@ -621,44 +815,6 @@ static int smc_llc_alloc_alt_link(struct smc_link_group *lgr, return -EMLINK; } -/* return first buffer from any of the next buf lists */ -static struct smc_buf_desc *_smc_llc_get_next_rmb(struct smc_link_group *lgr, - int *buf_lst) -{ - struct smc_buf_desc *buf_pos; - - while (*buf_lst < SMC_RMBE_SIZES) { - buf_pos = list_first_entry_or_null(&lgr->rmbs[*buf_lst], - struct smc_buf_desc, list); - if (buf_pos) - return buf_pos; - (*buf_lst)++; - } - return NULL; -} - -/* return next rmb from buffer lists */ -static struct smc_buf_desc *smc_llc_get_next_rmb(struct smc_link_group *lgr, - int *buf_lst, - struct smc_buf_desc *buf_pos) -{ - struct smc_buf_desc *buf_next; - - if (!buf_pos || list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) { - (*buf_lst)++; - return _smc_llc_get_next_rmb(lgr, buf_lst); - } - buf_next = list_next_entry(buf_pos, list); - return buf_next; -} - -static struct smc_buf_desc *smc_llc_get_first_rmb(struct smc_link_group *lgr, - int *buf_lst) -{ - *buf_lst = 0; - return smc_llc_get_next_rmb(lgr, buf_lst, NULL); -} - /* send one add_link_continue msg */ static int smc_llc_add_link_cont(struct smc_link *link, struct smc_link *link_new, u8 *num_rkeys_todo, @@ -672,9 +828,11 @@ static int smc_llc_add_link_cont(struct smc_link *link, struct smc_buf_desc *rmb; u8 n; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; addc_llc = (struct smc_llc_msg_add_link_cont *)wr_buf; memset(addc_llc, 0, sizeof(*addc_llc)); @@ -702,11 +860,14 @@ static int smc_llc_add_link_cont(struct smc_link *link, while (*buf_pos && !(*buf_pos)->used) *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos); } - addc_llc->hd.common.type = SMC_LLC_ADD_LINK_CONT; + addc_llc->hd.common.llc_type = SMC_LLC_ADD_LINK_CONT; addc_llc->hd.length = sizeof(struct smc_llc_msg_add_link_cont); if (lgr->role == SMC_CLNT) addc_llc->hd.flags |= SMC_LLC_FLAG_RESP; - return smc_wr_tx_send(link, pend); + rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); + return rc; } static int smc_llc_cli_rkey_exchange(struct smc_link *link, @@ -758,6 +919,8 @@ static int smc_llc_cli_add_link_reject(struct smc_llc_qentry *qentry) qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP; qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_ADD_LNK_REJ; qentry->msg.raw.hdr.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH; + smc_llc_init_msg_hdr(&qentry->msg.raw.hdr, qentry->link->lgr, + sizeof(qentry->msg)); return smc_llc_send_message(qentry->link, &qentry->msg); } @@ -778,7 +941,7 @@ static int smc_llc_cli_conf_link(struct smc_link *link, SMC_LLC_DEL_LOST_PATH); return -ENOLINK; } - if (qentry->msg.raw.hdr.common.type != SMC_LLC_CONFIRM_LINK) { + if (qentry->msg.raw.hdr.common.llc_type != SMC_LLC_CONFIRM_LINK) { /* received DELETE_LINK instead */ qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP; smc_llc_send_message(link, &qentry->msg); @@ -819,6 +982,26 @@ static int smc_llc_cli_conf_link(struct smc_link *link, return 0; } +static void smc_llc_save_add_link_rkeys(struct smc_link *link, + struct smc_link *link_new) +{ + struct smc_llc_msg_add_link_v2_ext *ext; + struct smc_link_group *lgr = link->lgr; + int max, i; + + ext = (struct smc_llc_msg_add_link_v2_ext *)((u8 *)lgr->wr_rx_buf_v2 + + SMC_WR_TX_SIZE); + max = min_t(u8, ext->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2); + mutex_lock(&lgr->rmbs_lock); + for (i = 0; i < max; i++) { + smc_rtoken_set(lgr, link->link_idx, link_new->link_idx, + ext->rt[i].rmb_key, + ext->rt[i].rmb_vaddr_new, + ext->rt[i].rmb_key_new); + } + mutex_unlock(&lgr->rmbs_lock); +} + static void smc_llc_save_add_link_info(struct smc_link *link, struct smc_llc_msg_add_link *add_llc) { @@ -835,31 +1018,47 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) struct smc_llc_msg_add_link *llc = &qentry->msg.add_link; enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC; struct smc_link_group *lgr = smc_get_lgr(link); + struct smc_init_info *ini = NULL; struct smc_link *lnk_new = NULL; - struct smc_init_info ini; int lnk_idx, rc = 0; if (!llc->qp_mtu) goto out_reject; - ini.vlan_id = lgr->vlan_id; - smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev); + ini = kzalloc(sizeof(*ini), GFP_KERNEL); + if (!ini) { + rc = -ENOMEM; + goto out_reject; + } + + ini->vlan_id = lgr->vlan_id; + if (lgr->smc_version == SMC_V2) { + ini->check_smcrv2 = true; + ini->smcrv2.saddr = lgr->saddr; + ini->smcrv2.daddr = smc_ib_gid_to_ipv4(llc->sender_gid); + } + smc_pnet_find_alt_roce(lgr, ini, link->smcibdev); if (!memcmp(llc->sender_gid, link->peer_gid, SMC_GID_SIZE) && - !memcmp(llc->sender_mac, link->peer_mac, ETH_ALEN)) { - if (!ini.ib_dev) + (lgr->smc_version == SMC_V2 || + !memcmp(llc->sender_mac, link->peer_mac, ETH_ALEN))) { + if (!ini->ib_dev && !ini->smcrv2.ib_dev_v2) goto out_reject; lgr_new_t = SMC_LGR_ASYMMETRIC_PEER; } - if (!ini.ib_dev) { + if (lgr->smc_version == SMC_V2 && !ini->smcrv2.ib_dev_v2) { lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL; - ini.ib_dev = link->smcibdev; - ini.ib_port = link->ibport; + ini->smcrv2.ib_dev_v2 = link->smcibdev; + ini->smcrv2.ib_port_v2 = link->ibport; + } else if (lgr->smc_version < SMC_V2 && !ini->ib_dev) { + lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL; + ini->ib_dev = link->smcibdev; + ini->ib_port = link->ibport; } lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t); if (lnk_idx < 0) goto out_reject; lnk_new = &lgr->lnk[lnk_idx]; - rc = smcr_link_init(lgr, lnk_new, lnk_idx, &ini); + rc = smcr_link_init(lgr, lnk_new, lnk_idx, ini); if (rc) goto out_reject; smc_llc_save_add_link_info(lnk_new, llc); @@ -875,16 +1074,20 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) goto out_clear_lnk; rc = smc_llc_send_add_link(link, - lnk_new->smcibdev->mac[ini.ib_port - 1], + lnk_new->smcibdev->mac[lnk_new->ibport - 1], lnk_new->gid, lnk_new, SMC_LLC_RESP); if (rc) goto out_clear_lnk; - rc = smc_llc_cli_rkey_exchange(link, lnk_new); - if (rc) { - rc = 0; - goto out_clear_lnk; + if (lgr->smc_version == SMC_V2) { + smc_llc_save_add_link_rkeys(link, lnk_new); + } else { + rc = smc_llc_cli_rkey_exchange(link, lnk_new); + if (rc) { + rc = 0; + goto out_clear_lnk; + } } - rc = smc_llc_cli_conf_link(link, &ini, lnk_new, lgr_new_t); + rc = smc_llc_cli_conf_link(link, ini, lnk_new, lgr_new_t); if (!rc) goto out; out_clear_lnk: @@ -893,29 +1096,78 @@ out_clear_lnk: out_reject: smc_llc_cli_add_link_reject(qentry); out: + kfree(ini); kfree(qentry); return rc; } +static void smc_llc_send_request_add_link(struct smc_link *link) +{ + struct smc_llc_msg_req_add_link_v2 *llc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_v2_buf *wr_buf; + struct smc_gidlist gidlist; + int rc, len, i; + + if (!smc_wr_tx_link_hold(link)) + return; + if (link->lgr->type == SMC_LGR_SYMMETRIC || + link->lgr->type == SMC_LGR_ASYMMETRIC_PEER) + goto put_out; + + smc_fill_gid_list(link->lgr, &gidlist, link->smcibdev, link->gid); + if (gidlist.len <= 1) + goto put_out; + + rc = smc_llc_add_pending_send_v2(link, &wr_buf, &pend); + if (rc) + goto put_out; + llc = (struct smc_llc_msg_req_add_link_v2 *)wr_buf; + memset(llc, 0, SMC_WR_TX_SIZE); + + llc->hd.common.llc_type = SMC_LLC_REQ_ADD_LINK; + for (i = 0; i < gidlist.len; i++) + memcpy(llc->gid[i], gidlist.list[i], sizeof(gidlist.list[0])); + llc->gid_cnt = gidlist.len; + len = sizeof(*llc) + (gidlist.len * sizeof(gidlist.list[0])); + smc_llc_init_msg_hdr(&llc->hd, link->lgr, len); + rc = smc_wr_tx_v2_send(link, pend, len); + if (!rc) + /* set REQ_ADD_LINK flow and wait for response from peer */ + link->lgr->llc_flow_lcl.type = SMC_LLC_FLOW_REQ_ADD_LINK; +put_out: + smc_wr_tx_link_put(link); +} + /* as an SMC client, invite server to start the add_link processing */ static void smc_llc_cli_add_link_invite(struct smc_link *link, struct smc_llc_qentry *qentry) { struct smc_link_group *lgr = smc_get_lgr(link); - struct smc_init_info ini; + struct smc_init_info *ini = NULL; + + if (lgr->smc_version == SMC_V2) { + smc_llc_send_request_add_link(link); + goto out; + } if (lgr->type == SMC_LGR_SYMMETRIC || lgr->type == SMC_LGR_ASYMMETRIC_PEER) goto out; - ini.vlan_id = lgr->vlan_id; - smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev); - if (!ini.ib_dev) + ini = kzalloc(sizeof(*ini), GFP_KERNEL); + if (!ini) goto out; - smc_llc_send_add_link(link, ini.ib_dev->mac[ini.ib_port - 1], - ini.ib_gid, NULL, SMC_LLC_REQ); + ini->vlan_id = lgr->vlan_id; + smc_pnet_find_alt_roce(lgr, ini, link->smcibdev); + if (!ini->ib_dev) + goto out; + + smc_llc_send_add_link(link, ini->ib_dev->mac[ini->ib_port - 1], + ini->ib_gid, NULL, SMC_LLC_REQ); out: + kfree(ini); kfree(qentry); } @@ -931,7 +1183,7 @@ static bool smc_llc_is_empty_llc_message(union smc_llc_msg *llc) static bool smc_llc_is_local_add_link(union smc_llc_msg *llc) { - if (llc->raw.hdr.common.type == SMC_LLC_ADD_LINK && + if (llc->raw.hdr.common.llc_type == SMC_LLC_ADD_LINK && smc_llc_is_empty_llc_message(llc)) return true; return false; @@ -1098,7 +1350,7 @@ static int smc_llc_srv_conf_link(struct smc_link *link, /* receive CONFIRM LINK response over the RoCE fabric */ qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_FIRST_TIME, 0); if (!qentry || - qentry->msg.raw.hdr.common.type != SMC_LLC_CONFIRM_LINK) { + qentry->msg.raw.hdr.common.llc_type != SMC_LLC_CONFIRM_LINK) { /* send DELETE LINK */ smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, false, SMC_LLC_DEL_LOST_PATH); @@ -1117,37 +1369,80 @@ static int smc_llc_srv_conf_link(struct smc_link *link, return 0; } -int smc_llc_srv_add_link(struct smc_link *link) +static void smc_llc_send_req_add_link_response(struct smc_llc_qentry *qentry) +{ + qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP; + smc_llc_init_msg_hdr(&qentry->msg.raw.hdr, qentry->link->lgr, + sizeof(qentry->msg)); + memset(&qentry->msg.raw.data, 0, sizeof(qentry->msg.raw.data)); + smc_llc_send_message(qentry->link, &qentry->msg); +} + +int smc_llc_srv_add_link(struct smc_link *link, + struct smc_llc_qentry *req_qentry) { enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC; struct smc_link_group *lgr = link->lgr; struct smc_llc_msg_add_link *add_llc; struct smc_llc_qentry *qentry = NULL; - struct smc_link *link_new; - struct smc_init_info ini; + bool send_req_add_link_resp = false; + struct smc_link *link_new = NULL; + struct smc_init_info *ini = NULL; int lnk_idx, rc = 0; + if (req_qentry && + req_qentry->msg.raw.hdr.common.llc_type == SMC_LLC_REQ_ADD_LINK) + send_req_add_link_resp = true; + + ini = kzalloc(sizeof(*ini), GFP_KERNEL); + if (!ini) { + rc = -ENOMEM; + goto out; + } + /* ignore client add link recommendation, start new flow */ - ini.vlan_id = lgr->vlan_id; - smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev); - if (!ini.ib_dev) { + ini->vlan_id = lgr->vlan_id; + if (lgr->smc_version == SMC_V2) { + ini->check_smcrv2 = true; + ini->smcrv2.saddr = lgr->saddr; + if (send_req_add_link_resp) { + struct smc_llc_msg_req_add_link_v2 *req_add = + &req_qentry->msg.req_add_link; + + ini->smcrv2.daddr = smc_ib_gid_to_ipv4(req_add->gid[0]); + } + } + smc_pnet_find_alt_roce(lgr, ini, link->smcibdev); + if (lgr->smc_version == SMC_V2 && !ini->smcrv2.ib_dev_v2) { + lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL; + ini->smcrv2.ib_dev_v2 = link->smcibdev; + ini->smcrv2.ib_port_v2 = link->ibport; + } else if (lgr->smc_version < SMC_V2 && !ini->ib_dev) { lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL; - ini.ib_dev = link->smcibdev; - ini.ib_port = link->ibport; + ini->ib_dev = link->smcibdev; + ini->ib_port = link->ibport; } lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t); - if (lnk_idx < 0) - return 0; + if (lnk_idx < 0) { + rc = 0; + goto out; + } - rc = smcr_link_init(lgr, &lgr->lnk[lnk_idx], lnk_idx, &ini); + rc = smcr_link_init(lgr, &lgr->lnk[lnk_idx], lnk_idx, ini); if (rc) - return rc; + goto out; link_new = &lgr->lnk[lnk_idx]; + + rc = smcr_buf_map_lgr(link_new); + if (rc) + goto out_err; + rc = smc_llc_send_add_link(link, - link_new->smcibdev->mac[ini.ib_port - 1], + link_new->smcibdev->mac[link_new->ibport-1], link_new->gid, link_new, SMC_LLC_REQ); if (rc) goto out_err; + send_req_add_link_resp = false; /* receive ADD LINK response over the RoCE fabric */ qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME, SMC_LLC_ADD_LINK); if (!qentry) { @@ -1162,7 +1457,8 @@ int smc_llc_srv_add_link(struct smc_link *link) } if (lgr->type == SMC_LGR_SINGLE && (!memcmp(add_llc->sender_gid, link->peer_gid, SMC_GID_SIZE) && - !memcmp(add_llc->sender_mac, link->peer_mac, ETH_ALEN))) { + (lgr->smc_version == SMC_V2 || + !memcmp(add_llc->sender_mac, link->peer_mac, ETH_ALEN)))) { lgr_new_t = SMC_LGR_ASYMMETRIC_PEER; } smc_llc_save_add_link_info(link_new, add_llc); @@ -1171,39 +1467,49 @@ int smc_llc_srv_add_link(struct smc_link *link) rc = smc_ib_ready_link(link_new); if (rc) goto out_err; - rc = smcr_buf_map_lgr(link_new); - if (rc) - goto out_err; rc = smcr_buf_reg_lgr(link_new); if (rc) goto out_err; - rc = smc_llc_srv_rkey_exchange(link, link_new); - if (rc) - goto out_err; + if (lgr->smc_version == SMC_V2) { + smc_llc_save_add_link_rkeys(link, link_new); + } else { + rc = smc_llc_srv_rkey_exchange(link, link_new); + if (rc) + goto out_err; + } rc = smc_llc_srv_conf_link(link, link_new, lgr_new_t); if (rc) goto out_err; + kfree(ini); return 0; out_err: - link_new->state = SMC_LNK_INACTIVE; - smcr_link_clear(link_new, false); + if (link_new) { + link_new->state = SMC_LNK_INACTIVE; + smcr_link_clear(link_new, false); + } +out: + kfree(ini); + if (send_req_add_link_resp) + smc_llc_send_req_add_link_response(req_qentry); return rc; } static void smc_llc_process_srv_add_link(struct smc_link_group *lgr) { struct smc_link *link = lgr->llc_flow_lcl.qentry->link; + struct smc_llc_qentry *qentry; int rc; - smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); mutex_lock(&lgr->llc_conf_mutex); - rc = smc_llc_srv_add_link(link); + rc = smc_llc_srv_add_link(link, qentry); if (!rc && lgr->type == SMC_LGR_SYMMETRIC) { /* delete any asymmetric link */ smc_llc_delete_asym_link(lgr); } mutex_unlock(&lgr->llc_conf_mutex); + kfree(qentry); } /* enqueue a local add_link req to trigger a new add_link flow */ @@ -1211,8 +1517,8 @@ void smc_llc_add_link_local(struct smc_link *link) { struct smc_llc_msg_add_link add_llc = {}; - add_llc.hd.length = sizeof(add_llc); - add_llc.hd.common.type = SMC_LLC_ADD_LINK; + add_llc.hd.common.llc_type = SMC_LLC_ADD_LINK; + smc_llc_init_msg_hdr(&add_llc.hd, link->lgr, sizeof(add_llc)); /* no dev and port needed */ smc_llc_enqueue(link, (union smc_llc_msg *)&add_llc); } @@ -1234,7 +1540,8 @@ static void smc_llc_add_link_work(struct work_struct *work) else smc_llc_process_srv_add_link(lgr); out: - smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); + if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_REQ_ADD_LINK) + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } /* enqueue a local del_link msg to trigger a new del_link flow, @@ -1244,8 +1551,8 @@ void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id) { struct smc_llc_msg_del_link del_llc = {}; - del_llc.hd.length = sizeof(del_llc); - del_llc.hd.common.type = SMC_LLC_DELETE_LINK; + del_llc.hd.common.llc_type = SMC_LLC_DELETE_LINK; + smc_llc_init_msg_hdr(&del_llc.hd, link->lgr, sizeof(del_llc)); del_llc.link_num = del_link_id; del_llc.reason = htonl(SMC_LLC_DEL_LOST_PATH); del_llc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; @@ -1315,8 +1622,8 @@ void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn) struct smc_llc_msg_del_link delllc = {}; int i; - delllc.hd.common.type = SMC_LLC_DELETE_LINK; - delllc.hd.length = sizeof(delllc); + delllc.hd.common.llc_type = SMC_LLC_DELETE_LINK; + smc_llc_init_msg_hdr(&delllc.hd, lgr, sizeof(delllc)); if (ord) delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; @@ -1432,6 +1739,8 @@ static void smc_llc_rmt_conf_rkey(struct smc_link_group *lgr) link = qentry->link; num_entries = llc->rtoken[0].num_rkeys; + if (num_entries > SMC_LLC_RKEYS_PER_MSG) + goto out_err; /* first rkey entry is for receiving link */ rk_idx = smc_rtoken_add(link, llc->rtoken[0].rmb_vaddr, @@ -1450,6 +1759,7 @@ out_err: llc->hd.flags |= SMC_LLC_FLAG_RKEY_RETRY; out: llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_init_msg_hdr(&llc->hd, link->lgr, sizeof(*llc)); smc_llc_send_message(link, &qentry->msg); smc_llc_flow_qentry_del(&lgr->llc_flow_rmt); } @@ -1467,6 +1777,28 @@ static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr) llc = &qentry->msg.delete_rkey; link = qentry->link; + if (lgr->smc_version == SMC_V2) { + struct smc_llc_msg_delete_rkey_v2 *llcv2; + + memcpy(lgr->wr_rx_buf_v2, llc, sizeof(*llc)); + llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)lgr->wr_rx_buf_v2; + llcv2->num_inval_rkeys = 0; + + max = min_t(u8, llcv2->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2); + for (i = 0; i < max; i++) { + if (smc_rtoken_delete(link, llcv2->rkey[i])) + llcv2->num_inval_rkeys++; + } + memset(&llc->rkey[0], 0, sizeof(llc->rkey)); + memset(&llc->reserved2, 0, sizeof(llc->reserved2)); + smc_llc_init_msg_hdr(&llc->hd, link->lgr, sizeof(*llc)); + if (llcv2->num_inval_rkeys) { + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + llc->err_mask = llcv2->num_inval_rkeys; + } + goto finish; + } + max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); for (i = 0; i < max; i++) { if (smc_rtoken_delete(link, llc->rkey[i])) @@ -1476,6 +1808,7 @@ static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr) llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; llc->err_mask = err_mask; } +finish: llc->hd.flags |= SMC_LLC_FLAG_RESP; smc_llc_send_message(link, &qentry->msg); smc_llc_flow_qentry_del(&lgr->llc_flow_rmt); @@ -1511,7 +1844,7 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) if (!smc_link_usable(link)) goto out; - switch (llc->raw.hdr.common.type) { + switch (llc->raw.hdr.common.llc_type) { case SMC_LLC_TEST_LINK: llc->test_link.hd.flags |= SMC_LLC_FLAG_RESP; smc_llc_send_message(link, llc); @@ -1536,8 +1869,18 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry); wake_up(&lgr->llc_msg_waiter); - } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, - qentry)) { + return; + } + if (lgr->llc_flow_lcl.type == + SMC_LLC_FLOW_REQ_ADD_LINK) { + /* server started add_link processing */ + lgr->llc_flow_lcl.type = SMC_LLC_FLOW_ADD_LINK; + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, + qentry); + schedule_work(&lgr->llc_add_link_work); + return; + } + if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { schedule_work(&lgr->llc_add_link_work); } } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { @@ -1585,6 +1928,23 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); } return; + case SMC_LLC_REQ_ADD_LINK: + /* handle response here, smc_llc_flow_stop() cannot be called + * in tasklet context + */ + if (lgr->role == SMC_CLNT && + lgr->llc_flow_lcl.type == SMC_LLC_FLOW_REQ_ADD_LINK && + (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP)) { + smc_llc_flow_stop(link->lgr, &lgr->llc_flow_lcl); + } else if (lgr->role == SMC_SERV) { + if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { + /* as smc server, handle client suggestion */ + lgr->llc_flow_lcl.type = SMC_LLC_FLOW_ADD_LINK; + schedule_work(&lgr->llc_add_link_work); + } + return; + } + break; default: smc_llc_protocol_violation(lgr, llc->raw.hdr.common.type); break; @@ -1628,7 +1988,7 @@ static void smc_llc_rx_response(struct smc_link *link, { enum smc_llc_flowtype flowtype = link->lgr->llc_flow_lcl.type; struct smc_llc_flow *flow = &link->lgr->llc_flow_lcl; - u8 llc_type = qentry->msg.raw.hdr.common.type; + u8 llc_type = qentry->msg.raw.hdr.common.llc_type; switch (llc_type) { case SMC_LLC_TEST_LINK: @@ -1654,7 +2014,8 @@ static void smc_llc_rx_response(struct smc_link *link, /* not used because max links is 3 */ break; default: - smc_llc_protocol_violation(link->lgr, llc_type); + smc_llc_protocol_violation(link->lgr, + qentry->msg.raw.hdr.common.type); break; } kfree(qentry); @@ -1679,7 +2040,8 @@ static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc) memcpy(&qentry->msg, llc, sizeof(union smc_llc_msg)); /* process responses immediately */ - if (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) { + if ((llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) && + llc->raw.hdr.common.llc_type != SMC_LLC_REQ_ADD_LINK) { smc_llc_rx_response(link, qentry); return; } @@ -1699,8 +2061,13 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) if (wc->byte_len < sizeof(*llc)) return; /* short message */ - if (llc->raw.hdr.length != sizeof(*llc)) - return; /* invalid message */ + if (!llc->raw.hdr.common.llc_version) { + if (llc->raw.hdr.length != sizeof(*llc)) + return; /* invalid message */ + } else { + if (llc->raw.hdr.length_v2 < sizeof(*llc)) + return; /* invalid message */ + } smc_llc_enqueue(link, llc); } @@ -1919,6 +2286,35 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { .handler = smc_llc_rx_handler, .type = SMC_LLC_DELETE_RKEY }, + /* V2 types */ + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_CONFIRM_LINK_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_TEST_LINK_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_ADD_LINK_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_DELETE_LINK_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_REQ_ADD_LINK_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_CONFIRM_RKEY_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_DELETE_RKEY_V2 + }, { .handler = NULL, } diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index cc00a2ec4e92..4404e52b3346 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -30,10 +30,19 @@ enum smc_llc_msg_type { SMC_LLC_ADD_LINK = 0x02, SMC_LLC_ADD_LINK_CONT = 0x03, SMC_LLC_DELETE_LINK = 0x04, + SMC_LLC_REQ_ADD_LINK = 0x05, SMC_LLC_CONFIRM_RKEY = 0x06, SMC_LLC_TEST_LINK = 0x07, SMC_LLC_CONFIRM_RKEY_CONT = 0x08, SMC_LLC_DELETE_RKEY = 0x09, + /* V2 types */ + SMC_LLC_CONFIRM_LINK_V2 = 0x21, + SMC_LLC_ADD_LINK_V2 = 0x22, + SMC_LLC_DELETE_LINK_V2 = 0x24, + SMC_LLC_REQ_ADD_LINK_V2 = 0x25, + SMC_LLC_CONFIRM_RKEY_V2 = 0x26, + SMC_LLC_TEST_LINK_V2 = 0x27, + SMC_LLC_DELETE_RKEY_V2 = 0x29, }; #define smc_link_downing(state) \ @@ -102,7 +111,8 @@ void smc_llc_flow_qentry_del(struct smc_llc_flow *flow); void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn); int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry); -int smc_llc_srv_add_link(struct smc_link *link); +int smc_llc_srv_add_link(struct smc_link *link, + struct smc_llc_qentry *req_qentry); void smc_llc_add_link_local(struct smc_link *link); int smc_llc_init(void) __init; diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 4a964e9190b0..67e9d9fde085 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -953,6 +953,26 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, return rc; } +static int smc_pnet_determine_gid(struct smc_ib_device *ibdev, int i, + struct smc_init_info *ini) +{ + if (!ini->check_smcrv2 && + !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->ib_gid, NULL, + NULL)) { + ini->ib_dev = ibdev; + ini->ib_port = i; + return 0; + } + if (ini->check_smcrv2 && + !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->smcrv2.ib_gid_v2, + NULL, &ini->smcrv2)) { + ini->smcrv2.ib_dev_v2 = ibdev; + ini->smcrv2.ib_port_v2 = i; + return 0; + } + return -ENODEV; +} + /* find a roce device for the given pnetid */ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, struct smc_init_info *ini, @@ -961,7 +981,6 @@ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, struct smc_ib_device *ibdev; int i; - ini->ib_dev = NULL; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { if (ibdev == known_dev) @@ -971,12 +990,9 @@ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, continue; if (smc_pnet_match(ibdev->pnetid[i - 1], pnet_id) && smc_ib_port_active(ibdev, i) && - !test_bit(i - 1, ibdev->ports_going_away) && - !smc_ib_determine_gid(ibdev, i, ini->vlan_id, - ini->ib_gid, NULL)) { - ini->ib_dev = ibdev; - ini->ib_port = i; - goto out; + !test_bit(i - 1, ibdev->ports_going_away)) { + if (!smc_pnet_determine_gid(ibdev, i, ini)) + goto out; } } } @@ -1016,12 +1032,9 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, dev_put(ndev); if (netdev == ndev && smc_ib_port_active(ibdev, i) && - !test_bit(i - 1, ibdev->ports_going_away) && - !smc_ib_determine_gid(ibdev, i, ini->vlan_id, - ini->ib_gid, NULL)) { - ini->ib_dev = ibdev; - ini->ib_port = i; - break; + !test_bit(i - 1, ibdev->ports_going_away)) { + if (!smc_pnet_determine_gid(ibdev, i, ini)) + break; } } } @@ -1083,8 +1096,6 @@ void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(sk); - ini->ib_dev = NULL; - ini->ib_port = 0; if (!dst) goto out; if (!dst->dev) diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index c79361dfcdfb..738a4a99c827 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -496,7 +496,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn, /* Wakeup sndbuf consumers from any context (IRQ or process) * since there is more data to transmit; usable snd_wnd as max transmit */ -static int _smcr_tx_sndbuf_nonempty(struct smc_connection *conn) +static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; struct smc_link *link = conn->lnk; @@ -505,8 +505,11 @@ static int _smcr_tx_sndbuf_nonempty(struct smc_connection *conn) struct smc_wr_buf *wr_buf; int rc; + if (!link || !smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_cdc_get_free_slot(conn, link, &wr_buf, &wr_rdma_buf, &pend); if (rc < 0) { + smc_wr_tx_link_put(link); if (rc == -EBUSY) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); @@ -547,22 +550,7 @@ static int _smcr_tx_sndbuf_nonempty(struct smc_connection *conn) out_unlock: spin_unlock_bh(&conn->send_lock); - return rc; -} - -static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) -{ - struct smc_link *link = conn->lnk; - int rc = -ENOLINK; - - if (!link) - return rc; - - atomic_inc(&link->wr_tx_refcnt); - if (smc_link_usable(link)) - rc = _smcr_tx_sndbuf_nonempty(conn); - if (atomic_dec_and_test(&link->wr_tx_refcnt)) - wake_up_all(&link->wr_tx_wait); + smc_wr_tx_link_put(link); return rc; } diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index a419e9af36b9..600ab5889227 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -101,19 +101,32 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) } pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id); - if (pnd_snd_idx == link->wr_tx_cnt) - return; - link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status; - if (link->wr_tx_pends[pnd_snd_idx].compl_requested) - complete(&link->wr_tx_compl[pnd_snd_idx]); - memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd)); - /* clear the full struct smc_wr_tx_pend including .priv */ - memset(&link->wr_tx_pends[pnd_snd_idx], 0, - sizeof(link->wr_tx_pends[pnd_snd_idx])); - memset(&link->wr_tx_bufs[pnd_snd_idx], 0, - sizeof(link->wr_tx_bufs[pnd_snd_idx])); - if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask)) - return; + if (pnd_snd_idx == link->wr_tx_cnt) { + if (link->lgr->smc_version != SMC_V2 || + link->wr_tx_v2_pend->wr_id != wc->wr_id) + return; + link->wr_tx_v2_pend->wc_status = wc->status; + memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd)); + /* clear the full struct smc_wr_tx_pend including .priv */ + memset(link->wr_tx_v2_pend, 0, + sizeof(*link->wr_tx_v2_pend)); + memset(link->lgr->wr_tx_buf_v2, 0, + sizeof(*link->lgr->wr_tx_buf_v2)); + } else { + link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status; + if (link->wr_tx_pends[pnd_snd_idx].compl_requested) + complete(&link->wr_tx_compl[pnd_snd_idx]); + memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], + sizeof(pnd_snd)); + /* clear the full struct smc_wr_tx_pend including .priv */ + memset(&link->wr_tx_pends[pnd_snd_idx], 0, + sizeof(link->wr_tx_pends[pnd_snd_idx])); + memset(&link->wr_tx_bufs[pnd_snd_idx], 0, + sizeof(link->wr_tx_bufs[pnd_snd_idx])); + if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask)) + return; + } + if (wc->status) { for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { /* clear full struct smc_wr_tx_pend including .priv */ @@ -123,6 +136,12 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) sizeof(link->wr_tx_bufs[i])); clear_bit(i, link->wr_tx_mask); } + if (link->lgr->smc_version == SMC_V2) { + memset(link->wr_tx_v2_pend, 0, + sizeof(*link->wr_tx_v2_pend)); + memset(link->lgr->wr_tx_buf_v2, 0, + sizeof(*link->lgr->wr_tx_buf_v2)); + } /* terminate link */ smcr_link_down_cond_sched(link); } @@ -239,6 +258,33 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, return 0; } +int smc_wr_tx_get_v2_slot(struct smc_link *link, + smc_wr_tx_handler handler, + struct smc_wr_v2_buf **wr_buf, + struct smc_wr_tx_pend_priv **wr_pend_priv) +{ + struct smc_wr_tx_pend *wr_pend; + struct ib_send_wr *wr_ib; + u64 wr_id; + + if (link->wr_tx_v2_pend->idx == link->wr_tx_cnt) + return -EBUSY; + + *wr_buf = NULL; + *wr_pend_priv = NULL; + wr_id = smc_wr_tx_get_next_wr_id(link); + wr_pend = link->wr_tx_v2_pend; + wr_pend->wr_id = wr_id; + wr_pend->handler = handler; + wr_pend->link = link; + wr_pend->idx = link->wr_tx_cnt; + wr_ib = link->wr_tx_v2_ib; + wr_ib->wr_id = wr_id; + *wr_buf = link->lgr->wr_tx_buf_v2; + *wr_pend_priv = &wr_pend->priv; + return 0; +} + int smc_wr_tx_put_slot(struct smc_link *link, struct smc_wr_tx_pend_priv *wr_pend_priv) { @@ -256,6 +302,14 @@ int smc_wr_tx_put_slot(struct smc_link *link, test_and_clear_bit(idx, link->wr_tx_mask); wake_up(&link->wr_tx_wait); return 1; + } else if (link->lgr->smc_version == SMC_V2 && + pend->idx == link->wr_tx_cnt) { + /* Large v2 buffer */ + memset(&link->wr_tx_v2_pend, 0, + sizeof(link->wr_tx_v2_pend)); + memset(&link->lgr->wr_tx_buf_v2, 0, + sizeof(link->lgr->wr_tx_buf_v2)); + return 1; } return 0; @@ -280,6 +334,22 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) return rc; } +int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, + int len) +{ + int rc; + + link->wr_tx_v2_ib->sg_list[0].length = len; + ib_req_notify_cq(link->smcibdev->roce_cq_send, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); + if (rc) { + smc_wr_tx_put_slot(link, priv); + smcr_link_down_cond_sched(link); + } + return rc; +} + /* Send prepared WR slot via ib_post_send and wait for send completion * notification. * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer @@ -517,6 +587,7 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) static void smc_wr_init_sge(struct smc_link *lnk) { + int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; u32 i; for (i = 0; i < lnk->wr_tx_cnt; i++) { @@ -545,14 +616,44 @@ static void smc_wr_init_sge(struct smc_link *lnk) lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list = lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge; } + + if (lnk->lgr->smc_version == SMC_V2) { + lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr; + lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE; + lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey; + + lnk->wr_tx_v2_ib->next = NULL; + lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge; + lnk->wr_tx_v2_ib->num_sge = 1; + lnk->wr_tx_v2_ib->opcode = IB_WR_SEND; + lnk->wr_tx_v2_ib->send_flags = + IB_SEND_SIGNALED | IB_SEND_SOLICITED; + } + + /* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE. + * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer + * and the same buffer for all sges. When a larger message arrived then + * the content of the first small sge is copied to the beginning of + * the larger spillover buffer, allowing easy data mapping. + */ for (i = 0; i < lnk->wr_rx_cnt; i++) { - lnk->wr_rx_sges[i].addr = + int x = i * sges_per_buf; + + lnk->wr_rx_sges[x].addr = lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE; - lnk->wr_rx_sges[i].length = SMC_WR_BUF_SIZE; - lnk->wr_rx_sges[i].lkey = lnk->roce_pd->local_dma_lkey; + lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE; + lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey; + if (lnk->lgr->smc_version == SMC_V2) { + lnk->wr_rx_sges[x + 1].addr = + lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE; + lnk->wr_rx_sges[x + 1].length = + SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE; + lnk->wr_rx_sges[x + 1].lkey = + lnk->roce_pd->local_dma_lkey; + } lnk->wr_rx_ibs[i].next = NULL; - lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i]; - lnk->wr_rx_ibs[i].num_sge = 1; + lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x]; + lnk->wr_rx_ibs[i].num_sge = sges_per_buf; } lnk->wr_reg.wr.next = NULL; lnk->wr_reg.wr.num_sge = 0; @@ -585,16 +686,45 @@ void smc_wr_free_link(struct smc_link *lnk) DMA_FROM_DEVICE); lnk->wr_rx_dma_addr = 0; } + if (lnk->wr_rx_v2_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr, + SMC_WR_BUF_V2_SIZE, + DMA_FROM_DEVICE); + lnk->wr_rx_v2_dma_addr = 0; + } if (lnk->wr_tx_dma_addr) { ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt, DMA_TO_DEVICE); lnk->wr_tx_dma_addr = 0; } + if (lnk->wr_tx_v2_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr, + SMC_WR_BUF_V2_SIZE, + DMA_TO_DEVICE); + lnk->wr_tx_v2_dma_addr = 0; + } +} + +void smc_wr_free_lgr_mem(struct smc_link_group *lgr) +{ + if (lgr->smc_version < SMC_V2) + return; + + kfree(lgr->wr_rx_buf_v2); + lgr->wr_rx_buf_v2 = NULL; + kfree(lgr->wr_tx_buf_v2); + lgr->wr_tx_buf_v2 = NULL; } void smc_wr_free_link_mem(struct smc_link *lnk) { + kfree(lnk->wr_tx_v2_ib); + lnk->wr_tx_v2_ib = NULL; + kfree(lnk->wr_tx_v2_sge); + lnk->wr_tx_v2_sge = NULL; + kfree(lnk->wr_tx_v2_pend); + lnk->wr_tx_v2_pend = NULL; kfree(lnk->wr_tx_compl); lnk->wr_tx_compl = NULL; kfree(lnk->wr_tx_pends); @@ -619,8 +749,26 @@ void smc_wr_free_link_mem(struct smc_link *lnk) lnk->wr_rx_bufs = NULL; } +int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr) +{ + if (lgr->smc_version < SMC_V2) + return 0; + + lgr->wr_rx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL); + if (!lgr->wr_rx_buf_v2) + return -ENOMEM; + lgr->wr_tx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL); + if (!lgr->wr_tx_buf_v2) { + kfree(lgr->wr_rx_buf_v2); + return -ENOMEM; + } + return 0; +} + int smc_wr_alloc_link_mem(struct smc_link *link) { + int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1; + /* allocate link related memory */ link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_tx_bufs) @@ -653,7 +801,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) if (!link->wr_tx_sges) goto no_mem_wr_tx_rdma_sges; link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, - sizeof(link->wr_rx_sges[0]), + sizeof(link->wr_rx_sges[0]) * sges_per_buf, GFP_KERNEL); if (!link->wr_rx_sges) goto no_mem_wr_tx_sges; @@ -672,8 +820,29 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_compl) goto no_mem_wr_tx_pends; + + if (link->lgr->smc_version == SMC_V2) { + link->wr_tx_v2_ib = kzalloc(sizeof(*link->wr_tx_v2_ib), + GFP_KERNEL); + if (!link->wr_tx_v2_ib) + goto no_mem_tx_compl; + link->wr_tx_v2_sge = kzalloc(sizeof(*link->wr_tx_v2_sge), + GFP_KERNEL); + if (!link->wr_tx_v2_sge) + goto no_mem_v2_ib; + link->wr_tx_v2_pend = kzalloc(sizeof(*link->wr_tx_v2_pend), + GFP_KERNEL); + if (!link->wr_tx_v2_pend) + goto no_mem_v2_sge; + } return 0; +no_mem_v2_sge: + kfree(link->wr_tx_v2_sge); +no_mem_v2_ib: + kfree(link->wr_tx_v2_ib); +no_mem_tx_compl: + kfree(link->wr_tx_compl); no_mem_wr_tx_pends: kfree(link->wr_tx_pends); no_mem_wr_tx_mask: @@ -725,6 +894,24 @@ int smc_wr_create_link(struct smc_link *lnk) rc = -EIO; goto out; } + if (lnk->lgr->smc_version == SMC_V2) { + lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev, + lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) { + lnk->wr_rx_v2_dma_addr = 0; + rc = -EIO; + goto dma_unmap; + } + lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev, + lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(ibdev, lnk->wr_tx_v2_dma_addr)) { + lnk->wr_tx_v2_dma_addr = 0; + rc = -EIO; + goto dma_unmap; + } + } lnk->wr_tx_dma_addr = ib_dma_map_single( ibdev, lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt, DMA_TO_DEVICE); @@ -742,6 +929,18 @@ int smc_wr_create_link(struct smc_link *lnk) return rc; dma_unmap: + if (lnk->wr_rx_v2_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr, + SMC_WR_BUF_V2_SIZE, + DMA_FROM_DEVICE); + lnk->wr_rx_v2_dma_addr = 0; + } + if (lnk->wr_tx_v2_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr, + SMC_WR_BUF_V2_SIZE, + DMA_TO_DEVICE); + lnk->wr_tx_v2_dma_addr = 0; + } ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, DMA_FROM_DEVICE); diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 423b8709f1c9..f353311e6f84 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -60,6 +60,20 @@ static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val) atomic_long_set(wr_tx_id, val); } +static inline bool smc_wr_tx_link_hold(struct smc_link *link) +{ + if (!smc_link_usable(link)) + return false; + atomic_inc(&link->wr_tx_refcnt); + return true; +} + +static inline void smc_wr_tx_link_put(struct smc_link *link) +{ + if (atomic_dec_and_test(&link->wr_tx_refcnt)) + wake_up_all(&link->wr_tx_wait); +} + static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk) { wake_up_all(&lnk->wr_tx_wait); @@ -87,8 +101,10 @@ static inline int smc_wr_rx_post(struct smc_link *link) int smc_wr_create_link(struct smc_link *lnk); int smc_wr_alloc_link_mem(struct smc_link *lnk); +int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr); void smc_wr_free_link(struct smc_link *lnk); void smc_wr_free_link_mem(struct smc_link *lnk); +void smc_wr_free_lgr_mem(struct smc_link_group *lgr); void smc_wr_remember_qp_attr(struct smc_link *lnk); void smc_wr_remove_dev(struct smc_ib_device *smcibdev); void smc_wr_add_dev(struct smc_ib_device *smcibdev); @@ -97,10 +113,16 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler, struct smc_wr_buf **wr_buf, struct smc_rdma_wr **wrs, struct smc_wr_tx_pend_priv **wr_pend_priv); +int smc_wr_tx_get_v2_slot(struct smc_link *link, + smc_wr_tx_handler handler, + struct smc_wr_v2_buf **wr_buf, + struct smc_wr_tx_pend_priv **wr_pend_priv); int smc_wr_tx_put_slot(struct smc_link *link, struct smc_wr_tx_pend_priv *wr_pend_priv); int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *wr_pend_priv); +int smc_wr_tx_v2_send(struct smc_link *link, + struct smc_wr_tx_pend_priv *priv, int len); int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, unsigned long timeout); void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context); diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 3e776e3dff91..1f2817195549 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -645,7 +645,7 @@ static bool gss_check_seq_num(const struct svc_rqst *rqstp, struct rsc *rsci, } __set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win); goto ok; - } else if (seq_num <= sd->sd_max - GSS_SEQ_WIN) { + } else if (seq_num + GSS_SEQ_WIN <= sd->sd_max) { goto toolow; } if (__test_and_set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win)) diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 443f8e5b9477..60bc74b76adc 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -462,7 +462,7 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, b->bcast_addr.media_id = b->media->type_id; b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT; b->mtu = dev->mtu; - b->media->raw2addr(b, &b->addr, (char *)dev->dev_addr); + b->media->raw2addr(b, &b->addr, (const char *)dev->dev_addr); rcu_assign_pointer(dev->tipc_ptr, b); return 0; } @@ -703,7 +703,7 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, break; case NETDEV_CHANGEADDR: b->media->raw2addr(b, &b->addr, - (char *)dev->dev_addr); + (const char *)dev->dev_addr); tipc_reset_bearer(net, b); break; case NETDEV_UNREGISTER: diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 57c6a1a719e2..490ad6e5f7a3 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -117,7 +117,7 @@ struct tipc_media { char *msg); int (*raw2addr)(struct tipc_bearer *b, struct tipc_media_addr *addr, - char *raw); + const char *raw); u32 priority; u32 tolerance; u32 min_win; diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c index c68019697cfe..cb0d185e06af 100644 --- a/net/tipc/eth_media.c +++ b/net/tipc/eth_media.c @@ -60,7 +60,7 @@ static int tipc_eth_addr2msg(char *msg, struct tipc_media_addr *addr) /* Convert raw mac address format to media addr format */ static int tipc_eth_raw2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, - char *msg) + const char *msg) { memset(addr, 0, sizeof(*addr)); ether_addr_copy(addr->value, msg); diff --git a/net/tipc/ib_media.c b/net/tipc/ib_media.c index 7aa9ff88458d..b9ad0434c3cd 100644 --- a/net/tipc/ib_media.c +++ b/net/tipc/ib_media.c @@ -67,7 +67,7 @@ static int tipc_ib_addr2msg(char *msg, struct tipc_media_addr *addr) /* Convert raw InfiniBand address format to media addr format */ static int tipc_ib_raw2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, - char *msg) + const char *msg) { memset(addr, 0, sizeof(*addr)); memcpy(addr->value, msg, INFINIBAND_ALEN); diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 989d1423a245..4147bb2e7057 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -498,9 +498,15 @@ static int tls_do_encryption(struct sock *sk, int rc, iv_offset = 0; /* For CCM based ciphers, first byte of IV is a constant */ - if (prot->cipher_type == TLS_CIPHER_AES_CCM_128) { + switch (prot->cipher_type) { + case TLS_CIPHER_AES_CCM_128: rec->iv_data[0] = TLS_AES_CCM_IV_B0_BYTE; iv_offset = 1; + break; + case TLS_CIPHER_SM4_CCM: + rec->iv_data[0] = TLS_SM4_CCM_IV_B0_BYTE; + iv_offset = 1; + break; } memcpy(&rec->iv_data[iv_offset], tls_ctx->tx.iv, @@ -1457,10 +1463,16 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb, aad = (u8 *)(sgout + n_sgout); iv = aad + prot->aad_size; - /* For CCM based ciphers, first byte of nonce+iv is always '2' */ - if (prot->cipher_type == TLS_CIPHER_AES_CCM_128) { - iv[0] = 2; + /* For CCM based ciphers, first byte of nonce+iv is a constant */ + switch (prot->cipher_type) { + case TLS_CIPHER_AES_CCM_128: + iv[0] = TLS_AES_CCM_IV_B0_BYTE; iv_offset = 1; + break; + case TLS_CIPHER_SM4_CCM: + iv[0] = TLS_SM4_CCM_IV_B0_BYTE; + iv_offset = 1; + break; } /* Prepare IV */ diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 92345c9bb60c..89f9e85ae970 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -608,20 +608,42 @@ static void unix_release_sock(struct sock *sk, int embrion) static void init_peercred(struct sock *sk) { - put_pid(sk->sk_peer_pid); - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + const struct cred *old_cred; + struct pid *old_pid; + + spin_lock(&sk->sk_peer_lock); + old_pid = sk->sk_peer_pid; + old_cred = sk->sk_peer_cred; sk->sk_peer_pid = get_pid(task_tgid(current)); sk->sk_peer_cred = get_current_cred(); + spin_unlock(&sk->sk_peer_lock); + + put_pid(old_pid); + put_cred(old_cred); } static void copy_peercred(struct sock *sk, struct sock *peersk) { - put_pid(sk->sk_peer_pid); - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + const struct cred *old_cred; + struct pid *old_pid; + + if (sk < peersk) { + spin_lock(&sk->sk_peer_lock); + spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock(&peersk->sk_peer_lock); + spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); + } + old_pid = sk->sk_peer_pid; + old_cred = sk->sk_peer_cred; sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); + + spin_unlock(&sk->sk_peer_lock); + spin_unlock(&peersk->sk_peer_lock); + + put_pid(old_pid); + put_cred(old_cred); } static int unix_listen(struct socket *sock, int backlog) @@ -806,7 +828,7 @@ static void unix_unhash(struct sock *sk) } struct proto unix_dgram_proto = { - .name = "UNIX-DGRAM", + .name = "UNIX", .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), .close = unix_close, @@ -828,20 +850,25 @@ struct proto unix_stream_proto = { static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) { - struct sock *sk = NULL; struct unix_sock *u; + struct sock *sk; + int err; atomic_long_inc(&unix_nr_socks); - if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) - goto out; + if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { + err = -ENFILE; + goto err; + } if (type == SOCK_STREAM) sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); else /*dgram and seqpacket */ sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); - if (!sk) - goto out; + if (!sk) { + err = -ENOMEM; + goto err; + } sock_init_data(sock, sk); @@ -861,20 +888,23 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); memset(&u->scm_stat, 0, sizeof(struct scm_stat)); unix_insert_socket(unix_sockets_unbound(sk), sk); -out: - if (sk == NULL) - atomic_long_dec(&unix_nr_socks); - else { - local_bh_disable(); - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - local_bh_enable(); - } + + local_bh_disable(); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + local_bh_enable(); + return sk; + +err: + atomic_long_dec(&unix_nr_socks); + return ERR_PTR(err); } static int unix_create(struct net *net, struct socket *sock, int protocol, int kern) { + struct sock *sk; + if (protocol && protocol != PF_UNIX) return -EPROTONOSUPPORT; @@ -901,7 +931,11 @@ static int unix_create(struct net *net, struct socket *sock, int protocol, return -ESOCKTNOSUPPORT; } - return unix_create1(net, sock, kern, sock->type) ? 0 : -ENOMEM; + sk = unix_create1(net, sock, kern, sock->type); + if (IS_ERR(sk)) + return PTR_ERR(sk); + + return 0; } static int unix_release(struct socket *sock) @@ -1314,12 +1348,15 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, we will have to recheck all again in any case. */ - err = -ENOMEM; - /* create new sock for complete connection */ newsk = unix_create1(sock_net(sk), NULL, 0, sock->type); - if (newsk == NULL) + if (IS_ERR(newsk)) { + err = PTR_ERR(newsk); + newsk = NULL; goto out; + } + + err = -ENOMEM; /* Allocate skb for sending to listening sock */ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); @@ -2845,6 +2882,9 @@ static int unix_shutdown(struct socket *sock, int mode) unix_state_lock(sk); sk->sk_shutdown |= mode; + if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && + mode == SHUTDOWN_MASK) + sk->sk_state = TCP_CLOSE; other = unix_peer(sk); if (other) sock_hold(other); @@ -2867,12 +2907,10 @@ static int unix_shutdown(struct socket *sock, int mode) other->sk_shutdown |= peer_mode; unix_state_unlock(other); other->sk_state_change(other); - if (peer_mode == SHUTDOWN_MASK) { + if (peer_mode == SHUTDOWN_MASK) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); - other->sk_state = TCP_CLOSE; - } else if (peer_mode & RCV_SHUTDOWN) { + else if (peer_mode & RCV_SHUTDOWN) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); - } } if (other) sock_put(other); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index e2c0cfb334d2..7d851eb3a683 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1614,13 +1614,18 @@ static int vsock_connectible_setsockopt(struct socket *sock, vsock_update_buffer_size(vsk, transport, vsk->buffer_size); break; - case SO_VM_SOCKETS_CONNECT_TIMEOUT: { - struct __kernel_old_timeval tv; - COPY_IN(tv); + case SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW: + case SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD: { + struct __kernel_sock_timeval tv; + + err = sock_copy_user_timeval(&tv, optval, optlen, + optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD); + if (err) + break; if (tv.tv_sec >= 0 && tv.tv_usec < USEC_PER_SEC && tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) { vsk->connect_timeout = tv.tv_sec * HZ + - DIV_ROUND_UP(tv.tv_usec, (1000000 / HZ)); + DIV_ROUND_UP((unsigned long)tv.tv_usec, (USEC_PER_SEC / HZ)); if (vsk->connect_timeout == 0) vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; @@ -1648,68 +1653,59 @@ static int vsock_connectible_getsockopt(struct socket *sock, char __user *optval, int __user *optlen) { - int err; + struct sock *sk = sock->sk; + struct vsock_sock *vsk = vsock_sk(sk); + + union { + u64 val64; + struct old_timeval32 tm32; + struct __kernel_old_timeval tm; + struct __kernel_sock_timeval stm; + } v; + + int lv = sizeof(v.val64); int len; - struct sock *sk; - struct vsock_sock *vsk; - u64 val; if (level != AF_VSOCK) return -ENOPROTOOPT; - err = get_user(len, optlen); - if (err != 0) - return err; - -#define COPY_OUT(_v) \ - do { \ - if (len < sizeof(_v)) \ - return -EINVAL; \ - \ - len = sizeof(_v); \ - if (copy_to_user(optval, &_v, len) != 0) \ - return -EFAULT; \ - \ - } while (0) + if (get_user(len, optlen)) + return -EFAULT; - err = 0; - sk = sock->sk; - vsk = vsock_sk(sk); + memset(&v, 0, sizeof(v)); switch (optname) { case SO_VM_SOCKETS_BUFFER_SIZE: - val = vsk->buffer_size; - COPY_OUT(val); + v.val64 = vsk->buffer_size; break; case SO_VM_SOCKETS_BUFFER_MAX_SIZE: - val = vsk->buffer_max_size; - COPY_OUT(val); + v.val64 = vsk->buffer_max_size; break; case SO_VM_SOCKETS_BUFFER_MIN_SIZE: - val = vsk->buffer_min_size; - COPY_OUT(val); + v.val64 = vsk->buffer_min_size; break; - case SO_VM_SOCKETS_CONNECT_TIMEOUT: { - struct __kernel_old_timeval tv; - tv.tv_sec = vsk->connect_timeout / HZ; - tv.tv_usec = - (vsk->connect_timeout - - tv.tv_sec * HZ) * (1000000 / HZ); - COPY_OUT(tv); + case SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW: + case SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD: + lv = sock_get_timeout(vsk->connect_timeout, &v, + optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD); break; - } + default: return -ENOPROTOOPT; } - err = put_user(len, optlen); - if (err != 0) + if (len < lv) + return -EINVAL; + if (len > lv) + len = lv; + if (copy_to_user(optval, &v, len)) return -EFAULT; -#undef COPY_OUT + if (put_user(len, optlen)) + return -EFAULT; return 0; } diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index d6b500dc4208..f16074eb53c7 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -134,21 +134,6 @@ int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, return 0; } -void xp_release(struct xdp_buff_xsk *xskb) -{ - xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; -} - -static u64 xp_get_handle(struct xdp_buff_xsk *xskb) -{ - u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; - - offset += xskb->pool->headroom; - if (!xskb->pool->unaligned) - return xskb->orig_addr + offset; - return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); -} - static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 8de01aaac4a0..90c4e1e819d3 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -44,12 +44,13 @@ void xp_destroy(struct xsk_buff_pool *pool) struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, struct xdp_umem *umem) { + bool unaligned = umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG; struct xsk_buff_pool *pool; struct xdp_buff_xsk *xskb; - u32 i; + u32 i, entries; - pool = kvzalloc(struct_size(pool, free_heads, umem->chunks), - GFP_KERNEL); + entries = unaligned ? umem->chunks : 0; + pool = kvzalloc(struct_size(pool, free_heads, entries), GFP_KERNEL); if (!pool) goto out; @@ -63,7 +64,8 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, pool->free_heads_cnt = umem->chunks; pool->headroom = umem->headroom; pool->chunk_size = umem->chunk_size; - pool->unaligned = umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG; + pool->chunk_shift = ffs(umem->chunk_size) - 1; + pool->unaligned = unaligned; pool->frame_len = umem->chunk_size - umem->headroom - XDP_PACKET_HEADROOM; pool->umem = umem; @@ -81,7 +83,10 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, xskb = &pool->heads[i]; xskb->pool = pool; xskb->xdp.frame_sz = umem->chunk_size - umem->headroom; - pool->free_heads[i] = xskb; + if (pool->unaligned) + pool->free_heads[i] = xskb; + else + xp_init_xskb_addr(xskb, pool, i * pool->chunk_size); } return pool; @@ -406,6 +411,12 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, if (pool->unaligned) xp_check_dma_contiguity(dma_map); + else + for (i = 0; i < pool->heads_cnt; i++) { + struct xdp_buff_xsk *xskb = &pool->heads[i]; + + xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, xskb->orig_addr); + } err = xp_init_dma_info(pool, dma_map); if (err) { @@ -448,12 +459,9 @@ static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) if (pool->free_heads_cnt == 0) return NULL; - xskb = pool->free_heads[--pool->free_heads_cnt]; - for (;;) { if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) { pool->fq->queue_empty_descs++; - xp_release(xskb); return NULL; } @@ -466,17 +474,17 @@ static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) } break; } - xskq_cons_release(pool->fq); - xskb->orig_addr = addr; - xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; - if (pool->dma_pages_cnt) { - xskb->frame_dma = (pool->dma_pages[addr >> PAGE_SHIFT] & - ~XSK_NEXT_PG_CONTIG_MASK) + - (addr & ~PAGE_MASK); - xskb->dma = xskb->frame_dma + pool->headroom + - XDP_PACKET_HEADROOM; + if (pool->unaligned) { + xskb = pool->free_heads[--pool->free_heads_cnt]; + xp_init_xskb_addr(xskb, pool, addr); + if (pool->dma_pages_cnt) + xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr); + } else { + xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)]; } + + xskq_cons_release(pool->fq); return xskb; } @@ -507,6 +515,96 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) } EXPORT_SYMBOL(xp_alloc); +static u32 xp_alloc_new_from_fq(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) +{ + u32 i, cached_cons, nb_entries; + + if (max > pool->free_heads_cnt) + max = pool->free_heads_cnt; + max = xskq_cons_nb_entries(pool->fq, max); + + cached_cons = pool->fq->cached_cons; + nb_entries = max; + i = max; + while (i--) { + struct xdp_buff_xsk *xskb; + u64 addr; + bool ok; + + __xskq_cons_read_addr_unchecked(pool->fq, cached_cons++, &addr); + + ok = pool->unaligned ? xp_check_unaligned(pool, &addr) : + xp_check_aligned(pool, &addr); + if (unlikely(!ok)) { + pool->fq->invalid_descs++; + nb_entries--; + continue; + } + + if (pool->unaligned) { + xskb = pool->free_heads[--pool->free_heads_cnt]; + xp_init_xskb_addr(xskb, pool, addr); + if (pool->dma_pages_cnt) + xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr); + } else { + xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)]; + } + + *xdp = &xskb->xdp; + xdp++; + } + + xskq_cons_release_n(pool->fq, max); + return nb_entries; +} + +static u32 xp_alloc_reused(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 nb_entries) +{ + struct xdp_buff_xsk *xskb; + u32 i; + + nb_entries = min_t(u32, nb_entries, pool->free_list_cnt); + + i = nb_entries; + while (i--) { + xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, free_list_node); + list_del(&xskb->free_list_node); + + *xdp = &xskb->xdp; + xdp++; + } + pool->free_list_cnt -= nb_entries; + + return nb_entries; +} + +u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) +{ + u32 nb_entries1 = 0, nb_entries2; + + if (unlikely(pool->dma_need_sync)) { + /* Slow path */ + *xdp = xp_alloc(pool); + return !!*xdp; + } + + if (unlikely(pool->free_list_cnt)) { + nb_entries1 = xp_alloc_reused(pool, xdp, max); + if (nb_entries1 == max) + return nb_entries1; + + max -= nb_entries1; + xdp += nb_entries1; + } + + nb_entries2 = xp_alloc_new_from_fq(pool, xdp, max); + if (!nb_entries2) + pool->fq->queue_empty_descs++; + + return nb_entries1 + nb_entries2; +} +EXPORT_SYMBOL(xp_alloc_batch); + bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count) { if (pool->free_list_cnt >= count) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 9ae13cccfb28..e9aa2c236356 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -111,14 +111,18 @@ struct xsk_queue { /* Functions that read and validate content from consumer rings. */ -static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) +static inline void __xskq_cons_read_addr_unchecked(struct xsk_queue *q, u32 cached_cons, u64 *addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + u32 idx = cached_cons & q->ring_mask; - if (q->cached_cons != q->cached_prod) { - u32 idx = q->cached_cons & q->ring_mask; + *addr = ring->desc[idx]; +} - *addr = ring->desc[idx]; +static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) +{ + if (q->cached_cons != q->cached_prod) { + __xskq_cons_read_addr_unchecked(q, q->cached_cons, addr); return true; } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 03b66d154b2b..3a3cb09eec12 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1961,24 +1961,65 @@ static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb, return skb; } +static int xfrm_notify_userpolicy(struct net *net) +{ + struct xfrm_userpolicy_default *up; + int len = NLMSG_ALIGN(sizeof(*up)); + struct nlmsghdr *nlh; + struct sk_buff *skb; + int err; + + skb = nlmsg_new(len, GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_GETDEFAULT, sizeof(*up), 0); + if (nlh == NULL) { + kfree_skb(skb); + return -EMSGSIZE; + } + + up = nlmsg_data(nlh); + up->in = net->xfrm.policy_default & XFRM_POL_DEFAULT_IN ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; + up->fwd = net->xfrm.policy_default & XFRM_POL_DEFAULT_FWD ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; + up->out = net->xfrm.policy_default & XFRM_POL_DEFAULT_OUT ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; + + nlmsg_end(skb, nlh); + + rcu_read_lock(); + err = xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_POLICY); + rcu_read_unlock(); + + return err; +} + static int xfrm_set_default(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs) { struct net *net = sock_net(skb->sk); struct xfrm_userpolicy_default *up = nlmsg_data(nlh); - u8 dirmask; - u8 old_default = net->xfrm.policy_default; - if (up->dirmask >= XFRM_USERPOLICY_DIRMASK_MAX) - return -EINVAL; + if (up->in == XFRM_USERPOLICY_BLOCK) + net->xfrm.policy_default |= XFRM_POL_DEFAULT_IN; + else if (up->in == XFRM_USERPOLICY_ACCEPT) + net->xfrm.policy_default &= ~XFRM_POL_DEFAULT_IN; - dirmask = (1 << up->dirmask) & XFRM_POL_DEFAULT_MASK; + if (up->fwd == XFRM_USERPOLICY_BLOCK) + net->xfrm.policy_default |= XFRM_POL_DEFAULT_FWD; + else if (up->fwd == XFRM_USERPOLICY_ACCEPT) + net->xfrm.policy_default &= ~XFRM_POL_DEFAULT_FWD; - net->xfrm.policy_default = (old_default & (0xff ^ dirmask)) - | (up->action << up->dirmask); + if (up->out == XFRM_USERPOLICY_BLOCK) + net->xfrm.policy_default |= XFRM_POL_DEFAULT_OUT; + else if (up->out == XFRM_USERPOLICY_ACCEPT) + net->xfrm.policy_default &= ~XFRM_POL_DEFAULT_OUT; rt_genid_bump_all(net); + xfrm_notify_userpolicy(net); return 0; } @@ -1988,13 +2029,11 @@ static int xfrm_get_default(struct sk_buff *skb, struct nlmsghdr *nlh, struct sk_buff *r_skb; struct nlmsghdr *r_nlh; struct net *net = sock_net(skb->sk); - struct xfrm_userpolicy_default *r_up, *up; + struct xfrm_userpolicy_default *r_up; int len = NLMSG_ALIGN(sizeof(struct xfrm_userpolicy_default)); u32 portid = NETLINK_CB(skb).portid; u32 seq = nlh->nlmsg_seq; - up = nlmsg_data(nlh); - r_skb = nlmsg_new(len, GFP_ATOMIC); if (!r_skb) return -ENOMEM; @@ -2007,8 +2046,12 @@ static int xfrm_get_default(struct sk_buff *skb, struct nlmsghdr *nlh, r_up = nlmsg_data(r_nlh); - r_up->action = ((net->xfrm.policy_default & (1 << up->dirmask)) >> up->dirmask); - r_up->dirmask = up->dirmask; + r_up->in = net->xfrm.policy_default & XFRM_POL_DEFAULT_IN ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; + r_up->fwd = net->xfrm.policy_default & XFRM_POL_DEFAULT_FWD ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; + r_up->out = net->xfrm.policy_default & XFRM_POL_DEFAULT_OUT ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; nlmsg_end(r_skb, r_nlh); return nlmsg_unicast(net->xfrm.nlsk, r_skb, portid); |