summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorPablo Neira Ayuso <pablo@netfilter.org>2016-09-25 23:23:57 +0200
committerPablo Neira Ayuso <pablo@netfilter.org>2016-09-25 23:34:19 +0200
commitf20fbc0717f9f007c94b2641134b19228d0ce9ed (patch)
tree1404248ebbec552a3fb7928b75322b65d74de1bd /net
parent8cb2a7d5667ab9a9c2fdd356357b85b63b320901 (diff)
parentfe0acb5fcb7fe8cb3d68bbdb8459865c972d8f83 (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Conflicts: net/netfilter/core.c net/netfilter/nf_tables_netdev.c Resolve two conflicts before pull request for David's net-next tree: 1) Between c73c24849011 ("netfilter: nf_tables_netdev: remove redundant ip_hdr assignment") from the net tree and commit ddc8b6027ad0 ("netfilter: introduce nft_set_pktinfo_{ipv4, ipv6}_validate()"). 2) Between e8bffe0cf964 ("net: Add _nf_(un)register_hooks symbols") and Aaron Conole's patches to replace list_head with single linked list. Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/ndisc.c2
-rw-r--r--net/appletalk/ddp.c2
-rw-r--r--net/atm/lec.c12
-rw-r--r--net/batman-adv/bat_v_elp.c2
-rw-r--r--net/batman-adv/routing.c28
-rw-r--r--net/bluetooth/af_bluetooth.c15
-rw-r--r--net/bluetooth/hci_core.c1
-rw-r--r--net/bluetooth/hci_request.c49
-rw-r--r--net/bluetooth/hci_request.h5
-rw-r--r--net/bluetooth/hci_sock.c396
-rw-r--r--net/bluetooth/leds.c27
-rw-r--r--net/bluetooth/leds.h10
-rw-r--r--net/bluetooth/mgmt.c349
-rw-r--r--net/bluetooth/mgmt_util.c66
-rw-r--r--net/bluetooth/smp.c5
-rw-r--r--net/bridge/br_input.c7
-rw-r--r--net/bridge/br_multicast.c2
-rw-r--r--net/bridge/br_stp_if.c43
-rw-r--r--net/bridge/netfilter/ebtables.c2
-rw-r--r--net/bridge/netfilter/nft_meta_bridge.c1
-rw-r--r--net/core/dev.c19
-rw-r--r--net/core/filter.c344
-rw-r--r--net/core/flow_dissector.c6
-rw-r--r--net/core/rtnetlink.c195
-rw-r--r--net/core/skbuff.c85
-rw-r--r--net/core/sock.c5
-rw-r--r--net/dsa/Kconfig3
-rw-r--r--net/dsa/Makefile1
-rw-r--r--net/dsa/dsa.c11
-rw-r--r--net/dsa/dsa2.c12
-rw-r--r--net/dsa/dsa_priv.h2
-rw-r--r--net/dsa/slave.c35
-rw-r--r--net/dsa/tag_qca.c138
-rw-r--r--net/ipv4/Kconfig18
-rw-r--r--net/ipv4/Makefile3
-rw-r--r--net/ipv4/af_inet.c14
-rw-r--r--net/ipv4/devinet.c11
-rw-r--r--net/ipv4/fib_frontend.c3
-rw-r--r--net/ipv4/fib_rules.c3
-rw-r--r--net/ipv4/fib_semantics.c8
-rw-r--r--net/ipv4/fib_trie.c10
-rw-r--r--net/ipv4/gre_offload.c6
-rw-r--r--net/ipv4/inet_diag.c49
-rw-r--r--net/ipv4/ip_gre.c23
-rw-r--r--net/ipv4/ip_input.c5
-rw-r--r--net/ipv4/ip_output.c13
-rw-r--r--net/ipv4/ip_sockglue.c7
-rw-r--r--net/ipv4/ip_tunnel.c76
-rw-r--r--net/ipv4/ip_tunnel_core.c2
-rw-r--r--net/ipv4/ip_vti.c15
-rw-r--r--net/ipv4/ipip.c35
-rw-r--r--net/ipv4/ipmr.c7
-rw-r--r--net/ipv4/netfilter/nft_chain_route_ipv4.c11
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c1
-rw-r--r--net/ipv4/raw.c6
-rw-r--r--net/ipv4/route.c34
-rw-r--r--net/ipv4/tcp.c61
-rw-r--r--net/ipv4/tcp_bbr.c896
-rw-r--r--net/ipv4/tcp_cdg.c12
-rw-r--r--net/ipv4/tcp_cong.c2
-rw-r--r--net/ipv4/tcp_fastopen.c2
-rw-r--r--net/ipv4/tcp_input.c495
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_minisocks.c6
-rw-r--r--net/ipv4/tcp_offload.c13
-rw-r--r--net/ipv4/tcp_output.c106
-rw-r--r--net/ipv4/tcp_rate.c186
-rw-r--r--net/ipv4/tcp_timer.c1
-rw-r--r--net/ipv4/tcp_yeah.c2
-rw-r--r--net/ipv4/udp.c6
-rw-r--r--net/ipv4/udp_diag.c14
-rw-r--r--net/ipv4/udp_offload.c6
-rw-r--r--net/ipv4/xfrm4_policy.c4
-rw-r--r--net/ipv6/addrconf.c18
-rw-r--r--net/ipv6/fib6_rules.c3
-rw-r--r--net/ipv6/ip6_fib.c6
-rw-r--r--net/ipv6/ip6_gre.c2
-rw-r--r--net/ipv6/ip6_offload.c5
-rw-r--r--net/ipv6/ip6_output.c19
-rw-r--r--net/ipv6/ip6_tunnel.c176
-rw-r--r--net/ipv6/ip6_vti.c19
-rw-r--r--net/ipv6/ip6mr.c7
-rw-r--r--net/ipv6/ndisc.c11
-rw-r--r--net/ipv6/netfilter/nft_chain_route_ipv6.c10
-rw-r--r--net/ipv6/netfilter/nft_reject_ipv6.c1
-rw-r--r--net/ipv6/output_core.c7
-rw-r--r--net/ipv6/ping.c9
-rw-r--r--net/ipv6/raw.c7
-rw-r--r--net/ipv6/route.c48
-rw-r--r--net/ipv6/tcp_ipv6.c8
-rw-r--r--net/ipv6/xfrm6_input.c15
-rw-r--r--net/ipv6/xfrm6_policy.c4
-rw-r--r--net/ipv6/xfrm6_tunnel.c2
-rw-r--r--net/irda/af_irda.c5
-rw-r--r--net/kcm/kcmsock.c3
-rw-r--r--net/l2tp/l2tp_core.c3
-rw-r--r--net/l2tp/l2tp_core.h2
-rw-r--r--net/l2tp/l2tp_eth.c6
-rw-r--r--net/l2tp/l2tp_ppp.c4
-rw-r--r--net/l3mdev/l3mdev.c105
-rw-r--r--net/llc/af_llc.c4
-rw-r--r--net/mac80211/agg-rx.c19
-rw-r--r--net/mac80211/agg-tx.c3
-rw-r--r--net/mac80211/cfg.c35
-rw-r--r--net/mac80211/debugfs.c152
-rw-r--r--net/mac80211/debugfs_netdev.c37
-rw-r--r--net/mac80211/debugfs_sta.c56
-rw-r--r--net/mac80211/driver-ops.c2
-rw-r--r--net/mac80211/driver-ops.h18
-rw-r--r--net/mac80211/ieee80211_i.h13
-rw-r--r--net/mac80211/iface.c21
-rw-r--r--net/mac80211/main.c3
-rw-r--r--net/mac80211/mesh_hwmp.c3
-rw-r--r--net/mac80211/mesh_pathtbl.c2
-rw-r--r--net/mac80211/mlme.c12
-rw-r--r--net/mac80211/pm.c3
-rw-r--r--net/mac80211/rx.c76
-rw-r--r--net/mac80211/scan.c2
-rw-r--r--net/mac80211/sta_info.c61
-rw-r--r--net/mac80211/sta_info.h22
-rw-r--r--net/mac80211/status.c15
-rw-r--r--net/mac80211/tdls.c7
-rw-r--r--net/mac80211/tx.c94
-rw-r--r--net/mac80211/util.c3
-rw-r--r--net/mac802154/iface.c1
-rw-r--r--net/mac802154/rx.c9
-rw-r--r--net/netfilter/core.c51
-rw-r--r--net/netfilter/nf_conntrack_core.c6
-rw-r--r--net/netfilter/nf_nat_core.c5
-rw-r--r--net/netfilter/nf_tables_trace.c2
-rw-r--r--net/netfilter/nfnetlink_acct.c6
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c49
-rw-r--r--net/netfilter/nft_meta.c17
-rw-r--r--net/netfilter/nft_reject.c16
-rw-r--r--net/netfilter/nft_reject_inet.c7
-rw-r--r--net/netfilter/xt_sctp.c2
-rw-r--r--net/netlink/diag.c1
-rw-r--r--net/openvswitch/actions.c46
-rw-r--r--net/openvswitch/datapath.c25
-rw-r--r--net/openvswitch/flow.c107
-rw-r--r--net/openvswitch/flow.h12
-rw-r--r--net/openvswitch/flow_netlink.c316
-rw-r--r--net/openvswitch/flow_netlink.h3
-rw-r--r--net/openvswitch/flow_table.c25
-rw-r--r--net/openvswitch/vport.c7
-rw-r--r--net/rxrpc/Kconfig14
-rw-r--r--net/rxrpc/af_rxrpc.c126
-rw-r--r--net/rxrpc/ar-internal.h607
-rw-r--r--net/rxrpc/call_accept.c699
-rw-r--r--net/rxrpc/call_event.c1404
-rw-r--r--net/rxrpc/call_object.c781
-rw-r--r--net/rxrpc/conn_client.c114
-rw-r--r--net/rxrpc/conn_event.c150
-rw-r--r--net/rxrpc/conn_object.c91
-rw-r--r--net/rxrpc/conn_service.c123
-rw-r--r--net/rxrpc/input.c1391
-rw-r--r--net/rxrpc/insecure.c26
-rw-r--r--net/rxrpc/local_event.c19
-rw-r--r--net/rxrpc/local_object.c52
-rw-r--r--net/rxrpc/misc.c177
-rw-r--r--net/rxrpc/output.c396
-rw-r--r--net/rxrpc/peer_event.c90
-rw-r--r--net/rxrpc/peer_object.c184
-rw-r--r--net/rxrpc/proc.c42
-rw-r--r--net/rxrpc/recvmsg.c823
-rw-r--r--net/rxrpc/rxkad.c202
-rw-r--r--net/rxrpc/security.c12
-rw-r--r--net/rxrpc/sendmsg.c222
-rw-r--r--net/rxrpc/skbuff.c194
-rw-r--r--net/rxrpc/sysctl.c12
-rw-r--r--net/rxrpc/utils.c2
-rw-r--r--net/sched/Kconfig27
-rw-r--r--net/sched/Makefile3
-rw-r--r--net/sched/act_api.c36
-rw-r--r--net/sched/act_bpf.c5
-rw-r--r--net/sched/act_csum.c36
-rw-r--r--net/sched/act_gact.c3
-rw-r--r--net/sched/act_ife.c26
-rw-r--r--net/sched/act_meta_skbtcindex.c79
-rw-r--r--net/sched/act_mirred.c11
-rw-r--r--net/sched/act_police.c12
-rw-r--r--net/sched/act_skbmod.c301
-rw-r--r--net/sched/act_tunnel_key.c342
-rw-r--r--net/sched/act_vlan.c29
-rw-r--r--net/sched/cls_api.c18
-rw-r--r--net/sched/cls_bpf.c126
-rw-r--r--net/sched/cls_flow.c27
-rw-r--r--net/sched/cls_flower.c124
-rw-r--r--net/sched/cls_fw.c10
-rw-r--r--net/sched/cls_route.c12
-rw-r--r--net/sched/cls_tcindex.c12
-rw-r--r--net/sched/cls_u32.c30
-rw-r--r--net/sched/sch_api.c41
-rw-r--r--net/sched/sch_codel.c4
-rw-r--r--net/sched/sch_fifo.c4
-rw-r--r--net/sched/sch_fq.c71
-rw-r--r--net/sched/sch_generic.c28
-rw-r--r--net/sched/sch_htb.c24
-rw-r--r--net/sched/sch_netem.c20
-rw-r--r--net/sched/sch_pie.c4
-rw-r--r--net/sctp/associola.c2
-rw-r--r--net/sctp/auth.c2
-rw-r--r--net/sctp/chunk.c26
-rw-r--r--net/sctp/input.c35
-rw-r--r--net/sctp/inqueue.c2
-rw-r--r--net/sctp/output.c73
-rw-r--r--net/sctp/outqueue.c99
-rw-r--r--net/sctp/sctp_diag.c20
-rw-r--r--net/sctp/sm_make_chunk.c28
-rw-r--r--net/sctp/sm_sideeffect.c25
-rw-r--r--net/sctp/sm_statefuns.c6
-rw-r--r--net/sctp/socket.c8
-rw-r--r--net/sctp/transport.c4
-rw-r--r--net/sctp/ulpevent.c4
-rw-r--r--net/sctp/ulpqueue.c3
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c5
-rw-r--r--net/sunrpc/clnt.c4
-rw-r--r--net/sunrpc/xprtrdma/verbs.c45
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h1
-rw-r--r--net/sunrpc/xprtsock.c2
-rw-r--r--net/tipc/name_distr.c8
-rw-r--r--net/tipc/udp_media.c3
-rw-r--r--net/unix/af_unix.c111
-rw-r--r--net/wireless/core.c2
-rw-r--r--net/wireless/core.h6
-rw-r--r--net/wireless/ibss.c11
-rw-r--r--net/wireless/mlme.c2
-rw-r--r--net/wireless/nl80211.c87
-rw-r--r--net/wireless/scan.c58
-rw-r--r--net/wireless/sme.c3
-rw-r--r--net/wireless/sysfs.c2
-rw-r--r--net/wireless/util.c13
-rw-r--r--net/wireless/wext-compat.c9
-rw-r--r--net/wireless/wext-core.c25
-rw-r--r--net/wireless/wext-sme.c5
-rw-r--r--net/x25/af_x25.c4
-rw-r--r--net/xfrm/xfrm_algo.c2
-rw-r--r--net/xfrm/xfrm_input.c14
-rw-r--r--net/xfrm/xfrm_policy.c149
-rw-r--r--net/xfrm/xfrm_replay.c6
-rw-r--r--net/xfrm/xfrm_state.c126
-rw-r--r--net/xfrm/xfrm_sysctl.c4
-rw-r--r--net/xfrm/xfrm_user.c22
243 files changed, 10530 insertions, 5889 deletions
diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c
index 86450b7e2899..941df2fa4448 100644
--- a/net/6lowpan/ndisc.c
+++ b/net/6lowpan/ndisc.c
@@ -101,8 +101,6 @@ static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags,
ieee802154_be16_to_le16(&neigh->short_addr, lladdr_short);
if (!lowpan_802154_is_valid_src_short_addr(neigh->short_addr))
neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
- } else {
- neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC);
}
write_unlock_bh(&n->lock);
}
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index f066781be3c8..10d2bdce686e 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1278,7 +1278,7 @@ out:
return err;
}
-#if defined(CONFIG_IPDDP) || defined(CONFIG_IPDDP_MODULE)
+#if IS_ENABLED(CONFIG_IPDDP)
static __inline__ int is_ip_over_ddp(struct sk_buff *skb)
{
return skb->data[12] == 22;
diff --git a/net/atm/lec.c b/net/atm/lec.c
index e574a7e9db6f..5d2693826afb 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -31,7 +31,7 @@
#include <linux/atmlec.h>
/* Proxy LEC knows about bridging */
-#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+#if IS_ENABLED(CONFIG_BRIDGE)
#include "../bridge/br_private.h"
static unsigned char bridge_ula_lec[] = { 0x01, 0x80, 0xc2, 0x00, 0x00 };
@@ -121,7 +121,7 @@ static unsigned char bus_mac[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
/* Device structures */
static struct net_device *dev_lec[MAX_LEC_ITF];
-#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+#if IS_ENABLED(CONFIG_BRIDGE)
static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev)
{
char *buff;
@@ -155,7 +155,7 @@ static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev)
sk->sk_data_ready(sk);
}
}
-#endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */
+#endif /* IS_ENABLED(CONFIG_BRIDGE) */
/*
* Open/initialize the netdevice. This is called (in the current kernel)
@@ -222,7 +222,7 @@ static netdev_tx_t lec_start_xmit(struct sk_buff *skb,
pr_debug("skbuff head:%lx data:%lx tail:%lx end:%lx\n",
(long)skb->head, (long)skb->data, (long)skb_tail_pointer(skb),
(long)skb_end_pointer(skb));
-#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+#if IS_ENABLED(CONFIG_BRIDGE)
if (memcmp(skb->data, bridge_ula_lec, sizeof(bridge_ula_lec)) == 0)
lec_handle_bridge(skb, dev);
#endif
@@ -426,7 +426,7 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
(unsigned short)(0xffff & mesg->content.normal.flag);
break;
case l_should_bridge:
-#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+#if IS_ENABLED(CONFIG_BRIDGE)
{
pr_debug("%s: bridge zeppelin asks about %pM\n",
dev->name, mesg->content.proxy.mac_addr);
@@ -452,7 +452,7 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
sk->sk_data_ready(sk);
}
}
-#endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */
+#endif /* IS_ENABLED(CONFIG_BRIDGE) */
break;
default:
pr_info("%s: Unknown message type %d\n", dev->name, mesg->type);
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 7d170010beb9..ee08540ce503 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -335,7 +335,7 @@ int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface)
goto out;
skb_reserve(hard_iface->bat_v.elp_skb, ETH_HLEN + NET_IP_ALIGN);
- elp_buff = skb_push(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN);
+ elp_buff = skb_put(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN);
elp_packet = (struct batadv_elp_packet *)elp_buff;
memset(elp_packet, 0, BATADV_ELP_HLEN);
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 610f2c45edcd..7e8dc648b95a 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -461,6 +461,29 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv,
}
/**
+ * batadv_last_bonding_get - Get last_bonding_candidate of orig_node
+ * @orig_node: originator node whose last bonding candidate should be retrieved
+ *
+ * Return: last bonding candidate of router or NULL if not found
+ *
+ * The object is returned with refcounter increased by 1.
+ */
+static struct batadv_orig_ifinfo *
+batadv_last_bonding_get(struct batadv_orig_node *orig_node)
+{
+ struct batadv_orig_ifinfo *last_bonding_candidate;
+
+ spin_lock_bh(&orig_node->neigh_list_lock);
+ last_bonding_candidate = orig_node->last_bonding_candidate;
+
+ if (last_bonding_candidate)
+ kref_get(&last_bonding_candidate->refcount);
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+
+ return last_bonding_candidate;
+}
+
+/**
* batadv_last_bonding_replace - Replace last_bonding_candidate of orig_node
* @orig_node: originator node whose bonding candidates should be replaced
* @new_candidate: new bonding candidate or NULL
@@ -530,7 +553,7 @@ batadv_find_router(struct batadv_priv *bat_priv,
* router - obviously there are no other candidates.
*/
rcu_read_lock();
- last_candidate = orig_node->last_bonding_candidate;
+ last_candidate = batadv_last_bonding_get(orig_node);
if (last_candidate)
last_cand_router = rcu_dereference(last_candidate->router);
@@ -622,6 +645,9 @@ next:
batadv_orig_ifinfo_put(next_candidate);
}
+ if (last_candidate)
+ batadv_orig_ifinfo_put(last_candidate);
+
return router;
}
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 0b5f729d08d2..1aff2da9bc74 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -26,11 +26,13 @@
#include <linux/module.h>
#include <linux/debugfs.h>
+#include <linux/stringify.h>
#include <asm/ioctls.h>
#include <net/bluetooth/bluetooth.h>
#include <linux/proc_fs.h>
+#include "leds.h"
#include "selftest.h"
/* Bluetooth sockets */
@@ -712,13 +714,16 @@ static struct net_proto_family bt_sock_family_ops = {
struct dentry *bt_debugfs;
EXPORT_SYMBOL_GPL(bt_debugfs);
+#define VERSION __stringify(BT_SUBSYS_VERSION) "." \
+ __stringify(BT_SUBSYS_REVISION)
+
static int __init bt_init(void)
{
int err;
sock_skb_cb_check_size(sizeof(struct bt_skb_cb));
- BT_INFO("Core ver %s", BT_SUBSYS_VERSION);
+ BT_INFO("Core ver %s", VERSION);
err = bt_selftest();
if (err < 0)
@@ -726,6 +731,8 @@ static int __init bt_init(void)
bt_debugfs = debugfs_create_dir("bluetooth", NULL);
+ bt_leds_init();
+
err = bt_sysfs_init();
if (err < 0)
return err;
@@ -785,6 +792,8 @@ static void __exit bt_exit(void)
bt_sysfs_cleanup();
+ bt_leds_cleanup();
+
debugfs_remove_recursive(bt_debugfs);
}
@@ -792,7 +801,7 @@ subsys_initcall(bt_init);
module_exit(bt_exit);
MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
-MODULE_DESCRIPTION("Bluetooth Core ver " BT_SUBSYS_VERSION);
-MODULE_VERSION(BT_SUBSYS_VERSION);
+MODULE_DESCRIPTION("Bluetooth Core ver " VERSION);
+MODULE_VERSION(VERSION);
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_BLUETOOTH);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index ddf8432fe8fb..3ac89e9ace71 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1562,6 +1562,7 @@ int hci_dev_do_close(struct hci_dev *hdev)
auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF);
if (!auto_off && hdev->dev_type == HCI_PRIMARY &&
+ !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
hci_dev_test_flag(hdev, HCI_MGMT))
__mgmt_power_off(hdev);
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index b0e23dfc5c34..c8135680c43e 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -971,14 +971,14 @@ void __hci_req_enable_advertising(struct hci_request *req)
hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
}
-static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr)
+static u8 append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len)
{
- u8 ad_len = 0;
size_t name_len;
+ int max_len;
+ max_len = HCI_MAX_AD_LENGTH - ad_len - 2;
name_len = strlen(hdev->dev_name);
- if (name_len > 0) {
- size_t max_len = HCI_MAX_AD_LENGTH - ad_len - 2;
+ if (name_len > 0 && max_len > 0) {
if (name_len > max_len) {
name_len = max_len;
@@ -997,22 +997,42 @@ static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr)
return ad_len;
}
+static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr)
+{
+ return append_local_name(hdev, ptr, 0);
+}
+
static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance,
u8 *ptr)
{
struct adv_info *adv_instance;
+ u32 instance_flags;
+ u8 scan_rsp_len = 0;
adv_instance = hci_find_adv_instance(hdev, instance);
if (!adv_instance)
return 0;
- /* TODO: Set the appropriate entries based on advertising instance flags
- * here once flags other than 0 are supported.
- */
+ instance_flags = adv_instance->flags;
+
+ if ((instance_flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) {
+ ptr[0] = 3;
+ ptr[1] = EIR_APPEARANCE;
+ put_unaligned_le16(hdev->appearance, ptr + 2);
+ scan_rsp_len += 4;
+ ptr += 4;
+ }
+
memcpy(ptr, adv_instance->scan_rsp_data,
adv_instance->scan_rsp_len);
- return adv_instance->scan_rsp_len;
+ scan_rsp_len += adv_instance->scan_rsp_len;
+ ptr += adv_instance->scan_rsp_len;
+
+ if (instance_flags & MGMT_ADV_FLAG_LOCAL_NAME)
+ scan_rsp_len = append_local_name(hdev, ptr, scan_rsp_len);
+
+ return scan_rsp_len;
}
void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance)
@@ -1194,7 +1214,7 @@ static void adv_timeout_expire(struct work_struct *work)
hci_req_init(&req, hdev);
- hci_req_clear_adv_instance(hdev, &req, instance, false);
+ hci_req_clear_adv_instance(hdev, NULL, &req, instance, false);
if (list_empty(&hdev->adv_instances))
__hci_req_disable_advertising(&req);
@@ -1284,8 +1304,9 @@ static void cancel_adv_timeout(struct hci_dev *hdev)
* setting.
* - force == false: Only instances that have a timeout will be removed.
*/
-void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req,
- u8 instance, bool force)
+void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk,
+ struct hci_request *req, u8 instance,
+ bool force)
{
struct adv_info *adv_instance, *n, *next_instance = NULL;
int err;
@@ -1311,7 +1332,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req,
rem_inst = adv_instance->instance;
err = hci_remove_adv_instance(hdev, rem_inst);
if (!err)
- mgmt_advertising_removed(NULL, hdev, rem_inst);
+ mgmt_advertising_removed(sk, hdev, rem_inst);
}
} else {
adv_instance = hci_find_adv_instance(hdev, instance);
@@ -1325,7 +1346,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req,
err = hci_remove_adv_instance(hdev, instance);
if (!err)
- mgmt_advertising_removed(NULL, hdev, instance);
+ mgmt_advertising_removed(sk, hdev, instance);
}
}
@@ -1716,7 +1737,7 @@ void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn,
* function. To be safe hard-code one of the
* values that's suitable for SCO.
*/
- rej.reason = HCI_ERROR_REMOTE_LOW_RESOURCES;
+ rej.reason = HCI_ERROR_REJ_LIMITED_RESOURCES;
hci_req_add(req, HCI_OP_REJECT_SYNC_CONN_REQ,
sizeof(rej), &rej);
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index b2d044bdc732..ac1e11006f38 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -73,8 +73,9 @@ void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance);
int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
bool force);
-void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req,
- u8 instance, bool force);
+void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk,
+ struct hci_request *req, u8 instance,
+ bool force);
void __hci_req_update_class(struct hci_request *req);
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 96f04b7b9556..48f9471e7c85 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -26,6 +26,7 @@
#include <linux/export.h>
#include <linux/utsname.h>
+#include <linux/sched.h>
#include <asm/unaligned.h>
#include <net/bluetooth/bluetooth.h>
@@ -38,6 +39,8 @@
static LIST_HEAD(mgmt_chan_list);
static DEFINE_MUTEX(mgmt_chan_list_lock);
+static DEFINE_IDA(sock_cookie_ida);
+
static atomic_t monitor_promisc = ATOMIC_INIT(0);
/* ----- HCI socket interface ----- */
@@ -52,6 +55,8 @@ struct hci_pinfo {
__u32 cmsg_mask;
unsigned short channel;
unsigned long flags;
+ __u32 cookie;
+ char comm[TASK_COMM_LEN];
};
void hci_sock_set_flag(struct sock *sk, int nr)
@@ -74,6 +79,38 @@ unsigned short hci_sock_get_channel(struct sock *sk)
return hci_pi(sk)->channel;
}
+u32 hci_sock_get_cookie(struct sock *sk)
+{
+ return hci_pi(sk)->cookie;
+}
+
+static bool hci_sock_gen_cookie(struct sock *sk)
+{
+ int id = hci_pi(sk)->cookie;
+
+ if (!id) {
+ id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL);
+ if (id < 0)
+ id = 0xffffffff;
+
+ hci_pi(sk)->cookie = id;
+ get_task_comm(hci_pi(sk)->comm, current);
+ return true;
+ }
+
+ return false;
+}
+
+static void hci_sock_free_cookie(struct sock *sk)
+{
+ int id = hci_pi(sk)->cookie;
+
+ if (id) {
+ hci_pi(sk)->cookie = 0xffffffff;
+ ida_simple_remove(&sock_cookie_ida, id);
+ }
+}
+
static inline int hci_test_bit(int nr, const void *addr)
{
return *((const __u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31));
@@ -305,6 +342,60 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb)
kfree_skb(skb_copy);
}
+void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event,
+ void *data, u16 data_len, ktime_t tstamp,
+ int flag, struct sock *skip_sk)
+{
+ struct sock *sk;
+ __le16 index;
+
+ if (hdev)
+ index = cpu_to_le16(hdev->id);
+ else
+ index = cpu_to_le16(MGMT_INDEX_NONE);
+
+ read_lock(&hci_sk_list.lock);
+
+ sk_for_each(sk, &hci_sk_list.head) {
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+
+ if (hci_pi(sk)->channel != HCI_CHANNEL_CONTROL)
+ continue;
+
+ /* Ignore socket without the flag set */
+ if (!hci_sock_test_flag(sk, flag))
+ continue;
+
+ /* Skip the original socket */
+ if (sk == skip_sk)
+ continue;
+
+ skb = bt_skb_alloc(6 + data_len, GFP_ATOMIC);
+ if (!skb)
+ continue;
+
+ put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+ put_unaligned_le16(event, skb_put(skb, 2));
+
+ if (data)
+ memcpy(skb_put(skb, data_len), data, data_len);
+
+ skb->tstamp = tstamp;
+
+ hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT);
+ hdr->index = index;
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+
+ read_unlock(&hci_sk_list.lock);
+}
+
static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event)
{
struct hci_mon_hdr *hdr;
@@ -384,6 +475,129 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event)
return skb;
}
+static struct sk_buff *create_monitor_ctrl_open(struct sock *sk)
+{
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+ u16 format;
+ u8 ver[3];
+ u32 flags;
+
+ /* No message needed when cookie is not present */
+ if (!hci_pi(sk)->cookie)
+ return NULL;
+
+ switch (hci_pi(sk)->channel) {
+ case HCI_CHANNEL_RAW:
+ format = 0x0000;
+ ver[0] = BT_SUBSYS_VERSION;
+ put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1);
+ break;
+ case HCI_CHANNEL_USER:
+ format = 0x0001;
+ ver[0] = BT_SUBSYS_VERSION;
+ put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1);
+ break;
+ case HCI_CHANNEL_CONTROL:
+ format = 0x0002;
+ mgmt_fill_version_info(ver);
+ break;
+ default:
+ /* No message for unsupported format */
+ return NULL;
+ }
+
+ skb = bt_skb_alloc(14 + TASK_COMM_LEN , GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ flags = hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) ? 0x1 : 0x0;
+
+ put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+ put_unaligned_le16(format, skb_put(skb, 2));
+ memcpy(skb_put(skb, sizeof(ver)), ver, sizeof(ver));
+ put_unaligned_le32(flags, skb_put(skb, 4));
+ *skb_put(skb, 1) = TASK_COMM_LEN;
+ memcpy(skb_put(skb, TASK_COMM_LEN), hci_pi(sk)->comm, TASK_COMM_LEN);
+
+ __net_timestamp(skb);
+
+ hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(HCI_MON_CTRL_OPEN);
+ if (hci_pi(sk)->hdev)
+ hdr->index = cpu_to_le16(hci_pi(sk)->hdev->id);
+ else
+ hdr->index = cpu_to_le16(HCI_DEV_NONE);
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ return skb;
+}
+
+static struct sk_buff *create_monitor_ctrl_close(struct sock *sk)
+{
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+
+ /* No message needed when cookie is not present */
+ if (!hci_pi(sk)->cookie)
+ return NULL;
+
+ switch (hci_pi(sk)->channel) {
+ case HCI_CHANNEL_RAW:
+ case HCI_CHANNEL_USER:
+ case HCI_CHANNEL_CONTROL:
+ break;
+ default:
+ /* No message for unsupported format */
+ return NULL;
+ }
+
+ skb = bt_skb_alloc(4, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+
+ __net_timestamp(skb);
+
+ hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(HCI_MON_CTRL_CLOSE);
+ if (hci_pi(sk)->hdev)
+ hdr->index = cpu_to_le16(hci_pi(sk)->hdev->id);
+ else
+ hdr->index = cpu_to_le16(HCI_DEV_NONE);
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ return skb;
+}
+
+static struct sk_buff *create_monitor_ctrl_command(struct sock *sk, u16 index,
+ u16 opcode, u16 len,
+ const void *buf)
+{
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(6 + len, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
+ put_unaligned_le16(opcode, skb_put(skb, 2));
+
+ if (buf)
+ memcpy(skb_put(skb, len), buf, len);
+
+ __net_timestamp(skb);
+
+ hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(HCI_MON_CTRL_COMMAND);
+ hdr->index = cpu_to_le16(index);
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ return skb;
+}
+
static void __printf(2, 3)
send_monitor_note(struct sock *sk, const char *fmt, ...)
{
@@ -458,6 +672,26 @@ static void send_monitor_replay(struct sock *sk)
read_unlock(&hci_dev_list_lock);
}
+static void send_monitor_control_replay(struct sock *mon_sk)
+{
+ struct sock *sk;
+
+ read_lock(&hci_sk_list.lock);
+
+ sk_for_each(sk, &hci_sk_list.head) {
+ struct sk_buff *skb;
+
+ skb = create_monitor_ctrl_open(sk);
+ if (!skb)
+ continue;
+
+ if (sock_queue_rcv_skb(mon_sk, skb))
+ kfree_skb(skb);
+ }
+
+ read_unlock(&hci_sk_list.lock);
+}
+
/* Generate internal stack event */
static void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data)
{
@@ -585,6 +819,7 @@ static int hci_sock_release(struct socket *sock)
{
struct sock *sk = sock->sk;
struct hci_dev *hdev;
+ struct sk_buff *skb;
BT_DBG("sock %p sk %p", sock, sk);
@@ -593,8 +828,24 @@ static int hci_sock_release(struct socket *sock)
hdev = hci_pi(sk)->hdev;
- if (hci_pi(sk)->channel == HCI_CHANNEL_MONITOR)
+ switch (hci_pi(sk)->channel) {
+ case HCI_CHANNEL_MONITOR:
atomic_dec(&monitor_promisc);
+ break;
+ case HCI_CHANNEL_RAW:
+ case HCI_CHANNEL_USER:
+ case HCI_CHANNEL_CONTROL:
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_close(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+
+ hci_sock_free_cookie(sk);
+ break;
+ }
bt_sock_unlink(&hci_sk_list, sk);
@@ -721,6 +972,27 @@ static int hci_sock_ioctl(struct socket *sock, unsigned int cmd,
goto done;
}
+ /* When calling an ioctl on an unbound raw socket, then ensure
+ * that the monitor gets informed. Ensure that the resulting event
+ * is only send once by checking if the cookie exists or not. The
+ * socket cookie will be only ever generated once for the lifetime
+ * of a given socket.
+ */
+ if (hci_sock_gen_cookie(sk)) {
+ struct sk_buff *skb;
+
+ if (capable(CAP_NET_ADMIN))
+ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
+
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_open(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+ }
+
release_sock(sk);
switch (cmd) {
@@ -784,6 +1056,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
struct sockaddr_hci haddr;
struct sock *sk = sock->sk;
struct hci_dev *hdev = NULL;
+ struct sk_buff *skb;
int len, err = 0;
BT_DBG("sock %p sk %p", sock, sk);
@@ -822,7 +1095,35 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
atomic_inc(&hdev->promisc);
}
+ hci_pi(sk)->channel = haddr.hci_channel;
+
+ if (!hci_sock_gen_cookie(sk)) {
+ /* In the case when a cookie has already been assigned,
+ * then there has been already an ioctl issued against
+ * an unbound socket and with that triggerd an open
+ * notification. Send a close notification first to
+ * allow the state transition to bounded.
+ */
+ skb = create_monitor_ctrl_close(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+ }
+
+ if (capable(CAP_NET_ADMIN))
+ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
+
hci_pi(sk)->hdev = hdev;
+
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_open(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
break;
case HCI_CHANNEL_USER:
@@ -884,9 +1185,38 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
}
}
- atomic_inc(&hdev->promisc);
+ hci_pi(sk)->channel = haddr.hci_channel;
+
+ if (!hci_sock_gen_cookie(sk)) {
+ /* In the case when a cookie has already been assigned,
+ * this socket will transition from a raw socket into
+ * an user channel socket. For a clean transition, send
+ * the close notification first.
+ */
+ skb = create_monitor_ctrl_close(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+ }
+
+ /* The user channel is restricted to CAP_NET_ADMIN
+ * capabilities and with that implicitly trusted.
+ */
+ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
hci_pi(sk)->hdev = hdev;
+
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_open(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+
+ atomic_inc(&hdev->promisc);
break;
case HCI_CHANNEL_MONITOR:
@@ -900,6 +1230,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
goto done;
}
+ hci_pi(sk)->channel = haddr.hci_channel;
+
/* The monitor interface is restricted to CAP_NET_RAW
* capabilities and with that implicitly trusted.
*/
@@ -908,9 +1240,10 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
send_monitor_note(sk, "Linux version %s (%s)",
init_utsname()->release,
init_utsname()->machine);
- send_monitor_note(sk, "Bluetooth subsystem version %s",
- BT_SUBSYS_VERSION);
+ send_monitor_note(sk, "Bluetooth subsystem version %u.%u",
+ BT_SUBSYS_VERSION, BT_SUBSYS_REVISION);
send_monitor_replay(sk);
+ send_monitor_control_replay(sk);
atomic_inc(&monitor_promisc);
break;
@@ -925,6 +1258,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
err = -EPERM;
goto done;
}
+
+ hci_pi(sk)->channel = haddr.hci_channel;
break;
default:
@@ -946,6 +1281,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
if (capable(CAP_NET_ADMIN))
hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
+ hci_pi(sk)->channel = haddr.hci_channel;
+
/* At the moment the index and unconfigured index events
* are enabled unconditionally. Setting them on each
* socket when binding keeps this functionality. They
@@ -956,16 +1293,40 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
* received by untrusted users. Example for such events
* are changes to settings, class of device, name etc.
*/
- if (haddr.hci_channel == HCI_CHANNEL_CONTROL) {
+ if (hci_pi(sk)->channel == HCI_CHANNEL_CONTROL) {
+ if (!hci_sock_gen_cookie(sk)) {
+ /* In the case when a cookie has already been
+ * assigned, this socket will transtion from
+ * a raw socket into a control socket. To
+ * allow for a clean transtion, send the
+ * close notification first.
+ */
+ skb = create_monitor_ctrl_close(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+ }
+
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_open(sk);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+
hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS);
hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS);
- hci_sock_set_flag(sk, HCI_MGMT_GENERIC_EVENTS);
+ hci_sock_set_flag(sk, HCI_MGMT_OPTION_EVENTS);
+ hci_sock_set_flag(sk, HCI_MGMT_SETTING_EVENTS);
+ hci_sock_set_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS);
+ hci_sock_set_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS);
}
break;
}
-
- hci_pi(sk)->channel = haddr.hci_channel;
sk->sk_state = BT_BOUND;
done:
@@ -1133,6 +1494,19 @@ static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk,
goto done;
}
+ if (chan->channel == HCI_CHANNEL_CONTROL) {
+ struct sk_buff *skb;
+
+ /* Send event to monitor */
+ skb = create_monitor_ctrl_command(sk, index, opcode, len,
+ buf + sizeof(*hdr));
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+ }
+
if (opcode >= chan->handler_count ||
chan->handlers[opcode].func == NULL) {
BT_DBG("Unknown op %u", opcode);
@@ -1440,6 +1814,9 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname,
BT_DBG("sk %p, opt %d", sk, optname);
+ if (level != SOL_HCI)
+ return -ENOPROTOOPT;
+
lock_sock(sk);
if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) {
@@ -1523,6 +1900,9 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname,
BT_DBG("sk %p, opt %d", sk, optname);
+ if (level != SOL_HCI)
+ return -ENOPROTOOPT;
+
if (get_user(len, optlen))
return -EFAULT;
diff --git a/net/bluetooth/leds.c b/net/bluetooth/leds.c
index 8319c8440c89..cb670b5594eb 100644
--- a/net/bluetooth/leds.c
+++ b/net/bluetooth/leds.c
@@ -11,6 +11,8 @@
#include "leds.h"
+DEFINE_LED_TRIGGER(bt_power_led_trigger);
+
struct hci_basic_led_trigger {
struct led_trigger led_trigger;
struct hci_dev *hdev;
@@ -24,6 +26,21 @@ void hci_leds_update_powered(struct hci_dev *hdev, bool enabled)
if (hdev->power_led)
led_trigger_event(hdev->power_led,
enabled ? LED_FULL : LED_OFF);
+
+ if (!enabled) {
+ struct hci_dev *d;
+
+ read_lock(&hci_dev_list_lock);
+
+ list_for_each_entry(d, &hci_dev_list, list) {
+ if (test_bit(HCI_UP, &d->flags))
+ enabled = true;
+ }
+
+ read_unlock(&hci_dev_list_lock);
+ }
+
+ led_trigger_event(bt_power_led_trigger, enabled ? LED_FULL : LED_OFF);
}
static void power_activate(struct led_classdev *led_cdev)
@@ -72,3 +89,13 @@ void hci_leds_init(struct hci_dev *hdev)
/* initialize power_led */
hdev->power_led = led_allocate_basic(hdev, power_activate, "power");
}
+
+void bt_leds_init(void)
+{
+ led_trigger_register_simple("bluetooth-power", &bt_power_led_trigger);
+}
+
+void bt_leds_cleanup(void)
+{
+ led_trigger_unregister_simple(bt_power_led_trigger);
+}
diff --git a/net/bluetooth/leds.h b/net/bluetooth/leds.h
index a9c4d6ea01cf..08725a2fbd9b 100644
--- a/net/bluetooth/leds.h
+++ b/net/bluetooth/leds.h
@@ -7,10 +7,20 @@
*/
#if IS_ENABLED(CONFIG_BT_LEDS)
+
void hci_leds_update_powered(struct hci_dev *hdev, bool enabled);
void hci_leds_init(struct hci_dev *hdev);
+
+void bt_leds_init(void);
+void bt_leds_cleanup(void);
+
#else
+
static inline void hci_leds_update_powered(struct hci_dev *hdev,
bool enabled) {}
static inline void hci_leds_init(struct hci_dev *hdev) {}
+
+static inline void bt_leds_init(void) {}
+static inline void bt_leds_cleanup(void) {}
+
#endif
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 7639290b6de3..7b2bac492fb1 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -38,7 +38,7 @@
#include "mgmt_util.h"
#define MGMT_VERSION 1
-#define MGMT_REVISION 13
+#define MGMT_REVISION 14
static const u16 mgmt_commands[] = {
MGMT_OP_READ_INDEX_LIST,
@@ -104,6 +104,8 @@ static const u16 mgmt_commands[] = {
MGMT_OP_REMOVE_ADVERTISING,
MGMT_OP_GET_ADV_SIZE_INFO,
MGMT_OP_START_LIMITED_DISCOVERY,
+ MGMT_OP_READ_EXT_INFO,
+ MGMT_OP_SET_APPEARANCE,
};
static const u16 mgmt_events[] = {
@@ -141,6 +143,7 @@ static const u16 mgmt_events[] = {
MGMT_EV_LOCAL_OOB_DATA_UPDATED,
MGMT_EV_ADVERTISING_ADDED,
MGMT_EV_ADVERTISING_REMOVED,
+ MGMT_EV_EXT_INFO_CHANGED,
};
static const u16 mgmt_untrusted_commands[] = {
@@ -149,6 +152,7 @@ static const u16 mgmt_untrusted_commands[] = {
MGMT_OP_READ_UNCONF_INDEX_LIST,
MGMT_OP_READ_CONFIG_INFO,
MGMT_OP_READ_EXT_INDEX_LIST,
+ MGMT_OP_READ_EXT_INFO,
};
static const u16 mgmt_untrusted_events[] = {
@@ -162,6 +166,7 @@ static const u16 mgmt_untrusted_events[] = {
MGMT_EV_NEW_CONFIG_OPTIONS,
MGMT_EV_EXT_INDEX_ADDED,
MGMT_EV_EXT_INDEX_REMOVED,
+ MGMT_EV_EXT_INFO_CHANGED,
};
#define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000)
@@ -256,13 +261,6 @@ static int mgmt_limited_event(u16 event, struct hci_dev *hdev, void *data,
flag, skip_sk);
}
-static int mgmt_generic_event(u16 event, struct hci_dev *hdev, void *data,
- u16 len, struct sock *skip_sk)
-{
- return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len,
- HCI_MGMT_GENERIC_EVENTS, skip_sk);
-}
-
static int mgmt_event(u16 event, struct hci_dev *hdev, void *data, u16 len,
struct sock *skip_sk)
{
@@ -278,6 +276,14 @@ static u8 le_addr_type(u8 mgmt_addr_type)
return ADDR_LE_DEV_RANDOM;
}
+void mgmt_fill_version_info(void *ver)
+{
+ struct mgmt_rp_read_version *rp = ver;
+
+ rp->version = MGMT_VERSION;
+ rp->revision = cpu_to_le16(MGMT_REVISION);
+}
+
static int read_version(struct sock *sk, struct hci_dev *hdev, void *data,
u16 data_len)
{
@@ -285,8 +291,7 @@ static int read_version(struct sock *sk, struct hci_dev *hdev, void *data,
BT_DBG("sock %p", sk);
- rp.version = MGMT_VERSION;
- rp.revision = cpu_to_le16(MGMT_REVISION);
+ mgmt_fill_version_info(&rp);
return mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_VERSION, 0,
&rp, sizeof(rp));
@@ -572,8 +577,8 @@ static int new_options(struct hci_dev *hdev, struct sock *skip)
{
__le32 options = get_missing_options(hdev);
- return mgmt_generic_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options,
- sizeof(options), skip);
+ return mgmt_limited_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options,
+ sizeof(options), HCI_MGMT_OPTION_EVENTS, skip);
}
static int send_options_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev)
@@ -862,6 +867,107 @@ static int read_controller_info(struct sock *sk, struct hci_dev *hdev,
sizeof(rp));
}
+static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data,
+ u8 data_len)
+{
+ eir[eir_len++] = sizeof(type) + data_len;
+ eir[eir_len++] = type;
+ memcpy(&eir[eir_len], data, data_len);
+ eir_len += data_len;
+
+ return eir_len;
+}
+
+static inline u16 eir_append_le16(u8 *eir, u16 eir_len, u8 type, u16 data)
+{
+ eir[eir_len++] = sizeof(type) + sizeof(data);
+ eir[eir_len++] = type;
+ put_unaligned_le16(data, &eir[eir_len]);
+ eir_len += sizeof(data);
+
+ return eir_len;
+}
+
+static u16 append_eir_data_to_buf(struct hci_dev *hdev, u8 *eir)
+{
+ u16 eir_len = 0;
+ size_t name_len;
+
+ if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ eir_len = eir_append_data(eir, eir_len, EIR_CLASS_OF_DEV,
+ hdev->dev_class, 3);
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ eir_len = eir_append_le16(eir, eir_len, EIR_APPEARANCE,
+ hdev->appearance);
+
+ name_len = strlen(hdev->dev_name);
+ eir_len = eir_append_data(eir, eir_len, EIR_NAME_COMPLETE,
+ hdev->dev_name, name_len);
+
+ name_len = strlen(hdev->short_name);
+ eir_len = eir_append_data(eir, eir_len, EIR_NAME_SHORT,
+ hdev->short_name, name_len);
+
+ return eir_len;
+}
+
+static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ char buf[512];
+ struct mgmt_rp_read_ext_info *rp = (void *)buf;
+ u16 eir_len;
+
+ BT_DBG("sock %p %s", sk, hdev->name);
+
+ memset(&buf, 0, sizeof(buf));
+
+ hci_dev_lock(hdev);
+
+ bacpy(&rp->bdaddr, &hdev->bdaddr);
+
+ rp->version = hdev->hci_ver;
+ rp->manufacturer = cpu_to_le16(hdev->manufacturer);
+
+ rp->supported_settings = cpu_to_le32(get_supported_settings(hdev));
+ rp->current_settings = cpu_to_le32(get_current_settings(hdev));
+
+
+ eir_len = append_eir_data_to_buf(hdev, rp->eir);
+ rp->eir_len = cpu_to_le16(eir_len);
+
+ hci_dev_unlock(hdev);
+
+ /* If this command is called at least once, then the events
+ * for class of device and local name changes are disabled
+ * and only the new extended controller information event
+ * is used.
+ */
+ hci_sock_set_flag(sk, HCI_MGMT_EXT_INFO_EVENTS);
+ hci_sock_clear_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS);
+ hci_sock_clear_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_EXT_INFO, 0, rp,
+ sizeof(*rp) + eir_len);
+}
+
+static int ext_info_changed(struct hci_dev *hdev, struct sock *skip)
+{
+ char buf[512];
+ struct mgmt_ev_ext_info_changed *ev = (void *)buf;
+ u16 eir_len;
+
+ memset(buf, 0, sizeof(buf));
+
+ eir_len = append_eir_data_to_buf(hdev, ev->eir);
+ ev->eir_len = cpu_to_le16(eir_len);
+
+ return mgmt_limited_event(MGMT_EV_EXT_INFO_CHANGED, hdev, ev,
+ sizeof(*ev) + eir_len,
+ HCI_MGMT_EXT_INFO_EVENTS, skip);
+}
+
static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev)
{
__le32 settings = cpu_to_le32(get_current_settings(hdev));
@@ -922,7 +1028,7 @@ static int clean_up_hci_state(struct hci_dev *hdev)
hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
}
- hci_req_clear_adv_instance(hdev, NULL, 0x00, false);
+ hci_req_clear_adv_instance(hdev, NULL, NULL, 0x00, false);
if (hci_dev_test_flag(hdev, HCI_LE_ADV))
__hci_req_disable_advertising(&req);
@@ -1000,8 +1106,8 @@ static int new_settings(struct hci_dev *hdev, struct sock *skip)
{
__le32 ev = cpu_to_le32(get_current_settings(hdev));
- return mgmt_generic_event(MGMT_EV_NEW_SETTINGS, hdev, &ev,
- sizeof(ev), skip);
+ return mgmt_limited_event(MGMT_EV_NEW_SETTINGS, hdev, &ev,
+ sizeof(ev), HCI_MGMT_SETTING_EVENTS, skip);
}
int mgmt_new_settings(struct hci_dev *hdev)
@@ -1690,7 +1796,7 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
enabled = lmp_host_le_capable(hdev);
if (!val)
- hci_req_clear_adv_instance(hdev, NULL, 0x00, true);
+ hci_req_clear_adv_instance(hdev, NULL, NULL, 0x00, true);
if (!hdev_is_powered(hdev) || val == enabled) {
bool changed = false;
@@ -2513,8 +2619,8 @@ static int set_io_capability(struct sock *sk, struct hci_dev *hdev, void *data,
BT_DBG("");
if (cp->io_capability > SMP_IO_KEYBOARD_DISPLAY)
- return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY,
- MGMT_STATUS_INVALID_PARAMS, NULL, 0);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY,
+ MGMT_STATUS_INVALID_PARAMS);
hci_dev_lock(hdev);
@@ -2932,6 +3038,35 @@ static int user_passkey_neg_reply(struct sock *sk, struct hci_dev *hdev,
HCI_OP_USER_PASSKEY_NEG_REPLY, 0);
}
+static void adv_expire(struct hci_dev *hdev, u32 flags)
+{
+ struct adv_info *adv_instance;
+ struct hci_request req;
+ int err;
+
+ adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance);
+ if (!adv_instance)
+ return;
+
+ /* stop if current instance doesn't need to be changed */
+ if (!(adv_instance->flags & flags))
+ return;
+
+ cancel_adv_timeout(hdev);
+
+ adv_instance = hci_get_next_instance(hdev, adv_instance->instance);
+ if (!adv_instance)
+ return;
+
+ hci_req_init(&req, hdev);
+ err = __hci_req_schedule_adv_instance(&req, adv_instance->instance,
+ true);
+ if (err)
+ return;
+
+ hci_req_run(&req, NULL);
+}
+
static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode)
{
struct mgmt_cp_set_local_name *cp;
@@ -2947,13 +3082,17 @@ static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode)
cp = cmd->param;
- if (status)
+ if (status) {
mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME,
mgmt_status(status));
- else
+ } else {
mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0,
cp, sizeof(*cp));
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV))
+ adv_expire(hdev, MGMT_ADV_FLAG_LOCAL_NAME);
+ }
+
mgmt_pending_remove(cmd);
unlock:
@@ -2993,8 +3132,9 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data,
if (err < 0)
goto failed;
- err = mgmt_generic_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev,
- data, len, sk);
+ err = mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, data,
+ len, HCI_MGMT_LOCAL_NAME_EVENTS, sk);
+ ext_info_changed(hdev, sk);
goto failed;
}
@@ -3029,6 +3169,40 @@ failed:
return err;
}
+static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_set_appearance *cp = data;
+ u16 apperance;
+ int err;
+
+ BT_DBG("");
+
+ if (!lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_APPEARANCE,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ apperance = le16_to_cpu(cp->appearance);
+
+ hci_dev_lock(hdev);
+
+ if (hdev->appearance != apperance) {
+ hdev->appearance = apperance;
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV))
+ adv_expire(hdev, MGMT_ADV_FLAG_APPEARANCE);
+
+ ext_info_changed(hdev, sk);
+ }
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_APPEARANCE, 0, NULL,
+ 0);
+
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status,
u16 opcode, struct sk_buff *skb)
{
@@ -4869,7 +5043,7 @@ static int clock_info_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status)
int err;
memset(&rp, 0, sizeof(rp));
- memcpy(&rp.addr, &cmd->param, sizeof(rp.addr));
+ memcpy(&rp.addr, cmd->param, sizeof(rp.addr));
if (status)
goto complete;
@@ -5501,17 +5675,6 @@ unlock:
return err;
}
-static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data,
- u8 data_len)
-{
- eir[eir_len++] = sizeof(type) + data_len;
- eir[eir_len++] = type;
- memcpy(&eir[eir_len], data, data_len);
- eir_len += data_len;
-
- return eir_len;
-}
-
static void read_local_oob_ext_data_complete(struct hci_dev *hdev, u8 status,
u16 opcode, struct sk_buff *skb)
{
@@ -5815,6 +5978,8 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev)
flags |= MGMT_ADV_FLAG_DISCOV;
flags |= MGMT_ADV_FLAG_LIMITED_DISCOV;
flags |= MGMT_ADV_FLAG_MANAGED_FLAGS;
+ flags |= MGMT_ADV_FLAG_APPEARANCE;
+ flags |= MGMT_ADV_FLAG_LOCAL_NAME;
if (hdev->adv_tx_power != HCI_TX_POWER_INVALID)
flags |= MGMT_ADV_FLAG_TX_POWER;
@@ -5871,28 +6036,59 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev,
return err;
}
-static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data,
- u8 len, bool is_adv_data)
+static u8 tlv_data_max_len(u32 adv_flags, bool is_adv_data)
{
u8 max_len = HCI_MAX_AD_LENGTH;
- int i, cur_len;
- bool flags_managed = false;
- bool tx_power_managed = false;
if (is_adv_data) {
if (adv_flags & (MGMT_ADV_FLAG_DISCOV |
MGMT_ADV_FLAG_LIMITED_DISCOV |
- MGMT_ADV_FLAG_MANAGED_FLAGS)) {
- flags_managed = true;
+ MGMT_ADV_FLAG_MANAGED_FLAGS))
max_len -= 3;
- }
- if (adv_flags & MGMT_ADV_FLAG_TX_POWER) {
- tx_power_managed = true;
+ if (adv_flags & MGMT_ADV_FLAG_TX_POWER)
max_len -= 3;
- }
+ } else {
+ /* at least 1 byte of name should fit in */
+ if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME)
+ max_len -= 3;
+
+ if (adv_flags & (MGMT_ADV_FLAG_APPEARANCE))
+ max_len -= 4;
}
+ return max_len;
+}
+
+static bool flags_managed(u32 adv_flags)
+{
+ return adv_flags & (MGMT_ADV_FLAG_DISCOV |
+ MGMT_ADV_FLAG_LIMITED_DISCOV |
+ MGMT_ADV_FLAG_MANAGED_FLAGS);
+}
+
+static bool tx_power_managed(u32 adv_flags)
+{
+ return adv_flags & MGMT_ADV_FLAG_TX_POWER;
+}
+
+static bool name_managed(u32 adv_flags)
+{
+ return adv_flags & MGMT_ADV_FLAG_LOCAL_NAME;
+}
+
+static bool appearance_managed(u32 adv_flags)
+{
+ return adv_flags & MGMT_ADV_FLAG_APPEARANCE;
+}
+
+static bool tlv_data_is_valid(u32 adv_flags, u8 *data, u8 len, bool is_adv_data)
+{
+ int i, cur_len;
+ u8 max_len;
+
+ max_len = tlv_data_max_len(adv_flags, is_adv_data);
+
if (len > max_len)
return false;
@@ -5900,10 +6096,21 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data,
for (i = 0, cur_len = 0; i < len; i += (cur_len + 1)) {
cur_len = data[i];
- if (flags_managed && data[i + 1] == EIR_FLAGS)
+ if (data[i + 1] == EIR_FLAGS &&
+ (!is_adv_data || flags_managed(adv_flags)))
+ return false;
+
+ if (data[i + 1] == EIR_TX_POWER && tx_power_managed(adv_flags))
+ return false;
+
+ if (data[i + 1] == EIR_NAME_COMPLETE && name_managed(adv_flags))
return false;
- if (tx_power_managed && data[i + 1] == EIR_TX_POWER)
+ if (data[i + 1] == EIR_NAME_SHORT && name_managed(adv_flags))
+ return false;
+
+ if (data[i + 1] == EIR_APPEARANCE &&
+ appearance_managed(adv_flags))
return false;
/* If the current field length would exceed the total data
@@ -6027,8 +6234,8 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
goto unlock;
}
- if (!tlv_data_is_valid(hdev, flags, cp->data, cp->adv_data_len, true) ||
- !tlv_data_is_valid(hdev, flags, cp->data + cp->adv_data_len,
+ if (!tlv_data_is_valid(flags, cp->data, cp->adv_data_len, true) ||
+ !tlv_data_is_valid(flags, cp->data + cp->adv_data_len,
cp->scan_rsp_len, false)) {
err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
MGMT_STATUS_INVALID_PARAMS);
@@ -6175,7 +6382,7 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev,
hci_req_init(&req, hdev);
- hci_req_clear_adv_instance(hdev, &req, cp->instance, true);
+ hci_req_clear_adv_instance(hdev, sk, &req, cp->instance, true);
if (list_empty(&hdev->adv_instances))
__hci_req_disable_advertising(&req);
@@ -6211,23 +6418,6 @@ unlock:
return err;
}
-static u8 tlv_data_max_len(u32 adv_flags, bool is_adv_data)
-{
- u8 max_len = HCI_MAX_AD_LENGTH;
-
- if (is_adv_data) {
- if (adv_flags & (MGMT_ADV_FLAG_DISCOV |
- MGMT_ADV_FLAG_LIMITED_DISCOV |
- MGMT_ADV_FLAG_MANAGED_FLAGS))
- max_len -= 3;
-
- if (adv_flags & MGMT_ADV_FLAG_TX_POWER)
- max_len -= 3;
- }
-
- return max_len;
-}
-
static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev,
void *data, u16 data_len)
{
@@ -6356,6 +6546,9 @@ static const struct hci_mgmt_handler mgmt_handlers[] = {
{ remove_advertising, MGMT_REMOVE_ADVERTISING_SIZE },
{ get_adv_size_info, MGMT_GET_ADV_SIZE_INFO_SIZE },
{ start_limited_discovery, MGMT_START_DISCOVERY_SIZE },
+ { read_ext_controller_info,MGMT_READ_EXT_INFO_SIZE,
+ HCI_MGMT_UNTRUSTED },
+ { set_appearance, MGMT_SET_APPEARANCE_SIZE },
};
void mgmt_index_added(struct hci_dev *hdev)
@@ -6494,9 +6687,12 @@ void __mgmt_power_off(struct hci_dev *hdev)
mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status);
- if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0)
- mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
- zero_cod, sizeof(zero_cod), NULL);
+ if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) {
+ mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
+ zero_cod, sizeof(zero_cod),
+ HCI_MGMT_DEV_CLASS_EVENTS, NULL);
+ ext_info_changed(hdev, NULL);
+ }
new_settings(hdev, match.sk);
@@ -7092,9 +7288,11 @@ void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class,
mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, sk_lookup, &match);
mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, sk_lookup, &match);
- if (!status)
- mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
- dev_class, 3, NULL);
+ if (!status) {
+ mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, dev_class,
+ 3, HCI_MGMT_DEV_CLASS_EVENTS, NULL);
+ ext_info_changed(hdev, NULL);
+ }
if (match.sk)
sock_put(match.sk);
@@ -7123,8 +7321,9 @@ void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status)
return;
}
- mgmt_generic_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev),
- cmd ? cmd->sk : NULL);
+ mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev),
+ HCI_MGMT_LOCAL_NAME_EVENTS, cmd ? cmd->sk : NULL);
+ ext_info_changed(hdev, cmd ? cmd->sk : NULL);
}
static inline bool has_uuid(u8 *uuid, u16 uuid_count, u8 (*uuids)[16])
diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c
index 8c30c7eb8bef..c933bd08c1fe 100644
--- a/net/bluetooth/mgmt_util.c
+++ b/net/bluetooth/mgmt_util.c
@@ -21,12 +21,41 @@
SOFTWARE IS DISCLAIMED.
*/
+#include <asm/unaligned.h>
+
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/hci_mon.h>
#include <net/bluetooth/mgmt.h>
#include "mgmt_util.h"
+static struct sk_buff *create_monitor_ctrl_event(__le16 index, u32 cookie,
+ u16 opcode, u16 len, void *buf)
+{
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(6 + len, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ put_unaligned_le32(cookie, skb_put(skb, 4));
+ put_unaligned_le16(opcode, skb_put(skb, 2));
+
+ if (buf)
+ memcpy(skb_put(skb, len), buf, len);
+
+ __net_timestamp(skb);
+
+ hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT);
+ hdr->index = index;
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ return skb;
+}
+
int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel,
void *data, u16 data_len, int flag, struct sock *skip_sk)
{
@@ -52,14 +81,18 @@ int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel,
__net_timestamp(skb);
hci_send_to_channel(channel, skb, flag, skip_sk);
- kfree_skb(skb);
+ if (channel == HCI_CHANNEL_CONTROL)
+ hci_send_monitor_ctrl_event(hdev, event, data, data_len,
+ skb_get_ktime(skb), flag, skip_sk);
+
+ kfree_skb(skb);
return 0;
}
int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status)
{
- struct sk_buff *skb;
+ struct sk_buff *skb, *mskb;
struct mgmt_hdr *hdr;
struct mgmt_ev_cmd_status *ev;
int err;
@@ -80,17 +113,30 @@ int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status)
ev->status = status;
ev->opcode = cpu_to_le16(cmd);
+ mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk),
+ MGMT_EV_CMD_STATUS, sizeof(*ev), ev);
+ if (mskb)
+ skb->tstamp = mskb->tstamp;
+ else
+ __net_timestamp(skb);
+
err = sock_queue_rcv_skb(sk, skb);
if (err < 0)
kfree_skb(skb);
+ if (mskb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(mskb);
+ }
+
return err;
}
int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status,
void *rp, size_t rp_len)
{
- struct sk_buff *skb;
+ struct sk_buff *skb, *mskb;
struct mgmt_hdr *hdr;
struct mgmt_ev_cmd_complete *ev;
int err;
@@ -114,10 +160,24 @@ int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status,
if (rp)
memcpy(ev->data, rp, rp_len);
+ mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk),
+ MGMT_EV_CMD_COMPLETE,
+ sizeof(*ev) + rp_len, ev);
+ if (mskb)
+ skb->tstamp = mskb->tstamp;
+ else
+ __net_timestamp(skb);
+
err = sock_queue_rcv_skb(sk, skb);
if (err < 0)
kfree_skb(skb);
+ if (mskb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(mskb);
+ }
+
return err;
}
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 4c1a16a96ae5..43faf2aea2ab 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -3387,7 +3387,10 @@ int smp_register(struct hci_dev *hdev)
if (!lmp_sc_capable(hdev)) {
debugfs_create_file("force_bredr_smp", 0644, hdev->debugfs,
hdev, &force_bredr_smp_fops);
- return 0;
+
+ /* Flag can be already set here (due to power toggle) */
+ if (!hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP))
+ return 0;
}
if (WARN_ON(hdev->smp_bredr_data)) {
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 8a4368461fb0..855b72fbe1da 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -80,13 +80,10 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
BR_INPUT_SKB_CB(skb)->proxyarp_replied = false;
- if (dev->flags & IFF_NOARP)
+ if ((dev->flags & IFF_NOARP) ||
+ !pskb_may_pull(skb, arp_hdr_len(dev)))
return;
- if (!pskb_may_pull(skb, arp_hdr_len(dev))) {
- dev->stats.tx_dropped++;
- return;
- }
parp = arp_hdr(skb);
if (parp->ar_pro != htons(ETH_P_IP) ||
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index a5423a1eec05..c5fea9393946 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1138,7 +1138,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
} else {
err = br_ip6_multicast_add_group(br, port,
&grec->grec_mca, vid);
- if (!err)
+ if (err)
break;
}
}
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index 341caa0ca63a..d8ad73b38de2 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -134,17 +134,36 @@ void br_stp_disable_port(struct net_bridge_port *p)
br_become_root_bridge(br);
}
-static void br_stp_start(struct net_bridge *br)
+static int br_stp_call_user(struct net_bridge *br, char *arg)
{
- int r;
- char *argv[] = { BR_STP_PROG, br->dev->name, "start", NULL };
+ char *argv[] = { BR_STP_PROG, br->dev->name, arg, NULL };
char *envp[] = { NULL };
+ int rc;
+
+ /* call userspace STP and report program errors */
+ rc = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
+ if (rc > 0) {
+ if (rc & 0xff)
+ br_debug(br, BR_STP_PROG " received signal %d\n",
+ rc & 0x7f);
+ else
+ br_debug(br, BR_STP_PROG " exited with code %d\n",
+ (rc >> 8) & 0xff);
+ }
+
+ return rc;
+}
+
+static void br_stp_start(struct net_bridge *br)
+{
struct net_bridge_port *p;
+ int err = -ENOENT;
if (net_eq(dev_net(br->dev), &init_net))
- r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
- else
- r = -ENOENT;
+ err = br_stp_call_user(br, "start");
+
+ if (err && err != -ENOENT)
+ br_err(br, "failed to start userspace STP (%d)\n", err);
spin_lock_bh(&br->lock);
@@ -153,9 +172,10 @@ static void br_stp_start(struct net_bridge *br)
else if (br->bridge_forward_delay > BR_MAX_FORWARD_DELAY)
__br_set_forward_delay(br, BR_MAX_FORWARD_DELAY);
- if (r == 0) {
+ if (!err) {
br->stp_enabled = BR_USER_STP;
br_debug(br, "userspace STP started\n");
+
/* Stop hello and hold timers */
del_timer(&br->hello_timer);
list_for_each_entry(p, &br->port_list, list)
@@ -173,14 +193,13 @@ static void br_stp_start(struct net_bridge *br)
static void br_stp_stop(struct net_bridge *br)
{
- int r;
- char *argv[] = { BR_STP_PROG, br->dev->name, "stop", NULL };
- char *envp[] = { NULL };
struct net_bridge_port *p;
+ int err;
if (br->stp_enabled == BR_USER_STP) {
- r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
- br_info(br, "userspace STP stopped, return code %d\n", r);
+ err = br_stp_call_user(br, "stop");
+ if (err)
+ br_err(br, "failed to stop userspace STP (%d)\n", err);
/* To start timers on any ports left in blocking */
mod_timer(&br->hello_timer, jiffies + br->hello_time);
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index dd7133216c9c..f5c11bbe27db 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -368,6 +368,8 @@ ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par,
match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0);
if (IS_ERR(match) || match->family != NFPROTO_BRIDGE) {
+ if (!IS_ERR(match))
+ module_put(match->me);
request_module("ebt_%s", m->u.name);
match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0);
}
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
index 4b901d9f2e7c..ad47a921b701 100644
--- a/net/bridge/netfilter/nft_meta_bridge.c
+++ b/net/bridge/netfilter/nft_meta_bridge.c
@@ -86,6 +86,7 @@ static const struct nft_expr_ops nft_meta_bridge_set_ops = {
.init = nft_meta_set_init,
.destroy = nft_meta_set_destroy,
.dump = nft_meta_set_dump,
+ .validate = nft_meta_set_validate,
};
static const struct nft_expr_ops *
diff --git a/net/core/dev.c b/net/core/dev.c
index 064919425b7d..c0c291f721d6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3904,8 +3904,7 @@ static void net_tx_action(struct softirq_action *h)
}
}
-#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
- (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
+#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
/* This hook is defined here for ATM LANE */
int (*br_fdb_test_addr_hook)(struct net_device *dev,
unsigned char *addr) __read_mostly;
@@ -3965,6 +3964,22 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
}
/**
+ * netdev_is_rx_handler_busy - check if receive handler is registered
+ * @dev: device to check
+ *
+ * Check if a receive handler is already registered for a given device.
+ * Return true if there one.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+bool netdev_is_rx_handler_busy(struct net_device *dev)
+{
+ ASSERT_RTNL();
+ return dev && rtnl_dereference(dev->rx_handler);
+}
+EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
+
+/**
* netdev_rx_handler_register - register receive handler
* @dev: device to register a handler for
* @rx_handler: receive handler to register
diff --git a/net/core/filter.c b/net/core/filter.c
index a83766be1ad2..00351cdf7d0c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -94,14 +94,13 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
}
EXPORT_SYMBOL(sk_filter_trim_cap);
-static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb)
{
- return skb_get_poff((struct sk_buff *)(unsigned long) ctx);
+ return skb_get_poff(skb);
}
-static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
{
- struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
struct nlattr *nla;
if (skb_is_nonlinear(skb))
@@ -120,9 +119,8 @@ static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
return 0;
}
-static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
{
- struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
struct nlattr *nla;
if (skb_is_nonlinear(skb))
@@ -145,7 +143,7 @@ static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
return 0;
}
-static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+BPF_CALL_0(__get_raw_cpu_id)
{
return raw_smp_processor_id();
}
@@ -233,9 +231,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
case SKF_AD_OFF + SKF_AD_HATYPE:
BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
- BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0);
- *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
BPF_REG_TMP, BPF_REG_CTX,
offsetof(struct sk_buff, dev));
/* if (tmp != 0) goto pc + 1 */
@@ -1365,6 +1362,11 @@ static inline int bpf_try_make_writable(struct sk_buff *skb,
return err;
}
+static int bpf_try_make_head_writable(struct sk_buff *skb)
+{
+ return bpf_try_make_writable(skb, skb_headlen(skb));
+}
+
static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
{
if (skb_at_tc_ingress(skb))
@@ -1377,12 +1379,9 @@ static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
}
-static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
+BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
+ const void *, from, u32, len, u64, flags)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- unsigned int offset = (unsigned int) r2;
- void *from = (void *) (long) r3;
- unsigned int len = (unsigned int) r4;
void *ptr;
if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
@@ -1417,12 +1416,9 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
.arg5_type = ARG_ANYTHING,
};
-static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
+ void *, to, u32, len)
{
- const struct sk_buff *skb = (const struct sk_buff *)(unsigned long) r1;
- unsigned int offset = (unsigned int) r2;
- void *to = (void *)(unsigned long) r3;
- unsigned int len = (unsigned int) r4;
void *ptr;
if (unlikely(offset > 0xffff))
@@ -1450,10 +1446,31 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
.arg4_type = ARG_CONST_STACK_SIZE,
};
-static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
+BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
+{
+ /* Idea is the following: should the needed direct read/write
+ * test fail during runtime, we can pull in more data and redo
+ * again, since implicitly, we invalidate previous checks here.
+ *
+ * Or, since we know how much we need to make read/writeable,
+ * this can be done once at the program beginning for direct
+ * access case. By this we overcome limitations of only current
+ * headroom being accessible.
+ */
+ return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
+}
+
+static const struct bpf_func_proto bpf_skb_pull_data_proto = {
+ .func = bpf_skb_pull_data,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
+ u64, from, u64, to, u64, flags)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- unsigned int offset = (unsigned int) r2;
__sum16 *ptr;
if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
@@ -1495,12 +1512,11 @@ static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
.arg5_type = ARG_ANYTHING,
};
-static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
+BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
+ u64, from, u64, to, u64, flags)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
- unsigned int offset = (unsigned int) r2;
__sum16 *ptr;
if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR |
@@ -1548,12 +1564,11 @@ static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
.arg5_type = ARG_ANYTHING,
};
-static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed)
+BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
+ __be32 *, to, u32, to_size, __wsum, seed)
{
struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
- u64 diff_size = from_size + to_size;
- __be32 *from = (__be32 *) (long) r1;
- __be32 *to = (__be32 *) (long) r3;
+ u32 diff_size = from_size + to_size;
int i, j = 0;
/* This is quite flexible, some examples:
@@ -1579,6 +1594,7 @@ static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed)
static const struct bpf_func_proto bpf_csum_diff_proto = {
.func = bpf_csum_diff,
.gpl_only = false,
+ .pkt_access = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_STACK,
.arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO,
@@ -1587,6 +1603,26 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
.arg5_type = ARG_ANYTHING,
};
+BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
+{
+ /* The interface is to be used in combination with bpf_csum_diff()
+ * for direct packet writes. csum rotation for alignment as well
+ * as emulating csum_sub() can be done from the eBPF program.
+ */
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ return (skb->csum = csum_add(skb->csum, csum));
+
+ return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_csum_update_proto = {
+ .func = bpf_csum_update,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
{
return dev_forward_skb(dev, skb);
@@ -1611,10 +1647,11 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
return ret;
}
-static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
+BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
struct net_device *dev;
+ struct sk_buff *clone;
+ int ret;
if (unlikely(flags & ~(BPF_F_INGRESS)))
return -EINVAL;
@@ -1623,14 +1660,25 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
if (unlikely(!dev))
return -EINVAL;
- skb = skb_clone(skb, GFP_ATOMIC);
- if (unlikely(!skb))
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (unlikely(!clone))
return -ENOMEM;
- bpf_push_mac_rcsum(skb);
+ /* For direct write, we need to keep the invariant that the skbs
+ * we're dealing with need to be uncloned. Should uncloning fail
+ * here, we need to free the just generated clone to unclone once
+ * again.
+ */
+ ret = bpf_try_make_head_writable(skb);
+ if (unlikely(ret)) {
+ kfree_skb(clone);
+ return -ENOMEM;
+ }
+
+ bpf_push_mac_rcsum(clone);
return flags & BPF_F_INGRESS ?
- __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
+ __bpf_rx_skb(dev, clone) : __bpf_tx_skb(dev, clone);
}
static const struct bpf_func_proto bpf_clone_redirect_proto = {
@@ -1649,7 +1697,7 @@ struct redirect_info {
static DEFINE_PER_CPU(struct redirect_info, redirect_info);
-static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
@@ -1688,9 +1736,9 @@ static const struct bpf_func_proto bpf_redirect_proto = {
.arg2_type = ARG_ANYTHING,
};
-static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
{
- return task_get_classid((struct sk_buff *) (unsigned long) r1);
+ return task_get_classid(skb);
}
static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
@@ -1700,9 +1748,9 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb)
{
- return dst_tclassid((struct sk_buff *) (unsigned long) r1);
+ return dst_tclassid(skb);
}
static const struct bpf_func_proto bpf_get_route_realm_proto = {
@@ -1712,14 +1760,14 @@ static const struct bpf_func_proto bpf_get_route_realm_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-static u64 bpf_get_hash_recalc(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb)
{
/* If skb_clear_hash() was called due to mangling, we can
* trigger SW recalculation here. Later access to hash
* can then use the inline skb->hash via context directly
* instead of calling this helper again.
*/
- return skb_get_hash((struct sk_buff *) (unsigned long) r1);
+ return skb_get_hash(skb);
}
static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
@@ -1729,10 +1777,25 @@ static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5)
+BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb)
+{
+ /* After all direct packet write, this can be used once for
+ * triggering a lazy recalc on next skb_get_hash() invocation.
+ */
+ skb_clear_hash(skb);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
+ .func = bpf_set_hash_invalid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
+ u16, vlan_tci)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- __be16 vlan_proto = (__force __be16) r2;
int ret;
if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
@@ -1757,9 +1820,8 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = {
};
EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
-static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
int ret;
bpf_push_mac_rcsum(skb);
@@ -1934,10 +1996,9 @@ static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
return -ENOTSUPP;
}
-static u64 bpf_skb_change_proto(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
+ u64, flags)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- __be16 proto = (__force __be16) r2;
int ret;
if (unlikely(flags))
@@ -1974,11 +2035,8 @@ static const struct bpf_func_proto bpf_skb_change_proto_proto = {
.arg3_type = ARG_ANYTHING,
};
-static u64 bpf_skb_change_type(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- u32 pkt_type = r2;
-
/* We only allow a restricted subset to be changed for now. */
if (unlikely(!skb_pkt_type_ok(skb->pkt_type) ||
!skb_pkt_type_ok(pkt_type)))
@@ -2010,8 +2068,7 @@ static u32 __bpf_skb_min_len(const struct sk_buff *skb)
static u32 __bpf_skb_max_len(const struct sk_buff *skb)
{
- return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len :
- 65536;
+ return skb->dev->mtu + skb->dev->hard_header_len;
}
static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
@@ -2030,12 +2087,11 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
return __skb_trim_rcsum(skb, new_len);
}
-static u64 bpf_skb_change_tail(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
+ u64, flags)
{
- struct sk_buff *skb = (struct sk_buff *)(long) r1;
u32 max_len = __bpf_skb_max_len(skb);
u32 min_len = __bpf_skb_min_len(skb);
- u32 new_len = (u32) r2;
int ret;
if (unlikely(flags || new_len > max_len || new_len < min_len))
@@ -2084,19 +2140,14 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
bool bpf_helper_changes_skb_data(void *func)
{
- if (func == bpf_skb_vlan_push)
- return true;
- if (func == bpf_skb_vlan_pop)
- return true;
- if (func == bpf_skb_store_bytes)
- return true;
- if (func == bpf_skb_change_proto)
- return true;
- if (func == bpf_skb_change_tail)
- return true;
- if (func == bpf_l3_csum_replace)
- return true;
- if (func == bpf_l4_csum_replace)
+ if (func == bpf_skb_vlan_push ||
+ func == bpf_skb_vlan_pop ||
+ func == bpf_skb_store_bytes ||
+ func == bpf_skb_change_proto ||
+ func == bpf_skb_change_tail ||
+ func == bpf_skb_pull_data ||
+ func == bpf_l3_csum_replace ||
+ func == bpf_l4_csum_replace)
return true;
return false;
@@ -2115,13 +2166,10 @@ static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
return 0;
}
-static u64 bpf_skb_event_output(u64 r1, u64 r2, u64 flags, u64 r4,
- u64 meta_size)
+BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map,
+ u64, flags, void *, meta, u64, meta_size)
{
- struct sk_buff *skb = (struct sk_buff *)(long) r1;
- struct bpf_map *map = (struct bpf_map *)(long) r2;
u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
- void *meta = (void *)(long) r4;
if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
return -EINVAL;
@@ -2148,10 +2196,9 @@ static unsigned short bpf_tunnel_key_af(u64 flags)
return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
}
-static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
+BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to,
+ u32, size, u64, flags)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2;
const struct ip_tunnel_info *info = skb_tunnel_info(skb);
u8 compat[sizeof(struct bpf_tunnel_key)];
void *to_orig = to;
@@ -2216,10 +2263,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
.arg4_type = ARG_ANYTHING,
};
-static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- u8 *to = (u8 *) (long) r2;
const struct ip_tunnel_info *info = skb_tunnel_info(skb);
int err;
@@ -2254,10 +2299,9 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
static struct metadata_dst __percpu *md_dst;
-static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
+BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
+ const struct bpf_tunnel_key *, from, u32, size, u64, flags)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2;
struct metadata_dst *md = this_cpu_ptr(md_dst);
u8 compat[sizeof(struct bpf_tunnel_key)];
struct ip_tunnel_info *info;
@@ -2275,7 +2319,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
*/
memcpy(compat, from, size);
memset(compat + size, 0, sizeof(compat) - size);
- from = (struct bpf_tunnel_key *)compat;
+ from = (const struct bpf_tunnel_key *) compat;
break;
default:
return -EINVAL;
@@ -2325,10 +2369,9 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
.arg4_type = ARG_ANYTHING,
};
-static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
+ const u8 *, from, u32, size)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- u8 *from = (u8 *) (long) r2;
struct ip_tunnel_info *info = skb_tunnel_info(skb);
const struct metadata_dst *md = this_cpu_ptr(md_dst);
@@ -2374,23 +2417,20 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
}
}
-static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map,
+ u32, idx)
{
- struct sk_buff *skb = (struct sk_buff *)(long)r1;
- struct bpf_map *map = (struct bpf_map *)(long)r2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct cgroup *cgrp;
struct sock *sk;
- u32 i = (u32)r3;
- sk = skb->sk;
+ sk = skb_to_full_sk(skb);
if (!sk || !sk_fullsock(sk))
return -ENOENT;
-
- if (unlikely(i >= array->map.max_entries))
+ if (unlikely(idx >= array->map.max_entries))
return -E2BIG;
- cgrp = READ_ONCE(array->ptrs[i]);
+ cgrp = READ_ONCE(array->ptrs[idx]);
if (unlikely(!cgrp))
return -EAGAIN;
@@ -2413,13 +2453,10 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
return 0;
}
-static u64 bpf_xdp_event_output(u64 r1, u64 r2, u64 flags, u64 r4,
- u64 meta_size)
+BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
+ u64, flags, void *, meta, u64, meta_size)
{
- struct xdp_buff *xdp = (struct xdp_buff *)(long) r1;
- struct bpf_map *map = (struct bpf_map *)(long) r2;
u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
- void *meta = (void *)(long) r4;
if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
return -EINVAL;
@@ -2475,8 +2512,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_skb_store_bytes_proto;
case BPF_FUNC_skb_load_bytes:
return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_pull_data:
+ return &bpf_skb_pull_data_proto;
case BPF_FUNC_csum_diff:
return &bpf_csum_diff_proto;
+ case BPF_FUNC_csum_update:
+ return &bpf_csum_update_proto;
case BPF_FUNC_l3_csum_replace:
return &bpf_l3_csum_replace_proto;
case BPF_FUNC_l4_csum_replace:
@@ -2509,6 +2550,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_get_route_realm_proto;
case BPF_FUNC_get_hash_recalc:
return &bpf_get_hash_recalc_proto;
+ case BPF_FUNC_set_hash_invalid:
+ return &bpf_set_hash_invalid_proto;
case BPF_FUNC_perf_event_output:
return &bpf_skb_event_output_proto;
case BPF_FUNC_get_smp_processor_id:
@@ -2526,6 +2569,8 @@ xdp_func_proto(enum bpf_func_id func_id)
switch (func_id) {
case BPF_FUNC_perf_event_output:
return &bpf_xdp_event_output_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
default:
return sk_filter_func_proto(func_id);
}
@@ -2568,6 +2613,45 @@ static bool sk_filter_is_valid_access(int off, int size,
return __is_valid_access(off, size, type);
}
+static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
+ const struct bpf_prog *prog)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ if (!direct_write)
+ return 0;
+
+ /* if (!skb->cloned)
+ * goto start;
+ *
+ * (Fast-path, otherwise approximation that we might be
+ * a clone, do the rest in helper.)
+ */
+ *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET());
+ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);
+
+ /* ret = bpf_skb_pull_data(skb, 0); */
+ *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+ *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
+ *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_skb_pull_data);
+ /* if (!ret)
+ * goto restore;
+ * return TC_ACT_SHOT;
+ */
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
+ *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT);
+ *insn++ = BPF_EXIT_INSN();
+
+ /* restore: */
+ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+ /* start: */
+ *insn++ = prog->insnsi[0];
+
+ return insn - insn_buf;
+}
+
static bool tc_cls_act_is_valid_access(int off, int size,
enum bpf_access_type type,
enum bpf_reg_type *reg_type)
@@ -2605,7 +2689,7 @@ static bool __is_valid_xdp_access(int off, int size,
return false;
if (off % size != 0)
return false;
- if (size != 4)
+ if (size != sizeof(__u32))
return false;
return true;
@@ -2636,10 +2720,10 @@ void bpf_warn_invalid_xdp_action(u32 act)
}
EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
-static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
- int src_reg, int ctx_off,
- struct bpf_insn *insn_buf,
- struct bpf_prog *prog)
+static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
+ int src_reg, int ctx_off,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog)
{
struct bpf_insn *insn = insn_buf;
@@ -2686,7 +2770,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
case offsetof(struct __sk_buff, ifindex):
BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
- *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
dst_reg, src_reg,
offsetof(struct sk_buff, dev));
*insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
@@ -2727,7 +2811,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
dst_reg, src_reg, insn);
case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]):
+ offsetof(struct __sk_buff, cb[4]):
BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
prog->cb_access = 1;
@@ -2751,7 +2835,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
break;
case offsetof(struct __sk_buff, data):
- *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, data)),
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
dst_reg, src_reg,
offsetof(struct sk_buff, data));
break;
@@ -2760,8 +2844,8 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
ctx_off -= offsetof(struct __sk_buff, data_end);
ctx_off += offsetof(struct sk_buff, cb);
ctx_off += offsetof(struct bpf_skb_data_end, data_end);
- *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(void *)),
- dst_reg, src_reg, ctx_off);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg,
+ ctx_off);
break;
case offsetof(struct __sk_buff, tc_index):
@@ -2787,6 +2871,31 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
return insn - insn_buf;
}
+static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg,
+ int src_reg, int ctx_off,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (ctx_off) {
+ case offsetof(struct __sk_buff, ifindex):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
+ dst_reg, src_reg,
+ offsetof(struct sk_buff, dev));
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
+ offsetof(struct net_device, ifindex));
+ break;
+ default:
+ return sk_filter_convert_ctx_access(type, dst_reg, src_reg,
+ ctx_off, insn_buf, prog);
+ }
+
+ return insn - insn_buf;
+}
+
static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
int src_reg, int ctx_off,
struct bpf_insn *insn_buf,
@@ -2796,12 +2905,12 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
switch (ctx_off) {
case offsetof(struct xdp_md, data):
- *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data)),
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
dst_reg, src_reg,
offsetof(struct xdp_buff, data));
break;
case offsetof(struct xdp_md, data_end):
- *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data_end)),
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
dst_reg, src_reg,
offsetof(struct xdp_buff, data_end));
break;
@@ -2813,13 +2922,14 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
static const struct bpf_verifier_ops sk_filter_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
- .convert_ctx_access = bpf_net_convert_ctx_access,
+ .convert_ctx_access = sk_filter_convert_ctx_access,
};
static const struct bpf_verifier_ops tc_cls_act_ops = {
.get_func_proto = tc_cls_act_func_proto,
.is_valid_access = tc_cls_act_is_valid_access,
- .convert_ctx_access = bpf_net_convert_ctx_access,
+ .convert_ctx_access = tc_cls_act_convert_ctx_access,
+ .gen_prologue = tc_cls_act_prologue,
};
static const struct bpf_verifier_ops xdp_ops = {
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index a2879c0f6c4c..1a7b80f73376 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -750,11 +750,13 @@ EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric);
void __skb_get_hash(struct sk_buff *skb)
{
struct flow_keys keys;
+ u32 hash;
__flow_hash_secret_init();
- __skb_set_sw_hash(skb, ___skb_get_hash(skb, &keys, hashrnd),
- flow_keys_have_l4(&keys));
+ hash = ___skb_get_hash(skb, &keys, hashrnd);
+
+ __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
}
EXPORT_SYMBOL(__skb_get_hash);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 1dfca1c3f8f5..3ac8946bf244 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -843,7 +843,10 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
size += nla_total_size(num_vfs * sizeof(struct nlattr));
size += num_vfs *
(nla_total_size(sizeof(struct ifla_vf_mac)) +
- nla_total_size(sizeof(struct ifla_vf_vlan)) +
+ nla_total_size(MAX_VLAN_LIST_LEN *
+ sizeof(struct nlattr)) +
+ nla_total_size(MAX_VLAN_LIST_LEN *
+ sizeof(struct ifla_vf_vlan_info)) +
nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
nla_total_size(sizeof(struct ifla_vf_rate)) +
nla_total_size(sizeof(struct ifla_vf_link_state)) +
@@ -1111,14 +1114,15 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
struct nlattr *vfinfo)
{
struct ifla_vf_rss_query_en vf_rss_query_en;
+ struct nlattr *vf, *vfstats, *vfvlanlist;
struct ifla_vf_link_state vf_linkstate;
+ struct ifla_vf_vlan_info vf_vlan_info;
struct ifla_vf_spoofchk vf_spoofchk;
struct ifla_vf_tx_rate vf_tx_rate;
struct ifla_vf_stats vf_stats;
struct ifla_vf_trust vf_trust;
struct ifla_vf_vlan vf_vlan;
struct ifla_vf_rate vf_rate;
- struct nlattr *vf, *vfstats;
struct ifla_vf_mac vf_mac;
struct ifla_vf_info ivi;
@@ -1135,11 +1139,14 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
* IFLA_VF_LINK_STATE_AUTO which equals zero
*/
ivi.linkstate = 0;
+ /* VLAN Protocol by default is 802.1Q */
+ ivi.vlan_proto = htons(ETH_P_8021Q);
if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi))
return 0;
vf_mac.vf =
vf_vlan.vf =
+ vf_vlan_info.vf =
vf_rate.vf =
vf_tx_rate.vf =
vf_spoofchk.vf =
@@ -1150,6 +1157,9 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
vf_vlan.vlan = ivi.vlan;
vf_vlan.qos = ivi.qos;
+ vf_vlan_info.vlan = ivi.vlan;
+ vf_vlan_info.qos = ivi.qos;
+ vf_vlan_info.vlan_proto = ivi.vlan_proto;
vf_tx_rate.rate = ivi.max_tx_rate;
vf_rate.min_tx_rate = ivi.min_tx_rate;
vf_rate.max_tx_rate = ivi.max_tx_rate;
@@ -1158,10 +1168,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
vf_rss_query_en.setting = ivi.rss_query_en;
vf_trust.setting = ivi.trusted;
vf = nla_nest_start(skb, IFLA_VF_INFO);
- if (!vf) {
- nla_nest_cancel(skb, vfinfo);
- return -EMSGSIZE;
- }
+ if (!vf)
+ goto nla_put_vfinfo_failure;
if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
@@ -1177,17 +1185,23 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
&vf_rss_query_en) ||
nla_put(skb, IFLA_VF_TRUST,
sizeof(vf_trust), &vf_trust))
- return -EMSGSIZE;
+ goto nla_put_vf_failure;
+ vfvlanlist = nla_nest_start(skb, IFLA_VF_VLAN_LIST);
+ if (!vfvlanlist)
+ goto nla_put_vf_failure;
+ if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info),
+ &vf_vlan_info)) {
+ nla_nest_cancel(skb, vfvlanlist);
+ goto nla_put_vf_failure;
+ }
+ nla_nest_end(skb, vfvlanlist);
memset(&vf_stats, 0, sizeof(vf_stats));
if (dev->netdev_ops->ndo_get_vf_stats)
dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num,
&vf_stats);
vfstats = nla_nest_start(skb, IFLA_VF_STATS);
- if (!vfstats) {
- nla_nest_cancel(skb, vf);
- nla_nest_cancel(skb, vfinfo);
- return -EMSGSIZE;
- }
+ if (!vfstats)
+ goto nla_put_vf_failure;
if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS,
vf_stats.rx_packets, IFLA_VF_STATS_PAD) ||
nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS,
@@ -1199,11 +1213,19 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
- vf_stats.multicast, IFLA_VF_STATS_PAD))
- return -EMSGSIZE;
+ vf_stats.multicast, IFLA_VF_STATS_PAD)) {
+ nla_nest_cancel(skb, vfstats);
+ goto nla_put_vf_failure;
+ }
nla_nest_end(skb, vfstats);
nla_nest_end(skb, vf);
return 0;
+
+nla_put_vf_failure:
+ nla_nest_cancel(skb, vf);
+nla_put_vfinfo_failure:
+ nla_nest_cancel(skb, vfinfo);
+ return -EMSGSIZE;
}
static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
@@ -1448,6 +1470,7 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
[IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) },
[IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) },
+ [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED },
[IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) },
[IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) },
[IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) },
@@ -1704,7 +1727,34 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_vlan)
err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan,
- ivv->qos);
+ ivv->qos,
+ htons(ETH_P_8021Q));
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_VLAN_LIST]) {
+ struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN];
+ struct nlattr *attr;
+ int rem, len = 0;
+
+ err = -EOPNOTSUPP;
+ if (!ops->ndo_set_vf_vlan)
+ return err;
+
+ nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) {
+ if (nla_type(attr) != IFLA_VF_VLAN_INFO ||
+ nla_len(attr) < NLA_HDRLEN) {
+ return -EINVAL;
+ }
+ if (len >= MAX_VLAN_LIST_LEN)
+ return -EOPNOTSUPP;
+ ivvl[len] = nla_data(attr);
+
+ len++;
+ }
+ err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan,
+ ivvl[0]->qos, ivvl[0]->vlan_proto);
if (err < 0)
return err;
}
@@ -3577,6 +3627,91 @@ static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr)
(!idxattr || idxattr == attrid);
}
+#define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1)
+static int rtnl_get_offload_stats_attr_size(int attr_id)
+{
+ switch (attr_id) {
+ case IFLA_OFFLOAD_XSTATS_CPU_HIT:
+ return sizeof(struct rtnl_link_stats64);
+ }
+
+ return 0;
+}
+
+static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev,
+ int *prividx)
+{
+ struct nlattr *attr = NULL;
+ int attr_id, size;
+ void *attr_data;
+ int err;
+
+ if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
+ dev->netdev_ops->ndo_get_offload_stats))
+ return -ENODATA;
+
+ for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
+ attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
+ if (attr_id < *prividx)
+ continue;
+
+ size = rtnl_get_offload_stats_attr_size(attr_id);
+ if (!size)
+ continue;
+
+ if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+ continue;
+
+ attr = nla_reserve_64bit(skb, attr_id, size,
+ IFLA_OFFLOAD_XSTATS_UNSPEC);
+ if (!attr)
+ goto nla_put_failure;
+
+ attr_data = nla_data(attr);
+ memset(attr_data, 0, size);
+ err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev,
+ attr_data);
+ if (err)
+ goto get_offload_stats_failure;
+ }
+
+ if (!attr)
+ return -ENODATA;
+
+ *prividx = 0;
+ return 0;
+
+nla_put_failure:
+ err = -EMSGSIZE;
+get_offload_stats_failure:
+ *prividx = attr_id;
+ return err;
+}
+
+static int rtnl_get_offload_stats_size(const struct net_device *dev)
+{
+ int nla_size = 0;
+ int attr_id;
+ int size;
+
+ if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
+ dev->netdev_ops->ndo_get_offload_stats))
+ return 0;
+
+ for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
+ attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
+ if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+ continue;
+ size = rtnl_get_offload_stats_attr_size(attr_id);
+ nla_size += nla_total_size_64bit(size);
+ }
+
+ if (nla_size != 0)
+ nla_size += nla_total_size(0);
+
+ return nla_size;
+}
+
static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
int type, u32 pid, u32 seq, u32 change,
unsigned int flags, unsigned int filter_mask,
@@ -3586,6 +3721,7 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
struct nlmsghdr *nlh;
struct nlattr *attr;
int s_prividx = *prividx;
+ int err;
ASSERT_RTNL();
@@ -3614,8 +3750,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
if (ops && ops->fill_linkxstats) {
- int err;
-
*idxattr = IFLA_STATS_LINK_XSTATS;
attr = nla_nest_start(skb,
IFLA_STATS_LINK_XSTATS);
@@ -3639,8 +3773,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
if (master)
ops = master->rtnl_link_ops;
if (ops && ops->fill_linkxstats) {
- int err;
-
*idxattr = IFLA_STATS_LINK_XSTATS_SLAVE;
attr = nla_nest_start(skb,
IFLA_STATS_LINK_XSTATS_SLAVE);
@@ -3655,6 +3787,24 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
}
}
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS,
+ *idxattr)) {
+ *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS;
+ attr = nla_nest_start(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS);
+ if (!attr)
+ goto nla_put_failure;
+
+ err = rtnl_get_offload_stats(skb, dev, prividx);
+ if (err == -ENODATA)
+ nla_nest_cancel(skb, attr);
+ else
+ nla_nest_end(skb, attr);
+
+ if (err && err != -ENODATA)
+ goto nla_put_failure;
+ *idxattr = 0;
+ }
+
nlmsg_end(skb, nlh);
return 0;
@@ -3669,10 +3819,6 @@ nla_put_failure:
return -EMSGSIZE;
}
-static const struct nla_policy ifla_stats_policy[IFLA_STATS_MAX + 1] = {
- [IFLA_STATS_LINK_64] = { .len = sizeof(struct rtnl_link_stats64) },
-};
-
static size_t if_nlmsg_stats_size(const struct net_device *dev,
u32 filter_mask)
{
@@ -3712,6 +3858,9 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
}
}
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
+ size += rtnl_get_offload_stats_size(dev);
+
return size;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3864b4b68fa1..d36c7548952f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2445,6 +2445,25 @@ void skb_queue_purge(struct sk_buff_head *list)
EXPORT_SYMBOL(skb_queue_purge);
/**
+ * skb_rbtree_purge - empty a skb rbtree
+ * @root: root of the rbtree to empty
+ *
+ * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
+ * the list and one reference dropped. This function does not take
+ * any lock. Synchronization should be handled by the caller (e.g., TCP
+ * out-of-order queue is protected by the socket lock).
+ */
+void skb_rbtree_purge(struct rb_root *root)
+{
+ struct sk_buff *skb, *next;
+
+ rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode)
+ kfree_skb(skb);
+
+ *root = RB_ROOT;
+}
+
+/**
* skb_queue_head - queue a buffer at the list head
* @list: list to use
* @newsk: buffer to queue
@@ -3078,11 +3097,31 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
sg = !!(features & NETIF_F_SG);
csum = !!can_checksum_protocol(features, proto);
- /* GSO partial only requires that we trim off any excess that
- * doesn't fit into an MSS sized block, so take care of that
- * now.
- */
- if (sg && csum && (features & NETIF_F_GSO_PARTIAL)) {
+ if (sg && csum && (mss != GSO_BY_FRAGS)) {
+ if (!(features & NETIF_F_GSO_PARTIAL)) {
+ struct sk_buff *iter;
+
+ if (!list_skb ||
+ !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
+ goto normal;
+
+ /* Split the buffer at the frag_list pointer.
+ * This is based on the assumption that all
+ * buffers in the chain excluding the last
+ * containing the same amount of data.
+ */
+ skb_walk_frags(head_skb, iter) {
+ if (skb_headlen(iter))
+ goto normal;
+
+ len -= iter->len;
+ }
+ }
+
+ /* GSO partial only requires that we trim off any excess that
+ * doesn't fit into an MSS sized block, so take care of that
+ * now.
+ */
partial_segs = len / mss;
if (partial_segs > 1)
mss *= partial_segs;
@@ -3090,6 +3129,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
partial_segs = 0;
}
+normal:
headroom = skb_headroom(head_skb);
pos = skb_headlen(head_skb);
@@ -3281,21 +3321,29 @@ perform_csum_check:
*/
segs->prev = tail;
- /* Update GSO info on first skb in partial sequence. */
if (partial_segs) {
+ struct sk_buff *iter;
int type = skb_shinfo(head_skb)->gso_type;
+ unsigned short gso_size = skb_shinfo(head_skb)->gso_size;
/* Update type to add partial and then remove dodgy if set */
- type |= SKB_GSO_PARTIAL;
+ type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
type &= ~SKB_GSO_DODGY;
/* Update GSO info and prepare to start updating headers on
* our way back down the stack of protocols.
*/
- skb_shinfo(segs)->gso_size = skb_shinfo(head_skb)->gso_size;
- skb_shinfo(segs)->gso_segs = partial_segs;
- skb_shinfo(segs)->gso_type = type;
- SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset;
+ for (iter = segs; iter; iter = iter->next) {
+ skb_shinfo(iter)->gso_size = gso_size;
+ skb_shinfo(iter)->gso_segs = partial_segs;
+ skb_shinfo(iter)->gso_type = type;
+ SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
+ }
+
+ if (tail->len - doffset <= gso_size)
+ skb_shinfo(tail)->gso_size = 0;
+ else if (tail != segs)
+ skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
}
/* Following permits correct backpressure, for protocols
@@ -4474,8 +4522,10 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len)
}
EXPORT_SYMBOL(skb_ensure_writable);
-/* remove VLAN header from packet and update csum accordingly. */
-static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
+/* remove VLAN header from packet and update csum accordingly.
+ * expects a non skb_vlan_tag_present skb with a vlan tag payload
+ */
+int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
{
struct vlan_hdr *vhdr;
unsigned int offset = skb->data - skb_mac_header(skb);
@@ -4506,6 +4556,7 @@ pull:
return err;
}
+EXPORT_SYMBOL(__skb_vlan_pop);
int skb_vlan_pop(struct sk_buff *skb)
{
@@ -4516,9 +4567,7 @@ int skb_vlan_pop(struct sk_buff *skb)
if (likely(skb_vlan_tag_present(skb))) {
skb->vlan_tci = 0;
} else {
- if (unlikely((skb->protocol != htons(ETH_P_8021Q) &&
- skb->protocol != htons(ETH_P_8021AD)) ||
- skb->len < VLAN_ETH_HLEN))
+ if (unlikely(!eth_type_vlan(skb->protocol)))
return 0;
err = __skb_vlan_pop(skb, &vlan_tci);
@@ -4526,9 +4575,7 @@ int skb_vlan_pop(struct sk_buff *skb)
return err;
}
/* move next vlan tag to hw accel tag */
- if (likely((skb->protocol != htons(ETH_P_8021Q) &&
- skb->protocol != htons(ETH_P_8021AD)) ||
- skb->len < VLAN_ETH_HLEN))
+ if (likely(!eth_type_vlan(skb->protocol)))
return 0;
vlan_proto = skb->protocol;
diff --git a/net/core/sock.c b/net/core/sock.c
index 51a730485649..038e660ef844 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1340,7 +1340,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
if (!try_module_get(prot->owner))
goto out_free_sec;
sk_tx_queue_clear(sk);
- cgroup_sk_alloc(&sk->sk_cgrp_data);
}
return sk;
@@ -1400,6 +1399,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
sock_net_set(sk, net);
atomic_set(&sk->sk_wmem_alloc, 1);
+ cgroup_sk_alloc(&sk->sk_cgrp_data);
sock_update_classid(&sk->sk_cgrp_data);
sock_update_netprioidx(&sk->sk_cgrp_data);
}
@@ -1544,6 +1544,9 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
newsk->sk_priority = 0;
newsk->sk_incoming_cpu = raw_smp_processor_id();
atomic64_set(&newsk->sk_cookie, 0);
+
+ cgroup_sk_alloc(&newsk->sk_cgrp_data);
+
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index ff7736f7ff42..96e47c539bee 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -38,4 +38,7 @@ config NET_DSA_TAG_EDSA
config NET_DSA_TAG_TRAILER
bool
+config NET_DSA_TAG_QCA
+ bool
+
endif
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 8af4ded70f1c..a3380ed0e0be 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -7,3 +7,4 @@ dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o
dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o
dsa_core-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o
+dsa_core-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index d8d267e9a872..a6902c1e2f28 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -54,6 +54,9 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
#ifdef CONFIG_NET_DSA_TAG_BRCM
[DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops,
#endif
+#ifdef CONFIG_NET_DSA_TAG_QCA
+ [DSA_TAG_PROTO_QCA] = &qca_netdev_ops,
+#endif
[DSA_TAG_PROTO_NONE] = &none_ops,
};
@@ -375,9 +378,11 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
if (ret < 0)
goto out;
- ret = ops->set_addr(ds, dst->master_netdev->dev_addr);
- if (ret < 0)
- goto out;
+ if (ops->set_addr) {
+ ret = ops->set_addr(ds, dst->master_netdev->dev_addr);
+ if (ret < 0)
+ goto out;
+ }
if (!ds->slave_mii_bus && ops->phy_read) {
ds->slave_mii_bus = devm_mdiobus_alloc(parent);
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 8278385dcd21..f8a7d9aab437 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -304,13 +304,11 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds)
if (err < 0)
return err;
- err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr);
- if (err < 0)
- return err;
-
- err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr);
- if (err < 0)
- return err;
+ if (ds->ops->set_addr) {
+ err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr);
+ if (err < 0)
+ return err;
+ }
if (!ds->slave_mii_bus && ds->ops->phy_read) {
ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev);
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 00077a9c97f4..6cfd7388834e 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -81,5 +81,7 @@ extern const struct dsa_device_ops trailer_netdev_ops;
/* tag_brcm.c */
extern const struct dsa_device_ops brcm_netdev_ops;
+/* tag_qca.c */
+extern const struct dsa_device_ops qca_netdev_ops;
#endif
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 9ecbe787f102..6b1282c006b1 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -69,6 +69,30 @@ static inline bool dsa_port_is_bridged(struct dsa_slave_priv *p)
return !!p->bridge_dev;
}
+static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state)
+{
+ struct dsa_port *dp = &ds->ports[port];
+
+ if (ds->ops->port_stp_state_set)
+ ds->ops->port_stp_state_set(ds, port, state);
+
+ if (ds->ops->port_fast_age) {
+ /* Fast age FDB entries or flush appropriate forwarding database
+ * for the given port, if we are moving it from Learning or
+ * Forwarding state, to Disabled or Blocking or Listening state.
+ */
+
+ if ((dp->stp_state == BR_STATE_LEARNING ||
+ dp->stp_state == BR_STATE_FORWARDING) &&
+ (state == BR_STATE_DISABLED ||
+ state == BR_STATE_BLOCKING ||
+ state == BR_STATE_LISTENING))
+ ds->ops->port_fast_age(ds, port);
+ }
+
+ dp->stp_state = state;
+}
+
static int dsa_slave_open(struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
@@ -104,8 +128,7 @@ static int dsa_slave_open(struct net_device *dev)
goto clear_promisc;
}
- if (ds->ops->port_stp_state_set)
- ds->ops->port_stp_state_set(ds, p->port, stp_state);
+ dsa_port_set_stp_state(ds, p->port, stp_state);
if (p->phy)
phy_start(p->phy);
@@ -147,8 +170,7 @@ static int dsa_slave_close(struct net_device *dev)
if (ds->ops->port_disable)
ds->ops->port_disable(ds, p->port, p->phy);
- if (ds->ops->port_stp_state_set)
- ds->ops->port_stp_state_set(ds, p->port, BR_STATE_DISABLED);
+ dsa_port_set_stp_state(ds, p->port, BR_STATE_DISABLED);
return 0;
}
@@ -354,7 +376,7 @@ static int dsa_slave_stp_state_set(struct net_device *dev,
if (switchdev_trans_ph_prepare(trans))
return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP;
- ds->ops->port_stp_state_set(ds, p->port, attr->u.stp_state);
+ dsa_port_set_stp_state(ds, p->port, attr->u.stp_state);
return 0;
}
@@ -556,8 +578,7 @@ static void dsa_slave_bridge_port_leave(struct net_device *dev)
/* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
* so allow it to be in BR_STATE_FORWARDING to be kept functional
*/
- if (ds->ops->port_stp_state_set)
- ds->ops->port_stp_state_set(ds, p->port, BR_STATE_FORWARDING);
+ dsa_port_set_stp_state(ds, p->port, BR_STATE_FORWARDING);
}
static int dsa_slave_port_attr_get(struct net_device *dev,
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
new file mode 100644
index 000000000000..0c90cacee7aa
--- /dev/null
+++ b/net/dsa/tag_qca.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/etherdevice.h>
+#include "dsa_priv.h"
+
+#define QCA_HDR_LEN 2
+#define QCA_HDR_VERSION 0x2
+
+#define QCA_HDR_RECV_VERSION_MASK GENMASK(15, 14)
+#define QCA_HDR_RECV_VERSION_S 14
+#define QCA_HDR_RECV_PRIORITY_MASK GENMASK(13, 11)
+#define QCA_HDR_RECV_PRIORITY_S 11
+#define QCA_HDR_RECV_TYPE_MASK GENMASK(10, 6)
+#define QCA_HDR_RECV_TYPE_S 6
+#define QCA_HDR_RECV_FRAME_IS_TAGGED BIT(3)
+#define QCA_HDR_RECV_SOURCE_PORT_MASK GENMASK(2, 0)
+
+#define QCA_HDR_XMIT_VERSION_MASK GENMASK(15, 14)
+#define QCA_HDR_XMIT_VERSION_S 14
+#define QCA_HDR_XMIT_PRIORITY_MASK GENMASK(13, 11)
+#define QCA_HDR_XMIT_PRIORITY_S 11
+#define QCA_HDR_XMIT_CONTROL_MASK GENMASK(10, 8)
+#define QCA_HDR_XMIT_CONTROL_S 8
+#define QCA_HDR_XMIT_FROM_CPU BIT(7)
+#define QCA_HDR_XMIT_DP_BIT_MASK GENMASK(6, 0)
+
+static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ u16 *phdr, hdr;
+
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += skb->len;
+
+ if (skb_cow_head(skb, 0) < 0)
+ goto out_free;
+
+ skb_push(skb, QCA_HDR_LEN);
+
+ memmove(skb->data, skb->data + QCA_HDR_LEN, 2 * ETH_ALEN);
+ phdr = (u16 *)(skb->data + 2 * ETH_ALEN);
+
+ /* Set the version field, and set destination port information */
+ hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S |
+ QCA_HDR_XMIT_FROM_CPU |
+ BIT(p->port);
+
+ *phdr = htons(hdr);
+
+ return skb;
+
+out_free:
+ kfree_skb(skb);
+ return NULL;
+}
+
+static int qca_tag_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct dsa_switch_tree *dst = dev->dsa_ptr;
+ struct dsa_switch *ds;
+ u8 ver;
+ int port;
+ __be16 *phdr, hdr;
+
+ if (unlikely(!dst))
+ goto out_drop;
+
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (!skb)
+ goto out;
+
+ if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN)))
+ goto out_drop;
+
+ /* The QCA header is added by the switch between src addr and Ethertype
+ * At this point, skb->data points to ethertype so header should be
+ * right before
+ */
+ phdr = (__be16 *)(skb->data - 2);
+ hdr = ntohs(*phdr);
+
+ /* Make sure the version is correct */
+ ver = (hdr & QCA_HDR_RECV_VERSION_MASK) >> QCA_HDR_RECV_VERSION_S;
+ if (unlikely(ver != QCA_HDR_VERSION))
+ goto out_drop;
+
+ /* Remove QCA tag and recalculate checksum */
+ skb_pull_rcsum(skb, QCA_HDR_LEN);
+ memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - QCA_HDR_LEN,
+ ETH_HLEN - QCA_HDR_LEN);
+
+ /* This protocol doesn't support cascading multiple switches so it's
+ * safe to assume the switch is first in the tree
+ */
+ ds = dst->ds[0];
+ if (!ds)
+ goto out_drop;
+
+ /* Get source port information */
+ port = (hdr & QCA_HDR_RECV_SOURCE_PORT_MASK);
+ if (!ds->ports[port].netdev)
+ goto out_drop;
+
+ /* Update skb & forward the frame accordingly */
+ skb_push(skb, ETH_HLEN);
+ skb->pkt_type = PACKET_HOST;
+ skb->dev = ds->ports[port].netdev;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+
+ skb->dev->stats.rx_packets++;
+ skb->dev->stats.rx_bytes += skb->len;
+
+ netif_receive_skb(skb);
+
+ return 0;
+
+out_drop:
+ kfree_skb(skb);
+out:
+ return 0;
+}
+
+const struct dsa_device_ops qca_netdev_ops = {
+ .xmit = qca_tag_xmit,
+ .rcv = qca_tag_rcv,
+};
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 50d6a9b49f6c..300b06888fdf 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -640,6 +640,21 @@ config TCP_CONG_CDG
D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
+config TCP_CONG_BBR
+ tristate "BBR TCP"
+ default n
+ ---help---
+
+ BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+ maximize network utilization and minimize queues. It builds an explicit
+ model of the the bottleneck delivery rate and path round-trip
+ propagation delay. It tolerates packet loss and delay unrelated to
+ congestion. It can operate over LAN, WAN, cellular, wifi, or cable
+ modem links. It can coexist with flows that use loss-based congestion
+ control, and can operate with shallow buffers, deep buffers,
+ bufferbloat, policers, or AQM schemes that do not provide a delay
+ signal. It requires the fq ("Fair Queue") pacing packet scheduler.
+
choice
prompt "Default TCP congestion control"
default DEFAULT_CUBIC
@@ -674,6 +689,9 @@ choice
config DEFAULT_CDG
bool "CDG" if TCP_CONG_CDG=y
+ config DEFAULT_BBR
+ bool "BBR" if TCP_CONG_BBR=y
+
config DEFAULT_RENO
bool "Reno"
endchoice
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 24629b6f57cc..bc6a6c8b9bcd 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,7 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
- tcp_recovery.o \
+ tcp_rate.o tcp_recovery.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
@@ -41,6 +41,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
+obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e94b47be0019..1effc986739e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1192,7 +1192,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
struct sk_buff *inet_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
- bool udpfrag = false, fixedid = false, encap;
+ bool udpfrag = false, fixedid = false, gso_partial, encap;
struct sk_buff *segs = ERR_PTR(-EINVAL);
const struct net_offload *ops;
unsigned int offset = 0;
@@ -1245,6 +1245,8 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
if (IS_ERR_OR_NULL(segs))
goto out;
+ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
skb = segs;
do {
iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
@@ -1259,9 +1261,13 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
iph->id = htons(id);
id += skb_shinfo(skb)->gso_segs;
}
- tot_len = skb_shinfo(skb)->gso_size +
- SKB_GSO_CB(skb)->data_offset +
- skb->head - (unsigned char *)iph;
+
+ if (gso_partial)
+ tot_len = skb_shinfo(skb)->gso_size +
+ SKB_GSO_CB(skb)->data_offset +
+ skb->head - (unsigned char *)iph;
+ else
+ tot_len = skb->len - nhoff;
} else {
if (!fixedid)
iph->id = htons(id++);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 415e117967c7..062a67ca9a21 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2232,7 +2232,7 @@ static struct devinet_sysctl_table {
};
static int __devinet_sysctl_register(struct net *net, char *dev_name,
- struct ipv4_devconf *p)
+ int ifindex, struct ipv4_devconf *p)
{
int i;
struct devinet_sysctl_table *t;
@@ -2255,6 +2255,8 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
goto free;
p->sysctl = t;
+
+ inet_netconf_notify_devconf(net, NETCONFA_ALL, ifindex, p);
return 0;
free:
@@ -2286,7 +2288,7 @@ static int devinet_sysctl_register(struct in_device *idev)
if (err)
return err;
err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
- &idev->cnf);
+ idev->dev->ifindex, &idev->cnf);
if (err)
neigh_sysctl_unregister(idev->arp_parms);
return err;
@@ -2347,11 +2349,12 @@ static __net_init int devinet_init_net(struct net *net)
}
#ifdef CONFIG_SYSCTL
- err = __devinet_sysctl_register(net, "all", all);
+ err = __devinet_sysctl_register(net, "all", NETCONFA_IFINDEX_ALL, all);
if (err < 0)
goto err_reg_all;
- err = __devinet_sysctl_register(net, "default", dflt);
+ err = __devinet_sysctl_register(net, "default",
+ NETCONFA_IFINDEX_DEFAULT, dflt);
if (err < 0)
goto err_reg_dflt;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 317c31939732..4e56a4c20a3c 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -503,6 +503,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
if (!dev)
return -ENODEV;
cfg->fc_oif = dev->ifindex;
+ cfg->fc_table = l3mdev_fib_table(dev);
if (colon) {
struct in_ifaddr *ifa;
struct in_device *in_dev = __in_dev_get_rtnl(dev);
@@ -1021,7 +1022,7 @@ no_promotions:
* First of all, we scan fib_info list searching
* for stray nexthop entries, then ignite fib_flush.
*/
- if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
+ if (fib_sync_down_addr(dev, ifa->ifa_local))
fib_flush(dev_net(dev));
}
}
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 6e9ea69e5f75..770bebed6b28 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -56,6 +56,9 @@ int __fib_lookup(struct net *net, struct flowi4 *flp,
};
int err;
+ /* update flow if oif or iif point to device enslaved to l3mdev */
+ l3mdev_update_flow(net, flowi4_to_flowi(flp));
+
err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
#ifdef CONFIG_IP_ROUTE_CLASSID
if (arg.rule)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 8066ccc48a17..388d3e21629b 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1057,6 +1057,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
fi->fib_priority = cfg->fc_priority;
fi->fib_prefsrc = cfg->fc_prefsrc;
fi->fib_type = cfg->fc_type;
+ fi->fib_tb_id = cfg->fc_table;
fi->fib_nhs = nhs;
change_nexthops(fi) {
@@ -1337,18 +1338,21 @@ nla_put_failure:
* referring to it.
* - device went down -> we must shutdown all nexthops going via it.
*/
-int fib_sync_down_addr(struct net *net, __be32 local)
+int fib_sync_down_addr(struct net_device *dev, __be32 local)
{
int ret = 0;
unsigned int hash = fib_laddr_hashfn(local);
struct hlist_head *head = &fib_info_laddrhash[hash];
+ struct net *net = dev_net(dev);
+ int tb_id = l3mdev_fib_table(dev);
struct fib_info *fi;
if (!fib_info_laddrhash || local == 0)
return 0;
hlist_for_each_entry(fi, head, fib_lhash) {
- if (!net_eq(fi->fib_net, net))
+ if (!net_eq(fi->fib_net, net) ||
+ fi->fib_tb_id != tb_id)
continue;
if (fi->fib_prefsrc == local) {
fi->fib_flags |= RTNH_F_DEAD;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index e2ffc2a5c7db..241f27bbd7ad 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1081,7 +1081,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa;
struct key_vector *l, *tp;
- unsigned int nlflags = 0;
+ u16 nlflags = NLM_F_EXCL;
struct fib_info *fi;
u8 plen = cfg->fc_dst_len;
u8 slen = KEYLENGTH - plen;
@@ -1126,6 +1126,8 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
if (cfg->fc_nlflags & NLM_F_EXCL)
goto out;
+ nlflags &= ~NLM_F_EXCL;
+
/* We have 2 goals:
* 1. Find exact match for type, scope, fib_info to avoid
* duplicate routes
@@ -1151,6 +1153,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
struct fib_info *fi_drop;
u8 state;
+ nlflags |= NLM_F_REPLACE;
fa = fa_first;
if (fa_match) {
if (fa == fa_match)
@@ -1191,7 +1194,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
if (state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
- tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
+ tb->tb_id, &cfg->fc_nlinfo, nlflags);
goto succeeded;
}
@@ -1203,7 +1206,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
goto out;
if (cfg->fc_nlflags & NLM_F_APPEND)
- nlflags = NLM_F_APPEND;
+ nlflags |= NLM_F_APPEND;
else
fa = fa_first;
}
@@ -1211,6 +1214,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
if (!(cfg->fc_nlflags & NLM_F_CREATE))
goto out;
+ nlflags |= NLM_F_CREATE;
err = -ENOBUFS;
new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
if (!new_fa)
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index ecd1e09dbbf1..96e0efecefa6 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
__be16 protocol = skb->protocol;
u16 mac_len = skb->mac_len;
int gre_offset, outer_hlen;
- bool need_csum, ufo;
+ bool need_csum, ufo, gso_partial;
if (!skb->encapsulation)
goto out;
@@ -69,6 +69,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
goto out;
}
+ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
outer_hlen = skb_tnl_header_len(skb);
gre_offset = outer_hlen - tnl_hlen;
skb = segs;
@@ -96,7 +98,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
greh = (struct gre_base_hdr *)skb_transport_header(skb);
pcsum = (__sum16 *)(greh + 1);
- if (skb_is_gso(skb)) {
+ if (gso_partial) {
unsigned int partial_adj;
/* Adjust checksum to account for the fact that
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index abfbe492ebfe..e4d16fc5bbb3 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -99,6 +99,7 @@ static size_t inet_sk_attr_size(void)
+ nla_total_size(1) /* INET_DIAG_SHUTDOWN */
+ nla_total_size(1) /* INET_DIAG_TOS */
+ nla_total_size(1) /* INET_DIAG_TCLASS */
+ + nla_total_size(4) /* INET_DIAG_MARK */
+ nla_total_size(sizeof(struct inet_diag_meminfo))
+ nla_total_size(sizeof(struct inet_diag_msg))
+ nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
@@ -109,7 +110,8 @@ static size_t inet_sk_attr_size(void)
int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
struct inet_diag_msg *r, int ext,
- struct user_namespace *user_ns)
+ struct user_namespace *user_ns,
+ bool net_admin)
{
const struct inet_sock *inet = inet_sk(sk);
@@ -136,6 +138,9 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
}
#endif
+ if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark))
+ goto errout;
+
r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
r->idiag_inode = sock_i_ino(sk);
@@ -149,7 +154,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
struct sk_buff *skb, const struct inet_diag_req_v2 *req,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ const struct nlmsghdr *unlh,
+ bool net_admin)
{
const struct tcp_congestion_ops *ca_ops;
const struct inet_diag_handler *handler;
@@ -175,7 +181,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
r->idiag_timer = 0;
r->idiag_retrans = 0;
- if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns))
+ if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin))
goto errout;
if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
@@ -274,10 +280,11 @@ static int inet_csk_diag_fill(struct sock *sk,
const struct inet_diag_req_v2 *req,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ const struct nlmsghdr *unlh,
+ bool net_admin)
{
- return inet_sk_diag_fill(sk, inet_csk(sk), skb, req,
- user_ns, portid, seq, nlmsg_flags, unlh);
+ return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, user_ns,
+ portid, seq, nlmsg_flags, unlh, net_admin);
}
static int inet_twsk_diag_fill(struct sock *sk,
@@ -319,8 +326,9 @@ static int inet_twsk_diag_fill(struct sock *sk,
static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ const struct nlmsghdr *unlh, bool net_admin)
{
+ struct request_sock *reqsk = inet_reqsk(sk);
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
long tmo;
@@ -334,7 +342,7 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
inet_diag_msg_common_fill(r, sk);
r->idiag_state = TCP_SYN_RECV;
r->idiag_timer = 1;
- r->idiag_retrans = inet_reqsk(sk)->num_retrans;
+ r->idiag_retrans = reqsk->num_retrans;
BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
offsetof(struct sock, sk_cookie));
@@ -346,6 +354,10 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
r->idiag_uid = 0;
r->idiag_inode = 0;
+ if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
+ inet_rsk(reqsk)->ir_mark))
+ return -EMSGSIZE;
+
nlmsg_end(skb, nlh);
return 0;
}
@@ -354,7 +366,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
const struct inet_diag_req_v2 *r,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ const struct nlmsghdr *unlh, bool net_admin)
{
if (sk->sk_state == TCP_TIME_WAIT)
return inet_twsk_diag_fill(sk, skb, portid, seq,
@@ -362,10 +374,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
if (sk->sk_state == TCP_NEW_SYN_RECV)
return inet_req_diag_fill(sk, skb, portid, seq,
- nlmsg_flags, unlh);
+ nlmsg_flags, unlh, net_admin);
return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
- nlmsg_flags, unlh);
+ nlmsg_flags, unlh, net_admin);
}
struct sock *inet_diag_find_one_icsk(struct net *net,
@@ -435,7 +447,8 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
err = sk_diag_fill(sk, rep, req,
sk_user_ns(NETLINK_CB(in_skb).sk),
NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0, nlh);
+ nlh->nlmsg_seq, 0, nlh,
+ netlink_net_capable(in_skb, CAP_NET_ADMIN));
if (err < 0) {
WARN_ON(err == -EMSGSIZE);
nlmsg_free(rep);
@@ -796,7 +809,8 @@ static int inet_csk_diag_dump(struct sock *sk,
struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r,
- const struct nlattr *bc)
+ const struct nlattr *bc,
+ bool net_admin)
{
if (!inet_diag_bc_sk(bc, sk))
return 0;
@@ -804,7 +818,8 @@ static int inet_csk_diag_dump(struct sock *sk,
return inet_csk_diag_fill(sk, skb, r,
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+ cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh,
+ net_admin);
}
static void twsk_build_assert(void)
@@ -840,6 +855,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
struct net *net = sock_net(skb->sk);
int i, num, s_i, s_num;
u32 idiag_states = r->idiag_states;
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
if (idiag_states & TCPF_SYN_RECV)
idiag_states |= TCPF_NEW_SYN_RECV;
@@ -880,7 +896,8 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
cb->args[3] > 0)
goto next_listen;
- if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
+ if (inet_csk_diag_dump(sk, skb, cb, r,
+ bc, net_admin) < 0) {
spin_unlock_bh(&ilb->lock);
goto done;
}
@@ -948,7 +965,7 @@ skip_listen_ht:
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
- cb->nlh);
+ cb->nlh, net_admin);
if (res < 0) {
spin_unlock_bh(lock);
goto done;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 113cc43df789..576f705d8180 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -246,25 +246,6 @@ static void gre_err(struct sk_buff *skb, u32 info)
ipgre_err(skb, info, &tpi);
}
-static __be64 key_to_tunnel_id(__be32 key)
-{
-#ifdef __BIG_ENDIAN
- return (__force __be64)((__force u32)key);
-#else
- return (__force __be64)((__force u64)key << 32);
-#endif
-}
-
-/* Returns the least-significant 32 bits of a __be64. */
-static __be32 tunnel_id_to_key(__be64 x)
-{
-#ifdef __BIG_ENDIAN
- return (__force __be32)x;
-#else
- return (__force __be32)((__force u64)x >> 32);
-#endif
-}
-
static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
{
@@ -290,7 +271,7 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
__be64 tun_id;
flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
- tun_id = key_to_tunnel_id(tpi->key);
+ tun_id = key32_to_tunnel_id(tpi->key);
tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
if (!tun_dst)
return PACKET_REJECT;
@@ -446,7 +427,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
gre_build_header(skb, tunnel_hlen, flags, proto,
- tunnel_id_to_key(tun_info->key.tun_id), 0);
+ tunnel_id_to_key32(tun_info->key.tun_id), 0);
df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 4b351af3e67b..d6feabb03516 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -312,6 +312,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
+ struct net_device *dev = skb->dev;
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
@@ -341,7 +342,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
*/
if (!skb_valid_dst(skb)) {
int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
- iph->tos, skb->dev);
+ iph->tos, dev);
if (unlikely(err)) {
if (err == -EXDEV)
__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
@@ -370,7 +371,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
__IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);
} else if (skb->pkt_type == PACKET_BROADCAST ||
skb->pkt_type == PACKET_MULTICAST) {
- struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
/* RFC 1122 3.3.6:
*
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 65569274efb8..05d105832bdb 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -99,6 +99,14 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
iph->tot_len = htons(skb->len);
ip_send_check(iph);
+
+ /* if egress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip_out(sk, skb);
+ if (unlikely(!skb))
+ return 0;
+
return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
net, sk, skb, NULL, skb_dst(skb)->dev,
dst_output);
@@ -490,7 +498,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->tc_index = from->tc_index;
#endif
nf_copy(to, from);
-#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+#if IS_ENABLED(CONFIG_IP_VS)
to->ipvs_property = from->ipvs_property;
#endif
skb_copy_secmark(to, from);
@@ -1574,8 +1582,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
}
oif = arg->bound_dev_if;
- if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
- oif = skb->skb_iif;
+ oif = oif ? : skb->skb_iif;
flowi4_init_output(&fl4, oif,
IP4_REPLY_MARK(net, skb->mark),
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 71a52f4d4cff..af4919792b6a 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -284,9 +284,12 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
ipc->ttl = val;
break;
case IP_TOS:
- if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+ if (cmsg->cmsg_len == CMSG_LEN(sizeof(int)))
+ val = *(int *)CMSG_DATA(cmsg);
+ else if (cmsg->cmsg_len == CMSG_LEN(sizeof(u8)))
+ val = *(u8 *)CMSG_DATA(cmsg);
+ else
return -EINVAL;
- val = *(int *)CMSG_DATA(cmsg);
if (val < 0 || val > 255)
return -EINVAL;
ipc->tos = val;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 95649ebd2874..5719d6ba0824 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -55,6 +55,7 @@
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/udp.h>
+#include <net/dst_metadata.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
@@ -546,6 +547,81 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
return 0;
}
+void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ u32 headroom = sizeof(struct iphdr);
+ struct ip_tunnel_info *tun_info;
+ const struct ip_tunnel_key *key;
+ const struct iphdr *inner_iph;
+ struct rtable *rt;
+ struct flowi4 fl4;
+ __be16 df = 0;
+ u8 tos, ttl;
+
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+ ip_tunnel_info_af(tun_info) != AF_INET))
+ goto tx_error;
+ key = &tun_info->key;
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+ tos = key->tos;
+ if (tos == 1) {
+ if (skb->protocol == htons(ETH_P_IP))
+ tos = inner_iph->tos;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
+ }
+ init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
+ RT_TOS(tos), tunnel->parms.link);
+ if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
+ goto tx_error;
+ rt = ip_route_output_key(tunnel->net, &fl4);
+ if (IS_ERR(rt)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error;
+ }
+ if (rt->dst.dev == dev) {
+ ip_rt_put(rt);
+ dev->stats.collisions++;
+ goto tx_error;
+ }
+ tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
+ ttl = key->ttl;
+ if (ttl == 0) {
+ if (skb->protocol == htons(ETH_P_IP))
+ ttl = inner_iph->ttl;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
+ else
+ ttl = ip4_dst_hoplimit(&rt->dst);
+ }
+ if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
+ df = htons(IP_DF);
+ else if (skb->protocol == htons(ETH_P_IP))
+ df = inner_iph->frag_off & htons(IP_DF);
+ headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
+ if (headroom > dev->needed_headroom)
+ dev->needed_headroom = headroom;
+
+ if (skb_cow_head(skb, dev->needed_headroom)) {
+ ip_rt_put(rt);
+ goto tx_dropped;
+ }
+ iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
+ key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
+ return;
+tx_error:
+ dev->stats.tx_errors++;
+ goto kfree;
+tx_dropped:
+ dev->stats.tx_dropped++;
+kfree:
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
+
void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
const struct iphdr *tnl_params, u8 protocol)
{
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 0f227db0e9ac..777bc1883870 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -69,7 +69,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
skb_scrub_packet(skb, xnet);
- skb_clear_hash(skb);
+ skb_clear_hash_if_not_l4(skb);
skb_dst_set(skb, &rt->dst);
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index cc701fa70b12..5d7944f394d9 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -88,6 +88,7 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
struct net_device *dev;
struct pcpu_sw_netstats *tstats;
struct xfrm_state *x;
+ struct xfrm_mode *inner_mode;
struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4;
u32 orig_mark = skb->mark;
int ret;
@@ -105,7 +106,19 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
}
x = xfrm_input_state(skb);
- family = x->inner_mode->afinfo->family;
+
+ inner_mode = x->inner_mode;
+
+ if (x->sel.family == AF_UNSPEC) {
+ inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
+ if (inner_mode == NULL) {
+ XFRM_INC_STATS(dev_net(skb->dev),
+ LINUX_MIB_XFRMINSTATEMODEERROR);
+ return -EINVAL;
+ }
+ }
+
+ family = inner_mode->afinfo->family;
skb->mark = be32_to_cpu(tunnel->parms.i_key);
ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 4ae3f8e6c6cc..c9392589c415 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -115,6 +115,7 @@
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <net/dst_metadata.h>
static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
@@ -193,6 +194,7 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
{
struct net *net = dev_net(skb->dev);
struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+ struct metadata_dst *tun_dst = NULL;
struct ip_tunnel *tunnel;
const struct iphdr *iph;
@@ -216,7 +218,12 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
tpi = &ipip_tpi;
if (iptunnel_pull_header(skb, 0, tpi->proto, false))
goto drop;
- return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+ if (tunnel->collect_md) {
+ tun_dst = ip_tun_rx_dst(skb, 0, 0, 0);
+ if (!tun_dst)
+ return 0;
+ }
+ return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
}
return -1;
@@ -270,7 +277,10 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb,
skb_set_inner_ipproto(skb, ipproto);
- ip_tunnel_xmit(skb, dev, tiph, ipproto);
+ if (tunnel->collect_md)
+ ip_md_tunnel_xmit(skb, dev, ipproto);
+ else
+ ip_tunnel_xmit(skb, dev, tiph, ipproto);
return NETDEV_TX_OK;
tx_error:
@@ -380,13 +390,14 @@ static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
}
static void ipip_netlink_parms(struct nlattr *data[],
- struct ip_tunnel_parm *parms)
+ struct ip_tunnel_parm *parms, bool *collect_md)
{
memset(parms, 0, sizeof(*parms));
parms->iph.version = 4;
parms->iph.protocol = IPPROTO_IPIP;
parms->iph.ihl = 5;
+ *collect_md = false;
if (!data)
return;
@@ -414,6 +425,9 @@ static void ipip_netlink_parms(struct nlattr *data[],
if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
parms->iph.frag_off = htons(IP_DF);
+
+ if (data[IFLA_IPTUN_COLLECT_METADATA])
+ *collect_md = true;
}
/* This function returns true when ENCAP attributes are present in the nl msg */
@@ -453,18 +467,18 @@ static bool ipip_netlink_encap_parms(struct nlattr *data[],
static int ipip_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
+ struct ip_tunnel *t = netdev_priv(dev);
struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
if (ipip_netlink_encap_parms(data, &ipencap)) {
- struct ip_tunnel *t = netdev_priv(dev);
int err = ip_tunnel_encap_setup(t, &ipencap);
if (err < 0)
return err;
}
- ipip_netlink_parms(data, &p);
+ ipip_netlink_parms(data, &p, &t->collect_md);
return ip_tunnel_newlink(dev, tb, &p);
}
@@ -473,6 +487,7 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
{
struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
+ bool collect_md;
if (ipip_netlink_encap_parms(data, &ipencap)) {
struct ip_tunnel *t = netdev_priv(dev);
@@ -482,7 +497,9 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
return err;
}
- ipip_netlink_parms(data, &p);
+ ipip_netlink_parms(data, &p, &collect_md);
+ if (collect_md)
+ return -EINVAL;
if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
(!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
@@ -516,6 +533,8 @@ static size_t ipip_get_size(const struct net_device *dev)
nla_total_size(2) +
/* IFLA_IPTUN_ENCAP_DPORT */
nla_total_size(2) +
+ /* IFLA_IPTUN_COLLECT_METADATA */
+ nla_total_size(0) +
0;
}
@@ -544,6 +563,9 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
tunnel->encap.flags))
goto nla_put_failure;
+ if (tunnel->collect_md)
+ if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -562,6 +584,7 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
[IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 },
[IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 },
[IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 },
+ [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG },
};
static struct rtnl_link_ops ipip_link_ops __read_mostly = {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 26253328d227..a87bcd2d4a94 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2076,6 +2076,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
struct rta_mfc_stats mfcs;
struct nlattr *mp_attr;
struct rtnexthop *nhp;
+ unsigned long lastuse;
int ct;
/* If cache is unresolved, don't try to parse IIF and OIF */
@@ -2105,12 +2106,14 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
nla_nest_end(skb, mp_attr);
+ lastuse = READ_ONCE(c->mfc_un.res.lastuse);
+ lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0;
+
mfcs.mfcs_packets = c->mfc_un.res.pkt;
mfcs.mfcs_bytes = c->mfc_un.res.bytes;
mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) ||
- nla_put_u64_64bit(skb, RTA_EXPIRES,
- jiffies_to_clock_t(c->mfc_un.res.lastuse),
+ nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse),
RTA_PAD))
return -EMSGSIZE;
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
index 2375b0a8be46..30493beb611a 100644
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -31,6 +31,7 @@ static unsigned int nf_route_table_hook(void *priv,
__be32 saddr, daddr;
u_int8_t tos;
const struct iphdr *iph;
+ int err;
/* root is playing with raw sockets. */
if (skb->len < sizeof(struct iphdr) ||
@@ -46,15 +47,17 @@ static unsigned int nf_route_table_hook(void *priv,
tos = iph->tos;
ret = nft_do_chain(&pkt, priv);
- if (ret != NF_DROP && ret != NF_QUEUE) {
+ if (ret != NF_DROP && ret != NF_STOLEN) {
iph = ip_hdr(skb);
if (iph->saddr != saddr ||
iph->daddr != daddr ||
skb->mark != mark ||
- iph->tos != tos)
- if (ip_route_me_harder(state->net, skb, RTN_UNSPEC))
- ret = NF_DROP;
+ iph->tos != tos) {
+ err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
}
return ret;
}
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
index c24f41c816b3..2c2553b9026c 100644
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -46,6 +46,7 @@ static const struct nft_expr_ops nft_reject_ipv4_ops = {
.eval = nft_reject_ipv4_eval,
.init = nft_reject_init,
.dump = nft_reject_dump,
+ .validate = nft_reject_validate,
};
static struct nft_expr_type nft_reject_ipv4_type __read_mostly = {
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 438f50c1a676..90a85c955872 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -606,12 +606,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
(inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
daddr, saddr, 0, 0);
- if (!saddr && ipc.oif) {
- err = l3mdev_get_saddr(net, ipc.oif, &fl4);
- if (err < 0)
- goto done;
- }
-
if (!inet->hdrincl) {
rfv.msg = msg;
rfv.hlen = 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3e992783c1d0..654a9af20136 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -476,12 +476,18 @@ u32 ip_idents_reserve(u32 hash, int segs)
atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
u32 old = ACCESS_ONCE(*p_tstamp);
u32 now = (u32)jiffies;
- u32 delta = 0;
+ u32 new, delta = 0;
if (old != now && cmpxchg(p_tstamp, old, now) == old)
delta = prandom_u32_max(now - old);
- return atomic_add_return(segs + delta, p_id) - segs;
+ /* Do not use atomic_add_return() as it makes UBSAN unhappy */
+ do {
+ old = (u32)atomic_read(p_id);
+ new = old + delta + segs;
+ } while (atomic_cmpxchg(p_id, old, new) != old);
+
+ return new - segs;
}
EXPORT_SYMBOL(ip_idents_reserve);
@@ -1831,7 +1837,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
* Now we are ready to route packet.
*/
fl4.flowi4_oif = 0;
- fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
+ fl4.flowi4_iif = dev->ifindex;
fl4.flowi4_mark = skb->mark;
fl4.flowi4_tos = tos;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
@@ -2018,7 +2024,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
return ERR_PTR(-EINVAL);
if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
- if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
+ if (ipv4_is_loopback(fl4->saddr) &&
+ !(dev_out->flags & IFF_LOOPBACK) &&
+ !netif_is_l3_master(dev_out))
return ERR_PTR(-EINVAL);
if (ipv4_is_lbcast(fl4->daddr))
@@ -2148,7 +2156,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
unsigned int flags = 0;
struct fib_result res;
struct rtable *rth;
- int master_idx;
int orig_oif;
int err = -ENETUNREACH;
@@ -2158,9 +2165,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
orig_oif = fl4->flowi4_oif;
- master_idx = l3mdev_master_ifindex_by_index(net, fl4->flowi4_oif);
- if (master_idx)
- fl4->flowi4_oif = master_idx;
fl4->flowi4_iif = LOOPBACK_IFINDEX;
fl4->flowi4_tos = tos & IPTOS_RT_MASK;
fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
@@ -2244,10 +2248,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
fl4->saddr = inet_select_addr(dev_out, 0,
RT_SCOPE_HOST);
}
-
- rth = l3mdev_get_rtable(dev_out, fl4);
- if (rth)
- goto out;
}
if (!fl4->daddr) {
@@ -2265,8 +2265,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
if (err) {
res.fi = NULL;
res.table = NULL;
- if (fl4->flowi4_oif &&
- !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
+ if (fl4->flowi4_oif) {
/* Apparently, routing tables are wrong. Assume,
that the destination is on link.
@@ -2302,7 +2301,9 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
else
fl4->saddr = fl4->daddr;
}
- dev_out = net->loopback_dev;
+
+ /* L3 master device is the loopback for that domain */
+ dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev;
fl4->flowi4_oif = dev_out->ifindex;
flags |= RTCF_LOCAL;
goto make_route;
@@ -2577,9 +2578,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
fl4.flowi4_mark = mark;
- if (netif_index_is_l3_master(net, fl4.flowi4_oif))
- fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
-
if (iif) {
struct net_device *dev;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 77311a92275c..f253e5019d22 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -380,14 +380,14 @@ void tcp_init_sock(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- __skb_queue_head_init(&tp->out_of_order_queue);
+ tp->out_of_order_queue = RB_ROOT;
tcp_init_xmit_timers(sk);
tcp_prequeue_init(tp);
INIT_LIST_HEAD(&tp->tsq_node);
icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
- tp->rtt_min[0].rtt = ~0U;
+ minmax_reset(&tp->rtt_min, tcp_time_stamp, ~0U);
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
@@ -396,6 +396,9 @@ void tcp_init_sock(struct sock *sk)
*/
tp->snd_cwnd = TCP_INIT_CWND;
+ /* There's a bubble in the pipe until at least the first ACK. */
+ tp->app_limited = ~0U;
+
/* See draft-stevens-tcpca-spec-01 for discussion of the
* initialization of these values.
*/
@@ -1014,23 +1017,40 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
flags);
lock_sock(sk);
+
+ tcp_rate_check_app_limited(sk); /* is sending application-limited? */
+
res = do_tcp_sendpages(sk, page, offset, size, flags);
release_sock(sk);
return res;
}
EXPORT_SYMBOL(tcp_sendpage);
-static inline int select_size(const struct sock *sk, bool sg)
+/* Do not bother using a page frag for very small frames.
+ * But use this heuristic only for the first skb in write queue.
+ *
+ * Having no payload in skb->head allows better SACK shifting
+ * in tcp_shift_skb_data(), reducing sack/rack overhead, because
+ * write queue has less skbs.
+ * Each skb can hold up to MAX_SKB_FRAGS * 32Kbytes, or ~0.5 MB.
+ * This also speeds up tso_fragment(), since it wont fallback
+ * to tcp_fragment().
+ */
+static int linear_payload_sz(bool first_skb)
+{
+ if (first_skb)
+ return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
+ return 0;
+}
+
+static int select_size(const struct sock *sk, bool sg, bool first_skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
int tmp = tp->mss_cache;
if (sg) {
if (sk_can_gso(sk)) {
- /* Small frames wont use a full page:
- * Payload will immediately follow tcp header.
- */
- tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
+ tmp = linear_payload_sz(first_skb);
} else {
int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
@@ -1101,6 +1121,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+ tcp_rate_check_app_limited(sk); /* is sending application-limited? */
+
/* Wait for a connection to finish. One exception is TCP Fast Open
* (passive side) where data is allowed to be sent before a connection
* is fully established.
@@ -1161,6 +1183,8 @@ restart:
}
if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
+ bool first_skb;
+
new_segment:
/* Allocate new segment. If the interface is SG,
* allocate skb fitting to single page.
@@ -1172,10 +1196,11 @@ new_segment:
process_backlog = false;
goto restart;
}
+ first_skb = skb_queue_empty(&sk->sk_write_queue);
skb = sk_stream_alloc_skb(sk,
- select_size(sk, sg),
+ select_size(sk, sg, first_skb),
sk->sk_allocation,
- skb_queue_empty(&sk->sk_write_queue));
+ first_skb);
if (!skb)
goto wait_for_memory;
@@ -2243,7 +2268,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_clear_xmit_timers(sk);
__skb_queue_purge(&sk->sk_receive_queue);
tcp_write_queue_purge(sk);
- __skb_queue_purge(&tp->out_of_order_queue);
+ skb_rbtree_purge(&tp->out_of_order_queue);
inet->inet_dport = 0;
@@ -2687,7 +2712,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
- u32 now = tcp_time_stamp;
+ u32 now = tcp_time_stamp, intv;
unsigned int start;
int notsent_bytes;
u64 rate64;
@@ -2777,6 +2802,15 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_min_rtt = tcp_min_rtt(tp);
info->tcpi_data_segs_in = tp->data_segs_in;
info->tcpi_data_segs_out = tp->data_segs_out;
+
+ info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
+ rate = READ_ONCE(tp->rate_delivered);
+ intv = READ_ONCE(tp->rate_interval_us);
+ if (rate && intv) {
+ rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
+ do_div(rate64, intv);
+ put_unaligned(rate64, &info->tcpi_delivery_rate);
+ }
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3244,11 +3278,12 @@ static void __init tcp_init_mem(void)
void __init tcp_init(void)
{
- unsigned long limit;
int max_rshare, max_wshare, cnt;
+ unsigned long limit;
unsigned int i;
- sock_skb_cb_check_size(sizeof(struct tcp_skb_cb));
+ BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
+ FIELD_SIZEOF(struct sk_buff, cb));
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
new file mode 100644
index 000000000000..0ea66c2c9344
--- /dev/null
+++ b/net/ipv4/tcp_bbr.c
@@ -0,0 +1,896 @@
+/* Bottleneck Bandwidth and RTT (BBR) congestion control
+ *
+ * BBR congestion control computes the sending rate based on the delivery
+ * rate (throughput) estimated from ACKs. In a nutshell:
+ *
+ * On each ACK, update our model of the network path:
+ * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
+ * min_rtt = windowed_min(rtt, 10 seconds)
+ * pacing_rate = pacing_gain * bottleneck_bandwidth
+ * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
+ *
+ * The core algorithm does not react directly to packet losses or delays,
+ * although BBR may adjust the size of next send per ACK when loss is
+ * observed, or adjust the sending rate if it estimates there is a
+ * traffic policer, in order to keep the drop rate reasonable.
+ *
+ * BBR is described in detail in:
+ * "BBR: Congestion-Based Congestion Control",
+ * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
+ * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
+ *
+ * There is a public e-mail list for discussing BBR development and testing:
+ * https://groups.google.com/forum/#!forum/bbr-dev
+ *
+ * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled,
+ * since pacing is integral to the BBR design and implementation.
+ * BBR without pacing would not function properly, and may incur unnecessary
+ * high packet loss rates.
+ */
+#include <linux/module.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+#include <linux/inet.h>
+#include <linux/random.h>
+#include <linux/win_minmax.h>
+
+/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+ * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+ * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+ * Since the minimum window is >=4 packets, the lower bound isn't
+ * an issue. The upper bound isn't an issue with existing technologies.
+ */
+#define BW_SCALE 24
+#define BW_UNIT (1 << BW_SCALE)
+
+#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */
+#define BBR_UNIT (1 << BBR_SCALE)
+
+/* BBR has the following modes for deciding how fast to send: */
+enum bbr_mode {
+ BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */
+ BBR_DRAIN, /* drain any queue created during startup */
+ BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */
+ BBR_PROBE_RTT, /* cut cwnd to min to probe min_rtt */
+};
+
+/* BBR congestion control block */
+struct bbr {
+ u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */
+ u32 min_rtt_stamp; /* timestamp of min_rtt_us */
+ u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */
+ struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */
+ u32 rtt_cnt; /* count of packet-timed rounds elapsed */
+ u32 next_rtt_delivered; /* scb->tx.delivered at end of round */
+ struct skb_mstamp cycle_mstamp; /* time of this cycle phase start */
+ u32 mode:3, /* current bbr_mode in state machine */
+ prev_ca_state:3, /* CA state on previous ACK */
+ packet_conservation:1, /* use packet conservation? */
+ restore_cwnd:1, /* decided to revert cwnd to old value */
+ round_start:1, /* start of packet-timed tx->ack round? */
+ tso_segs_goal:7, /* segments we want in each skb we send */
+ idle_restart:1, /* restarting after idle? */
+ probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */
+ unused:5,
+ lt_is_sampling:1, /* taking long-term ("LT") samples now? */
+ lt_rtt_cnt:7, /* round trips in long-term interval */
+ lt_use_bw:1; /* use lt_bw as our bw estimate? */
+ u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */
+ u32 lt_last_delivered; /* LT intvl start: tp->delivered */
+ u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */
+ u32 lt_last_lost; /* LT intvl start: tp->lost */
+ u32 pacing_gain:10, /* current gain for setting pacing rate */
+ cwnd_gain:10, /* current gain for setting cwnd */
+ full_bw_cnt:3, /* number of rounds without large bw gains */
+ cycle_idx:3, /* current index in pacing_gain cycle array */
+ unused_b:6;
+ u32 prior_cwnd; /* prior cwnd upon entering loss recovery */
+ u32 full_bw; /* recent bw, to estimate if pipe is full */
+};
+
+#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */
+
+/* Window length of bw filter (in rounds): */
+static const int bbr_bw_rtts = CYCLE_LEN + 2;
+/* Window length of min_rtt filter (in sec): */
+static const u32 bbr_min_rtt_win_sec = 10;
+/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
+static const u32 bbr_probe_rtt_mode_ms = 200;
+/* Skip TSO below the following bandwidth (bits/sec): */
+static const int bbr_min_tso_rate = 1200000;
+
+/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
+ * that will allow a smoothly increasing pacing rate that will double each RTT
+ * and send the same number of packets per RTT that an un-paced, slow-starting
+ * Reno or CUBIC flow would:
+ */
+static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1;
+/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
+ * the queue created in BBR_STARTUP in a single round:
+ */
+static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */
+static const int bbr_cwnd_gain = BBR_UNIT * 2;
+/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
+static const int bbr_pacing_gain[] = {
+ BBR_UNIT * 5 / 4, /* probe for more available bw */
+ BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */
+ BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */
+ BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */
+};
+/* Randomize the starting gain cycling phase over N phases: */
+static const u32 bbr_cycle_rand = 7;
+
+/* Try to keep at least this many packets in flight, if things go smoothly. For
+ * smooth functioning, a sliding window protocol ACKing every other packet
+ * needs at least 4 packets in flight:
+ */
+static const u32 bbr_cwnd_min_target = 4;
+
+/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
+/* If bw has increased significantly (1.25x), there may be more bw available: */
+static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+/* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
+static const u32 bbr_full_bw_cnt = 3;
+
+/* "long-term" ("LT") bandwidth estimator parameters... */
+/* The minimum number of rounds in an LT bw sampling interval: */
+static const u32 bbr_lt_intvl_min_rtts = 4;
+/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
+static const u32 bbr_lt_loss_thresh = 50;
+/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
+static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
+/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
+static const u32 bbr_lt_bw_diff = 4000 / 8;
+/* If we estimate we're policed, use lt_bw for this many round trips: */
+static const u32 bbr_lt_bw_max_rtts = 48;
+
+/* Do we estimate that STARTUP filled the pipe? */
+static bool bbr_full_bw_reached(const struct sock *sk)
+{
+ const struct bbr *bbr = inet_csk_ca(sk);
+
+ return bbr->full_bw_cnt >= bbr_full_bw_cnt;
+}
+
+/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+static u32 bbr_max_bw(const struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return minmax_get(&bbr->bw);
+}
+
+/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+static u32 bbr_bw(const struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
+}
+
+/* Return rate in bytes per second, optionally with a gain.
+ * The order here is chosen carefully to avoid overflow of u64. This should
+ * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+ */
+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
+{
+ rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache);
+ rate *= gain;
+ rate >>= BBR_SCALE;
+ rate *= USEC_PER_SEC;
+ return rate >> BW_SCALE;
+}
+
+/* Pace using current bw estimate and a gain factor. In order to help drive the
+ * network toward lower queues while maintaining high utilization and low
+ * latency, the average pacing rate aims to be slightly (~1%) lower than the
+ * estimated bandwidth. This is an important aspect of the design. In this
+ * implementation this slightly lower pacing rate is achieved implicitly by not
+ * including link-layer headers in the packet size used for the pacing rate.
+ */
+static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 rate = bw;
+
+ rate = bbr_rate_bytes_per_sec(sk, rate, gain);
+ rate = min_t(u64, rate, sk->sk_max_pacing_rate);
+ if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate)
+ sk->sk_pacing_rate = rate;
+}
+
+/* Return count of segments we want in the skbs we send, or 0 for default. */
+static u32 bbr_tso_segs_goal(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return bbr->tso_segs_goal;
+}
+
+static void bbr_set_tso_segs_goal(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 min_segs;
+
+ min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
+ bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs),
+ 0x7FU);
+}
+
+/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+static void bbr_save_cwnd(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
+ bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */
+ else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
+ bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
+}
+
+static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (event == CA_EVENT_TX_START && tp->app_limited) {
+ bbr->idle_restart = 1;
+ /* Avoid pointless buffer overflows: pace at est. bw if we don't
+ * need more speed (we're restarting from idle and app-limited).
+ */
+ if (bbr->mode == BBR_PROBE_BW)
+ bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+ }
+}
+
+/* Find target cwnd. Right-size the cwnd based on min RTT and the
+ * estimated bottleneck bandwidth:
+ *
+ * cwnd = bw * min_rtt * gain = BDP * gain
+ *
+ * The key factor, gain, controls the amount of queue. While a small gain
+ * builds a smaller queue, it becomes more vulnerable to noise in RTT
+ * measurements (e.g., delayed ACKs or other ACK compression effects). This
+ * noise may cause BBR to under-estimate the rate.
+ *
+ * To achieve full performance in high-speed paths, we budget enough cwnd to
+ * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
+ * - one skb in sending host Qdisc,
+ * - one skb in sending host TSO/GSO engine
+ * - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
+ * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+ * full even with ACK-every-other-packet delayed ACKs.
+ */
+static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 cwnd;
+ u64 w;
+
+ /* If we've never had a valid RTT sample, cap cwnd at the initial
+ * default. This should only happen when the connection is not using TCP
+ * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+ * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+ * case we need to slow-start up toward something safe: TCP_INIT_CWND.
+ */
+ if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */
+ return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/
+
+ w = (u64)bw * bbr->min_rtt_us;
+
+ /* Apply a gain to the given value, then remove the BW_SCALE shift. */
+ cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
+
+ /* Allow enough full-sized skbs in flight to utilize end systems. */
+ cwnd += 3 * bbr->tso_segs_goal;
+
+ /* Reduce delayed ACKs by rounding up cwnd to the next even number. */
+ cwnd = (cwnd + 1) & ~1U;
+
+ return cwnd;
+}
+
+/* An optimization in BBR to reduce losses: On the first round of recovery, we
+ * follow the packet conservation principle: send P packets per P packets acked.
+ * After that, we slow-start and send at most 2*P packets per P packets acked.
+ * After recovery finishes, or upon undo, we restore the cwnd we had when
+ * recovery started (capped by the target cwnd based on estimated BDP).
+ *
+ * TODO(ycheng/ncardwell): implement a rate-based approach.
+ */
+static bool bbr_set_cwnd_to_recover_or_restore(
+ struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
+ u32 cwnd = tp->snd_cwnd;
+
+ /* An ACK for P pkts should release at most 2*P packets. We do this
+ * in two steps. First, here we deduct the number of lost packets.
+ * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
+ */
+ if (rs->losses > 0)
+ cwnd = max_t(s32, cwnd - rs->losses, 1);
+
+ if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
+ /* Starting 1st round of Recovery, so do packet conservation. */
+ bbr->packet_conservation = 1;
+ bbr->next_rtt_delivered = tp->delivered; /* start round now */
+ /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
+ cwnd = tcp_packets_in_flight(tp) + acked;
+ } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
+ /* Exiting loss recovery; restore cwnd saved before recovery. */
+ bbr->restore_cwnd = 1;
+ bbr->packet_conservation = 0;
+ }
+ bbr->prev_ca_state = state;
+
+ if (bbr->restore_cwnd) {
+ /* Restore cwnd after exiting loss recovery or PROBE_RTT. */
+ cwnd = max(cwnd, bbr->prior_cwnd);
+ bbr->restore_cwnd = 0;
+ }
+
+ if (bbr->packet_conservation) {
+ *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
+ return true; /* yes, using packet conservation */
+ }
+ *new_cwnd = cwnd;
+ return false;
+}
+
+/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+ * has drawn us down below target), or snap down to target if we're above it.
+ */
+static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+ u32 acked, u32 bw, int gain)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 cwnd = 0, target_cwnd = 0;
+
+ if (!acked)
+ return;
+
+ if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
+ goto done;
+
+ /* If we're below target cwnd, slow start cwnd toward target cwnd. */
+ target_cwnd = bbr_target_cwnd(sk, bw, gain);
+ if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */
+ cwnd = min(cwnd + acked, target_cwnd);
+ else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
+ cwnd = cwnd + acked;
+ cwnd = max(cwnd, bbr_cwnd_min_target);
+
+done:
+ tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */
+ if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */
+ tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target);
+}
+
+/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
+static bool bbr_is_next_cycle_phase(struct sock *sk,
+ const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ bool is_full_length =
+ skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) >
+ bbr->min_rtt_us;
+ u32 inflight, bw;
+
+ /* The pacing_gain of 1.0 paces at the estimated bw to try to fully
+ * use the pipe without increasing the queue.
+ */
+ if (bbr->pacing_gain == BBR_UNIT)
+ return is_full_length; /* just use wall clock time */
+
+ inflight = rs->prior_in_flight; /* what was in-flight before ACK? */
+ bw = bbr_max_bw(sk);
+
+ /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
+ * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
+ * small (e.g. on a LAN). We do not persist if packets are lost, since
+ * a path with small buffers may not hold that much.
+ */
+ if (bbr->pacing_gain > BBR_UNIT)
+ return is_full_length &&
+ (rs->losses || /* perhaps pacing_gain*BDP won't fit */
+ inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain));
+
+ /* A pacing_gain < 1.0 tries to drain extra queue we added if bw
+ * probing didn't find more bw. If inflight falls to match BDP then we
+ * estimate queue is drained; persisting would underutilize the pipe.
+ */
+ return is_full_length ||
+ inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT);
+}
+
+static void bbr_advance_cycle_phase(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
+ bbr->cycle_mstamp = tp->delivered_mstamp;
+ bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
+}
+
+/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
+static void bbr_update_cycle_phase(struct sock *sk,
+ const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw &&
+ bbr_is_next_cycle_phase(sk, rs))
+ bbr_advance_cycle_phase(sk);
+}
+
+static void bbr_reset_startup_mode(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->mode = BBR_STARTUP;
+ bbr->pacing_gain = bbr_high_gain;
+ bbr->cwnd_gain = bbr_high_gain;
+}
+
+static void bbr_reset_probe_bw_mode(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->mode = BBR_PROBE_BW;
+ bbr->pacing_gain = BBR_UNIT;
+ bbr->cwnd_gain = bbr_cwnd_gain;
+ bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
+ bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */
+}
+
+static void bbr_reset_mode(struct sock *sk)
+{
+ if (!bbr_full_bw_reached(sk))
+ bbr_reset_startup_mode(sk);
+ else
+ bbr_reset_probe_bw_mode(sk);
+}
+
+/* Start a new long-term sampling interval. */
+static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies;
+ bbr->lt_last_delivered = tp->delivered;
+ bbr->lt_last_lost = tp->lost;
+ bbr->lt_rtt_cnt = 0;
+}
+
+/* Completely reset long-term bandwidth sampling. */
+static void bbr_reset_lt_bw_sampling(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->lt_bw = 0;
+ bbr->lt_use_bw = 0;
+ bbr->lt_is_sampling = false;
+ bbr_reset_lt_bw_sampling_interval(sk);
+}
+
+/* Long-term bw sampling interval is done. Estimate whether we're policed. */
+static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 diff;
+
+ if (bbr->lt_bw) { /* do we have bw from a previous interval? */
+ /* Is new bw close to the lt_bw from the previous interval? */
+ diff = abs(bw - bbr->lt_bw);
+ if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
+ (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
+ bbr_lt_bw_diff)) {
+ /* All criteria are met; estimate we're policed. */
+ bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */
+ bbr->lt_use_bw = 1;
+ bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */
+ bbr->lt_rtt_cnt = 0;
+ return;
+ }
+ }
+ bbr->lt_bw = bw;
+ bbr_reset_lt_bw_sampling_interval(sk);
+}
+
+/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
+ * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
+ * explicitly models their policed rate, to reduce unnecessary losses. We
+ * estimate that we're policed if we see 2 consecutive sampling intervals with
+ * consistent throughput and high packet loss. If we think we're being policed,
+ * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
+ */
+static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 lost, delivered;
+ u64 bw;
+ s32 t;
+
+ if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */
+ if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
+ ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
+ bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */
+ bbr_reset_probe_bw_mode(sk); /* restart gain cycling */
+ }
+ return;
+ }
+
+ /* Wait for the first loss before sampling, to let the policer exhaust
+ * its tokens and estimate the steady-state rate allowed by the policer.
+ * Starting samples earlier includes bursts that over-estimate the bw.
+ */
+ if (!bbr->lt_is_sampling) {
+ if (!rs->losses)
+ return;
+ bbr_reset_lt_bw_sampling_interval(sk);
+ bbr->lt_is_sampling = true;
+ }
+
+ /* To avoid underestimates, reset sampling if we run out of data. */
+ if (rs->is_app_limited) {
+ bbr_reset_lt_bw_sampling(sk);
+ return;
+ }
+
+ if (bbr->round_start)
+ bbr->lt_rtt_cnt++; /* count round trips in this interval */
+ if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
+ return; /* sampling interval needs to be longer */
+ if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
+ bbr_reset_lt_bw_sampling(sk); /* interval is too long */
+ return;
+ }
+
+ /* End sampling interval when a packet is lost, so we estimate the
+ * policer tokens were exhausted. Stopping the sampling before the
+ * tokens are exhausted under-estimates the policed rate.
+ */
+ if (!rs->losses)
+ return;
+
+ /* Calculate packets lost and delivered in sampling interval. */
+ lost = tp->lost - bbr->lt_last_lost;
+ delivered = tp->delivered - bbr->lt_last_delivered;
+ /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
+ if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
+ return;
+
+ /* Find average delivery rate in this sampling interval. */
+ t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp);
+ if (t < 1)
+ return; /* interval is less than one jiffy, so wait */
+ t = jiffies_to_usecs(t);
+ /* Interval long enough for jiffies_to_usecs() to return a bogus 0? */
+ if (t < 1) {
+ bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */
+ return;
+ }
+ bw = (u64)delivered * BW_UNIT;
+ do_div(bw, t);
+ bbr_lt_bw_interval_done(sk, bw);
+}
+
+/* Estimate the bandwidth based on how fast packets are delivered */
+static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 bw;
+
+ bbr->round_start = 0;
+ if (rs->delivered < 0 || rs->interval_us <= 0)
+ return; /* Not a valid observation */
+
+ /* See if we've reached the next RTT */
+ if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
+ bbr->next_rtt_delivered = tp->delivered;
+ bbr->rtt_cnt++;
+ bbr->round_start = 1;
+ bbr->packet_conservation = 0;
+ }
+
+ bbr_lt_bw_sampling(sk, rs);
+
+ /* Divide delivered by the interval to find a (lower bound) bottleneck
+ * bandwidth sample. Delivered is in packets and interval_us in uS and
+ * ratio will be <<1 for most connections. So delivered is first scaled.
+ */
+ bw = (u64)rs->delivered * BW_UNIT;
+ do_div(bw, rs->interval_us);
+
+ /* If this sample is application-limited, it is likely to have a very
+ * low delivered count that represents application behavior rather than
+ * the available network rate. Such a sample could drag down estimated
+ * bw, causing needless slow-down. Thus, to continue to send at the
+ * last measured network rate, we filter out app-limited samples unless
+ * they describe the path bw at least as well as our bw model.
+ *
+ * So the goal during app-limited phase is to proceed with the best
+ * network rate no matter how long. We automatically leave this
+ * phase when app writes faster than the network can deliver :)
+ */
+ if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
+ /* Incorporate new sample into our max bw filter. */
+ minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
+ }
+}
+
+/* Estimate when the pipe is full, using the change in delivery rate: BBR
+ * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+ * higher rwin, 3: we get higher delivery rate samples. Or transient
+ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+ */
+static void bbr_check_full_bw_reached(struct sock *sk,
+ const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 bw_thresh;
+
+ if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+ return;
+
+ bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
+ if (bbr_max_bw(sk) >= bw_thresh) {
+ bbr->full_bw = bbr_max_bw(sk);
+ bbr->full_bw_cnt = 0;
+ return;
+ }
+ ++bbr->full_bw_cnt;
+}
+
+/* If pipe is probably full, drain the queue and then enter steady-state. */
+static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+ bbr->mode = BBR_DRAIN; /* drain queue we created */
+ bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */
+ bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */
+ } /* fall through to check if in-flight is already small: */
+ if (bbr->mode == BBR_DRAIN &&
+ tcp_packets_in_flight(tcp_sk(sk)) <=
+ bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
+ bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */
+}
+
+/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+ * periodically drain the bottleneck queue, to converge to measure the true
+ * min_rtt (unloaded propagation delay). This allows the flows to keep queues
+ * small (reducing queuing delay and packet loss) and achieve fairness among
+ * BBR flows.
+ *
+ * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
+ * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
+ * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
+ * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
+ * re-enter the previous mode. BBR uses 200ms to approximately bound the
+ * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
+ *
+ * Note that flows need only pay 2% if they are busy sending over the last 10
+ * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
+ * natural silences or low-rate periods within 10 seconds where the rate is low
+ * enough for long enough to drain its queue in the bottleneck. We pick up
+ * these min RTT measurements opportunistically with our min_rtt filter. :-)
+ */
+static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ bool filter_expired;
+
+ /* Track min RTT seen in the min_rtt_win_sec filter window: */
+ filter_expired = after(tcp_time_stamp,
+ bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
+ if (rs->rtt_us >= 0 &&
+ (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) {
+ bbr->min_rtt_us = rs->rtt_us;
+ bbr->min_rtt_stamp = tcp_time_stamp;
+ }
+
+ if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
+ !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+ bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */
+ bbr->pacing_gain = BBR_UNIT;
+ bbr->cwnd_gain = BBR_UNIT;
+ bbr_save_cwnd(sk); /* note cwnd so we can restore it */
+ bbr->probe_rtt_done_stamp = 0;
+ }
+
+ if (bbr->mode == BBR_PROBE_RTT) {
+ /* Ignore low rate samples during this mode. */
+ tp->app_limited =
+ (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+ /* Maintain min packets in flight for max(200 ms, 1 round). */
+ if (!bbr->probe_rtt_done_stamp &&
+ tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
+ bbr->probe_rtt_done_stamp = tcp_time_stamp +
+ msecs_to_jiffies(bbr_probe_rtt_mode_ms);
+ bbr->probe_rtt_round_done = 0;
+ bbr->next_rtt_delivered = tp->delivered;
+ } else if (bbr->probe_rtt_done_stamp) {
+ if (bbr->round_start)
+ bbr->probe_rtt_round_done = 1;
+ if (bbr->probe_rtt_round_done &&
+ after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) {
+ bbr->min_rtt_stamp = tcp_time_stamp;
+ bbr->restore_cwnd = 1; /* snap to prior_cwnd */
+ bbr_reset_mode(sk);
+ }
+ }
+ }
+ bbr->idle_restart = 0;
+}
+
+static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
+{
+ bbr_update_bw(sk, rs);
+ bbr_update_cycle_phase(sk, rs);
+ bbr_check_full_bw_reached(sk, rs);
+ bbr_check_drain(sk, rs);
+ bbr_update_min_rtt(sk, rs);
+}
+
+static void bbr_main(struct sock *sk, const struct rate_sample *rs)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 bw;
+
+ bbr_update_model(sk, rs);
+
+ bw = bbr_bw(sk);
+ bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+ bbr_set_tso_segs_goal(sk);
+ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
+}
+
+static void bbr_init(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 bw;
+
+ bbr->prior_cwnd = 0;
+ bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */
+ bbr->rtt_cnt = 0;
+ bbr->next_rtt_delivered = 0;
+ bbr->prev_ca_state = TCP_CA_Open;
+ bbr->packet_conservation = 0;
+
+ bbr->probe_rtt_done_stamp = 0;
+ bbr->probe_rtt_round_done = 0;
+ bbr->min_rtt_us = tcp_min_rtt(tp);
+ bbr->min_rtt_stamp = tcp_time_stamp;
+
+ minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */
+
+ /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
+ bw = (u64)tp->snd_cwnd * BW_UNIT;
+ do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC);
+ sk->sk_pacing_rate = 0; /* force an update of sk_pacing_rate */
+ bbr_set_pacing_rate(sk, bw, bbr_high_gain);
+
+ bbr->restore_cwnd = 0;
+ bbr->round_start = 0;
+ bbr->idle_restart = 0;
+ bbr->full_bw = 0;
+ bbr->full_bw_cnt = 0;
+ bbr->cycle_mstamp.v64 = 0;
+ bbr->cycle_idx = 0;
+ bbr_reset_lt_bw_sampling(sk);
+ bbr_reset_startup_mode(sk);
+}
+
+static u32 bbr_sndbuf_expand(struct sock *sk)
+{
+ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+ return 3;
+}
+
+/* In theory BBR does not need to undo the cwnd since it does not
+ * always reduce cwnd on losses (see bbr_main()). Keep it for now.
+ */
+static u32 bbr_undo_cwnd(struct sock *sk)
+{
+ return tcp_sk(sk)->snd_cwnd;
+}
+
+/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
+static u32 bbr_ssthresh(struct sock *sk)
+{
+ bbr_save_cwnd(sk);
+ return TCP_INFINITE_SSTHRESH; /* BBR does not use ssthresh */
+}
+
+static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
+{
+ if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+ ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 bw = bbr_bw(sk);
+
+ bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
+ memset(&info->bbr, 0, sizeof(info->bbr));
+ info->bbr.bbr_bw_lo = (u32)bw;
+ info->bbr.bbr_bw_hi = (u32)(bw >> 32);
+ info->bbr.bbr_min_rtt = bbr->min_rtt_us;
+ info->bbr.bbr_pacing_gain = bbr->pacing_gain;
+ info->bbr.bbr_cwnd_gain = bbr->cwnd_gain;
+ *attr = INET_DIAG_BBRINFO;
+ return sizeof(info->bbr);
+ }
+ return 0;
+}
+
+static void bbr_set_state(struct sock *sk, u8 new_state)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (new_state == TCP_CA_Loss) {
+ struct rate_sample rs = { .losses = 1 };
+
+ bbr->prev_ca_state = TCP_CA_Loss;
+ bbr->full_bw = 0;
+ bbr->round_start = 1; /* treat RTO like end of a round */
+ bbr_lt_bw_sampling(sk, &rs);
+ }
+}
+
+static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+ .flags = TCP_CONG_NON_RESTRICTED,
+ .name = "bbr",
+ .owner = THIS_MODULE,
+ .init = bbr_init,
+ .cong_control = bbr_main,
+ .sndbuf_expand = bbr_sndbuf_expand,
+ .undo_cwnd = bbr_undo_cwnd,
+ .cwnd_event = bbr_cwnd_event,
+ .ssthresh = bbr_ssthresh,
+ .tso_segs_goal = bbr_tso_segs_goal,
+ .get_info = bbr_get_info,
+ .set_state = bbr_set_state,
+};
+
+static int __init bbr_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&tcp_bbr_cong_ops);
+}
+
+static void __exit bbr_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
+}
+
+module_init(bbr_register);
+module_exit(bbr_unregister);
+
+MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 03725b294286..35b280361cb2 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -56,7 +56,7 @@ MODULE_PARM_DESC(use_shadow, "use shadow window heuristic");
module_param(use_tolerance, bool, 0644);
MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic");
-struct minmax {
+struct cdg_minmax {
union {
struct {
s32 min;
@@ -74,10 +74,10 @@ enum cdg_state {
};
struct cdg {
- struct minmax rtt;
- struct minmax rtt_prev;
- struct minmax *gradients;
- struct minmax gsum;
+ struct cdg_minmax rtt;
+ struct cdg_minmax rtt_prev;
+ struct cdg_minmax *gradients;
+ struct cdg_minmax gsum;
bool gfilled;
u8 tail;
u8 state;
@@ -353,7 +353,7 @@ static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev)
{
struct cdg *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
- struct minmax *gradients;
+ struct cdg_minmax *gradients;
switch (ev) {
case CA_EVENT_CWND_RESTART:
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 882caa4e72bc..1294af4e0127 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -69,7 +69,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
int ret = 0;
/* all algorithms must implement ssthresh and cong_avoid ops */
- if (!ca->ssthresh || !ca->cong_avoid) {
+ if (!ca->ssthresh || !(ca->cong_avoid || ca->cong_control)) {
pr_err("%s does not implement required ops\n", ca->name);
return -EINVAL;
}
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 54d9f9b0120f..4e777a3243f9 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -150,6 +150,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
tp->segs_in = 0;
tcp_segs_in(tp, skb);
__skb_pull(skb, tcp_hdrlen(skb));
+ sk_forced_mem_schedule(sk, skb->truesize);
skb_set_owner_r(skb, sk);
TCP_SKB_CB(skb)->seq++;
@@ -226,6 +227,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
tcp_fastopen_add_skb(child, skb);
tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
+ tp->rcv_wup = tp->rcv_nxt;
/* tcp_conn_request() is sending the SYNACK,
* and queues the child into listener accept queue.
*/
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f3a9f3c2c8d8..8c6ad2d319d6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -289,6 +289,7 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr
static void tcp_sndbuf_expand(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
int sndmem, per_mss;
u32 nr_segs;
@@ -309,7 +310,8 @@ static void tcp_sndbuf_expand(struct sock *sk)
* Cubic needs 1.7 factor, rounded to 2 to include
* extra cushion (application might react slowly to POLLOUT)
*/
- sndmem = 2 * nr_segs * per_mss;
+ sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
+ sndmem *= nr_segs * per_mss;
if (sk->sk_sndbuf < sndmem)
sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
@@ -899,12 +901,29 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
}
+/* Sum the number of packets on the wire we have marked as lost.
+ * There are two cases we care about here:
+ * a) Packet hasn't been marked lost (nor retransmitted),
+ * and this is the first loss.
+ * b) Packet has been marked both lost and retransmitted,
+ * and this means we think it was lost again.
+ */
+static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ __u8 sacked = TCP_SKB_CB(skb)->sacked;
+
+ if (!(sacked & TCPCB_LOST) ||
+ ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
+ tp->lost += tcp_skb_pcount(skb);
+}
+
static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
{
if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
tcp_verify_retransmit_hint(tp, skb);
tp->lost_out += tcp_skb_pcount(skb);
+ tcp_sum_lost(tp, skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
}
}
@@ -913,6 +932,7 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
{
tcp_verify_retransmit_hint(tp, skb);
+ tcp_sum_lost(tp, skb);
if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
tp->lost_out += tcp_skb_pcount(skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
@@ -1094,6 +1114,7 @@ struct tcp_sacktag_state {
*/
struct skb_mstamp first_sackt;
struct skb_mstamp last_sackt;
+ struct rate_sample *rate;
int flag;
};
@@ -1261,6 +1282,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
start_seq, end_seq, dup_sack, pcount,
&skb->skb_mstamp);
+ tcp_rate_skb_delivered(sk, skb, state->rate);
if (skb == tp->lost_skb_hint)
tp->lost_cnt_hint += pcount;
@@ -1311,6 +1333,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
tcp_advance_highest_sack(sk, skb);
tcp_skb_collapse_tstamp(prev, skb);
+ if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64))
+ TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0;
+
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
@@ -1540,6 +1565,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
dup_sack,
tcp_skb_pcount(skb),
&skb->skb_mstamp);
+ tcp_rate_skb_delivered(sk, skb, state->rate);
if (!before(TCP_SKB_CB(skb)->seq,
tcp_highest_sack_seq(tp)))
@@ -1622,8 +1648,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
num_sacks, prior_snd_una);
- if (found_dup_sack)
+ if (found_dup_sack) {
state->flag |= FLAG_DSACKING_ACK;
+ tp->delivered++; /* A spurious retransmission is delivered */
+ }
/* Eliminate too old ACKs, but take into
* account more or less fresh ones, they can
@@ -1890,6 +1918,7 @@ void tcp_enter_loss(struct sock *sk)
struct sk_buff *skb;
bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
bool is_reneg; /* is receiver reneging on SACKs? */
+ bool mark_lost;
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1923,8 +1952,12 @@ void tcp_enter_loss(struct sock *sk)
if (skb == tcp_send_head(sk))
break;
+ mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
+ is_reneg);
+ if (mark_lost)
+ tcp_sum_lost(tp, skb);
TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
- if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
+ if (mark_lost) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
@@ -2503,6 +2536,9 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ if (inet_csk(sk)->icsk_ca_ops->cong_control)
+ return;
+
/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
(tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
@@ -2879,67 +2915,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
*rexmit = REXMIT_LOST;
}
-/* Kathleen Nichols' algorithm for tracking the minimum value of
- * a data stream over some fixed time interval. (E.g., the minimum
- * RTT over the past five minutes.) It uses constant space and constant
- * time per update yet almost always delivers the same minimum as an
- * implementation that has to keep all the data in the window.
- *
- * The algorithm keeps track of the best, 2nd best & 3rd best min
- * values, maintaining an invariant that the measurement time of the
- * n'th best >= n-1'th best. It also makes sure that the three values
- * are widely separated in the time window since that bounds the worse
- * case error when that data is monotonically increasing over the window.
- *
- * Upon getting a new min, we can forget everything earlier because it
- * has no value - the new min is <= everything else in the window by
- * definition and it's the most recent. So we restart fresh on every new min
- * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
- * best.
- */
static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
{
- const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
- struct rtt_meas *m = tcp_sk(sk)->rtt_min;
- struct rtt_meas rttm = {
- .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1),
- .ts = now,
- };
- u32 elapsed;
-
- /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
- if (unlikely(rttm.rtt <= m[0].rtt))
- m[0] = m[1] = m[2] = rttm;
- else if (rttm.rtt <= m[1].rtt)
- m[1] = m[2] = rttm;
- else if (rttm.rtt <= m[2].rtt)
- m[2] = rttm;
-
- elapsed = now - m[0].ts;
- if (unlikely(elapsed > wlen)) {
- /* Passed entire window without a new min so make 2nd choice
- * the new min & 3rd choice the new 2nd. So forth and so on.
- */
- m[0] = m[1];
- m[1] = m[2];
- m[2] = rttm;
- if (now - m[0].ts > wlen) {
- m[0] = m[1];
- m[1] = rttm;
- if (now - m[0].ts > wlen)
- m[0] = rttm;
- }
- } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
- /* Passed a quarter of the window without a new min so
- * take 2nd choice from the 2nd quarter of the window.
- */
- m[2] = m[1] = rttm;
- } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
- /* Passed half the window without a new min so take the 3rd
- * choice from the last half of the window.
- */
- m[2] = rttm;
- }
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 wlen = sysctl_tcp_min_rtt_wlen * HZ;
+
+ minmax_running_min(&tp->rtt_min, wlen, tcp_time_stamp,
+ rtt_us ? : jiffies_to_usecs(1));
}
static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
@@ -3102,10 +3084,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
u32 prior_snd_una, int *acked,
- struct tcp_sacktag_state *sack)
+ struct tcp_sacktag_state *sack,
+ struct skb_mstamp *now)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- struct skb_mstamp first_ackt, last_ackt, now;
+ struct skb_mstamp first_ackt, last_ackt;
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_sacked = tp->sacked_out;
u32 reord = tp->packets_out;
@@ -3137,7 +3120,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
acked_pcount = tcp_tso_acked(sk, skb);
if (!acked_pcount)
break;
-
fully_acked = false;
} else {
/* Speedup tcp_unlink_write_queue() and next loop */
@@ -3173,6 +3155,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->packets_out -= acked_pcount;
pkts_acked += acked_pcount;
+ tcp_rate_skb_delivered(sk, skb, sack->rate);
/* Initial outgoing SYN's get put onto the write_queue
* just like anything else we transmit. It is not
@@ -3205,16 +3188,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
flag |= FLAG_SACK_RENEGING;
- skb_mstamp_get(&now);
if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
- seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
- ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+ seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt);
+ ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt);
}
if (sack->first_sackt.v64) {
- sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt);
- ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
+ sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt);
+ ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt);
}
-
+ sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */
rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
ca_rtt_us);
@@ -3242,7 +3224,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->fackets_out -= min(pkts_acked, tp->fackets_out);
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
- sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
+ sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) {
/* Do not re-arm RTO if the sack RTT is measured from data sent
* after when the head was last (re)transmitted. Otherwise the
* timeout may continue to extend in loss recovery.
@@ -3333,8 +3315,15 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
* information. All transmission or retransmission are delayed afterwards.
*/
static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
- int flag)
+ int flag, const struct rate_sample *rs)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->cong_control) {
+ icsk->icsk_ca_ops->cong_control(sk, rs);
+ return;
+ }
+
if (tcp_in_cwnd_reduction(sk)) {
/* Reduce cwnd if state mandates */
tcp_cwnd_reduction(sk, acked_sacked, flag);
@@ -3579,17 +3568,21 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_sacktag_state sack_state;
+ struct rate_sample rs = { .prior_delivered = 0 };
u32 prior_snd_una = tp->snd_una;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
bool is_dupack = false;
u32 prior_fackets;
int prior_packets = tp->packets_out;
- u32 prior_delivered = tp->delivered;
+ u32 delivered = tp->delivered;
+ u32 lost = tp->lost;
int acked = 0; /* Number of packets newly acked */
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+ struct skb_mstamp now;
sack_state.first_sackt.v64 = 0;
+ sack_state.rate = &rs;
/* We very likely will need to access write queue head. */
prefetchw(sk->sk_write_queue.next);
@@ -3612,6 +3605,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, tp->snd_nxt))
goto invalid_ack;
+ skb_mstamp_get(&now);
+
if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
@@ -3622,6 +3617,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
}
prior_fackets = tp->fackets_out;
+ rs.prior_in_flight = tcp_packets_in_flight(tp);
/* ts_recent update must be made after we are sure that the packet
* is in window.
@@ -3677,7 +3673,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
- &sack_state);
+ &sack_state, &now);
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
@@ -3694,7 +3690,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
tcp_schedule_loss_probe(sk);
- tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag);
+ delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
+ lost = tp->lost - lost; /* freshly marked lost */
+ tcp_rate_gen(sk, delivered, lost, &now, &rs);
+ tcp_cong_control(sk, ack, delivered, flag, &rs);
tcp_xmit_recovery(sk, rexmit);
return 1;
@@ -4108,7 +4107,7 @@ void tcp_fin(struct sock *sk)
/* It _is_ possible, that we have something out-of-order _after_ FIN.
* Probably, we should reset in this case. For now drop them.
*/
- __skb_queue_purge(&tp->out_of_order_queue);
+ skb_rbtree_purge(&tp->out_of_order_queue);
if (tcp_is_sack(tp))
tcp_sack_reset(&tp->rx_opt);
sk_mem_reclaim(sk);
@@ -4268,7 +4267,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
int this_sack;
/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
- if (skb_queue_empty(&tp->out_of_order_queue)) {
+ if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
tp->rx_opt.num_sacks = 0;
return;
}
@@ -4344,10 +4343,13 @@ static void tcp_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
__u32 dsack_high = tp->rcv_nxt;
+ bool fin, fragstolen, eaten;
struct sk_buff *skb, *tail;
- bool fragstolen, eaten;
+ struct rb_node *p;
- while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
+ p = rb_first(&tp->out_of_order_queue);
+ while (p) {
+ skb = rb_entry(p, struct sk_buff, rbnode);
if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
break;
@@ -4357,9 +4359,10 @@ static void tcp_ofo_queue(struct sock *sk)
dsack_high = TCP_SKB_CB(skb)->end_seq;
tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
}
+ p = rb_next(p);
+ rb_erase(&skb->rbnode, &tp->out_of_order_queue);
- __skb_unlink(skb, &tp->out_of_order_queue);
- if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+ if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
SOCK_DEBUG(sk, "ofo packet was already received\n");
tcp_drop(sk, skb);
continue;
@@ -4371,12 +4374,19 @@ static void tcp_ofo_queue(struct sock *sk)
tail = skb_peek_tail(&sk->sk_receive_queue);
eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
+ fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
if (!eaten)
__skb_queue_tail(&sk->sk_receive_queue, skb);
- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
- tcp_fin(sk);
- if (eaten)
+ else
kfree_skb_partial(skb, fragstolen);
+
+ if (unlikely(fin)) {
+ tcp_fin(sk);
+ /* tcp_fin() purges tp->out_of_order_queue,
+ * so we must end this loop right now.
+ */
+ break;
+ }
}
}
@@ -4403,8 +4413,10 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct rb_node **p, *q, *parent;
struct sk_buff *skb1;
u32 seq, end_seq;
+ bool fragstolen;
tcp_ecn_check_ce(tp, skb);
@@ -4419,88 +4431,92 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
inet_csk_schedule_ack(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
+ seq = TCP_SKB_CB(skb)->seq;
+ end_seq = TCP_SKB_CB(skb)->end_seq;
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
- tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+ tp->rcv_nxt, seq, end_seq);
- skb1 = skb_peek_tail(&tp->out_of_order_queue);
- if (!skb1) {
+ p = &tp->out_of_order_queue.rb_node;
+ if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
/* Initial out of order segment, build 1 SACK. */
if (tcp_is_sack(tp)) {
tp->rx_opt.num_sacks = 1;
- tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
- tp->selective_acks[0].end_seq =
- TCP_SKB_CB(skb)->end_seq;
+ tp->selective_acks[0].start_seq = seq;
+ tp->selective_acks[0].end_seq = end_seq;
}
- __skb_queue_head(&tp->out_of_order_queue, skb);
+ rb_link_node(&skb->rbnode, NULL, p);
+ rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
+ tp->ooo_last_skb = skb;
goto end;
}
- seq = TCP_SKB_CB(skb)->seq;
- end_seq = TCP_SKB_CB(skb)->end_seq;
-
- if (seq == TCP_SKB_CB(skb1)->end_seq) {
- bool fragstolen;
-
- if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
- __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
- } else {
- tcp_grow_window(sk, skb);
- kfree_skb_partial(skb, fragstolen);
- skb = NULL;
- }
-
- if (!tp->rx_opt.num_sacks ||
- tp->selective_acks[0].end_seq != seq)
- goto add_sack;
-
- /* Common case: data arrive in order after hole. */
- tp->selective_acks[0].end_seq = end_seq;
- goto end;
- }
-
- /* Find place to insert this segment. */
- while (1) {
- if (!after(TCP_SKB_CB(skb1)->seq, seq))
- break;
- if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
- skb1 = NULL;
- break;
- }
- skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
- }
-
- /* Do skb overlap to previous one? */
- if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
- if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
- /* All the bits are present. Drop. */
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
- tcp_drop(sk, skb);
- skb = NULL;
- tcp_dsack_set(sk, seq, end_seq);
- goto add_sack;
+ /* In the typical case, we are adding an skb to the end of the list.
+ * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
+ */
+ if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) {
+coalesce_done:
+ tcp_grow_window(sk, skb);
+ kfree_skb_partial(skb, fragstolen);
+ skb = NULL;
+ goto add_sack;
+ }
+ /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
+ if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
+ parent = &tp->ooo_last_skb->rbnode;
+ p = &parent->rb_right;
+ goto insert;
+ }
+
+ /* Find place to insert this segment. Handle overlaps on the way. */
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+ skb1 = rb_entry(parent, struct sk_buff, rbnode);
+ if (before(seq, TCP_SKB_CB(skb1)->seq)) {
+ p = &parent->rb_left;
+ continue;
}
- if (after(seq, TCP_SKB_CB(skb1)->seq)) {
- /* Partial overlap. */
- tcp_dsack_set(sk, seq,
- TCP_SKB_CB(skb1)->end_seq);
- } else {
- if (skb_queue_is_first(&tp->out_of_order_queue,
- skb1))
- skb1 = NULL;
- else
- skb1 = skb_queue_prev(
- &tp->out_of_order_queue,
- skb1);
+ if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+ /* All the bits are present. Drop. */
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPOFOMERGE);
+ __kfree_skb(skb);
+ skb = NULL;
+ tcp_dsack_set(sk, seq, end_seq);
+ goto add_sack;
+ }
+ if (after(seq, TCP_SKB_CB(skb1)->seq)) {
+ /* Partial overlap. */
+ tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
+ } else {
+ /* skb's seq == skb1's seq and skb covers skb1.
+ * Replace skb1 with skb.
+ */
+ rb_replace_node(&skb1->rbnode, &skb->rbnode,
+ &tp->out_of_order_queue);
+ tcp_dsack_extend(sk,
+ TCP_SKB_CB(skb1)->seq,
+ TCP_SKB_CB(skb1)->end_seq);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPOFOMERGE);
+ __kfree_skb(skb1);
+ goto merge_right;
+ }
+ } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
+ goto coalesce_done;
}
+ p = &parent->rb_right;
}
- if (!skb1)
- __skb_queue_head(&tp->out_of_order_queue, skb);
- else
- __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+insert:
+ /* Insert segment into RB tree. */
+ rb_link_node(&skb->rbnode, parent, p);
+ rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
- /* And clean segments covered by new one as whole. */
- while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
- skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
+merge_right:
+ /* Remove other segments covered by skb. */
+ while ((q = rb_next(&skb->rbnode)) != NULL) {
+ skb1 = rb_entry(q, struct sk_buff, rbnode);
if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
break;
@@ -4509,12 +4525,15 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
end_seq);
break;
}
- __skb_unlink(skb1, &tp->out_of_order_queue);
+ rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
TCP_SKB_CB(skb1)->end_seq);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
tcp_drop(sk, skb1);
}
+ /* If there is no skb after us, we are the last_skb ! */
+ if (!q)
+ tp->ooo_last_skb = skb;
add_sack:
if (tcp_is_sack(tp))
@@ -4651,13 +4670,13 @@ queue_and_out:
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
tcp_fin(sk);
- if (!skb_queue_empty(&tp->out_of_order_queue)) {
+ if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
tcp_ofo_queue(sk);
/* RFC2581. 4.2. SHOULD send immediate ACK, when
* gap in queue is filled.
*/
- if (skb_queue_empty(&tp->out_of_order_queue))
+ if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
inet_csk(sk)->icsk_ack.pingpong = 0;
}
@@ -4711,48 +4730,76 @@ drop:
tcp_data_queue_ofo(sk, skb);
}
+static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
+{
+ if (list)
+ return !skb_queue_is_last(list, skb) ? skb->next : NULL;
+
+ return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
+}
+
static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
- struct sk_buff_head *list)
+ struct sk_buff_head *list,
+ struct rb_root *root)
{
- struct sk_buff *next = NULL;
+ struct sk_buff *next = tcp_skb_next(skb, list);
- if (!skb_queue_is_last(list, skb))
- next = skb_queue_next(list, skb);
+ if (list)
+ __skb_unlink(skb, list);
+ else
+ rb_erase(&skb->rbnode, root);
- __skb_unlink(skb, list);
__kfree_skb(skb);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
return next;
}
+/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
+static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct sk_buff *skb1;
+
+ while (*p) {
+ parent = *p;
+ skb1 = rb_entry(parent, struct sk_buff, rbnode);
+ if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
+ }
+ rb_link_node(&skb->rbnode, parent, p);
+ rb_insert_color(&skb->rbnode, root);
+}
+
/* Collapse contiguous sequence of skbs head..tail with
* sequence numbers start..end.
*
- * If tail is NULL, this means until the end of the list.
+ * If tail is NULL, this means until the end of the queue.
*
* Segments with FIN/SYN are not collapsed (only because this
* simplifies code)
*/
static void
-tcp_collapse(struct sock *sk, struct sk_buff_head *list,
- struct sk_buff *head, struct sk_buff *tail,
- u32 start, u32 end)
+tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
+ struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
{
- struct sk_buff *skb, *n;
+ struct sk_buff *skb = head, *n;
+ struct sk_buff_head tmp;
bool end_of_skbs;
/* First, check that queue is collapsible and find
- * the point where collapsing can be useful. */
- skb = head;
+ * the point where collapsing can be useful.
+ */
restart:
- end_of_skbs = true;
- skb_queue_walk_from_safe(list, skb, n) {
- if (skb == tail)
- break;
+ for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
+ n = tcp_skb_next(skb, list);
+
/* No new bits? It is possible on ofo queue. */
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
- skb = tcp_collapse_one(sk, skb, list);
+ skb = tcp_collapse_one(sk, skb, list, root);
if (!skb)
break;
goto restart;
@@ -4770,13 +4817,10 @@ restart:
break;
}
- if (!skb_queue_is_last(list, skb)) {
- struct sk_buff *next = skb_queue_next(list, skb);
- if (next != tail &&
- TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
- end_of_skbs = false;
- break;
- }
+ if (n && n != tail &&
+ TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
+ end_of_skbs = false;
+ break;
}
/* Decided to skip this, advance start seq. */
@@ -4786,17 +4830,22 @@ restart:
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
return;
+ __skb_queue_head_init(&tmp);
+
while (before(start, end)) {
int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
struct sk_buff *nskb;
nskb = alloc_skb(copy, GFP_ATOMIC);
if (!nskb)
- return;
+ break;
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
- __skb_queue_before(list, skb, nskb);
+ if (list)
+ __skb_queue_before(list, skb, nskb);
+ else
+ __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
skb_set_owner_r(nskb, sk);
/* Copy data, releasing collapsed skbs. */
@@ -4814,14 +4863,17 @@ restart:
start += size;
}
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
- skb = tcp_collapse_one(sk, skb, list);
+ skb = tcp_collapse_one(sk, skb, list, root);
if (!skb ||
skb == tail ||
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
- return;
+ goto end;
}
}
}
+end:
+ skb_queue_walk_safe(&tmp, skb, n)
+ tcp_rbtree_insert(root, skb);
}
/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
@@ -4830,43 +4882,43 @@ restart:
static void tcp_collapse_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
- struct sk_buff *head;
+ struct sk_buff *skb, *head;
+ struct rb_node *p;
u32 start, end;
- if (!skb)
+ p = rb_first(&tp->out_of_order_queue);
+ skb = rb_entry_safe(p, struct sk_buff, rbnode);
+new_range:
+ if (!skb) {
+ p = rb_last(&tp->out_of_order_queue);
+ /* Note: This is possible p is NULL here. We do not
+ * use rb_entry_safe(), as ooo_last_skb is valid only
+ * if rbtree is not empty.
+ */
+ tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
return;
-
+ }
start = TCP_SKB_CB(skb)->seq;
end = TCP_SKB_CB(skb)->end_seq;
- head = skb;
-
- for (;;) {
- struct sk_buff *next = NULL;
- if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
- next = skb_queue_next(&tp->out_of_order_queue, skb);
- skb = next;
+ for (head = skb;;) {
+ skb = tcp_skb_next(skb, NULL);
- /* Segment is terminated when we see gap or when
- * we are at the end of all the queue. */
+ /* Range is terminated when we see a gap or when
+ * we are at the queue end.
+ */
if (!skb ||
after(TCP_SKB_CB(skb)->seq, end) ||
before(TCP_SKB_CB(skb)->end_seq, start)) {
- tcp_collapse(sk, &tp->out_of_order_queue,
+ tcp_collapse(sk, NULL, &tp->out_of_order_queue,
head, skb, start, end);
- head = skb;
- if (!skb)
- break;
- /* Start new segment */
+ goto new_range;
+ }
+
+ if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
start = TCP_SKB_CB(skb)->seq;
+ if (after(TCP_SKB_CB(skb)->end_seq, end))
end = TCP_SKB_CB(skb)->end_seq;
- } else {
- if (before(TCP_SKB_CB(skb)->seq, start))
- start = TCP_SKB_CB(skb)->seq;
- if (after(TCP_SKB_CB(skb)->end_seq, end))
- end = TCP_SKB_CB(skb)->end_seq;
- }
}
}
@@ -4883,20 +4935,24 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
static bool tcp_prune_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
+ struct rb_node *node, *prev;
- if (skb_queue_empty(&tp->out_of_order_queue))
+ if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
return false;
NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
-
- while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue)) != NULL) {
- tcp_drop(sk, skb);
+ node = &tp->ooo_last_skb->rbnode;
+ do {
+ prev = rb_prev(node);
+ rb_erase(node, &tp->out_of_order_queue);
+ tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
sk_mem_reclaim(sk);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
!tcp_under_memory_pressure(sk))
break;
- }
+ node = prev;
+ } while (node);
+ tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
/* Reset SACK state. A conforming SACK implementation will
* do the same at a timeout based retransmit. When a connection
@@ -4930,7 +4986,7 @@ static int tcp_prune_queue(struct sock *sk)
tcp_collapse_ofo_queue(sk);
if (!skb_queue_empty(&sk->sk_receive_queue))
- tcp_collapse(sk, &sk->sk_receive_queue,
+ tcp_collapse(sk, &sk->sk_receive_queue, NULL,
skb_peek(&sk->sk_receive_queue),
NULL,
tp->copied_seq, tp->rcv_nxt);
@@ -5035,7 +5091,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
/* We ACK each frame or... */
tcp_in_quickack_mode(sk) ||
/* We have out of order data. */
- (ofo_possible && skb_peek(&tp->out_of_order_queue))) {
+ (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
/* Then ack it now */
tcp_send_ack(sk);
} else {
@@ -5894,7 +5950,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
* so release it.
*/
if (req) {
- tp->total_retrans = req->num_retrans;
+ inet_csk(sk)->icsk_retransmits = 0;
reqsk_fastopen_remove(sk, req, false);
} else {
/* Make sure socket is routed, for correct metrics. */
@@ -5936,7 +5992,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
} else
tcp_init_metrics(sk);
- tcp_update_pacing_rate(sk);
+ if (!inet_csk(sk)->icsk_ca_ops->cong_control)
+ tcp_update_pacing_rate(sk);
/* Prevent spurious tcp_cwnd_restart() on first data packet */
tp->lsndtime = tcp_time_stamp;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 13b05adf9d3e..7ac37c314312 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1844,7 +1844,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
tcp_write_queue_purge(sk);
/* Cleans up our, hopefully empty, out_of_order_queue. */
- __skb_queue_purge(&tp->out_of_order_queue);
+ skb_rbtree_purge(&tp->out_of_order_queue);
#ifdef CONFIG_TCP_MD5SIG
/* Clean up the MD5 key list, if any */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 4b95ec4ed2c8..6234ebaa7db1 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -464,7 +464,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->srtt_us = 0;
newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
- newtp->rtt_min[0].rtt = ~0U;
+ minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U);
newicsk->icsk_rto = TCP_TIMEOUT_INIT;
newtp->packets_out = 0;
@@ -487,8 +487,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->snd_cwnd = TCP_INIT_CWND;
newtp->snd_cwnd_cnt = 0;
+ /* There's a bubble in the pipe until at least the first ACK. */
+ newtp->app_limited = ~0U;
+
tcp_init_xmit_timers(newsk);
- __skb_queue_head_init(&newtp->out_of_order_queue);
newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
newtp->rx_opt.saw_tstamp = 0;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 5c5964962d0c..bc68da38ea86 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -90,12 +90,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
goto out;
}
- /* GSO partial only requires splitting the frame into an MSS
- * multiple and possibly a remainder. So update the mss now.
- */
- if (features & NETIF_F_GSO_PARTIAL)
- mss = skb->len - (skb->len % mss);
-
copy_destructor = gso_skb->destructor == tcp_wfree;
ooo_okay = gso_skb->ooo_okay;
/* All segments but the first should have ooo_okay cleared */
@@ -108,6 +102,13 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
/* Only first segment might have ooo_okay set */
segs->ooo_okay = ooo_okay;
+ /* GSO partial and frag_list segmentation only requires splitting
+ * the frame into an MSS multiple and possibly a remainder, both
+ * cases return a GSO skb. So update the mss now.
+ */
+ if (skb_is_gso(segs))
+ mss *= skb_shinfo(segs)->gso_segs;
+
delta = htonl(oldlen + (thlen + mss));
skb = segs;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8b45794eb6b2..7c777089a4d6 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -734,9 +734,16 @@ static void tcp_tsq_handler(struct sock *sk)
{
if ((1 << sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
- TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
- tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle,
+ TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->lost_out > tp->retrans_out &&
+ tp->snd_cwnd > tcp_packets_in_flight(tp))
+ tcp_xmit_retransmit_queue(sk);
+
+ tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
0, GFP_ATOMIC);
+ }
}
/*
* One tasklet per cpu tries to send more skbs.
@@ -918,6 +925,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb_mstamp_get(&skb->skb_mstamp);
TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
- tp->snd_una;
+ tcp_rate_skb_sent(sk, skb);
if (unlikely(skb_cloned(skb)))
skb = pskb_copy(skb, gfp_mask);
@@ -1213,6 +1221,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
tcp_set_skb_tso_segs(skb, mss_now);
tcp_set_skb_tso_segs(buff, mss_now);
+ /* Update delivered info for the new segment */
+ TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
+
/* If this packet has been sent out already, we must
* adjust the various packet counters.
*/
@@ -1358,6 +1369,7 @@ int tcp_mss_to_mtu(struct sock *sk, int mss)
}
return mtu;
}
+EXPORT_SYMBOL(tcp_mss_to_mtu);
/* MTU probing init per socket */
void tcp_mtup_init(struct sock *sk)
@@ -1545,7 +1557,8 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
/* Return how many segs we'd like on a TSO packet,
* to send one TSO packet per ms
*/
-static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
+u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+ int min_tso_segs)
{
u32 bytes, segs;
@@ -1557,10 +1570,23 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
* This preserves ACK clocking and is consistent
* with tcp_tso_should_defer() heuristic.
*/
- segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs);
+ segs = max_t(u32, bytes / mss_now, min_tso_segs);
return min_t(u32, segs, sk->sk_gso_max_segs);
}
+EXPORT_SYMBOL(tcp_tso_autosize);
+
+/* Return the number of segments we want in the skb we are transmitting.
+ * See if congestion control module wants to decide; otherwise, autosize.
+ */
+static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
+{
+ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+ u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
+
+ return tso_segs ? :
+ tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs);
+}
/* Returns the portion of skb which can be sent right away */
static unsigned int tcp_mss_split_point(const struct sock *sk,
@@ -2020,6 +2046,39 @@ static int tcp_mtu_probe(struct sock *sk)
return -1;
}
+/* TCP Small Queues :
+ * Control number of packets in qdisc/devices to two packets / or ~1 ms.
+ * (These limits are doubled for retransmits)
+ * This allows for :
+ * - better RTT estimation and ACK scheduling
+ * - faster recovery
+ * - high rates
+ * Alas, some drivers / subsystems require a fair amount
+ * of queued bytes to ensure line rate.
+ * One example is wifi aggregation (802.11 AMPDU)
+ */
+static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
+ unsigned int factor)
+{
+ unsigned int limit;
+
+ limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
+ limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
+ limit <<= factor;
+
+ if (atomic_read(&sk->sk_wmem_alloc) > limit) {
+ set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
+ /* It is possible TX completion already happened
+ * before we set TSQ_THROTTLED, so we must
+ * test again the condition.
+ */
+ smp_mb__after_atomic();
+ if (atomic_read(&sk->sk_wmem_alloc) > limit)
+ return true;
+ }
+ return false;
+}
+
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
@@ -2057,7 +2116,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
}
}
- max_segs = tcp_tso_autosize(sk, mss_now);
+ max_segs = tcp_tso_segs(sk, mss_now);
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
@@ -2106,29 +2165,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
- /* TCP Small Queues :
- * Control number of packets in qdisc/devices to two packets / or ~1 ms.
- * This allows for :
- * - better RTT estimation and ACK scheduling
- * - faster recovery
- * - high rates
- * Alas, some drivers / subsystems require a fair amount
- * of queued bytes to ensure line rate.
- * One example is wifi aggregation (802.11 AMPDU)
- */
- limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
- limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
-
- if (atomic_read(&sk->sk_wmem_alloc) > limit) {
- set_bit(TSQ_THROTTLED, &tp->tsq_flags);
- /* It is possible TX completion already happened
- * before we set TSQ_THROTTLED, so we must
- * test again the condition.
- */
- smp_mb__after_atomic();
- if (atomic_read(&sk->sk_wmem_alloc) > limit)
- break;
- }
+ if (tcp_small_queue_check(sk, skb, 0))
+ break;
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
break;
@@ -2605,7 +2643,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
* copying overhead: fragmentation, tunneling, mangling etc.
*/
if (atomic_read(&sk->sk_wmem_alloc) >
- min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
+ min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2),
+ sk->sk_sndbuf))
return -EAGAIN;
if (skb_still_in_host_queue(sk, skb))
@@ -2774,7 +2813,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
last_lost = tp->snd_una;
}
- max_segs = tcp_tso_autosize(sk, tcp_current_mss(sk));
+ max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
tcp_for_write_queue_from(skb, sk) {
__u8 sacked;
int segs;
@@ -2828,10 +2867,13 @@ begin_fwd:
if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
continue;
+ if (tcp_small_queue_check(sk, skb, 1))
+ return;
+
if (tcp_retransmit_skb(sk, skb, segs))
return;
- NET_INC_STATS(sock_net(sk), mib_idx);
+ NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += tcp_skb_pcount(skb);
@@ -3568,6 +3610,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
if (!res) {
__TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ if (unlikely(tcp_passive_fastopen(sk)))
+ tcp_sk(sk)->total_retrans++;
}
return res;
}
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
new file mode 100644
index 000000000000..9be1581a5a08
--- /dev/null
+++ b/net/ipv4/tcp_rate.c
@@ -0,0 +1,186 @@
+#include <net/tcp.h>
+
+/* The bandwidth estimator estimates the rate at which the network
+ * can currently deliver outbound data packets for this flow. At a high
+ * level, it operates by taking a delivery rate sample for each ACK.
+ *
+ * A rate sample records the rate at which the network delivered packets
+ * for this flow, calculated over the time interval between the transmission
+ * of a data packet and the acknowledgment of that packet.
+ *
+ * Specifically, over the interval between each transmit and corresponding ACK,
+ * the estimator generates a delivery rate sample. Typically it uses the rate
+ * at which packets were acknowledged. However, the approach of using only the
+ * acknowledgment rate faces a challenge under the prevalent ACK decimation or
+ * compression: packets can temporarily appear to be delivered much quicker
+ * than the bottleneck rate. Since it is physically impossible to do that in a
+ * sustained fashion, when the estimator notices that the ACK rate is faster
+ * than the transmit rate, it uses the latter:
+ *
+ * send_rate = #pkts_delivered/(last_snd_time - first_snd_time)
+ * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time)
+ * bw = min(send_rate, ack_rate)
+ *
+ * Notice the estimator essentially estimates the goodput, not always the
+ * network bottleneck link rate when the sending or receiving is limited by
+ * other factors like applications or receiver window limits. The estimator
+ * deliberately avoids using the inter-packet spacing approach because that
+ * approach requires a large number of samples and sophisticated filtering.
+ *
+ * TCP flows can often be application-limited in request/response workloads.
+ * The estimator marks a bandwidth sample as application-limited if there
+ * was some moment during the sampled window of packets when there was no data
+ * ready to send in the write queue.
+ */
+
+/* Snapshot the current delivery information in the skb, to generate
+ * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+ */
+void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* In general we need to start delivery rate samples from the
+ * time we received the most recent ACK, to ensure we include
+ * the full time the network needs to deliver all in-flight
+ * packets. If there are no packets in flight yet, then we
+ * know that any ACKs after now indicate that the network was
+ * able to deliver those packets completely in the sampling
+ * interval between now and the next ACK.
+ *
+ * Note that we use packets_out instead of tcp_packets_in_flight(tp)
+ * because the latter is a guess based on RTO and loss-marking
+ * heuristics. We don't want spurious RTOs or loss markings to cause
+ * a spuriously small time interval, causing a spuriously high
+ * bandwidth estimate.
+ */
+ if (!tp->packets_out) {
+ tp->first_tx_mstamp = skb->skb_mstamp;
+ tp->delivered_mstamp = skb->skb_mstamp;
+ }
+
+ TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
+ TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp;
+ TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
+ TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
+}
+
+/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+ * delivery information when the skb was last transmitted.
+ *
+ * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
+ * called multiple times. We favor the information from the most recently
+ * sent skb, i.e., the skb with the highest prior_delivered count.
+ */
+void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+
+ if (!scb->tx.delivered_mstamp.v64)
+ return;
+
+ if (!rs->prior_delivered ||
+ after(scb->tx.delivered, rs->prior_delivered)) {
+ rs->prior_delivered = scb->tx.delivered;
+ rs->prior_mstamp = scb->tx.delivered_mstamp;
+ rs->is_app_limited = scb->tx.is_app_limited;
+ rs->is_retrans = scb->sacked & TCPCB_RETRANS;
+
+ /* Find the duration of the "send phase" of this window: */
+ rs->interval_us = skb_mstamp_us_delta(
+ &skb->skb_mstamp,
+ &scb->tx.first_tx_mstamp);
+
+ /* Record send time of most recently ACKed packet: */
+ tp->first_tx_mstamp = skb->skb_mstamp;
+ }
+ /* Mark off the skb delivered once it's sacked to avoid being
+ * used again when it's cumulatively acked. For acked packets
+ * we don't need to reset since it'll be freed soon.
+ */
+ if (scb->sacked & TCPCB_SACKED_ACKED)
+ scb->tx.delivered_mstamp.v64 = 0;
+}
+
+/* Update the connection delivery information and generate a rate sample. */
+void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ struct skb_mstamp *now, struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 snd_us, ack_us;
+
+ /* Clear app limited if bubble is acked and gone. */
+ if (tp->app_limited && after(tp->delivered, tp->app_limited))
+ tp->app_limited = 0;
+
+ /* TODO: there are multiple places throughout tcp_ack() to get
+ * current time. Refactor the code using a new "tcp_acktag_state"
+ * to carry current time, flags, stats like "tcp_sacktag_state".
+ */
+ if (delivered)
+ tp->delivered_mstamp = *now;
+
+ rs->acked_sacked = delivered; /* freshly ACKed or SACKed */
+ rs->losses = lost; /* freshly marked lost */
+ /* Return an invalid sample if no timing information is available. */
+ if (!rs->prior_mstamp.v64) {
+ rs->delivered = -1;
+ rs->interval_us = -1;
+ return;
+ }
+ rs->delivered = tp->delivered - rs->prior_delivered;
+
+ /* Model sending data and receiving ACKs as separate pipeline phases
+ * for a window. Usually the ACK phase is longer, but with ACK
+ * compression the send phase can be longer. To be safe we use the
+ * longer phase.
+ */
+ snd_us = rs->interval_us; /* send phase */
+ ack_us = skb_mstamp_us_delta(now, &rs->prior_mstamp); /* ack phase */
+ rs->interval_us = max(snd_us, ack_us);
+
+ /* Normally we expect interval_us >= min-rtt.
+ * Note that rate may still be over-estimated when a spuriously
+ * retransmistted skb was first (s)acked because "interval_us"
+ * is under-estimated (up to an RTT). However continuously
+ * measuring the delivery rate during loss recovery is crucial
+ * for connections suffer heavy or prolonged losses.
+ */
+ if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
+ if (!rs->is_retrans)
+ pr_debug("tcp rate: %ld %d %u %u %u\n",
+ rs->interval_us, rs->delivered,
+ inet_csk(sk)->icsk_ca_state,
+ tp->rx_opt.sack_ok, tcp_min_rtt(tp));
+ rs->interval_us = -1;
+ return;
+ }
+
+ /* Record the last non-app-limited or the highest app-limited bw */
+ if (!rs->is_app_limited ||
+ ((u64)rs->delivered * tp->rate_interval_us >=
+ (u64)tp->rate_delivered * rs->interval_us)) {
+ tp->rate_delivered = rs->delivered;
+ tp->rate_interval_us = rs->interval_us;
+ tp->rate_app_limited = rs->is_app_limited;
+ }
+}
+
+/* If a gap is detected between sends, mark the socket application-limited. */
+void tcp_rate_check_app_limited(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (/* We have less than one packet to send. */
+ tp->write_seq - tp->snd_nxt < tp->mss_cache &&
+ /* Nothing in sending host's qdisc queues or NIC tx queue. */
+ sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
+ /* We are not limited by CWND. */
+ tcp_packets_in_flight(tp) < tp->snd_cwnd &&
+ /* All lost packets have been retransmitted. */
+ tp->lost_out <= tp->retrans_out)
+ tp->app_limited =
+ (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index d84930b2dd95..f712b411f6ed 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -384,6 +384,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
*/
inet_rtx_syn_ack(sk, req);
req->num_timeout++;
+ icsk->icsk_retransmits++;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
}
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 028eb046ea40..9c5fc973267f 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -76,7 +76,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh)
+ if (tcp_in_slow_start(tp))
tcp_slow_start(tp, acked);
else if (!yeah->doing_reno_now) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 058c31286ce1..7d96dc2d3d08 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1021,12 +1021,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
flow_flags,
faddr, saddr, dport, inet->inet_sport);
- if (!saddr && ipc.oif) {
- err = l3mdev_get_saddr(net, ipc.oif, fl4);
- if (err < 0)
- goto out;
- }
-
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt)) {
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 8a9f6e535caa..9a89c10a55f0 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -20,7 +20,7 @@
static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *req,
- struct nlattr *bc)
+ struct nlattr *bc, bool net_admin)
{
if (!inet_diag_bc_sk(bc, sk))
return 0;
@@ -28,7 +28,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
return inet_sk_diag_fill(sk, NULL, skb, req,
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+ cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin);
}
static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
@@ -76,7 +76,8 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
err = inet_sk_diag_fill(sk, NULL, rep, req,
sk_user_ns(NETLINK_CB(in_skb).sk),
NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0, nlh);
+ nlh->nlmsg_seq, 0, nlh,
+ netlink_net_capable(in_skb, CAP_NET_ADMIN));
if (err < 0) {
WARN_ON(err == -EMSGSIZE);
kfree_skb(rep);
@@ -97,6 +98,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct net *net = sock_net(skb->sk);
int num, s_num, slot, s_slot;
@@ -132,7 +134,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
r->id.idiag_dport)
goto next;
- if (sk_diag_dump(sk, skb, cb, r, bc) < 0) {
+ if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) {
spin_unlock_bh(&hslot->lock);
goto done;
}
@@ -186,8 +188,8 @@ static int __udp_diag_destroy(struct sk_buff *in_skb,
if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
sk = __udp4_lib_lookup(net,
- req->id.idiag_dst[0], req->id.idiag_dport,
- req->id.idiag_src[0], req->id.idiag_sport,
+ req->id.idiag_dst[3], req->id.idiag_dport,
+ req->id.idiag_src[3], req->id.idiag_sport,
req->id.idiag_if, tbl, NULL);
else
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 81f253b6ff36..f9333c963607 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -21,7 +21,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
__be16 new_protocol, bool is_ipv6)
{
int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
- bool remcsum, need_csum, offload_csum, ufo;
+ bool remcsum, need_csum, offload_csum, ufo, gso_partial;
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct udphdr *uh = udp_hdr(skb);
u16 mac_offset = skb->mac_header;
@@ -88,6 +88,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
goto out;
}
+ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
outer_hlen = skb_tnl_header_len(skb);
udp_offset = outer_hlen - tnl_hlen;
skb = segs;
@@ -117,7 +119,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
* will be using a length value equal to only one MSS sized
* segment instead of the entire frame.
*/
- if (skb_is_gso(skb)) {
+ if (gso_partial) {
uh->len = htons(skb_shinfo(skb)->gso_size +
SKB_GSO_CB(skb)->data_offset +
skb->head - (unsigned char *)uh);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index b644a23c3db0..6a7ff6957535 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -29,7 +29,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
memset(fl4, 0, sizeof(*fl4));
fl4->daddr = daddr->a4;
fl4->flowi4_tos = tos;
- fl4->flowi4_oif = oif;
+ fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif);
if (saddr)
fl4->saddr = saddr->a4;
@@ -112,7 +112,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
int oif = 0;
if (skb_dst(skb))
- oif = l3mdev_fib_oif(skb_dst(skb)->dev);
+ oif = skb_dst(skb)->dev->ifindex;
memset(fl4, 0, sizeof(struct flowi4));
fl4->flowi4_mark = skb->mark;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f418d2eaeddd..2f1f5d439788 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -778,7 +778,14 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf)
}
if (p == &net->ipv6.devconf_all->forwarding) {
+ int old_dflt = net->ipv6.devconf_dflt->forwarding;
+
net->ipv6.devconf_dflt->forwarding = newf;
+ if ((!newf) ^ (!old_dflt))
+ inet6_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv6.devconf_dflt);
+
addrconf_forward_change(net, newf);
if ((!newf) ^ (!old))
inet6_netconf_notify_devconf(net, NETCONFA_FORWARDING,
@@ -1941,6 +1948,7 @@ errdad:
spin_unlock_bh(&ifp->lock);
addrconf_mod_dad_work(ifp, 0);
+ in6_ifa_put(ifp);
}
/* Join to solicited addr multicast group.
@@ -3850,6 +3858,7 @@ static void addrconf_dad_work(struct work_struct *w)
addrconf_dad_begin(ifp);
goto out;
} else if (action == DAD_ABORT) {
+ in6_ifa_hold(ifp);
addrconf_dad_stop(ifp, 1);
if (disable_ipv6)
addrconf_ifdown(idev->dev, 0);
@@ -6025,7 +6034,7 @@ static const struct ctl_table addrconf_sysctl[] = {
static int __addrconf_sysctl_register(struct net *net, char *dev_name,
struct inet6_dev *idev, struct ipv6_devconf *p)
{
- int i;
+ int i, ifindex;
struct ctl_table *table;
char path[sizeof("net/ipv6/conf/") + IFNAMSIZ];
@@ -6045,6 +6054,13 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name,
if (!p->sysctl_header)
goto free;
+ if (!strcmp(dev_name, "all"))
+ ifindex = NETCONFA_IFINDEX_ALL;
+ else if (!strcmp(dev_name, "default"))
+ ifindex = NETCONFA_IFINDEX_DEFAULT;
+ else
+ ifindex = idev->dev->ifindex;
+ inet6_netconf_notify_devconf(net, NETCONFA_ALL, ifindex, p);
return 0;
free:
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 5857c1fc8b67..eea23b57c6a5 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -38,6 +38,9 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
.flags = FIB_LOOKUP_NOREF,
};
+ /* update flow if oif or iif point to device enslaved to l3mdev */
+ l3mdev_update_flow(net, flowi6_to_flowi(fl6));
+
fib_rules_lookup(net->ipv6.fib6_rules_ops,
flowi6_to_flowi(fl6), flags, &arg);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 771be1fa4176..ef5485204522 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -743,6 +743,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
(info->nlh->nlmsg_flags & NLM_F_CREATE));
int found = 0;
bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
+ u16 nlflags = NLM_F_EXCL;
int err;
ins = &fn->leaf;
@@ -759,6 +760,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
if (info->nlh &&
(info->nlh->nlmsg_flags & NLM_F_EXCL))
return -EEXIST;
+
+ nlflags &= ~NLM_F_EXCL;
if (replace) {
if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
found++;
@@ -856,6 +859,7 @@ next_iter:
pr_warn("NLM_F_CREATE should be set when creating new route\n");
add:
+ nlflags |= NLM_F_CREATE;
err = fib6_commit_metrics(&rt->dst, mxc);
if (err)
return err;
@@ -864,7 +868,7 @@ add:
*ins = rt;
rt->rt6i_node = fn;
atomic_inc(&rt->rt6i_ref);
- inet6_rt_notify(RTM_NEWROUTE, rt, info, 0);
+ inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
if (!(fn->fn_flags & RTN_RTINFO)) {
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 397e1ed3daa3..4ce74f86291b 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1239,7 +1239,7 @@ static void ip6gre_netlink_parms(struct nlattr *data[],
parms->encap_limit = nla_get_u8(data[IFLA_GRE_ENCAP_LIMIT]);
if (data[IFLA_GRE_FLOWINFO])
- parms->flowinfo = nla_get_u32(data[IFLA_GRE_FLOWINFO]);
+ parms->flowinfo = nla_get_be32(data[IFLA_GRE_FLOWINFO]);
if (data[IFLA_GRE_FLAGS])
parms->flags = nla_get_u32(data[IFLA_GRE_FLAGS]);
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 22e90e56b5a9..e7bfd55899a3 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -69,6 +69,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
int offset = 0;
bool encap, udpfrag;
int nhoff;
+ bool gso_partial;
skb_reset_network_header(skb);
nhoff = skb_network_header(skb) - skb_mac_header(skb);
@@ -101,9 +102,11 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
if (IS_ERR(segs))
goto out;
+ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
for (skb = segs; skb; skb = skb->next) {
ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
- if (skb_is_gso(skb))
+ if (gso_partial)
payload_len = skb_shinfo(skb)->gso_size +
SKB_GSO_CB(skb)->data_offset +
skb->head - (unsigned char *)(ipv6h + 1);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 993fd9666f1b..6001e781164e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -236,6 +236,14 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
IPSTATS_MIB_OUT, skb->len);
+
+ /* if egress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip6_out((struct sock *)sk, skb);
+ if (unlikely(!skb))
+ return 0;
+
/* hooks should never assume socket lock is held.
* we promote our socket to non const
*/
@@ -918,13 +926,6 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
int err;
int flags = 0;
- if (ipv6_addr_any(&fl6->saddr) && fl6->flowi6_oif &&
- (!*dst || !(*dst)->error)) {
- err = l3mdev_get_saddr6(net, sk, fl6);
- if (err)
- goto out_err;
- }
-
/* The correct way to handle this would be to do
* ip6_route_get_saddr, and then ip6_route_output; however,
* the route-specific preferred source forces the
@@ -1016,7 +1017,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
out_err_release:
dst_release(*dst);
*dst = NULL;
-out_err:
+
if (err == -ENETUNREACH)
IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
return err;
@@ -1062,8 +1063,6 @@ struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
return ERR_PTR(err);
if (final_dst)
fl6->daddr = *final_dst;
- if (!fl6->flowi6_oif)
- fl6->flowi6_oif = l3mdev_fib_oif(dst->dev);
return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
}
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 2050217df565..6a66adba0c22 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -57,6 +57,7 @@
#include <net/inet_ecn.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <net/dst_metadata.h>
MODULE_AUTHOR("Ville Nuorvala");
MODULE_DESCRIPTION("IPv6 tunneling device");
@@ -90,6 +91,7 @@ struct ip6_tnl_net {
struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE];
struct ip6_tnl __rcu *tnls_wc[1];
struct ip6_tnl __rcu **tnls[2];
+ struct ip6_tnl __rcu *collect_md_tun;
};
static struct net_device_stats *ip6_get_stats(struct net_device *dev)
@@ -166,6 +168,10 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_
return t;
}
+ t = rcu_dereference(ip6n->collect_md_tun);
+ if (t)
+ return t;
+
t = rcu_dereference(ip6n->tnls_wc[0]);
if (t && (t->dev->flags & IFF_UP))
return t;
@@ -209,6 +215,8 @@ ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
{
struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms);
+ if (t->parms.collect_md)
+ rcu_assign_pointer(ip6n->collect_md_tun, t);
rcu_assign_pointer(t->next , rtnl_dereference(*tp));
rcu_assign_pointer(*tp, t);
}
@@ -224,6 +232,9 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
struct ip6_tnl __rcu **tp;
struct ip6_tnl *iter;
+ if (t->parms.collect_md)
+ rcu_assign_pointer(ip6n->collect_md_tun, NULL);
+
for (tp = ip6_tnl_bucket(ip6n, &t->parms);
(iter = rtnl_dereference(*tp)) != NULL;
tp = &iter->next) {
@@ -829,6 +840,9 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
+ if (tun_dst)
+ skb_dst_set(skb, (struct dst_entry *)tun_dst);
+
gro_cells_receive(&tunnel->gro_cells, skb);
return 0;
@@ -865,6 +879,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
{
struct ip6_tnl *t;
const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ struct metadata_dst *tun_dst = NULL;
int ret = -1;
rcu_read_lock();
@@ -881,7 +896,12 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
goto drop;
if (iptunnel_pull_header(skb, 0, tpi->proto, false))
goto drop;
- ret = __ip6_tnl_rcv(t, skb, tpi, NULL, dscp_ecn_decapsulate,
+ if (t->parms.collect_md) {
+ tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0);
+ if (!tun_dst)
+ return 0;
+ }
+ ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate,
log_ecn_error);
}
@@ -1012,8 +1032,16 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
int mtu;
unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen;
unsigned int max_headroom = psh_hlen;
+ u8 hop_limit;
int err = -1;
+ if (t->parms.collect_md) {
+ hop_limit = skb_tunnel_info(skb)->key.ttl;
+ goto route_lookup;
+ } else {
+ hop_limit = t->parms.hop_limit;
+ }
+
/* NBMA tunnel */
if (ipv6_addr_any(&t->parms.raddr)) {
struct in6_addr *addr6;
@@ -1043,6 +1071,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
goto tx_err_link_failure;
if (!dst) {
+route_lookup:
dst = ip6_route_output(net, NULL, fl6);
if (dst->error)
@@ -1053,6 +1082,10 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
dst = NULL;
goto tx_err_link_failure;
}
+ if (t->parms.collect_md &&
+ ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
+ &fl6->daddr, 0, &fl6->saddr))
+ goto tx_err_link_failure;
ndst = dst;
}
@@ -1071,7 +1104,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
}
if (mtu < IPV6_MIN_MTU)
mtu = IPV6_MIN_MTU;
- if (skb_dst(skb))
+ if (skb_dst(skb) && !t->parms.collect_md)
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
if (skb->len > mtu && !skb_is_gso(skb)) {
*pmtu = mtu;
@@ -1111,8 +1144,13 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
skb = new_skb;
}
- if (!fl6->flowi6_mark && ndst)
- dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
+ if (t->parms.collect_md) {
+ if (t->encap.type != TUNNEL_ENCAP_NONE)
+ goto tx_err_dst_release;
+ } else {
+ if (!fl6->flowi6_mark && ndst)
+ dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
+ }
skb_dst_set(skb, dst);
if (encap_limit >= 0) {
@@ -1137,7 +1175,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
ipv6h = ipv6_hdr(skb);
ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6));
- ipv6h->hop_limit = t->parms.hop_limit;
+ ipv6h->hop_limit = hop_limit;
ipv6h->nexthdr = proto;
ipv6h->saddr = fl6->saddr;
ipv6h->daddr = fl6->daddr;
@@ -1170,18 +1208,34 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
if (tproto != IPPROTO_IPIP && tproto != 0)
return -1;
- if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
- encap_limit = t->parms.encap_limit;
+ dsfield = ipv4_get_dsfield(iph);
- memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+ if (t->parms.collect_md) {
+ struct ip_tunnel_info *tun_info;
+ const struct ip_tunnel_key *key;
- dsfield = ipv4_get_dsfield(iph);
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+ ip_tunnel_info_af(tun_info) != AF_INET6))
+ return -1;
+ key = &tun_info->key;
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_IPIP;
+ fl6.daddr = key->u.ipv6.dst;
+ fl6.flowlabel = key->label;
+ } else {
+ if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ encap_limit = t->parms.encap_limit;
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
- fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
- & IPV6_TCLASS_MASK;
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
- fl6.flowi6_mark = skb->mark;
+ memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_IPIP;
+
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
+ & IPV6_TCLASS_MASK;
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+ fl6.flowi6_mark = skb->mark;
+ }
if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
return -1;
@@ -1219,28 +1273,47 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
ip6_tnl_addr_conflict(t, ipv6h))
return -1;
- offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
- if (offset > 0) {
- struct ipv6_tlv_tnl_enc_lim *tel;
- tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
- if (tel->encap_limit == 0) {
- icmpv6_send(skb, ICMPV6_PARAMPROB,
- ICMPV6_HDR_FIELD, offset + 2);
+ dsfield = ipv6_get_dsfield(ipv6h);
+
+ if (t->parms.collect_md) {
+ struct ip_tunnel_info *tun_info;
+ const struct ip_tunnel_key *key;
+
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+ ip_tunnel_info_af(tun_info) != AF_INET6))
return -1;
+ key = &tun_info->key;
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_IPV6;
+ fl6.daddr = key->u.ipv6.dst;
+ fl6.flowlabel = key->label;
+ } else {
+ offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
+ if (offset > 0) {
+ struct ipv6_tlv_tnl_enc_lim *tel;
+
+ tel = (void *)&skb_network_header(skb)[offset];
+ if (tel->encap_limit == 0) {
+ icmpv6_send(skb, ICMPV6_PARAMPROB,
+ ICMPV6_HDR_FIELD, offset + 2);
+ return -1;
+ }
+ encap_limit = tel->encap_limit - 1;
+ } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) {
+ encap_limit = t->parms.encap_limit;
}
- encap_limit = tel->encap_limit - 1;
- } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
- encap_limit = t->parms.encap_limit;
- memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+ memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_IPV6;
- dsfield = ipv6_get_dsfield(ipv6h);
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
- fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK);
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
- fl6.flowlabel |= ip6_flowlabel(ipv6h);
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
- fl6.flowi6_mark = skb->mark;
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ fl6.flowlabel |= (*(__be32 *)ipv6h & IPV6_TCLASS_MASK);
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
+ fl6.flowlabel |= ip6_flowlabel(ipv6h);
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+ fl6.flowi6_mark = skb->mark;
+ }
if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
return -1;
@@ -1739,6 +1812,10 @@ static int ip6_tnl_dev_init(struct net_device *dev)
if (err)
return err;
ip6_tnl_link_config(t);
+ if (t->parms.collect_md) {
+ dev->features |= NETIF_F_NETNS_LOCAL;
+ netif_keep_dst(dev);
+ }
return 0;
}
@@ -1809,6 +1886,9 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[],
if (data[IFLA_IPTUN_PROTO])
parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+
+ if (data[IFLA_IPTUN_COLLECT_METADATA])
+ parms->collect_md = true;
}
static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[],
@@ -1848,6 +1928,7 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
struct net *net = dev_net(dev);
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
struct ip6_tnl *nt, *t;
struct ip_tunnel_encap ipencap;
@@ -1862,9 +1943,14 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
ip6_tnl_netlink_parms(data, &nt->parms);
- t = ip6_tnl_locate(net, &nt->parms, 0);
- if (!IS_ERR(t))
- return -EEXIST;
+ if (nt->parms.collect_md) {
+ if (rtnl_dereference(ip6n->collect_md_tun))
+ return -EEXIST;
+ } else {
+ t = ip6_tnl_locate(net, &nt->parms, 0);
+ if (!IS_ERR(t))
+ return -EEXIST;
+ }
return ip6_tnl_create2(dev);
}
@@ -1888,6 +1974,8 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[],
return err;
}
ip6_tnl_netlink_parms(data, &p);
+ if (p.collect_md)
+ return -EINVAL;
t = ip6_tnl_locate(net, &p, 0);
if (!IS_ERR(t)) {
@@ -1935,6 +2023,8 @@ static size_t ip6_tnl_get_size(const struct net_device *dev)
nla_total_size(2) +
/* IFLA_IPTUN_ENCAP_DPORT */
nla_total_size(2) +
+ /* IFLA_IPTUN_COLLECT_METADATA */
+ nla_total_size(0) +
0;
}
@@ -1953,16 +2043,15 @@ static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto))
goto nla_put_failure;
- if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
- tunnel->encap.type) ||
- nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT,
- tunnel->encap.sport) ||
- nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
- tunnel->encap.dport) ||
- nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
- tunnel->encap.flags))
+ if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) ||
+ nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) ||
+ nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) ||
+ nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags))
goto nla_put_failure;
+ if (parm->collect_md)
+ if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -1990,6 +2079,7 @@ static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = {
[IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 },
[IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 },
[IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 },
+ [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG },
};
static struct rtnl_link_ops ip6_link_ops __read_mostly = {
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index cc7e05898307..8a02ca8a11af 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -321,11 +321,9 @@ static int vti6_rcv(struct sk_buff *skb)
goto discard;
}
- XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t;
-
rcu_read_unlock();
- return xfrm6_rcv(skb);
+ return xfrm6_rcv_tnl(skb, t);
}
rcu_read_unlock();
return -EINVAL;
@@ -340,6 +338,7 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err)
struct net_device *dev;
struct pcpu_sw_netstats *tstats;
struct xfrm_state *x;
+ struct xfrm_mode *inner_mode;
struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6;
u32 orig_mark = skb->mark;
int ret;
@@ -357,7 +356,19 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err)
}
x = xfrm_input_state(skb);
- family = x->inner_mode->afinfo->family;
+
+ inner_mode = x->inner_mode;
+
+ if (x->sel.family == AF_UNSPEC) {
+ inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
+ if (inner_mode == NULL) {
+ XFRM_INC_STATS(dev_net(skb->dev),
+ LINUX_MIB_XFRMINSTATEMODEERROR);
+ return -EINVAL;
+ }
+ }
+
+ family = inner_mode->afinfo->family;
skb->mark = be32_to_cpu(t->parms.i_key);
ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 6122f9c5cc49..fccb5dd91902 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2239,6 +2239,7 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
struct rta_mfc_stats mfcs;
struct nlattr *mp_attr;
struct rtnexthop *nhp;
+ unsigned long lastuse;
int ct;
/* If cache is unresolved, don't try to parse IIF and OIF */
@@ -2269,12 +2270,14 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
nla_nest_end(skb, mp_attr);
+ lastuse = READ_ONCE(c->mfc_un.res.lastuse);
+ lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0;
+
mfcs.mfcs_packets = c->mfc_un.res.pkt;
mfcs.mfcs_bytes = c->mfc_un.res.bytes;
mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) ||
- nla_put_u64_64bit(skb, RTA_EXPIRES,
- jiffies_to_clock_t(c->mfc_un.res.lastuse),
+ nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse),
RTA_PAD))
return -EMSGSIZE;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index fe65cdc28a45..d8e671457d10 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -67,7 +67,6 @@
#include <net/flow.h>
#include <net/ip6_checksum.h>
#include <net/inet_common.h>
-#include <net/l3mdev.h>
#include <linux/proc_fs.h>
#include <linux/netfilter.h>
@@ -457,11 +456,9 @@ static void ndisc_send_skb(struct sk_buff *skb,
if (!dst) {
struct flowi6 fl6;
- int oif = l3mdev_fib_oif(skb->dev);
+ int oif = skb->dev->ifindex;
icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif);
- if (oif != skb->dev->ifindex)
- fl6.flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC;
dst = icmp6_dst_alloc(skb->dev, &fl6);
if (IS_ERR(dst)) {
kfree_skb(skb);
@@ -1538,7 +1535,6 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
int rd_len;
u8 ha_buf[MAX_ADDR_LEN], *ha = NULL,
ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL;
- int oif = l3mdev_fib_oif(dev);
bool ret;
if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) {
@@ -1555,10 +1551,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
}
icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT,
- &saddr_buf, &ipv6_hdr(skb)->saddr, oif);
-
- if (oif != skb->dev->ifindex)
- fl6.flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC;
+ &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex);
dst = ip6_route_output(net, NULL, &fl6);
if (dst->error) {
diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c
index 01eb0f658366..f2727475895e 100644
--- a/net/ipv6/netfilter/nft_chain_route_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c
@@ -31,6 +31,7 @@ static unsigned int nf_route_table_hook(void *priv,
struct in6_addr saddr, daddr;
u_int8_t hop_limit;
u32 mark, flowlabel;
+ int err;
nft_set_pktinfo_ipv6(&pkt, skb, state);
@@ -44,13 +45,16 @@ static unsigned int nf_route_table_hook(void *priv,
flowlabel = *((u32 *)ipv6_hdr(skb));
ret = nft_do_chain(&pkt, priv);
- if (ret != NF_DROP && ret != NF_QUEUE &&
+ if (ret != NF_DROP && ret != NF_STOLEN &&
(memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) ||
memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) ||
skb->mark != mark ||
ipv6_hdr(skb)->hop_limit != hop_limit ||
- flowlabel != *((u_int32_t *)ipv6_hdr(skb))))
- return ip6_route_me_harder(state->net, skb) == 0 ? ret : NF_DROP;
+ flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) {
+ err = ip6_route_me_harder(state->net, skb);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
return ret;
}
diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c
index 533cd5719c59..92bda9908bb9 100644
--- a/net/ipv6/netfilter/nft_reject_ipv6.c
+++ b/net/ipv6/netfilter/nft_reject_ipv6.c
@@ -47,6 +47,7 @@ static const struct nft_expr_ops nft_reject_ipv6_ops = {
.eval = nft_reject_ipv6_eval,
.init = nft_reject_init,
.dump = nft_reject_dump,
+ .validate = nft_reject_validate,
};
static struct nft_expr_type nft_reject_ipv6_type __read_mostly = {
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 462f2a76b5c2..7cca8ac66fe9 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -148,6 +148,13 @@ int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
ipv6_hdr(skb)->payload_len = htons(len);
IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
+ /* if egress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip6_out(sk, skb);
+ if (unlikely(!skb))
+ return 0;
+
return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
net, sk, skb, NULL, skb_dst(skb)->dev,
dst_output);
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 0900352c924c..0e983b694ee8 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -126,8 +126,10 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
rt = (struct rt6_info *) dst;
np = inet6_sk(sk);
- if (!np)
- return -EBADF;
+ if (!np) {
+ err = -EBADF;
+ goto dst_err_out;
+ }
if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
fl6.flowi6_oif = np->mcast_oif;
@@ -163,6 +165,9 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
release_sock(sk);
+dst_err_out:
+ dst_release(dst);
+
if (err)
return err;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 590dd1f7746f..54404f08efcc 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -653,6 +653,13 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
if (err)
goto error_fault;
+ /* if egress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip6_out(sk, skb);
+ if (unlikely(!skb))
+ return 0;
+
IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb,
NULL, rt->dst.dev, dst_output);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 09d43ff11a8d..5a5aeb92b4ec 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1147,15 +1147,16 @@ static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *
return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
}
-static struct dst_entry *ip6_route_input_lookup(struct net *net,
- struct net_device *dev,
- struct flowi6 *fl6, int flags)
+struct dst_entry *ip6_route_input_lookup(struct net *net,
+ struct net_device *dev,
+ struct flowi6 *fl6, int flags)
{
if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
flags |= RT6_LOOKUP_F_IFACE;
return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
}
+EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
void ip6_route_input(struct sk_buff *skb)
{
@@ -1164,7 +1165,7 @@ void ip6_route_input(struct sk_buff *skb)
int flags = RT6_LOOKUP_F_HAS_SADDR;
struct ip_tunnel_info *tun_info;
struct flowi6 fl6 = {
- .flowi6_iif = l3mdev_fib_oif(skb->dev),
+ .flowi6_iif = skb->dev->ifindex,
.daddr = iph->daddr,
.saddr = iph->saddr,
.flowlabel = ip6_flowinfo(iph),
@@ -1188,12 +1189,15 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table
struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
struct flowi6 *fl6, int flags)
{
- struct dst_entry *dst;
bool any_src;
- dst = l3mdev_get_rt6_dst(net, fl6);
- if (dst)
- return dst;
+ if (rt6_need_strict(&fl6->daddr)) {
+ struct dst_entry *dst;
+
+ dst = l3mdev_link_scope_lookup(net, fl6);
+ if (dst)
+ return dst;
+ }
fl6->flowi6_iif = LOOPBACK_IFINDEX;
@@ -1988,9 +1992,18 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
if (!(gwa_type & IPV6_ADDR_UNICAST))
goto out;
- if (cfg->fc_table)
+ if (cfg->fc_table) {
grt = ip6_nh_lookup_table(net, cfg, gw_addr);
+ if (grt) {
+ if (grt->rt6i_flags & RTF_GATEWAY ||
+ (dev && dev != grt->dst.dev)) {
+ ip6_rt_put(grt);
+ grt = NULL;
+ }
+ }
+ }
+
if (!grt)
grt = rt6_lookup(net, gw_addr, NULL,
cfg->fc_ifindex, 1);
@@ -2558,8 +2571,16 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
{
u32 tb_id;
struct net *net = dev_net(idev->dev);
- struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
- DST_NOCOUNT);
+ struct net_device *dev = net->loopback_dev;
+ struct rt6_info *rt;
+
+ /* use L3 Master device as loopback for host routes if device
+ * is enslaved and address is not link local or multicast
+ */
+ if (!rt6_need_strict(addr))
+ dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
+
+ rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
if (!rt)
return ERR_PTR(-ENOMEM);
@@ -3338,11 +3359,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
} else {
fl6.flowi6_oif = oif;
- if (netif_index_is_l3_master(net, oif)) {
- fl6.flowi6_flags = FLOWI_FLAG_L3MDEV_SRC |
- FLOWI_FLAG_SKIP_NH_OIF;
- }
-
rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 04529a3d42cb..54cf7197c7ab 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -818,12 +818,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
fl6.flowi6_proto = IPPROTO_TCP;
if (rt6_need_strict(&fl6.daddr) && !oif)
fl6.flowi6_oif = tcp_v6_iif(skb);
- else {
- if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
- oif = skb->skb_iif;
-
- fl6.flowi6_oif = oif;
- }
+ else
+ fl6.flowi6_oif = oif ? : skb->skb_iif;
fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
fl6.fl6_dport = t1->dest;
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 0eaab1fa6be5..b5789562aded 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -21,8 +21,10 @@ int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb)
return xfrm6_extract_header(skb);
}
-int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi)
+int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi,
+ struct ip6_tnl *t)
{
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t;
XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
return xfrm_input(skb, nexthdr, spi, 0);
@@ -48,13 +50,18 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
return -1;
}
-int xfrm6_rcv(struct sk_buff *skb)
+int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t)
{
return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff],
- 0);
+ 0, t);
}
-EXPORT_SYMBOL(xfrm6_rcv);
+EXPORT_SYMBOL(xfrm6_rcv_tnl);
+int xfrm6_rcv(struct sk_buff *skb)
+{
+ return xfrm6_rcv_tnl(skb, NULL);
+}
+EXPORT_SYMBOL(xfrm6_rcv);
int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
xfrm_address_t *saddr, u8 proto)
{
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 6cc97003e4a9..e0f71c01d728 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -36,7 +36,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
int err;
memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_oif = oif;
+ fl6.flowi6_oif = l3mdev_master_ifindex_by_index(net, oif);
fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF;
memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr));
if (saddr)
@@ -134,7 +134,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
nexthdr = nh[nhoff];
if (skb_dst(skb))
- oif = l3mdev_fib_oif(skb_dst(skb)->dev);
+ oif = skb_dst(skb)->dev->ifindex;
memset(fl6, 0, sizeof(struct flowi6));
fl6->flowi6_mark = skb->mark;
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 5743044cd660..e1c0bbe7996c 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -236,7 +236,7 @@ static int xfrm6_tunnel_rcv(struct sk_buff *skb)
__be32 spi;
spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr);
- return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi);
+ return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL);
}
static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index db639690c205..391c3cbd2eed 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -832,7 +832,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
struct sock *sk = sock->sk;
struct irda_sock *new, *self = irda_sk(sk);
struct sock *newsk;
- struct sk_buff *skb;
+ struct sk_buff *skb = NULL;
int err;
err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0);
@@ -897,7 +897,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
err = -EPERM; /* value does not seem to make sense. -arnd */
if (!new->tsap) {
pr_debug("%s(), dup failed!\n", __func__);
- kfree_skb(skb);
goto out;
}
@@ -916,7 +915,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
/* Clean up the original one to keep it in listen state */
irttp_listen(self->tsap);
- kfree_skb(skb);
sk->sk_ack_backlog--;
newsock->state = SS_CONNECTED;
@@ -924,6 +922,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
irda_connect_response(new);
err = 0;
out:
+ kfree_skb(skb);
release_sock(sk);
return err;
}
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 2632ac748371..b7f869a85ab7 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -23,6 +23,7 @@
#include <linux/socket.h>
#include <linux/uaccess.h>
#include <linux/workqueue.h>
+#include <linux/syscalls.h>
#include <net/kcm.h>
#include <net/netns/generic.h>
#include <net/sock.h>
@@ -1721,7 +1722,7 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
if (copy_to_user((void __user *)arg, &info,
sizeof(info))) {
err = -EFAULT;
- sock_release(newsock);
+ sys_close(info.fd);
}
}
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 1e40dacaa137..a2ed3bda4ddc 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1855,6 +1855,9 @@ static __net_exit void l2tp_exit_net(struct net *net)
(void)l2tp_tunnel_delete(tunnel);
}
rcu_read_unlock_bh();
+
+ flush_workqueue(l2tp_wq);
+ rcu_barrier();
}
static struct pernet_operations l2tp_net_ops = {
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 5871537af387..2599af6378e4 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -139,7 +139,7 @@ struct l2tp_session {
void (*session_close)(struct l2tp_session *session);
void (*ref)(struct l2tp_session *session);
void (*deref)(struct l2tp_session *session);
-#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
void (*show)(struct seq_file *m, void *priv);
#endif
uint8_t priv[0]; /* private data */
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 57fc5a46ce06..965f7e344cef 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -121,7 +121,7 @@ static struct rtnl_link_stats64 *l2tp_eth_get_stats64(struct net_device *dev,
}
-static struct net_device_ops l2tp_eth_netdev_ops = {
+static const struct net_device_ops l2tp_eth_netdev_ops = {
.ndo_init = l2tp_eth_dev_init,
.ndo_uninit = l2tp_eth_dev_uninit,
.ndo_start_xmit = l2tp_eth_dev_xmit,
@@ -195,7 +195,7 @@ static void l2tp_eth_delete(struct l2tp_session *session)
}
}
-#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
static void l2tp_eth_show(struct seq_file *m, void *arg)
{
struct l2tp_session *session = arg;
@@ -268,7 +268,7 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p
priv->tunnel_sock = tunnel->sock;
session->recv_skb = l2tp_eth_dev_recv;
session->session_close = l2tp_eth_delete;
-#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
session->show = l2tp_eth_show;
#endif
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 34eff77982cf..41d47bfda15c 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -552,7 +552,7 @@ out:
return error;
}
-#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
static void pppol2tp_show(struct seq_file *m, void *arg)
{
struct l2tp_session *session = arg;
@@ -723,7 +723,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
session->recv_skb = pppol2tp_recv;
session->session_close = pppol2tp_session_close;
-#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
session->show = pppol2tp_show;
#endif
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index c4a1c3e84e12..8da86ceca33d 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -100,15 +100,14 @@ u32 l3mdev_fib_table_by_index(struct net *net, int ifindex)
EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index);
/**
- * l3mdev_get_rt6_dst - IPv6 route lookup based on flow. Returns
- * cached route for L3 master device if relevant
- * to flow
+ * l3mdev_link_scope_lookup - IPv6 route lookup based on flow for link
+ * local and multicast addresses
* @net: network namespace for device index lookup
* @fl6: IPv6 flow struct for lookup
*/
-struct dst_entry *l3mdev_get_rt6_dst(struct net *net,
- struct flowi6 *fl6)
+struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
+ struct flowi6 *fl6)
{
struct dst_entry *dst = NULL;
struct net_device *dev;
@@ -121,70 +120,15 @@ struct dst_entry *l3mdev_get_rt6_dst(struct net *net,
dev = netdev_master_upper_dev_get_rcu(dev);
if (dev && netif_is_l3_master(dev) &&
- dev->l3mdev_ops->l3mdev_get_rt6_dst)
- dst = dev->l3mdev_ops->l3mdev_get_rt6_dst(dev, fl6);
+ dev->l3mdev_ops->l3mdev_link_scope_lookup)
+ dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6);
rcu_read_unlock();
}
return dst;
}
-EXPORT_SYMBOL_GPL(l3mdev_get_rt6_dst);
-
-/**
- * l3mdev_get_saddr - get source address for a flow based on an interface
- * enslaved to an L3 master device
- * @net: network namespace for device index lookup
- * @ifindex: Interface index
- * @fl4: IPv4 flow struct
- */
-
-int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4)
-{
- struct net_device *dev;
- int rc = 0;
-
- if (ifindex) {
- rcu_read_lock();
-
- dev = dev_get_by_index_rcu(net, ifindex);
- if (dev && netif_is_l3_slave(dev))
- dev = netdev_master_upper_dev_get_rcu(dev);
-
- if (dev && netif_is_l3_master(dev) &&
- dev->l3mdev_ops->l3mdev_get_saddr)
- rc = dev->l3mdev_ops->l3mdev_get_saddr(dev, fl4);
-
- rcu_read_unlock();
- }
-
- return rc;
-}
-EXPORT_SYMBOL_GPL(l3mdev_get_saddr);
-
-int l3mdev_get_saddr6(struct net *net, const struct sock *sk,
- struct flowi6 *fl6)
-{
- struct net_device *dev;
- int rc = 0;
-
- if (fl6->flowi6_oif) {
- rcu_read_lock();
-
- dev = dev_get_by_index_rcu(net, fl6->flowi6_oif);
- if (dev && netif_is_l3_slave(dev))
- dev = netdev_master_upper_dev_get_rcu(dev);
-
- if (dev && netif_is_l3_master(dev) &&
- dev->l3mdev_ops->l3mdev_get_saddr6)
- rc = dev->l3mdev_ops->l3mdev_get_saddr6(dev, sk, fl6);
-
- rcu_read_unlock();
- }
-
- return rc;
-}
-EXPORT_SYMBOL_GPL(l3mdev_get_saddr6);
+EXPORT_SYMBOL_GPL(l3mdev_link_scope_lookup);
/**
* l3mdev_fib_rule_match - Determine if flowi references an
@@ -222,3 +166,38 @@ out:
return rc;
}
+
+void l3mdev_update_flow(struct net *net, struct flowi *fl)
+{
+ struct net_device *dev;
+ int ifindex;
+
+ rcu_read_lock();
+
+ if (fl->flowi_oif) {
+ dev = dev_get_by_index_rcu(net, fl->flowi_oif);
+ if (dev) {
+ ifindex = l3mdev_master_ifindex_rcu(dev);
+ if (ifindex) {
+ fl->flowi_oif = ifindex;
+ fl->flowi_flags |= FLOWI_FLAG_SKIP_NH_OIF;
+ goto out;
+ }
+ }
+ }
+
+ if (fl->flowi_iif) {
+ dev = dev_get_by_index_rcu(net, fl->flowi_iif);
+ if (dev) {
+ ifindex = l3mdev_master_ifindex_rcu(dev);
+ if (ifindex) {
+ fl->flowi_iif = ifindex;
+ fl->flowi_flags |= FLOWI_FLAG_SKIP_NH_OIF;
+ }
+ }
+ }
+
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(l3mdev_update_flow);
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 8ae3ed97d95c..db916cf51ffe 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -38,7 +38,7 @@ static u16 llc_ui_sap_link_no_max[256];
static struct sockaddr_llc llc_ui_addrnull;
static const struct proto_ops llc_ui_ops;
-static long llc_ui_wait_for_conn(struct sock *sk, long timeout);
+static bool llc_ui_wait_for_conn(struct sock *sk, long timeout);
static int llc_ui_wait_for_disc(struct sock *sk, long timeout);
static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout);
@@ -551,7 +551,7 @@ static int llc_ui_wait_for_disc(struct sock *sk, long timeout)
return rc;
}
-static long llc_ui_wait_for_conn(struct sock *sk, long timeout)
+static bool llc_ui_wait_for_conn(struct sock *sk, long timeout)
{
DEFINE_WAIT(wait);
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index a9aff6079c42..f6749dced021 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -261,10 +261,16 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
.timeout = timeout,
.ssn = start_seq_num,
};
-
int i, ret = -EOPNOTSUPP;
u16 status = WLAN_STATUS_REQUEST_DECLINED;
+ if (tid >= IEEE80211_FIRST_TSPEC_TSID) {
+ ht_dbg(sta->sdata,
+ "STA %pM requests BA session on unsupported tid %d\n",
+ sta->sta.addr, tid);
+ goto end_no_lock;
+ }
+
if (!sta->sta.ht_cap.ht_supported) {
ht_dbg(sta->sdata,
"STA %pM erroneously requests BA session on tid %d w/o QoS\n",
@@ -298,10 +304,13 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
buf_size = IEEE80211_MAX_AMPDU_BUF;
/* make sure the size doesn't exceed the maximum supported by the hw */
- if (buf_size > local->hw.max_rx_aggregation_subframes)
- buf_size = local->hw.max_rx_aggregation_subframes;
+ if (buf_size > sta->sta.max_rx_aggregation_subframes)
+ buf_size = sta->sta.max_rx_aggregation_subframes;
params.buf_size = buf_size;
+ ht_dbg(sta->sdata, "AddBA Req buf_size=%d for %pM\n",
+ buf_size, sta->sta.addr);
+
/* examine state machine */
mutex_lock(&sta->ampdu_mlme.mtx);
@@ -406,8 +415,10 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
}
end:
- if (status == WLAN_STATUS_SUCCESS)
+ if (status == WLAN_STATUS_SUCCESS) {
__set_bit(tid, sta->ampdu_mlme.agg_session_valid);
+ __clear_bit(tid, sta->ampdu_mlme.unexpected_agg);
+ }
mutex_unlock(&sta->ampdu_mlme.mtx);
end_no_lock:
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 5650c46bf91a..45319cc01121 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -584,6 +584,9 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid,
ieee80211_hw_check(&local->hw, TX_AMPDU_SETUP_IN_HW))
return -EINVAL;
+ if (WARN_ON(tid >= IEEE80211_FIRST_TSPEC_TSID))
+ return -EINVAL;
+
ht_dbg(sdata, "Open BA session requested for %pM tid %u\n",
pubsta->addr, tid);
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 543b1d4fc33d..e29ff5749944 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -39,7 +39,7 @@ static struct wireless_dev *ieee80211_add_iface(struct wiphy *wiphy,
if (type == NL80211_IFTYPE_MONITOR && flags) {
sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
- sdata->u.mntr_flags = *flags;
+ sdata->u.mntr.flags = *flags;
}
return wdev;
@@ -73,8 +73,29 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
sdata->u.mgd.use_4addr = params->use_4addr;
}
- if (sdata->vif.type == NL80211_IFTYPE_MONITOR && flags) {
+ if (sdata->vif.type == NL80211_IFTYPE_MONITOR) {
struct ieee80211_local *local = sdata->local;
+ struct ieee80211_sub_if_data *monitor_sdata;
+ u32 mu_mntr_cap_flag = NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER;
+
+ monitor_sdata = rtnl_dereference(local->monitor_sdata);
+ if (monitor_sdata &&
+ wiphy_ext_feature_isset(wiphy, mu_mntr_cap_flag)) {
+ memcpy(monitor_sdata->vif.bss_conf.mu_group.membership,
+ params->vht_mumimo_groups, WLAN_MEMBERSHIP_LEN);
+ memcpy(monitor_sdata->vif.bss_conf.mu_group.position,
+ params->vht_mumimo_groups + WLAN_MEMBERSHIP_LEN,
+ WLAN_USER_POSITION_LEN);
+ monitor_sdata->vif.mu_mimo_owner = true;
+ ieee80211_bss_info_change_notify(monitor_sdata,
+ BSS_CHANGED_MU_GROUPS);
+
+ ether_addr_copy(monitor_sdata->u.mntr.mu_follow_addr,
+ params->macaddr);
+ }
+
+ if (!flags)
+ return 0;
if (ieee80211_sdata_running(sdata)) {
u32 mask = MONITOR_FLAG_COOK_FRAMES |
@@ -89,11 +110,11 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
* cooked_mntrs, monitor and all fif_* counters
* reconfigure hardware
*/
- if ((*flags & mask) != (sdata->u.mntr_flags & mask))
+ if ((*flags & mask) != (sdata->u.mntr.flags & mask))
return -EBUSY;
ieee80211_adjust_monitor_flags(sdata, -1);
- sdata->u.mntr_flags = *flags;
+ sdata->u.mntr.flags = *flags;
ieee80211_adjust_monitor_flags(sdata, 1);
ieee80211_configure_filter(local);
@@ -103,7 +124,7 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
* and ieee80211_do_open take care of "everything"
* mentioned in the comment above.
*/
- sdata->u.mntr_flags = *flags;
+ sdata->u.mntr.flags = *flags;
}
}
@@ -2940,10 +2961,6 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
}
chanctx = container_of(conf, struct ieee80211_chanctx, conf);
- if (!chanctx) {
- err = -EBUSY;
- goto out;
- }
ch_switch.timestamp = 0;
ch_switch.device_timestamp = 0;
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 2906c1004e1a..8ca62b6bb02a 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -71,138 +71,39 @@ DEBUGFS_READONLY_FILE(wep_iv, "%#08x",
DEBUGFS_READONLY_FILE(rate_ctrl_alg, "%s",
local->rate_ctrl ? local->rate_ctrl->ops->name : "hw/driver");
-struct aqm_info {
- struct ieee80211_local *local;
- size_t size;
- size_t len;
- unsigned char buf[0];
-};
-
-#define AQM_HDR_LEN 200
-#define AQM_HW_ENTRY_LEN 40
-#define AQM_TXQ_ENTRY_LEN 110
-
-static int aqm_open(struct inode *inode, struct file *file)
+static ssize_t aqm_read(struct file *file,
+ char __user *user_buf,
+ size_t count,
+ loff_t *ppos)
{
- struct ieee80211_local *local = inode->i_private;
- struct ieee80211_sub_if_data *sdata;
- struct sta_info *sta;
- struct txq_info *txqi;
+ struct ieee80211_local *local = file->private_data;
struct fq *fq = &local->fq;
- struct aqm_info *info = NULL;
+ char buf[200];
int len = 0;
- int i;
-
- if (!local->ops->wake_tx_queue)
- return -EOPNOTSUPP;
-
- len += AQM_HDR_LEN;
- len += 6 * AQM_HW_ENTRY_LEN;
-
- rcu_read_lock();
- list_for_each_entry_rcu(sdata, &local->interfaces, list)
- len += AQM_TXQ_ENTRY_LEN;
- list_for_each_entry_rcu(sta, &local->sta_list, list)
- len += AQM_TXQ_ENTRY_LEN * ARRAY_SIZE(sta->sta.txq);
- rcu_read_unlock();
-
- info = vmalloc(len);
- if (!info)
- return -ENOMEM;
spin_lock_bh(&local->fq.lock);
rcu_read_lock();
- file->private_data = info;
- info->local = local;
- info->size = len;
- len = 0;
-
- len += scnprintf(info->buf + len, info->size - len,
- "* hw\n"
- "access name value\n"
- "R fq_flows_cnt %u\n"
- "R fq_backlog %u\n"
- "R fq_overlimit %u\n"
- "R fq_collisions %u\n"
- "RW fq_limit %u\n"
- "RW fq_quantum %u\n",
- fq->flows_cnt,
- fq->backlog,
- fq->overlimit,
- fq->collisions,
- fq->limit,
- fq->quantum);
-
- len += scnprintf(info->buf + len,
- info->size - len,
- "* vif\n"
- "ifname addr ac backlog-bytes backlog-packets flows overlimit collisions tx-bytes tx-packets\n");
-
- list_for_each_entry_rcu(sdata, &local->interfaces, list) {
- txqi = to_txq_info(sdata->vif.txq);
- len += scnprintf(info->buf + len, info->size - len,
- "%s %pM %u %u %u %u %u %u %u %u\n",
- sdata->name,
- sdata->vif.addr,
- txqi->txq.ac,
- txqi->tin.backlog_bytes,
- txqi->tin.backlog_packets,
- txqi->tin.flows,
- txqi->tin.overlimit,
- txqi->tin.collisions,
- txqi->tin.tx_bytes,
- txqi->tin.tx_packets);
- }
-
- len += scnprintf(info->buf + len,
- info->size - len,
- "* sta\n"
- "ifname addr tid ac backlog-bytes backlog-packets flows overlimit collisions tx-bytes tx-packets\n");
-
- list_for_each_entry_rcu(sta, &local->sta_list, list) {
- sdata = sta->sdata;
- for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
- txqi = to_txq_info(sta->sta.txq[i]);
- len += scnprintf(info->buf + len, info->size - len,
- "%s %pM %d %d %u %u %u %u %u %u %u\n",
- sdata->name,
- sta->sta.addr,
- txqi->txq.tid,
- txqi->txq.ac,
- txqi->tin.backlog_bytes,
- txqi->tin.backlog_packets,
- txqi->tin.flows,
- txqi->tin.overlimit,
- txqi->tin.collisions,
- txqi->tin.tx_bytes,
- txqi->tin.tx_packets);
- }
- }
-
- info->len = len;
+ len = scnprintf(buf, sizeof(buf),
+ "access name value\n"
+ "R fq_flows_cnt %u\n"
+ "R fq_backlog %u\n"
+ "R fq_overlimit %u\n"
+ "R fq_collisions %u\n"
+ "RW fq_limit %u\n"
+ "RW fq_quantum %u\n",
+ fq->flows_cnt,
+ fq->backlog,
+ fq->overlimit,
+ fq->collisions,
+ fq->limit,
+ fq->quantum);
rcu_read_unlock();
spin_unlock_bh(&local->fq.lock);
- return 0;
-}
-
-static int aqm_release(struct inode *inode, struct file *file)
-{
- vfree(file->private_data);
- return 0;
-}
-
-static ssize_t aqm_read(struct file *file,
- char __user *user_buf,
- size_t count,
- loff_t *ppos)
-{
- struct aqm_info *info = file->private_data;
-
return simple_read_from_buffer(user_buf, count, ppos,
- info->buf, info->len);
+ buf, len);
}
static ssize_t aqm_write(struct file *file,
@@ -210,8 +111,7 @@ static ssize_t aqm_write(struct file *file,
size_t count,
loff_t *ppos)
{
- struct aqm_info *info = file->private_data;
- struct ieee80211_local *local = info->local;
+ struct ieee80211_local *local = file->private_data;
char buf[100];
size_t len;
@@ -237,8 +137,7 @@ static ssize_t aqm_write(struct file *file,
static const struct file_operations aqm_ops = {
.write = aqm_write,
.read = aqm_read,
- .open = aqm_open,
- .release = aqm_release,
+ .open = simple_open,
.llseek = default_llseek,
};
@@ -302,6 +201,7 @@ static const char *hw_flag_names[] = {
FLAG(USES_RSS),
FLAG(TX_AMSDU),
FLAG(TX_FRAG_LIST),
+ FLAG(REPORTS_LOW_ACK),
#undef FLAG
};
@@ -428,7 +328,9 @@ void debugfs_hw_add(struct ieee80211_local *local)
DEBUGFS_ADD(hwflags);
DEBUGFS_ADD(user_power);
DEBUGFS_ADD(power);
- DEBUGFS_ADD_MODE(aqm, 0600);
+
+ if (local->ops->wake_tx_queue)
+ DEBUGFS_ADD_MODE(aqm, 0600);
statsd = debugfs_create_dir("statistics", phyd);
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index a5ba739cd2a7..5d35c0f37bb7 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -30,7 +30,7 @@ static ssize_t ieee80211_if_read(
size_t count, loff_t *ppos,
ssize_t (*format)(const struct ieee80211_sub_if_data *, char *, int))
{
- char buf[70];
+ char buf[200];
ssize_t ret = -EINVAL;
read_lock(&dev_base_lock);
@@ -486,6 +486,38 @@ static ssize_t ieee80211_if_fmt_num_buffered_multicast(
}
IEEE80211_IF_FILE_R(num_buffered_multicast);
+static ssize_t ieee80211_if_fmt_aqm(
+ const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct txq_info *txqi = to_txq_info(sdata->vif.txq);
+ int len;
+
+ spin_lock_bh(&local->fq.lock);
+ rcu_read_lock();
+
+ len = scnprintf(buf,
+ buflen,
+ "ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets\n"
+ "%u %u %u %u %u %u %u %u %u %u\n",
+ txqi->txq.ac,
+ txqi->tin.backlog_bytes,
+ txqi->tin.backlog_packets,
+ txqi->tin.flows,
+ txqi->cstats.drop_count,
+ txqi->cstats.ecn_mark,
+ txqi->tin.overlimit,
+ txqi->tin.collisions,
+ txqi->tin.tx_bytes,
+ txqi->tin.tx_packets);
+
+ rcu_read_unlock();
+ spin_unlock_bh(&local->fq.lock);
+
+ return len;
+}
+IEEE80211_IF_FILE_R(aqm);
+
/* IBSS attributes */
static ssize_t ieee80211_if_fmt_tsf(
const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
@@ -618,6 +650,9 @@ static void add_common_files(struct ieee80211_sub_if_data *sdata)
DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_2ghz);
DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_5ghz);
DEBUGFS_ADD(hw_queues);
+
+ if (sdata->local->ops->wake_tx_queue)
+ DEBUGFS_ADD(aqm);
}
static void add_sta_files(struct ieee80211_sub_if_data *sdata)
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index fd334133ff45..a2fcdb47a0e6 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -133,6 +133,55 @@ static ssize_t sta_last_seq_ctrl_read(struct file *file, char __user *userbuf,
}
STA_OPS(last_seq_ctrl);
+#define AQM_TXQ_ENTRY_LEN 130
+
+static ssize_t sta_aqm_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct sta_info *sta = file->private_data;
+ struct ieee80211_local *local = sta->local;
+ size_t bufsz = AQM_TXQ_ENTRY_LEN*(IEEE80211_NUM_TIDS+1);
+ char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf;
+ struct txq_info *txqi;
+ ssize_t rv;
+ int i;
+
+ if (!buf)
+ return -ENOMEM;
+
+ spin_lock_bh(&local->fq.lock);
+ rcu_read_lock();
+
+ p += scnprintf(p,
+ bufsz+buf-p,
+ "tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets\n");
+
+ for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
+ txqi = to_txq_info(sta->sta.txq[i]);
+ p += scnprintf(p, bufsz+buf-p,
+ "%d %d %u %u %u %u %u %u %u %u %u\n",
+ txqi->txq.tid,
+ txqi->txq.ac,
+ txqi->tin.backlog_bytes,
+ txqi->tin.backlog_packets,
+ txqi->tin.flows,
+ txqi->cstats.drop_count,
+ txqi->cstats.ecn_mark,
+ txqi->tin.overlimit,
+ txqi->tin.collisions,
+ txqi->tin.tx_bytes,
+ txqi->tin.tx_packets);
+ }
+
+ rcu_read_unlock();
+ spin_unlock_bh(&local->fq.lock);
+
+ rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+ kfree(buf);
+ return rv;
+}
+STA_OPS(aqm);
+
static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf,
size_t count, loff_t *ppos)
{
@@ -478,6 +527,9 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
DEBUGFS_ADD_COUNTER(rx_fragments, rx_stats.fragments);
DEBUGFS_ADD_COUNTER(tx_filtered, status_stats.filtered);
+ if (local->ops->wake_tx_queue)
+ DEBUGFS_ADD(aqm);
+
if (sizeof(sta->driver_buffered_tids) == sizeof(u32))
debugfs_create_x32("driver_buffered_tids", 0400,
sta->debugfs_dir,
@@ -492,10 +544,6 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
void ieee80211_sta_debugfs_remove(struct sta_info *sta)
{
- struct ieee80211_local *local = sta->local;
- struct ieee80211_sub_if_data *sdata = sta->sdata;
-
- drv_sta_remove_debugfs(local, sdata, &sta->sta, sta->debugfs_dir);
debugfs_remove_recursive(sta->debugfs_dir);
sta->debugfs_dir = NULL;
}
diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c
index c258f1041d33..c701b6438bd9 100644
--- a/net/mac80211/driver-ops.c
+++ b/net/mac80211/driver-ops.c
@@ -62,7 +62,7 @@ int drv_add_interface(struct ieee80211_local *local,
if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
(sdata->vif.type == NL80211_IFTYPE_MONITOR &&
!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) &&
- !(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE))))
+ !(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE))))
return -EINVAL;
trace_drv_add_interface(local, sdata);
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 42a41ae405ba..fe35a1c0dc86 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -162,7 +162,8 @@ static inline void drv_bss_info_changed(struct ieee80211_local *local,
return;
if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE ||
- sdata->vif.type == NL80211_IFTYPE_MONITOR))
+ (sdata->vif.type == NL80211_IFTYPE_MONITOR &&
+ !sdata->vif.mu_mimo_owner)))
return;
if (!check_sdata_in_driver(sdata))
@@ -498,21 +499,6 @@ static inline void drv_sta_add_debugfs(struct ieee80211_local *local,
local->ops->sta_add_debugfs(&local->hw, &sdata->vif,
sta, dir);
}
-
-static inline void drv_sta_remove_debugfs(struct ieee80211_local *local,
- struct ieee80211_sub_if_data *sdata,
- struct ieee80211_sta *sta,
- struct dentry *dir)
-{
- might_sleep();
-
- sdata = get_bss_sdata(sdata);
- check_sdata_in_driver(sdata);
-
- if (local->ops->sta_remove_debugfs)
- local->ops->sta_remove_debugfs(&local->hw, &sdata->vif,
- sta, dir);
-}
#endif
static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index f56d342c31b8..e496dee5af08 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -3,7 +3,7 @@
* Copyright 2005, Devicescape Software, Inc.
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net>
- * Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright 2013-2015 Intel Mobile Communications GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -818,12 +818,18 @@ struct txq_info {
struct fq_tin tin;
struct fq_flow def_flow;
struct codel_vars def_cvars;
+ struct codel_stats cstats;
unsigned long flags;
/* keep last! */
struct ieee80211_txq txq;
};
+struct ieee80211_if_mntr {
+ u32 flags;
+ u8 mu_follow_addr[ETH_ALEN] __aligned(2);
+};
+
struct ieee80211_sub_if_data {
struct list_head list;
@@ -922,7 +928,7 @@ struct ieee80211_sub_if_data {
struct ieee80211_if_ibss ibss;
struct ieee80211_if_mesh mesh;
struct ieee80211_if_ocb ocb;
- u32 mntr_flags;
+ struct ieee80211_if_mntr mntr;
} u;
#ifdef CONFIG_MAC80211_DEBUGFS
@@ -1112,7 +1118,6 @@ struct ieee80211_local {
struct fq fq;
struct codel_vars *cvars;
struct codel_params cparams;
- struct codel_stats cstats;
const struct ieee80211_ops *ops;
@@ -1208,7 +1213,7 @@ struct ieee80211_local {
spinlock_t tim_lock;
unsigned long num_sta;
struct list_head sta_list;
- struct rhashtable sta_hash;
+ struct rhltable sta_hash;
struct timer_list sta_cleanup;
int sta_generation;
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index b123a9e325b3..b0abddc714ef 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -43,6 +43,8 @@
* by either the RTNL, the iflist_mtx or RCU.
*/
+static void ieee80211_iface_work(struct work_struct *work);
+
bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata)
{
struct ieee80211_chanctx_conf *chanctx_conf;
@@ -188,7 +190,7 @@ static int ieee80211_verify_mac(struct ieee80211_sub_if_data *sdata, u8 *addr,
continue;
if (iter->vif.type == NL80211_IFTYPE_MONITOR &&
- !(iter->u.mntr_flags & MONITOR_FLAG_ACTIVE))
+ !(iter->u.mntr.flags & MONITOR_FLAG_ACTIVE))
continue;
m = iter->vif.addr;
@@ -217,7 +219,7 @@ static int ieee80211_change_mac(struct net_device *dev, void *addr)
return -EBUSY;
if (sdata->vif.type == NL80211_IFTYPE_MONITOR &&
- !(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE))
+ !(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE))
check_dup = false;
ret = ieee80211_verify_mac(sdata, sa->sa_data, check_dup);
@@ -357,7 +359,7 @@ void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata,
const int offset)
{
struct ieee80211_local *local = sdata->local;
- u32 flags = sdata->u.mntr_flags;
+ u32 flags = sdata->u.mntr.flags;
#define ADJUST(_f, _s) do { \
if (flags & MONITOR_FLAG_##_f) \
@@ -448,6 +450,9 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
return ret;
}
+ skb_queue_head_init(&sdata->skb_queue);
+ INIT_WORK(&sdata->work, ieee80211_iface_work);
+
return 0;
}
@@ -589,12 +594,12 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
}
break;
case NL80211_IFTYPE_MONITOR:
- if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) {
+ if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) {
local->cooked_mntrs++;
break;
}
- if (sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE) {
+ if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) {
res = drv_add_interface(local, sdata);
if (res)
goto err_stop;
@@ -926,7 +931,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
/* no need to tell driver */
break;
case NL80211_IFTYPE_MONITOR:
- if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) {
+ if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) {
local->cooked_mntrs--;
break;
}
@@ -1012,7 +1017,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
ieee80211_recalc_idle(local);
mutex_unlock(&local->mtx);
- if (!(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE))
+ if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE))
break;
/* fall through */
@@ -1444,7 +1449,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
case NL80211_IFTYPE_MONITOR:
sdata->dev->type = ARPHRD_IEEE80211_RADIOTAP;
sdata->dev->netdev_ops = &ieee80211_monitorif_ops;
- sdata->u.mntr_flags = MONITOR_FLAG_CONTROL |
+ sdata->u.mntr.flags = MONITOR_FLAG_CONTROL |
MONITOR_FLAG_OTHER_BSS;
break;
case NL80211_IFTYPE_WDS:
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index d00ea9b13f49..ac053a9df36d 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -660,6 +660,9 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
ieee80211_roc_setup(local);
+ local->hw.radiotap_timestamp.units_pos = -1;
+ local->hw.radiotap_timestamp.accuracy = -1;
+
return &local->hw;
err_free:
wiphy_free(wiphy);
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index fa7d37cf0351..b747c9645e43 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -757,6 +757,7 @@ static void hwmp_perr_frame_process(struct ieee80211_sub_if_data *sdata,
sta = next_hop_deref_protected(mpath);
if (mpath->flags & MESH_PATH_ACTIVE &&
ether_addr_equal(ta, sta->sta.addr) &&
+ !(mpath->flags & MESH_PATH_FIXED) &&
(!(mpath->flags & MESH_PATH_SN_VALID) ||
SN_GT(target_sn, mpath->sn) || target_sn == 0)) {
mpath->flags &= ~MESH_PATH_ACTIVE;
@@ -1023,7 +1024,7 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata)
goto enddiscovery;
spin_lock_bh(&mpath->state_lock);
- if (mpath->flags & MESH_PATH_DELETED) {
+ if (mpath->flags & (MESH_PATH_DELETED | MESH_PATH_FIXED)) {
spin_unlock_bh(&mpath->state_lock);
goto enddiscovery;
}
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index 6db2ddfa0695..f0e6175a9821 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -826,7 +826,7 @@ void mesh_path_fix_nexthop(struct mesh_path *mpath, struct sta_info *next_hop)
mpath->metric = 0;
mpath->hop_count = 0;
mpath->exp_time = 0;
- mpath->flags |= MESH_PATH_FIXED;
+ mpath->flags = MESH_PATH_FIXED | MESH_PATH_SN_VALID;
mesh_path_activate(mpath);
spin_unlock_bh(&mpath->state_lock);
mesh_path_tx_pending(mpath);
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 8d426f637f58..7486f2dab4ba 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1672,11 +1672,15 @@ __ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata)
non_acm_ac++)
if (!(sdata->wmm_acm & BIT(7 - 2 * non_acm_ac)))
break;
- /* The loop will result in using BK even if it requires
- * admission control, such configuration makes no sense
- * and we have to transmit somehow - the AC selection
- * does the same thing.
+ /* Usually the loop will result in using BK even if it
+ * requires admission control, but such a configuration
+ * makes no sense and we have to transmit somehow - the
+ * AC selection does the same thing.
+ * If we started out trying to downgrade from BK, then
+ * the extra condition here might be needed.
*/
+ if (non_acm_ac >= IEEE80211_NUM_ACS)
+ non_acm_ac = IEEE80211_AC_BK;
if (drv_conf_tx(local, sdata, ac,
&sdata->tx_conf[non_acm_ac]))
sdata_err(sdata,
diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c
index 00a43a70e1fc..28a3a0957c9e 100644
--- a/net/mac80211/pm.c
+++ b/net/mac80211/pm.c
@@ -178,8 +178,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan)
WARN_ON(!list_empty(&local->chanctx_list));
/* stop hardware - this must stop RX */
- if (local->open_count)
- ieee80211_stop_device(local);
+ ieee80211_stop_device(local);
suspend:
local->suspended = true;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 9dce3b157908..f7cf342bab52 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -180,6 +180,11 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
len += 12;
}
+ if (local->hw.radiotap_timestamp.units_pos >= 0) {
+ len = ALIGN(len, 8);
+ len += 12;
+ }
+
if (status->chains) {
/* antenna and antenna signal fields */
len += 2 * hweight8(status->chains);
@@ -447,6 +452,31 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
pos += 2;
}
+ if (local->hw.radiotap_timestamp.units_pos >= 0) {
+ u16 accuracy = 0;
+ u8 flags = IEEE80211_RADIOTAP_TIMESTAMP_FLAG_32BIT;
+
+ rthdr->it_present |=
+ cpu_to_le32(1 << IEEE80211_RADIOTAP_TIMESTAMP);
+
+ /* ensure 8 byte alignment */
+ while ((pos - (u8 *)rthdr) & 7)
+ pos++;
+
+ put_unaligned_le64(status->device_timestamp, pos);
+ pos += sizeof(u64);
+
+ if (local->hw.radiotap_timestamp.accuracy >= 0) {
+ accuracy = local->hw.radiotap_timestamp.accuracy;
+ flags |= IEEE80211_RADIOTAP_TIMESTAMP_FLAG_ACCURACY;
+ }
+ put_unaligned_le16(accuracy, pos);
+ pos += sizeof(u16);
+
+ *pos++ = local->hw.radiotap_timestamp.units_pos;
+ *pos++ = flags;
+ }
+
for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) {
*pos++ = status->chain_signal[chain];
*pos++ = chain;
@@ -485,6 +515,9 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
struct net_device *prev_dev = NULL;
int present_fcs_len = 0;
unsigned int rtap_vendor_space = 0;
+ struct ieee80211_mgmt *mgmt;
+ struct ieee80211_sub_if_data *monitor_sdata =
+ rcu_dereference(local->monitor_sdata);
if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) {
struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data;
@@ -567,7 +600,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
if (sdata->vif.type != NL80211_IFTYPE_MONITOR)
continue;
- if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES)
+ if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES)
continue;
if (!ieee80211_sdata_running(sdata))
@@ -585,6 +618,23 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
ieee80211_rx_stats(sdata->dev, skb->len);
}
+ mgmt = (void *)skb->data;
+ if (monitor_sdata &&
+ skb->len >= IEEE80211_MIN_ACTION_SIZE + 1 + VHT_MUMIMO_GROUPS_DATA_LEN &&
+ ieee80211_is_action(mgmt->frame_control) &&
+ mgmt->u.action.category == WLAN_CATEGORY_VHT &&
+ mgmt->u.action.u.vht_group_notif.action_code == WLAN_VHT_ACTION_GROUPID_MGMT &&
+ is_valid_ether_addr(monitor_sdata->u.mntr.mu_follow_addr) &&
+ ether_addr_equal(mgmt->da, monitor_sdata->u.mntr.mu_follow_addr)) {
+ struct sk_buff *mu_skb = skb_copy(skb, GFP_ATOMIC);
+
+ if (mu_skb) {
+ mu_skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
+ skb_queue_tail(&monitor_sdata->skb_queue, mu_skb);
+ ieee80211_queue_work(&local->hw, &monitor_sdata->work);
+ }
+ }
+
if (prev_dev) {
skb->dev = prev_dev;
netif_receive_skb(skb);
@@ -1072,8 +1122,15 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx,
tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
- if (!tid_agg_rx)
+ if (!tid_agg_rx) {
+ if (ack_policy == IEEE80211_QOS_CTL_ACK_POLICY_BLOCKACK &&
+ !test_bit(tid, rx->sta->ampdu_mlme.agg_session_valid) &&
+ !test_and_set_bit(tid, rx->sta->ampdu_mlme.unexpected_agg))
+ ieee80211_send_delba(rx->sdata, rx->sta->sta.addr, tid,
+ WLAN_BACK_RECIPIENT,
+ WLAN_REASON_QSTA_REQUIRE_SETUP);
goto dont_reorder;
+ }
/* qos null data frames are excluded */
if (unlikely(hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_NULLFUNC)))
@@ -2535,6 +2592,12 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames)
tid = le16_to_cpu(bar_data.control) >> 12;
+ if (!test_bit(tid, rx->sta->ampdu_mlme.agg_session_valid) &&
+ !test_and_set_bit(tid, rx->sta->ampdu_mlme.unexpected_agg))
+ ieee80211_send_delba(rx->sdata, rx->sta->sta.addr, tid,
+ WLAN_BACK_RECIPIENT,
+ WLAN_REASON_QSTA_REQUIRE_SETUP);
+
tid_agg_rx = rcu_dereference(rx->sta->ampdu_mlme.tid_rx[tid]);
if (!tid_agg_rx)
return RX_DROP_MONITOR;
@@ -3147,7 +3210,7 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
continue;
if (sdata->vif.type != NL80211_IFTYPE_MONITOR ||
- !(sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES))
+ !(sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES))
continue;
if (prev_dev) {
@@ -3940,7 +4003,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
__le16 fc;
struct ieee80211_rx_data rx;
struct ieee80211_sub_if_data *prev;
- struct rhash_head *tmp;
+ struct rhlist_head *tmp;
int err = 0;
fc = ((struct ieee80211_hdr *)skb->data)->frame_control;
@@ -3983,13 +4046,10 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
goto out;
} else if (ieee80211_is_data(fc)) {
struct sta_info *sta, *prev_sta;
- const struct bucket_table *tbl;
prev_sta = NULL;
- tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
-
- for_each_sta_info(local, tbl, hdr->addr2, sta, tmp) {
+ for_each_sta_info(local, hdr->addr2, sta, tmp) {
if (!prev_sta) {
prev_sta = sta;
continue;
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 070b40f15850..23d8ac829279 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -420,7 +420,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw,
{
struct ieee80211_local *local = hw_to_local(hw);
- trace_api_scan_completed(local, info);
+ trace_api_scan_completed(local, info->aborted);
set_bit(SCAN_COMPLETED, &local->scanning);
if (info->aborted)
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 19f14c907d74..011880d633b4 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -67,12 +67,10 @@
static const struct rhashtable_params sta_rht_params = {
.nelem_hint = 3, /* start small */
- .insecure_elasticity = true, /* Disable chain-length checks. */
.automatic_shrinking = true,
.head_offset = offsetof(struct sta_info, hash_node),
.key_offset = offsetof(struct sta_info, addr),
.key_len = ETH_ALEN,
- .hashfn = sta_addr_hash,
.max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE,
};
@@ -80,8 +78,8 @@ static const struct rhashtable_params sta_rht_params = {
static int sta_info_hash_del(struct ieee80211_local *local,
struct sta_info *sta)
{
- return rhashtable_remove_fast(&local->sta_hash, &sta->hash_node,
- sta_rht_params);
+ return rhltable_remove(&local->sta_hash, &sta->hash_node,
+ sta_rht_params);
}
static void __cleanup_single_sta(struct sta_info *sta)
@@ -157,19 +155,22 @@ static void cleanup_single_sta(struct sta_info *sta)
sta_info_free(local, sta);
}
+struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local,
+ const u8 *addr)
+{
+ return rhltable_lookup(&local->sta_hash, addr, sta_rht_params);
+}
+
/* protected by RCU */
struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
const u8 *addr)
{
struct ieee80211_local *local = sdata->local;
+ struct rhlist_head *tmp;
struct sta_info *sta;
- struct rhash_head *tmp;
- const struct bucket_table *tbl;
rcu_read_lock();
- tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
-
- for_each_sta_info(local, tbl, addr, sta, tmp) {
+ for_each_sta_info(local, addr, sta, tmp) {
if (sta->sdata == sdata) {
rcu_read_unlock();
/* this is safe as the caller must already hold
@@ -190,14 +191,11 @@ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
const u8 *addr)
{
struct ieee80211_local *local = sdata->local;
+ struct rhlist_head *tmp;
struct sta_info *sta;
- struct rhash_head *tmp;
- const struct bucket_table *tbl;
rcu_read_lock();
- tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
-
- for_each_sta_info(local, tbl, addr, sta, tmp) {
+ for_each_sta_info(local, addr, sta, tmp) {
if (sta->sdata == sdata ||
(sta->sdata->bss && sta->sdata->bss == sdata->bss)) {
rcu_read_unlock();
@@ -263,8 +261,8 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta)
static int sta_info_hash_add(struct ieee80211_local *local,
struct sta_info *sta)
{
- return rhashtable_insert_fast(&local->sta_hash, &sta->hash_node,
- sta_rht_params);
+ return rhltable_insert(&local->sta_hash, &sta->hash_node,
+ sta_rht_params);
}
static void sta_deliver_ps_frames(struct work_struct *wk)
@@ -340,6 +338,9 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
memcpy(sta->addr, addr, ETH_ALEN);
memcpy(sta->sta.addr, addr, ETH_ALEN);
+ sta->sta.max_rx_aggregation_subframes =
+ local->hw.max_rx_aggregation_subframes;
+
sta->local = local;
sta->sdata = sdata;
sta->rx_stats.last_rx = jiffies;
@@ -450,9 +451,9 @@ static int sta_info_insert_check(struct sta_info *sta)
is_multicast_ether_addr(sta->sta.addr)))
return -EINVAL;
- /* Strictly speaking this isn't necessary as we hold the mutex, but
- * the rhashtable code can't really deal with that distinction. We
- * do require the mutex for correctness though.
+ /* The RCU read lock is required by rhashtable due to
+ * asynchronous resize/rehash. We also require the mutex
+ * for correctness.
*/
rcu_read_lock();
lockdep_assert_held(&sdata->local->sta_mtx);
@@ -687,7 +688,7 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending)
}
/* No need to do anything if the driver does all */
- if (ieee80211_hw_check(&local->hw, AP_LINK_PS))
+ if (!local->ops->set_tim)
return;
if (sta->dead)
@@ -1040,16 +1041,11 @@ static void sta_info_cleanup(unsigned long data)
round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL));
}
-u32 sta_addr_hash(const void *key, u32 length, u32 seed)
-{
- return jhash(key, ETH_ALEN, seed);
-}
-
int sta_info_init(struct ieee80211_local *local)
{
int err;
- err = rhashtable_init(&local->sta_hash, &sta_rht_params);
+ err = rhltable_init(&local->sta_hash, &sta_rht_params);
if (err)
return err;
@@ -1065,7 +1061,7 @@ int sta_info_init(struct ieee80211_local *local)
void sta_info_stop(struct ieee80211_local *local)
{
del_timer_sync(&local->sta_cleanup);
- rhashtable_destroy(&local->sta_hash);
+ rhltable_destroy(&local->sta_hash);
}
@@ -1135,17 +1131,14 @@ struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw,
const u8 *localaddr)
{
struct ieee80211_local *local = hw_to_local(hw);
+ struct rhlist_head *tmp;
struct sta_info *sta;
- struct rhash_head *tmp;
- const struct bucket_table *tbl;
-
- tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
/*
* Just return a random station if localaddr is NULL
* ... first in list.
*/
- for_each_sta_info(local, tbl, addr, sta, tmp) {
+ for_each_sta_info(local, addr, sta, tmp) {
if (localaddr &&
!ether_addr_equal(sta->sdata->vif.addr, localaddr))
continue;
@@ -1616,7 +1609,6 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
sta_info_recalc_tim(sta);
} else {
- unsigned long tids = sta->txq_buffered_tids & driver_release_tids;
int tid;
/*
@@ -1648,7 +1640,8 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]);
- if (!(tids & BIT(tid)) || txqi->tin.backlog_packets)
+ if (!(driver_release_tids & BIT(tid)) ||
+ txqi->tin.backlog_packets)
continue;
sta_info_recalc_tim(sta);
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 0556be3e3628..ed5fcb984a01 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -230,6 +230,8 @@ struct tid_ampdu_rx {
* @tid_rx_stop_requested: bitmap indicating which BA sessions per TID the
* driver requested to close until the work for it runs
* @agg_session_valid: bitmap indicating which TID has a rx BA session open on
+ * @unexpected_agg: bitmap indicating which TID already sent a delBA due to
+ * unexpected aggregation related frames outside a session
* @work: work struct for starting/stopping aggregation
* @tid_tx: aggregation info for Tx per TID
* @tid_start_tx: sessions where start was requested
@@ -244,6 +246,7 @@ struct sta_ampdu_mlme {
unsigned long tid_rx_timer_expired[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
unsigned long tid_rx_stop_requested[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
unsigned long agg_session_valid[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
+ unsigned long unexpected_agg[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
/* tx */
struct work_struct work;
struct tid_ampdu_tx __rcu *tid_tx[IEEE80211_NUM_TIDS];
@@ -452,7 +455,7 @@ struct sta_info {
/* General information, mostly static */
struct list_head list, free_list;
struct rcu_head rcu_head;
- struct rhash_head hash_node;
+ struct rhlist_head hash_node;
u8 addr[ETH_ALEN];
struct ieee80211_local *local;
struct ieee80211_sub_if_data *sdata;
@@ -635,6 +638,9 @@ rcu_dereference_protected_tid_tx(struct sta_info *sta, int tid)
*/
#define STA_INFO_CLEANUP_INTERVAL (10 * HZ)
+struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local,
+ const u8 *addr);
+
/*
* Get a STA info, must be under RCU read lock.
*/
@@ -644,17 +650,9 @@ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
const u8 *addr);
-u32 sta_addr_hash(const void *key, u32 length, u32 seed);
-
-#define _sta_bucket_idx(_tbl, _a) \
- rht_bucket_index(_tbl, sta_addr_hash(_a, ETH_ALEN, (_tbl)->hash_rnd))
-
-#define for_each_sta_info(local, tbl, _addr, _sta, _tmp) \
- rht_for_each_entry_rcu(_sta, _tmp, tbl, \
- _sta_bucket_idx(tbl, _addr), \
- hash_node) \
- /* compare address and run code only if it matches */ \
- if (ether_addr_equal(_sta->addr, (_addr)))
+#define for_each_sta_info(local, _addr, _sta, _tmp) \
+ rhl_for_each_entry_rcu(_sta, _tmp, \
+ sta_info_hash_lookup(local, _addr), hash_node)
/*
* Get STA info by index, BROKEN!
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index a2a68269675d..ddf71c648cab 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -557,6 +557,12 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
static void ieee80211_lost_packet(struct sta_info *sta,
struct ieee80211_tx_info *info)
{
+ /* If driver relies on its own algorithm for station kickout, skip
+ * mac80211 packet loss mechanism.
+ */
+ if (ieee80211_hw_check(&sta->local->hw, REPORTS_LOW_ACK))
+ return;
+
/* This packet was aggregated but doesn't carry status info */
if ((info->flags & IEEE80211_TX_CTL_AMPDU) &&
!(info->flags & IEEE80211_TX_STAT_AMPDU))
@@ -709,7 +715,7 @@ void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb,
if (!ieee80211_sdata_running(sdata))
continue;
- if ((sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) &&
+ if ((sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) &&
!send_to_cooked)
continue;
@@ -740,8 +746,8 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
__le16 fc;
struct ieee80211_supported_band *sband;
+ struct rhlist_head *tmp;
struct sta_info *sta;
- struct rhash_head *tmp;
int retry_count;
int rates_idx;
bool send_to_cooked;
@@ -749,7 +755,6 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
struct ieee80211_bar *bar;
int shift = 0;
int tid = IEEE80211_NUM_TIDS;
- const struct bucket_table *tbl;
rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count);
@@ -758,9 +763,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
sband = local->hw.wiphy->bands[info->band];
fc = hdr->frame_control;
- tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
-
- for_each_sta_info(local, tbl, hdr->addr1, sta, tmp) {
+ for_each_sta_info(local, hdr->addr1, sta, tmp) {
/* skip wrong virtual interface */
if (!ether_addr_equal(hdr->addr2, sta->sdata->vif.addr))
continue;
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index b5d28f14b9cf..afca7d103684 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -333,10 +333,11 @@ ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata,
if (!uc.center_freq1)
return;
- /* proceed to downgrade the chandef until usable or the same */
+ /* proceed to downgrade the chandef until usable or the same as AP BW */
while (uc.width > max_width ||
- !cfg80211_reg_can_beacon_relax(sdata->local->hw.wiphy, &uc,
- sdata->wdev.iftype))
+ (uc.width > sta->tdls_chandef.width &&
+ !cfg80211_reg_can_beacon_relax(sdata->local->hw.wiphy, &uc,
+ sdata->wdev.iftype)))
ieee80211_chandef_downgrade(&uc);
if (!cfg80211_chandef_identical(&uc, &sta->tdls_chandef)) {
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 1d0746dfea57..1ff08be90a98 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -796,6 +796,36 @@ static __le16 ieee80211_tx_next_seq(struct sta_info *sta, int tid)
return ret;
}
+static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
+ struct ieee80211_vif *vif,
+ struct ieee80211_sta *pubsta,
+ struct sk_buff *skb)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_txq *txq = NULL;
+
+ if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) ||
+ (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE))
+ return NULL;
+
+ if (!ieee80211_is_data(hdr->frame_control))
+ return NULL;
+
+ if (pubsta) {
+ u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK;
+
+ txq = pubsta->txq[tid];
+ } else if (vif) {
+ txq = vif->txq;
+ }
+
+ if (!txq)
+ return NULL;
+
+ return to_txq_info(txq);
+}
+
static ieee80211_tx_result debug_noinline
ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
{
@@ -853,7 +883,8 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
tx->sta->tx_stats.msdu[tid]++;
- if (!tx->sta->sta.txq[0])
+ if (!ieee80211_get_txq(tx->local, info->control.vif, &tx->sta->sta,
+ tx->skb))
hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
return TX_CONTINUE;
@@ -1243,36 +1274,6 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata,
return TX_CONTINUE;
}
-static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
- struct ieee80211_vif *vif,
- struct ieee80211_sta *pubsta,
- struct sk_buff *skb)
-{
- struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
- struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
- struct ieee80211_txq *txq = NULL;
-
- if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) ||
- (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE))
- return NULL;
-
- if (!ieee80211_is_data(hdr->frame_control))
- return NULL;
-
- if (pubsta) {
- u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK;
-
- txq = pubsta->txq[tid];
- } else if (vif) {
- txq = vif->txq;
- }
-
- if (!txq)
- return NULL;
-
- return to_txq_info(txq);
-}
-
static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb)
{
IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time();
@@ -1343,7 +1344,7 @@ static struct sk_buff *fq_tin_dequeue_func(struct fq *fq,
local = container_of(fq, struct ieee80211_local, fq);
txqi = container_of(tin, struct txq_info, tin);
cparams = &local->cparams;
- cstats = &local->cstats;
+ cstats = &txqi->cstats;
if (flow == &txqi->def_flow)
cvars = &txqi->def_cvars;
@@ -1403,6 +1404,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
fq_tin_init(&txqi->tin);
fq_flow_init(&txqi->def_flow);
codel_vars_init(&txqi->def_cvars);
+ codel_stats_init(&txqi->cstats);
txqi->txq.vif = &sdata->vif;
@@ -1441,7 +1443,6 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local)
return ret;
codel_params_init(&local->cparams);
- codel_stats_init(&local->cstats);
local->cparams.interval = MS2TIME(100);
local->cparams.target = MS2TIME(20);
local->cparams.ecn = true;
@@ -1514,8 +1515,12 @@ out:
spin_unlock_bh(&fq->lock);
if (skb && skb_has_frag_list(skb) &&
- !ieee80211_hw_check(&local->hw, TX_FRAG_LIST))
- skb_linearize(skb);
+ !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) {
+ if (skb_linearize(skb)) {
+ ieee80211_free_txskb(&local->hw, skb);
+ return NULL;
+ }
+ }
return skb;
}
@@ -1643,7 +1648,7 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
switch (sdata->vif.type) {
case NL80211_IFTYPE_MONITOR:
- if (sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE) {
+ if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) {
vif = &sdata->vif;
break;
}
@@ -2263,15 +2268,9 @@ static int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata,
case NL80211_IFTYPE_STATION:
if (sdata->wdev.wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS) {
sta = sta_info_get(sdata, skb->data);
- if (sta) {
- bool tdls_peer, tdls_auth;
-
- tdls_peer = test_sta_flag(sta,
- WLAN_STA_TDLS_PEER);
- tdls_auth = test_sta_flag(sta,
- WLAN_STA_TDLS_PEER_AUTH);
-
- if (tdls_peer && tdls_auth) {
+ if (sta && test_sta_flag(sta, WLAN_STA_TDLS_PEER)) {
+ if (test_sta_flag(sta,
+ WLAN_STA_TDLS_PEER_AUTH)) {
*sta_out = sta;
return 0;
}
@@ -2283,8 +2282,7 @@ static int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata,
* after a TDLS sta is removed due to being
* unreachable.
*/
- if (tdls_peer && !tdls_auth &&
- !ieee80211_is_tdls_setup(skb))
+ if (!ieee80211_is_tdls_setup(skb))
return -EINVAL;
}
@@ -3243,7 +3241,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
*ieee80211_get_qos_ctl(hdr) = tid;
- if (!sta->sta.txq[0])
+ if (!ieee80211_get_txq(local, &sdata->vif, &sta->sta, skb))
hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
} else {
info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 42bf0b6685e8..b6865d884487 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -598,7 +598,7 @@ static void __iterate_interfaces(struct ieee80211_local *local,
list_for_each_entry_rcu(sdata, &local->interfaces, list) {
switch (sdata->vif.type) {
case NL80211_IFTYPE_MONITOR:
- if (!(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE))
+ if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE))
continue;
break;
case NL80211_IFTYPE_AP_VLAN:
@@ -2555,7 +2555,6 @@ int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata,
if (need_basic && basic_rates & BIT(i))
basic = 0x80;
- rate = sband->bitrates[i].bitrate;
rate = DIV_ROUND_UP(sband->bitrates[i].bitrate,
5 * (1 << shift));
*pos++ = basic | (u8) rate;
diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c
index 7079cd32a7ad..06019dba4b10 100644
--- a/net/mac802154/iface.c
+++ b/net/mac802154/iface.c
@@ -663,6 +663,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name,
/* TODO check this */
SET_NETDEV_DEV(ndev, &local->phy->dev);
+ dev_net_set(ndev, wpan_phy_net(local->hw.phy));
sdata = netdev_priv(ndev);
ndev->ieee802154_ptr = &sdata->wpan_dev;
memcpy(sdata->name, ndev->name, IFNAMSIZ);
diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c
index 446e1300383e..4dcf6e18563a 100644
--- a/net/mac802154/rx.c
+++ b/net/mac802154/rx.c
@@ -101,11 +101,16 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata,
sdata->dev->stats.rx_bytes += skb->len;
switch (mac_cb(skb)->type) {
+ case IEEE802154_FC_TYPE_BEACON:
+ case IEEE802154_FC_TYPE_ACK:
+ case IEEE802154_FC_TYPE_MAC_CMD:
+ goto fail;
+
case IEEE802154_FC_TYPE_DATA:
return ieee802154_deliver_skb(skb);
default:
- pr_warn("ieee802154: bad frame received (type = %d)\n",
- mac_cb(skb)->type);
+ pr_warn_ratelimited("ieee802154: bad frame received "
+ "(type = %d)\n", mac_cb(skb)->type);
goto fail;
}
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 72fc514ec676..fa6715db4581 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -231,19 +231,17 @@ EXPORT_SYMBOL(nf_unregister_net_hooks);
static LIST_HEAD(nf_hook_list);
-int nf_register_hook(struct nf_hook_ops *reg)
+static int _nf_register_hook(struct nf_hook_ops *reg)
{
struct net *net, *last;
int ret;
- rtnl_lock();
for_each_net(net) {
ret = nf_register_net_hook(net, reg);
if (ret && ret != -ENOENT)
goto rollback;
}
list_add_tail(&reg->list, &nf_hook_list);
- rtnl_unlock();
return 0;
rollback:
@@ -253,19 +251,34 @@ rollback:
break;
nf_unregister_net_hook(net, reg);
}
+ return ret;
+}
+
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+ int ret;
+
+ rtnl_lock();
+ ret = _nf_register_hook(reg);
rtnl_unlock();
+
return ret;
}
EXPORT_SYMBOL(nf_register_hook);
-void nf_unregister_hook(struct nf_hook_ops *reg)
+static void _nf_unregister_hook(struct nf_hook_ops *reg)
{
struct net *net;
- rtnl_lock();
list_del(&reg->list);
for_each_net(net)
nf_unregister_net_hook(net, reg);
+}
+
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+ rtnl_lock();
+ _nf_unregister_hook(reg);
rtnl_unlock();
}
EXPORT_SYMBOL(nf_unregister_hook);
@@ -289,6 +302,26 @@ err:
}
EXPORT_SYMBOL(nf_register_hooks);
+/* Caller MUST take rtnl_lock() */
+int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+ unsigned int i;
+ int err = 0;
+
+ for (i = 0; i < n; i++) {
+ err = _nf_register_hook(&reg[i]);
+ if (err)
+ goto err;
+ }
+ return err;
+
+err:
+ if (i > 0)
+ _nf_unregister_hooks(reg, i);
+ return err;
+}
+EXPORT_SYMBOL(_nf_register_hooks);
+
void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
{
while (n-- > 0)
@@ -296,6 +329,14 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
}
EXPORT_SYMBOL(nf_unregister_hooks);
+/* Caller MUST take rtnl_lock */
+void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+ while (n-- > 0)
+ _nf_unregister_hook(&reg[n]);
+}
+EXPORT_SYMBOL(_nf_unregister_hooks);
+
unsigned int nf_iterate(struct sk_buff *skb,
struct nf_hook_state *state,
struct nf_hook_entry **entryp)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index c94ec197845c..ba6a1d421222 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1108,9 +1108,9 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
if (IS_ERR(ct))
return (struct nf_conntrack_tuple_hash *)ct;
- if (tmpl && nfct_synproxy(tmpl)) {
- nfct_seqadj_ext_add(ct);
- nfct_synproxy_ext_add(ct);
+ if (!nf_ct_add_synproxy(ct, tmpl)) {
+ nf_conntrack_free(ct);
+ return ERR_PTR(-ENOMEM);
}
timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 81ae41f85d3a..bbb8f3df79f7 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -441,7 +441,8 @@ nf_nat_setup_info(struct nf_conn *ct,
ct->status |= IPS_DST_NAT;
if (nfct_help(ct))
- nfct_seqadj_ext_add(ct);
+ if (!nfct_seqadj_ext_add(ct))
+ return NF_DROP;
}
if (maniptype == NF_NAT_MANIP_SRC) {
@@ -801,7 +802,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
if (err < 0)
return err;
- return nf_nat_setup_info(ct, &range, manip);
+ return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0;
}
#else
static int
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 696fe8f6f2f2..ab695f8e2d29 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -239,7 +239,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
break;
case NFT_TRACETYPE_POLICY:
if (nla_put_be32(skb, NFTA_TRACE_POLICY,
- info->basechain->policy))
+ htonl(info->basechain->policy)))
goto nla_put_failure;
break;
}
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 70eb2f6a3b01..d44d89b56127 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -343,12 +343,12 @@ static int nfnl_acct_del(struct net *net, struct sock *nfnl,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const tb[])
{
- char *acct_name;
- struct nf_acct *cur;
+ struct nf_acct *cur, *tmp;
int ret = -ENOENT;
+ char *acct_name;
if (!tb[NFACCT_NAME]) {
- list_for_each_entry(cur, &net->nfnl_acct_list, head)
+ list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head)
nfnl_acct_try_del(cur);
return 0;
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 68216cdc7083..139e0867e56e 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -98,31 +98,28 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
break;
}
- l4proto = nf_ct_l4proto_find_get(l3num, l4num);
-
- /* This protocol is not supportted, skip. */
- if (l4proto->l4proto != l4num) {
- ret = -EOPNOTSUPP;
- goto err_proto_put;
- }
-
if (matching) {
if (nlh->nlmsg_flags & NLM_F_REPLACE) {
/* You cannot replace one timeout policy by another of
* different kind, sorry.
*/
if (matching->l3num != l3num ||
- matching->l4proto->l4proto != l4num) {
- ret = -EINVAL;
- goto err_proto_put;
- }
-
- ret = ctnl_timeout_parse_policy(&matching->data,
- l4proto, net,
- cda[CTA_TIMEOUT_DATA]);
- return ret;
+ matching->l4proto->l4proto != l4num)
+ return -EINVAL;
+
+ return ctnl_timeout_parse_policy(&matching->data,
+ matching->l4proto, net,
+ cda[CTA_TIMEOUT_DATA]);
}
- ret = -EBUSY;
+
+ return -EBUSY;
+ }
+
+ l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+
+ /* This protocol is not supportted, skip. */
+ if (l4proto->l4proto != l4num) {
+ ret = -EOPNOTSUPP;
goto err_proto_put;
}
@@ -305,7 +302,16 @@ static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout)
const struct hlist_nulls_node *nn;
unsigned int last_hsize;
spinlock_t *lock;
- int i;
+ int i, cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+ spin_lock_bh(&pcpu->lock);
+ hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode)
+ untimeout(h, timeout);
+ spin_unlock_bh(&pcpu->lock);
+ }
local_bh_disable();
restart:
@@ -350,12 +356,13 @@ static int cttimeout_del_timeout(struct net *net, struct sock *ctnl,
const struct nlmsghdr *nlh,
const struct nlattr * const cda[])
{
- struct ctnl_timeout *cur;
+ struct ctnl_timeout *cur, *tmp;
int ret = -ENOENT;
char *name;
if (!cda[CTA_TIMEOUT_NAME]) {
- list_for_each_entry(cur, &net->nfct_timeout_list, head)
+ list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list,
+ head)
ctnl_timeout_try_del(net, cur);
return 0;
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 14264edf2d77..6c1e0246706e 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -293,10 +293,16 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
}
EXPORT_SYMBOL_GPL(nft_meta_get_init);
-static int nft_meta_set_init_pkttype(const struct nft_ctx *ctx)
+int nft_meta_set_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
{
+ struct nft_meta *priv = nft_expr_priv(expr);
unsigned int hooks;
+ if (priv->key != NFT_META_PKTTYPE)
+ return 0;
+
switch (ctx->afi->family) {
case NFPROTO_BRIDGE:
hooks = 1 << NF_BR_PRE_ROUTING;
@@ -310,6 +316,7 @@ static int nft_meta_set_init_pkttype(const struct nft_ctx *ctx)
return nft_chain_validate_hooks(ctx->chain, hooks);
}
+EXPORT_SYMBOL_GPL(nft_meta_set_validate);
int nft_meta_set_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
@@ -329,15 +336,16 @@ int nft_meta_set_init(const struct nft_ctx *ctx,
len = sizeof(u8);
break;
case NFT_META_PKTTYPE:
- err = nft_meta_set_init_pkttype(ctx);
- if (err)
- return err;
len = sizeof(u8);
break;
default:
return -EOPNOTSUPP;
}
+ err = nft_meta_set_validate(ctx, expr, NULL);
+ if (err < 0)
+ return err;
+
priv->sreg = nft_parse_register(tb[NFTA_META_SREG]);
err = nft_validate_register_load(priv->sreg, len);
if (err < 0)
@@ -409,6 +417,7 @@ static const struct nft_expr_ops nft_meta_set_ops = {
.init = nft_meta_set_init,
.destroy = nft_meta_set_destroy,
.dump = nft_meta_set_dump,
+ .validate = nft_meta_set_validate,
};
static const struct nft_expr_ops *
diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
index 0522fc9bfb0a..c64de3f7379d 100644
--- a/net/netfilter/nft_reject.c
+++ b/net/netfilter/nft_reject.c
@@ -26,11 +26,27 @@ const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = {
};
EXPORT_SYMBOL_GPL(nft_reject_policy);
+int nft_reject_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ return nft_chain_validate_hooks(ctx->chain,
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT));
+}
+EXPORT_SYMBOL_GPL(nft_reject_validate);
+
int nft_reject_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_reject *priv = nft_expr_priv(expr);
+ int err;
+
+ err = nft_reject_validate(ctx, expr, NULL);
+ if (err < 0)
+ return err;
if (tb[NFTA_REJECT_TYPE] == NULL)
return -EINVAL;
diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c
index 759ca5248a3d..e79d9ca2ffee 100644
--- a/net/netfilter/nft_reject_inet.c
+++ b/net/netfilter/nft_reject_inet.c
@@ -66,7 +66,11 @@ static int nft_reject_inet_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_reject *priv = nft_expr_priv(expr);
- int icmp_code;
+ int icmp_code, err;
+
+ err = nft_reject_validate(ctx, expr, NULL);
+ if (err < 0)
+ return err;
if (tb[NFTA_REJECT_TYPE] == NULL)
return -EINVAL;
@@ -124,6 +128,7 @@ static const struct nft_expr_ops nft_reject_inet_ops = {
.eval = nft_reject_inet_eval,
.init = nft_reject_inet_init,
.dump = nft_reject_inet_dump,
+ .validate = nft_reject_validate,
};
static struct nft_expr_type nft_reject_inet_type __read_mostly = {
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index ef36a56a02c6..4dedb96d1a06 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -68,7 +68,7 @@ match_packet(const struct sk_buff *skb,
++i, offset, sch->type, htons(sch->length),
sch->flags);
#endif
- offset += WORD_ROUND(ntohs(sch->length));
+ offset += SCTP_PAD4(ntohs(sch->length));
pr_debug("skb->len: %d\toffset: %d\n", skb->len, offset);
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index 3e3e2534478a..b2f0e986a6f4 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -127,7 +127,6 @@ stop:
goto done;
rhashtable_walk_exit(hti);
- cb->args[2] = 0;
num++;
mc_list:
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index ca91fc33f8a9..863e992dfbc0 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -71,6 +71,8 @@ struct ovs_frag_data {
static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage);
#define DEFERRED_ACTION_FIFO_SIZE 10
+#define OVS_RECURSION_LIMIT 5
+#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2)
struct action_fifo {
int head;
int tail;
@@ -78,7 +80,12 @@ struct action_fifo {
struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE];
};
+struct recirc_keys {
+ struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD];
+};
+
static struct action_fifo __percpu *action_fifos;
+static struct recirc_keys __percpu *recirc_keys;
static DEFINE_PER_CPU(int, exec_actions_level);
static void action_fifo_init(struct action_fifo *fifo)
@@ -246,20 +253,24 @@ static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key)
int err;
err = skb_vlan_pop(skb);
- if (skb_vlan_tag_present(skb))
+ if (skb_vlan_tag_present(skb)) {
invalidate_flow_key(key);
- else
- key->eth.tci = 0;
+ } else {
+ key->eth.vlan.tci = 0;
+ key->eth.vlan.tpid = 0;
+ }
return err;
}
static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key,
const struct ovs_action_push_vlan *vlan)
{
- if (skb_vlan_tag_present(skb))
+ if (skb_vlan_tag_present(skb)) {
invalidate_flow_key(key);
- else
- key->eth.tci = vlan->vlan_tci;
+ } else {
+ key->eth.vlan.tci = vlan->vlan_tci;
+ key->eth.vlan.tpid = vlan->vlan_tpid;
+ }
return skb_vlan_push(skb, vlan->vlan_tpid,
ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
}
@@ -1016,6 +1027,7 @@ static int execute_recirc(struct datapath *dp, struct sk_buff *skb,
const struct nlattr *a, int rem)
{
struct deferred_action *da;
+ int level;
if (!is_flow_key_valid(key)) {
int err;
@@ -1039,6 +1051,18 @@ static int execute_recirc(struct datapath *dp, struct sk_buff *skb,
return 0;
}
+ level = this_cpu_read(exec_actions_level);
+ if (level <= OVS_DEFERRED_ACTION_THRESHOLD) {
+ struct recirc_keys *rks = this_cpu_ptr(recirc_keys);
+ struct sw_flow_key *recirc_key = &rks->key[level - 1];
+
+ *recirc_key = *key;
+ recirc_key->recirc_id = nla_get_u32(a);
+ ovs_dp_process_packet(skb, recirc_key);
+
+ return 0;
+ }
+
da = add_deferred_actions(skb, key, NULL);
if (da) {
da->pkt_key.recirc_id = nla_get_u32(a);
@@ -1205,11 +1229,10 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_actions *acts,
struct sw_flow_key *key)
{
- static const int ovs_recursion_limit = 5;
int err, level;
level = __this_cpu_inc_return(exec_actions_level);
- if (unlikely(level > ovs_recursion_limit)) {
+ if (unlikely(level > OVS_RECURSION_LIMIT)) {
net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n",
ovs_dp_name(dp));
kfree_skb(skb);
@@ -1234,10 +1257,17 @@ int action_fifos_init(void)
if (!action_fifos)
return -ENOMEM;
+ recirc_keys = alloc_percpu(struct recirc_keys);
+ if (!recirc_keys) {
+ free_percpu(action_fifos);
+ return -ENOMEM;
+ }
+
return 0;
}
void action_fifos_exit(void)
{
free_percpu(action_fifos);
+ free_percpu(recirc_keys);
}
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 524c0fd3078e..4d67ea856067 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -928,7 +928,6 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
struct sw_flow_mask mask;
struct sk_buff *reply;
struct datapath *dp;
- struct sw_flow_key key;
struct sw_flow_actions *acts;
struct sw_flow_match match;
u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
@@ -956,20 +955,24 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
}
/* Extract key. */
- ovs_match_init(&match, &key, &mask);
+ ovs_match_init(&match, &new_flow->key, false, &mask);
error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
a[OVS_FLOW_ATTR_MASK], log);
if (error)
goto err_kfree_flow;
- ovs_flow_mask_key(&new_flow->key, &key, true, &mask);
-
/* Extract flow identifier. */
error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID],
- &key, log);
+ &new_flow->key, log);
if (error)
goto err_kfree_flow;
+ /* unmasked key is needed to match when ufid is not used. */
+ if (ovs_identifier_is_key(&new_flow->id))
+ match.key = new_flow->id.unmasked_key;
+
+ ovs_flow_mask_key(&new_flow->key, &new_flow->key, true, &mask);
+
/* Validate actions. */
error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS],
&new_flow->key, &acts, log);
@@ -996,7 +999,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
if (ovs_identifier_is_ufid(&new_flow->id))
flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id);
if (!flow)
- flow = ovs_flow_tbl_lookup(&dp->table, &key);
+ flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->key);
if (likely(!flow)) {
rcu_assign_pointer(new_flow->sf_acts, acts);
@@ -1121,7 +1124,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log);
if (a[OVS_FLOW_ATTR_KEY]) {
- ovs_match_init(&match, &key, &mask);
+ ovs_match_init(&match, &key, true, &mask);
error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
a[OVS_FLOW_ATTR_MASK], log);
} else if (!ufid_present) {
@@ -1238,7 +1241,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
if (a[OVS_FLOW_ATTR_KEY]) {
- ovs_match_init(&match, &key, NULL);
+ ovs_match_init(&match, &key, true, NULL);
err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL,
log);
} else if (!ufid_present) {
@@ -1297,7 +1300,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
if (a[OVS_FLOW_ATTR_KEY]) {
- ovs_match_init(&match, &key, NULL);
+ ovs_match_init(&match, &key, true, NULL);
err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
NULL, log);
if (unlikely(err))
@@ -2437,3 +2440,7 @@ module_exit(dp_cleanup);
MODULE_DESCRIPTION("Open vSwitch switching datapath");
MODULE_LICENSE("GPL");
+MODULE_ALIAS_GENL_FAMILY(OVS_DATAPATH_FAMILY);
+MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY);
+MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY);
+MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY);
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 0ea128eeeab2..634cc10d6dee 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -29,6 +29,7 @@
#include <linux/module.h>
#include <linux/in.h>
#include <linux/rcupdate.h>
+#include <linux/cpumask.h>
#include <linux/if_arp.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
@@ -72,32 +73,33 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
{
struct flow_stats *stats;
int node = numa_node_id();
+ int cpu = smp_processor_id();
int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
- stats = rcu_dereference(flow->stats[node]);
+ stats = rcu_dereference(flow->stats[cpu]);
- /* Check if already have node-specific stats. */
+ /* Check if already have CPU-specific stats. */
if (likely(stats)) {
spin_lock(&stats->lock);
/* Mark if we write on the pre-allocated stats. */
- if (node == 0 && unlikely(flow->stats_last_writer != node))
- flow->stats_last_writer = node;
+ if (cpu == 0 && unlikely(flow->stats_last_writer != cpu))
+ flow->stats_last_writer = cpu;
} else {
stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */
spin_lock(&stats->lock);
- /* If the current NUMA-node is the only writer on the
+ /* If the current CPU is the only writer on the
* pre-allocated stats keep using them.
*/
- if (unlikely(flow->stats_last_writer != node)) {
+ if (unlikely(flow->stats_last_writer != cpu)) {
/* A previous locker may have already allocated the
- * stats, so we need to check again. If node-specific
+ * stats, so we need to check again. If CPU-specific
* stats were already allocated, we update the pre-
* allocated stats as we have already locked them.
*/
- if (likely(flow->stats_last_writer != NUMA_NO_NODE)
- && likely(!rcu_access_pointer(flow->stats[node]))) {
- /* Try to allocate node-specific stats. */
+ if (likely(flow->stats_last_writer != -1) &&
+ likely(!rcu_access_pointer(flow->stats[cpu]))) {
+ /* Try to allocate CPU-specific stats. */
struct flow_stats *new_stats;
new_stats =
@@ -114,12 +116,12 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
new_stats->tcp_flags = tcp_flags;
spin_lock_init(&new_stats->lock);
- rcu_assign_pointer(flow->stats[node],
+ rcu_assign_pointer(flow->stats[cpu],
new_stats);
goto unlock;
}
}
- flow->stats_last_writer = node;
+ flow->stats_last_writer = cpu;
}
}
@@ -136,14 +138,15 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
struct ovs_flow_stats *ovs_stats,
unsigned long *used, __be16 *tcp_flags)
{
- int node;
+ int cpu;
*used = 0;
*tcp_flags = 0;
memset(ovs_stats, 0, sizeof(*ovs_stats));
- for_each_node(node) {
- struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[node]);
+ /* We open code this to make sure cpu 0 is always considered */
+ for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask)) {
+ struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[cpu]);
if (stats) {
/* Local CPU may write on non-local stats, so we must
@@ -163,10 +166,11 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
/* Called with ovs_mutex. */
void ovs_flow_stats_clear(struct sw_flow *flow)
{
- int node;
+ int cpu;
- for_each_node(node) {
- struct flow_stats *stats = ovsl_dereference(flow->stats[node]);
+ /* We open code this to make sure cpu 0 is always considered */
+ for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask)) {
+ struct flow_stats *stats = ovsl_dereference(flow->stats[cpu]);
if (stats) {
spin_lock_bh(&stats->lock);
@@ -302,24 +306,57 @@ static bool icmp6hdr_ok(struct sk_buff *skb)
sizeof(struct icmp6hdr));
}
-static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
+/**
+ * Parse vlan tag from vlan header.
+ * Returns ERROR on memory error.
+ * Returns 0 if it encounters a non-vlan or incomplete packet.
+ * Returns 1 after successfully parsing vlan tag.
+ */
+static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh)
{
- struct qtag_prefix {
- __be16 eth_type; /* ETH_P_8021Q */
- __be16 tci;
- };
- struct qtag_prefix *qp;
+ struct vlan_head *vh = (struct vlan_head *)skb->data;
+
+ if (likely(!eth_type_vlan(vh->tpid)))
+ return 0;
- if (unlikely(skb->len < sizeof(struct qtag_prefix) + sizeof(__be16)))
+ if (unlikely(skb->len < sizeof(struct vlan_head) + sizeof(__be16)))
return 0;
- if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) +
- sizeof(__be16))))
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct vlan_head) +
+ sizeof(__be16))))
return -ENOMEM;
- qp = (struct qtag_prefix *) skb->data;
- key->eth.tci = qp->tci | htons(VLAN_TAG_PRESENT);
- __skb_pull(skb, sizeof(struct qtag_prefix));
+ vh = (struct vlan_head *)skb->data;
+ key_vh->tci = vh->tci | htons(VLAN_TAG_PRESENT);
+ key_vh->tpid = vh->tpid;
+
+ __skb_pull(skb, sizeof(struct vlan_head));
+ return 1;
+}
+
+static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
+{
+ int res;
+
+ key->eth.vlan.tci = 0;
+ key->eth.vlan.tpid = 0;
+ key->eth.cvlan.tci = 0;
+ key->eth.cvlan.tpid = 0;
+
+ if (likely(skb_vlan_tag_present(skb))) {
+ key->eth.vlan.tci = htons(skb->vlan_tci);
+ key->eth.vlan.tpid = skb->vlan_proto;
+ } else {
+ /* Parse outer vlan tag in the non-accelerated case. */
+ res = parse_vlan_tag(skb, &key->eth.vlan);
+ if (res <= 0)
+ return res;
+ }
+
+ /* Parse inner vlan tag. */
+ res = parse_vlan_tag(skb, &key->eth.cvlan);
+ if (res <= 0)
+ return res;
return 0;
}
@@ -480,12 +517,8 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
* update skb->csum here.
*/
- key->eth.tci = 0;
- if (skb_vlan_tag_present(skb))
- key->eth.tci = htons(skb->vlan_tci);
- else if (eth->h_proto == htons(ETH_P_8021Q))
- if (unlikely(parse_vlan(skb, key)))
- return -ENOMEM;
+ if (unlikely(parse_vlan(skb, key)))
+ return -ENOMEM;
key->eth.type = parse_ethertype(skb);
if (unlikely(key->eth.type == htons(0)))
@@ -734,8 +767,6 @@ int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
{
int err;
- memset(key, 0, OVS_SW_FLOW_KEY_METADATA_SIZE);
-
/* Extract metadata from netlink attributes. */
err = ovs_nla_get_flow_metadata(net, attr, key, log);
if (err)
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 03378e75a67c..ae783f5c6695 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -50,6 +50,11 @@ struct ovs_tunnel_info {
struct metadata_dst *tun_dst;
};
+struct vlan_head {
+ __be16 tpid; /* Vlan type. Generally 802.1q or 802.1ad.*/
+ __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */
+};
+
#define OVS_SW_FLOW_KEY_METADATA_SIZE \
(offsetof(struct sw_flow_key, recirc_id) + \
FIELD_SIZEOF(struct sw_flow_key, recirc_id))
@@ -69,7 +74,8 @@ struct sw_flow_key {
struct {
u8 src[ETH_ALEN]; /* Ethernet source address. */
u8 dst[ETH_ALEN]; /* Ethernet destination address. */
- __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */
+ struct vlan_head vlan;
+ struct vlan_head cvlan;
__be16 type; /* Ethernet frame type. */
} eth;
union {
@@ -172,14 +178,14 @@ struct sw_flow {
struct hlist_node node[2];
u32 hash;
} flow_table, ufid_table;
- int stats_last_writer; /* NUMA-node id of the last writer on
+ int stats_last_writer; /* CPU id of the last writer on
* 'stats[0]'.
*/
struct sw_flow_key key;
struct sw_flow_id id;
struct sw_flow_mask *mask;
struct sw_flow_actions __rcu *sf_acts;
- struct flow_stats __rcu *stats[]; /* One for each NUMA node. First one
+ struct flow_stats __rcu *stats[]; /* One for each CPU. First one
* is allocated at flow creation time,
* the rest are allocated on demand
* while holding the 'stats[0].lock'.
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index c78a6a1476fb..ae25ded82b3b 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -808,6 +808,167 @@ int ovs_nla_put_tunnel_info(struct sk_buff *skb,
ip_tunnel_info_af(tun_info));
}
+static int encode_vlan_from_nlattrs(struct sw_flow_match *match,
+ const struct nlattr *a[],
+ bool is_mask, bool inner)
+{
+ __be16 tci = 0;
+ __be16 tpid = 0;
+
+ if (a[OVS_KEY_ATTR_VLAN])
+ tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+
+ if (a[OVS_KEY_ATTR_ETHERTYPE])
+ tpid = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+
+ if (likely(!inner)) {
+ SW_FLOW_KEY_PUT(match, eth.vlan.tpid, tpid, is_mask);
+ SW_FLOW_KEY_PUT(match, eth.vlan.tci, tci, is_mask);
+ } else {
+ SW_FLOW_KEY_PUT(match, eth.cvlan.tpid, tpid, is_mask);
+ SW_FLOW_KEY_PUT(match, eth.cvlan.tci, tci, is_mask);
+ }
+ return 0;
+}
+
+static int validate_vlan_from_nlattrs(const struct sw_flow_match *match,
+ u64 key_attrs, bool inner,
+ const struct nlattr **a, bool log)
+{
+ __be16 tci = 0;
+
+ if (!((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) &&
+ (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) &&
+ eth_type_vlan(nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE])))) {
+ /* Not a VLAN. */
+ return 0;
+ }
+
+ if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) &&
+ (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) {
+ OVS_NLERR(log, "Invalid %s frame", (inner) ? "C-VLAN" : "VLAN");
+ return -EINVAL;
+ }
+
+ if (a[OVS_KEY_ATTR_VLAN])
+ tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+
+ if (!(tci & htons(VLAN_TAG_PRESENT))) {
+ if (tci) {
+ OVS_NLERR(log, "%s TCI does not have VLAN_TAG_PRESENT bit set.",
+ (inner) ? "C-VLAN" : "VLAN");
+ return -EINVAL;
+ } else if (nla_len(a[OVS_KEY_ATTR_ENCAP])) {
+ /* Corner case for truncated VLAN header. */
+ OVS_NLERR(log, "Truncated %s header has non-zero encap attribute.",
+ (inner) ? "C-VLAN" : "VLAN");
+ return -EINVAL;
+ }
+ }
+
+ return 1;
+}
+
+static int validate_vlan_mask_from_nlattrs(const struct sw_flow_match *match,
+ u64 key_attrs, bool inner,
+ const struct nlattr **a, bool log)
+{
+ __be16 tci = 0;
+ __be16 tpid = 0;
+ bool encap_valid = !!(match->key->eth.vlan.tci &
+ htons(VLAN_TAG_PRESENT));
+ bool i_encap_valid = !!(match->key->eth.cvlan.tci &
+ htons(VLAN_TAG_PRESENT));
+
+ if (!(key_attrs & (1 << OVS_KEY_ATTR_ENCAP))) {
+ /* Not a VLAN. */
+ return 0;
+ }
+
+ if ((!inner && !encap_valid) || (inner && !i_encap_valid)) {
+ OVS_NLERR(log, "Encap mask attribute is set for non-%s frame.",
+ (inner) ? "C-VLAN" : "VLAN");
+ return -EINVAL;
+ }
+
+ if (a[OVS_KEY_ATTR_VLAN])
+ tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+
+ if (a[OVS_KEY_ATTR_ETHERTYPE])
+ tpid = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+
+ if (tpid != htons(0xffff)) {
+ OVS_NLERR(log, "Must have an exact match on %s TPID (mask=%x).",
+ (inner) ? "C-VLAN" : "VLAN", ntohs(tpid));
+ return -EINVAL;
+ }
+ if (!(tci & htons(VLAN_TAG_PRESENT))) {
+ OVS_NLERR(log, "%s TCI mask does not have exact match for VLAN_TAG_PRESENT bit.",
+ (inner) ? "C-VLAN" : "VLAN");
+ return -EINVAL;
+ }
+
+ return 1;
+}
+
+static int __parse_vlan_from_nlattrs(struct sw_flow_match *match,
+ u64 *key_attrs, bool inner,
+ const struct nlattr **a, bool is_mask,
+ bool log)
+{
+ int err;
+ const struct nlattr *encap;
+
+ if (!is_mask)
+ err = validate_vlan_from_nlattrs(match, *key_attrs, inner,
+ a, log);
+ else
+ err = validate_vlan_mask_from_nlattrs(match, *key_attrs, inner,
+ a, log);
+ if (err <= 0)
+ return err;
+
+ err = encode_vlan_from_nlattrs(match, a, is_mask, inner);
+ if (err)
+ return err;
+
+ *key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
+ *key_attrs &= ~(1 << OVS_KEY_ATTR_VLAN);
+ *key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+
+ encap = a[OVS_KEY_ATTR_ENCAP];
+
+ if (!is_mask)
+ err = parse_flow_nlattrs(encap, a, key_attrs, log);
+ else
+ err = parse_flow_mask_nlattrs(encap, a, key_attrs, log);
+
+ return err;
+}
+
+static int parse_vlan_from_nlattrs(struct sw_flow_match *match,
+ u64 *key_attrs, const struct nlattr **a,
+ bool is_mask, bool log)
+{
+ int err;
+ bool encap_valid = false;
+
+ err = __parse_vlan_from_nlattrs(match, key_attrs, false, a,
+ is_mask, log);
+ if (err)
+ return err;
+
+ encap_valid = !!(match->key->eth.vlan.tci & htons(VLAN_TAG_PRESENT));
+ if (encap_valid) {
+ err = __parse_vlan_from_nlattrs(match, key_attrs, true, a,
+ is_mask, log);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
u64 *attrs, const struct nlattr **a,
bool is_mask, bool log)
@@ -923,20 +1084,11 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
}
if (attrs & (1 << OVS_KEY_ATTR_VLAN)) {
- __be16 tci;
-
- tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
- if (!(tci & htons(VLAN_TAG_PRESENT))) {
- if (is_mask)
- OVS_NLERR(log, "VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.");
- else
- OVS_NLERR(log, "VLAN TCI does not have VLAN_TAG_PRESENT bit set.");
-
- return -EINVAL;
- }
-
- SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask);
- attrs &= ~(1 << OVS_KEY_ATTR_VLAN);
+ /* VLAN attribute is always parsed before getting here since it
+ * may occur multiple times.
+ */
+ OVS_NLERR(log, "VLAN attribute unexpected.");
+ return -EINVAL;
}
if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
@@ -1182,49 +1334,18 @@ int ovs_nla_get_match(struct net *net, struct sw_flow_match *match,
bool log)
{
const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
- const struct nlattr *encap;
struct nlattr *newmask = NULL;
u64 key_attrs = 0;
u64 mask_attrs = 0;
- bool encap_valid = false;
int err;
err = parse_flow_nlattrs(nla_key, a, &key_attrs, log);
if (err)
return err;
- if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) &&
- (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) &&
- (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) {
- __be16 tci;
-
- if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) &&
- (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) {
- OVS_NLERR(log, "Invalid Vlan frame.");
- return -EINVAL;
- }
-
- key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
- tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
- encap = a[OVS_KEY_ATTR_ENCAP];
- key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
- encap_valid = true;
-
- if (tci & htons(VLAN_TAG_PRESENT)) {
- err = parse_flow_nlattrs(encap, a, &key_attrs, log);
- if (err)
- return err;
- } else if (!tci) {
- /* Corner case for truncated 802.1Q header. */
- if (nla_len(encap)) {
- OVS_NLERR(log, "Truncated 802.1Q header has non-zero encap attribute.");
- return -EINVAL;
- }
- } else {
- OVS_NLERR(log, "Encap attr is set for non-VLAN frame");
- return -EINVAL;
- }
- }
+ err = parse_vlan_from_nlattrs(match, &key_attrs, a, false, log);
+ if (err)
+ return err;
err = ovs_key_from_nlattrs(net, match, key_attrs, a, false, log);
if (err)
@@ -1265,46 +1386,12 @@ int ovs_nla_get_match(struct net *net, struct sw_flow_match *match,
goto free_newmask;
/* Always match on tci. */
- SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true);
-
- if (mask_attrs & 1 << OVS_KEY_ATTR_ENCAP) {
- __be16 eth_type = 0;
- __be16 tci = 0;
-
- if (!encap_valid) {
- OVS_NLERR(log, "Encap mask attribute is set for non-VLAN frame.");
- err = -EINVAL;
- goto free_newmask;
- }
-
- mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
- if (a[OVS_KEY_ATTR_ETHERTYPE])
- eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
-
- if (eth_type == htons(0xffff)) {
- mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
- encap = a[OVS_KEY_ATTR_ENCAP];
- err = parse_flow_mask_nlattrs(encap, a,
- &mask_attrs, log);
- if (err)
- goto free_newmask;
- } else {
- OVS_NLERR(log, "VLAN frames must have an exact match on the TPID (mask=%x).",
- ntohs(eth_type));
- err = -EINVAL;
- goto free_newmask;
- }
-
- if (a[OVS_KEY_ATTR_VLAN])
- tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+ SW_FLOW_KEY_PUT(match, eth.vlan.tci, htons(0xffff), true);
+ SW_FLOW_KEY_PUT(match, eth.cvlan.tci, htons(0xffff), true);
- if (!(tci & htons(VLAN_TAG_PRESENT))) {
- OVS_NLERR(log, "VLAN tag present bit must have an exact match (tci_mask=%x).",
- ntohs(tci));
- err = -EINVAL;
- goto free_newmask;
- }
- }
+ err = parse_vlan_from_nlattrs(match, &mask_attrs, a, true, log);
+ if (err)
+ goto free_newmask;
err = ovs_key_from_nlattrs(net, match, mask_attrs, a, true,
log);
@@ -1410,12 +1497,25 @@ int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr,
return metadata_from_nlattrs(net, &match, &attrs, a, false, log);
}
+static int ovs_nla_put_vlan(struct sk_buff *skb, const struct vlan_head *vh,
+ bool is_mask)
+{
+ __be16 eth_type = !is_mask ? vh->tpid : htons(0xffff);
+
+ if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) ||
+ nla_put_be16(skb, OVS_KEY_ATTR_VLAN, vh->tci))
+ return -EMSGSIZE;
+ return 0;
+}
+
static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
const struct sw_flow_key *output, bool is_mask,
struct sk_buff *skb)
{
struct ovs_key_ethernet *eth_key;
- struct nlattr *nla, *encap;
+ struct nlattr *nla;
+ struct nlattr *encap = NULL;
+ struct nlattr *in_encap = NULL;
if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id))
goto nla_put_failure;
@@ -1464,17 +1564,21 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
ether_addr_copy(eth_key->eth_src, output->eth.src);
ether_addr_copy(eth_key->eth_dst, output->eth.dst);
- if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) {
- __be16 eth_type;
- eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff);
- if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) ||
- nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci))
+ if (swkey->eth.vlan.tci || eth_type_vlan(swkey->eth.type)) {
+ if (ovs_nla_put_vlan(skb, &output->eth.vlan, is_mask))
goto nla_put_failure;
encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
- if (!swkey->eth.tci)
+ if (!swkey->eth.vlan.tci)
goto unencap;
- } else
- encap = NULL;
+
+ if (swkey->eth.cvlan.tci || eth_type_vlan(swkey->eth.type)) {
+ if (ovs_nla_put_vlan(skb, &output->eth.cvlan, is_mask))
+ goto nla_put_failure;
+ in_encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
+ if (!swkey->eth.cvlan.tci)
+ goto unencap;
+ }
+ }
if (swkey->eth.type == htons(ETH_P_802_2)) {
/*
@@ -1493,6 +1597,14 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type))
goto nla_put_failure;
+ if (eth_type_vlan(swkey->eth.type)) {
+ /* There are 3 VLAN tags, we don't know anything about the rest
+ * of the packet, so truncate here.
+ */
+ WARN_ON_ONCE(!(encap && in_encap));
+ goto unencap;
+ }
+
if (swkey->eth.type == htons(ETH_P_IP)) {
struct ovs_key_ipv4 *ipv4_key;
@@ -1619,6 +1731,8 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
}
unencap:
+ if (in_encap)
+ nla_nest_end(skb, in_encap);
if (encap)
nla_nest_end(skb, encap);
@@ -1882,13 +1996,15 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
void ovs_match_init(struct sw_flow_match *match,
struct sw_flow_key *key,
+ bool reset_key,
struct sw_flow_mask *mask)
{
memset(match, 0, sizeof(*match));
match->key = key;
match->mask = mask;
- memset(key, 0, sizeof(*key));
+ if (reset_key)
+ memset(key, 0, sizeof(*key));
if (mask) {
memset(&mask->key, 0, sizeof(mask->key));
@@ -1935,7 +2051,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
struct nlattr *a;
int err = 0, start, opts_type;
- ovs_match_init(&match, &key, NULL);
+ ovs_match_init(&match, &key, true, NULL);
opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log);
if (opts_type < 0)
return opts_type;
@@ -2283,7 +2399,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
case OVS_ACTION_ATTR_PUSH_VLAN:
vlan = nla_data(a);
- if (vlan->vlan_tpid != htons(ETH_P_8021Q))
+ if (!eth_type_vlan(vlan->vlan_tpid))
return -EINVAL;
if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
return -EINVAL;
@@ -2388,7 +2504,7 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
(*sfa)->orig_len = nla_len(attr);
err = __ovs_nla_copy_actions(net, attr, key, 0, sfa, key->eth.type,
- key->eth.tci, log);
+ key->eth.vlan.tci, log);
if (err)
ovs_nla_free_flow_actions(*sfa);
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 47dd142eca1c..45f9769e5aac 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -41,7 +41,8 @@ size_t ovs_tun_key_attr_size(void);
size_t ovs_key_attr_size(void);
void ovs_match_init(struct sw_flow_match *match,
- struct sw_flow_key *key, struct sw_flow_mask *mask);
+ struct sw_flow_key *key, bool reset_key,
+ struct sw_flow_mask *mask);
int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *,
int attr, bool is_mask, struct sk_buff *);
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index d073fff82fdb..ea7a8073fa02 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -32,6 +32,7 @@
#include <linux/module.h>
#include <linux/in.h>
#include <linux/rcupdate.h>
+#include <linux/cpumask.h>
#include <linux/if_arp.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
@@ -79,17 +80,12 @@ struct sw_flow *ovs_flow_alloc(void)
{
struct sw_flow *flow;
struct flow_stats *stats;
- int node;
- flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
+ flow = kmem_cache_zalloc(flow_cache, GFP_KERNEL);
if (!flow)
return ERR_PTR(-ENOMEM);
- flow->sf_acts = NULL;
- flow->mask = NULL;
- flow->id.unmasked_key = NULL;
- flow->id.ufid_len = 0;
- flow->stats_last_writer = NUMA_NO_NODE;
+ flow->stats_last_writer = -1;
/* Initialize the default stat node. */
stats = kmem_cache_alloc_node(flow_stats_cache,
@@ -102,10 +98,6 @@ struct sw_flow *ovs_flow_alloc(void)
RCU_INIT_POINTER(flow->stats[0], stats);
- for_each_node(node)
- if (node != 0)
- RCU_INIT_POINTER(flow->stats[node], NULL);
-
return flow;
err:
kmem_cache_free(flow_cache, flow);
@@ -142,16 +134,17 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets)
static void flow_free(struct sw_flow *flow)
{
- int node;
+ int cpu;
if (ovs_identifier_is_key(&flow->id))
kfree(flow->id.unmasked_key);
if (flow->sf_acts)
ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts);
- for_each_node(node)
- if (flow->stats[node])
+ /* We open code this to make sure cpu 0 is always considered */
+ for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask))
+ if (flow->stats[cpu])
kmem_cache_free(flow_stats_cache,
- (struct flow_stats __force *)flow->stats[node]);
+ (struct flow_stats __force *)flow->stats[cpu]);
kmem_cache_free(flow_cache, flow);
}
@@ -756,7 +749,7 @@ int ovs_flow_init(void)
BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
- + (nr_node_ids
+ + (nr_cpu_ids
* sizeof(struct flow_stats *)),
0, 0, NULL);
if (flow_cache == NULL)
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 6b21fd068d87..8f198437c724 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -485,9 +485,14 @@ static unsigned int packet_length(const struct sk_buff *skb)
{
unsigned int length = skb->len - ETH_HLEN;
- if (skb->protocol == htons(ETH_P_8021Q))
+ if (skb_vlan_tagged(skb))
length -= VLAN_HLEN;
+ /* Don't subtract for multiple VLAN tags. Most (all?) drivers allow
+ * (ETH_LEN + VLAN_HLEN) in addition to the mtu value, but almost none
+ * account for 802.1ad. e.g. is_skb_forwardable().
+ */
+
return length;
}
diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig
index 784c53163b7b..86f8853a038c 100644
--- a/net/rxrpc/Kconfig
+++ b/net/rxrpc/Kconfig
@@ -19,6 +19,20 @@ config AF_RXRPC
See Documentation/networking/rxrpc.txt.
+config AF_RXRPC_IPV6
+ bool "IPv6 support for RxRPC"
+ depends on (IPV6 = m && AF_RXRPC = m) || (IPV6 = y && AF_RXRPC)
+ help
+ Say Y here to allow AF_RXRPC to use IPV6 UDP as well as IPV4 UDP as
+ its network transport.
+
+config AF_RXRPC_INJECT_LOSS
+ bool "Inject packet loss into RxRPC packet stream"
+ depends on AF_RXRPC
+ help
+ Say Y here to inject packet loss by discarding some received and some
+ transmitted packets.
+
config AF_RXRPC_DEBUG
bool "RxRPC dynamic debugging"
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index b66a9e6f8d04..8dbf7bed2cc4 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -45,7 +45,7 @@ u32 rxrpc_epoch;
atomic_t rxrpc_debug_id;
/* count of skbs currently in use */
-atomic_t rxrpc_n_skbs;
+atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs;
struct workqueue_struct *rxrpc_workqueue;
@@ -106,19 +106,25 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx,
case AF_INET:
if (srx->transport_len < sizeof(struct sockaddr_in))
return -EINVAL;
- _debug("INET: %x @ %pI4",
- ntohs(srx->transport.sin.sin_port),
- &srx->transport.sin.sin_addr);
tail = offsetof(struct sockaddr_rxrpc, transport.sin.__pad);
break;
+#ifdef CONFIG_AF_RXRPC_IPV6
case AF_INET6:
+ if (srx->transport_len < sizeof(struct sockaddr_in6))
+ return -EINVAL;
+ tail = offsetof(struct sockaddr_rxrpc, transport) +
+ sizeof(struct sockaddr_in6);
+ break;
+#endif
+
default:
return -EAFNOSUPPORT;
}
if (tail < len)
memset((void *)srx + tail, 0, len - tail);
+ _debug("INET: %pISp", &srx->transport);
return 0;
}
@@ -155,15 +161,15 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
}
if (rx->srx.srx_service) {
- write_lock_bh(&local->services_lock);
- list_for_each_entry(prx, &local->services, listen_link) {
+ write_lock(&local->services_lock);
+ hlist_for_each_entry(prx, &local->services, listen_link) {
if (prx->srx.srx_service == rx->srx.srx_service)
goto service_in_use;
}
rx->local = local;
- list_add_tail(&rx->listen_link, &local->services);
- write_unlock_bh(&local->services_lock);
+ hlist_add_head_rcu(&rx->listen_link, &local->services);
+ write_unlock(&local->services_lock);
rx->sk.sk_state = RXRPC_SERVER_BOUND;
} else {
@@ -176,7 +182,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
return 0;
service_in_use:
- write_unlock_bh(&local->services_lock);
+ write_unlock(&local->services_lock);
rxrpc_put_local(local);
ret = -EADDRINUSE;
error_unlock:
@@ -193,7 +199,7 @@ static int rxrpc_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
struct rxrpc_sock *rx = rxrpc_sk(sk);
- unsigned int max;
+ unsigned int max, old;
int ret;
_enter("%p,%d", rx, backlog);
@@ -212,9 +218,13 @@ static int rxrpc_listen(struct socket *sock, int backlog)
backlog = max;
else if (backlog < 0 || backlog > max)
break;
+ old = sk->sk_max_ack_backlog;
sk->sk_max_ack_backlog = backlog;
- rx->sk.sk_state = RXRPC_SERVER_LISTENING;
- ret = 0;
+ ret = rxrpc_service_prealloc(rx, GFP_KERNEL);
+ if (ret == 0)
+ rx->sk.sk_state = RXRPC_SERVER_LISTENING;
+ else
+ sk->sk_max_ack_backlog = old;
break;
default:
ret = -EBUSY;
@@ -294,9 +304,8 @@ EXPORT_SYMBOL(rxrpc_kernel_begin_call);
void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call)
{
_enter("%d{%d}", call->debug_id, atomic_read(&call->usage));
- rxrpc_remove_user_ID(rxrpc_sk(sock->sk), call);
- rxrpc_purge_queue(&call->knlrecv_queue);
- rxrpc_put_call(call);
+ rxrpc_release_call(rxrpc_sk(sock->sk), call);
+ rxrpc_put_call(call, rxrpc_call_put_kernel);
}
EXPORT_SYMBOL(rxrpc_kernel_end_call);
@@ -304,16 +313,19 @@ EXPORT_SYMBOL(rxrpc_kernel_end_call);
* rxrpc_kernel_new_call_notification - Get notifications of new calls
* @sock: The socket to intercept received messages on
* @notify_new_call: Function to be called when new calls appear
+ * @discard_new_call: Function to discard preallocated calls
*
* Allow a kernel service to be given notifications about new calls.
*/
void rxrpc_kernel_new_call_notification(
struct socket *sock,
- rxrpc_notify_new_call_t notify_new_call)
+ rxrpc_notify_new_call_t notify_new_call,
+ rxrpc_discard_new_call_t discard_new_call)
{
struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
rx->notify_new_call = notify_new_call;
+ rx->discard_new_call = discard_new_call;
}
EXPORT_SYMBOL(rxrpc_kernel_new_call_notification);
@@ -395,6 +407,23 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
switch (rx->sk.sk_state) {
case RXRPC_UNBOUND:
+ rx->srx.srx_family = AF_RXRPC;
+ rx->srx.srx_service = 0;
+ rx->srx.transport_type = SOCK_DGRAM;
+ rx->srx.transport.family = rx->family;
+ switch (rx->family) {
+ case AF_INET:
+ rx->srx.transport_len = sizeof(struct sockaddr_in);
+ break;
+#ifdef CONFIG_AF_RXRPC_IPV6
+ case AF_INET6:
+ rx->srx.transport_len = sizeof(struct sockaddr_in6);
+ break;
+#endif
+ default:
+ ret = -EAFNOSUPPORT;
+ goto error_unlock;
+ }
local = rxrpc_lookup_local(&rx->srx);
if (IS_ERR(local)) {
ret = PTR_ERR(local);
@@ -509,15 +538,16 @@ error:
static unsigned int rxrpc_poll(struct file *file, struct socket *sock,
poll_table *wait)
{
- unsigned int mask;
struct sock *sk = sock->sk;
+ struct rxrpc_sock *rx = rxrpc_sk(sk);
+ unsigned int mask;
sock_poll_wait(file, sk_sleep(sk), wait);
mask = 0;
/* the socket is readable if there are any messages waiting on the Rx
* queue */
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (!list_empty(&rx->recvmsg_q))
mask |= POLLIN | POLLRDNORM;
/* the socket is writable if there is space to add new data to the
@@ -544,7 +574,8 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
return -EAFNOSUPPORT;
/* we support transport protocol UDP/UDP6 only */
- if (protocol != PF_INET)
+ if (protocol != PF_INET &&
+ IS_ENABLED(CONFIG_AF_RXRPC_IPV6) && protocol != PF_INET6)
return -EPROTONOSUPPORT;
if (sock->type != SOCK_DGRAM)
@@ -558,6 +589,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
return -ENOMEM;
sock_init_data(sock, sk);
+ sock_set_flag(sk, SOCK_RCU_FREE);
sk->sk_state = RXRPC_UNBOUND;
sk->sk_write_space = rxrpc_write_space;
sk->sk_max_ack_backlog = 0;
@@ -567,9 +599,12 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
rx->family = protocol;
rx->calls = RB_ROOT;
- INIT_LIST_HEAD(&rx->listen_link);
- INIT_LIST_HEAD(&rx->secureq);
- INIT_LIST_HEAD(&rx->acceptq);
+ INIT_HLIST_NODE(&rx->listen_link);
+ spin_lock_init(&rx->incoming_lock);
+ INIT_LIST_HEAD(&rx->sock_calls);
+ INIT_LIST_HEAD(&rx->to_be_accepted);
+ INIT_LIST_HEAD(&rx->recvmsg_q);
+ rwlock_init(&rx->recvmsg_lock);
rwlock_init(&rx->call_lock);
memset(&rx->srx, 0, sizeof(rx->srx));
@@ -578,6 +613,39 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
}
/*
+ * Kill all the calls on a socket and shut it down.
+ */
+static int rxrpc_shutdown(struct socket *sock, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct rxrpc_sock *rx = rxrpc_sk(sk);
+ int ret = 0;
+
+ _enter("%p,%d", sk, flags);
+
+ if (flags != SHUT_RDWR)
+ return -EOPNOTSUPP;
+ if (sk->sk_state == RXRPC_CLOSE)
+ return -ESHUTDOWN;
+
+ lock_sock(sk);
+
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ if (sk->sk_state < RXRPC_CLOSE) {
+ sk->sk_state = RXRPC_CLOSE;
+ sk->sk_shutdown = SHUTDOWN_MASK;
+ } else {
+ ret = -ESHUTDOWN;
+ }
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+
+ rxrpc_discard_prealloc(rx);
+
+ release_sock(sk);
+ return ret;
+}
+
+/*
* RxRPC socket destructor
*/
static void rxrpc_sock_destructor(struct sock *sk)
@@ -615,13 +683,14 @@ static int rxrpc_release_sock(struct sock *sk)
ASSERTCMP(rx->listen_link.next, !=, LIST_POISON1);
- if (!list_empty(&rx->listen_link)) {
- write_lock_bh(&rx->local->services_lock);
- list_del(&rx->listen_link);
- write_unlock_bh(&rx->local->services_lock);
+ if (!hlist_unhashed(&rx->listen_link)) {
+ write_lock(&rx->local->services_lock);
+ hlist_del_rcu(&rx->listen_link);
+ write_unlock(&rx->local->services_lock);
}
/* try to flush out this socket */
+ rxrpc_discard_prealloc(rx);
rxrpc_release_calls_on_socket(rx);
flush_workqueue(rxrpc_workqueue);
rxrpc_purge_queue(&sk->sk_receive_queue);
@@ -670,7 +739,7 @@ static const struct proto_ops rxrpc_rpc_ops = {
.poll = rxrpc_poll,
.ioctl = sock_no_ioctl,
.listen = rxrpc_listen,
- .shutdown = sock_no_shutdown,
+ .shutdown = rxrpc_shutdown,
.setsockopt = rxrpc_setsockopt,
.getsockopt = sock_no_getsockopt,
.sendmsg = rxrpc_sendmsg,
@@ -798,7 +867,8 @@ static void __exit af_rxrpc_exit(void)
proto_unregister(&rxrpc_proto);
rxrpc_destroy_all_calls();
rxrpc_destroy_all_connections();
- ASSERTCMP(atomic_read(&rxrpc_n_skbs), ==, 0);
+ ASSERTCMP(atomic_read(&rxrpc_n_tx_skbs), ==, 0);
+ ASSERTCMP(atomic_read(&rxrpc_n_rx_skbs), ==, 0);
rxrpc_destroy_all_locals();
remove_proc_entry("rxrpc_conns", init_net.proc_net);
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index bb342f5fe7e4..ca96e547cb9a 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -35,8 +35,6 @@ struct rxrpc_crypt {
#define rxrpc_queue_delayed_work(WS,D) \
queue_delayed_work(rxrpc_workqueue, (WS), (D))
-#define rxrpc_queue_call(CALL) rxrpc_queue_work(&(CALL)->processor)
-
struct rxrpc_connection;
/*
@@ -66,19 +64,45 @@ enum {
};
/*
+ * Service backlog preallocation.
+ *
+ * This contains circular buffers of preallocated peers, connections and calls
+ * for incoming service calls and their head and tail pointers. This allows
+ * calls to be set up in the data_ready handler, thereby avoiding the need to
+ * shuffle packets around so much.
+ */
+struct rxrpc_backlog {
+ unsigned short peer_backlog_head;
+ unsigned short peer_backlog_tail;
+ unsigned short conn_backlog_head;
+ unsigned short conn_backlog_tail;
+ unsigned short call_backlog_head;
+ unsigned short call_backlog_tail;
+#define RXRPC_BACKLOG_MAX 32
+ struct rxrpc_peer *peer_backlog[RXRPC_BACKLOG_MAX];
+ struct rxrpc_connection *conn_backlog[RXRPC_BACKLOG_MAX];
+ struct rxrpc_call *call_backlog[RXRPC_BACKLOG_MAX];
+};
+
+/*
* RxRPC socket definition
*/
struct rxrpc_sock {
/* WARNING: sk has to be the first member */
struct sock sk;
rxrpc_notify_new_call_t notify_new_call; /* Func to notify of new call */
+ rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */
struct rxrpc_local *local; /* local endpoint */
- struct list_head listen_link; /* link in the local endpoint's listen list */
- struct list_head secureq; /* calls awaiting connection security clearance */
- struct list_head acceptq; /* calls awaiting acceptance */
+ struct hlist_node listen_link; /* link in the local endpoint's listen list */
+ struct rxrpc_backlog *backlog; /* Preallocation for services */
+ spinlock_t incoming_lock; /* Incoming call vs service shutdown lock */
+ struct list_head sock_calls; /* List of calls owned by this socket */
+ struct list_head to_be_accepted; /* calls awaiting acceptance */
+ struct list_head recvmsg_q; /* Calls awaiting recvmsg's attention */
+ rwlock_t recvmsg_lock; /* Lock for recvmsg_q */
struct key *key; /* security for this socket */
struct key *securities; /* list of server security descriptors */
- struct rb_root calls; /* outstanding calls on this socket */
+ struct rb_root calls; /* User ID -> call mapping */
unsigned long flags;
#define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */
rwlock_t call_lock; /* lock for calls */
@@ -117,13 +141,13 @@ struct rxrpc_host_header {
* - max 48 bytes (struct sk_buff::cb)
*/
struct rxrpc_skb_priv {
- struct rxrpc_call *call; /* call with which associated */
- unsigned long resend_at; /* time in jiffies at which to resend */
+ union {
+ u8 nr_jumbo; /* Number of jumbo subpackets */
+ };
union {
unsigned int offset; /* offset into buffer of next read */
int remain; /* amount of space remaining for next write */
u32 error; /* network error code */
- bool need_resend; /* T if needs resending */
};
struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */
@@ -157,7 +181,12 @@ struct rxrpc_security {
void *);
/* verify the security on a received packet */
- int (*verify_packet)(struct rxrpc_call *, struct sk_buff *, u32 *);
+ int (*verify_packet)(struct rxrpc_call *, struct sk_buff *,
+ unsigned int, unsigned int, rxrpc_seq_t, u16);
+
+ /* Locate the data in a received packet that has been verified. */
+ void (*locate_data)(struct rxrpc_call *, struct sk_buff *,
+ unsigned int *, unsigned int *);
/* issue a challenge */
int (*issue_challenge)(struct rxrpc_connection *);
@@ -187,9 +216,8 @@ struct rxrpc_local {
struct list_head link;
struct socket *socket; /* my UDP socket */
struct work_struct processor;
- struct list_head services; /* services listening on this endpoint */
+ struct hlist_head services; /* services listening on this endpoint */
struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */
- struct sk_buff_head accept_queue; /* incoming calls awaiting acceptance */
struct sk_buff_head reject_queue; /* packets awaiting rejection */
struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */
struct rb_root client_conns; /* Client connections by socket params */
@@ -227,10 +255,12 @@ struct rxrpc_peer {
/* calculated RTT cache */
#define RXRPC_RTT_CACHE_SIZE 32
- suseconds_t rtt; /* current RTT estimate (in uS) */
- unsigned int rtt_point; /* next entry at which to insert */
- unsigned int rtt_usage; /* amount of cache actually used */
- suseconds_t rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* calculated RTT cache */
+ ktime_t rtt_last_req; /* Time of last RTT request */
+ u64 rtt; /* Current RTT estimate (in nS) */
+ u64 rtt_sum; /* Sum of cache contents */
+ u64 rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* Determined RTT cache */
+ u8 rtt_cursor; /* next entry at which to insert */
+ u8 rtt_usage; /* amount of cache actually used */
};
/*
@@ -283,6 +313,7 @@ enum rxrpc_conn_cache_state {
RXRPC_CONN_CLIENT_ACTIVE, /* Conn is on active list, doing calls */
RXRPC_CONN_CLIENT_CULLED, /* Conn is culled and delisted, doing calls */
RXRPC_CONN_CLIENT_IDLE, /* Conn is on idle list, doing mostly nothing */
+ RXRPC_CONN__NR_CACHE_STATES
};
/*
@@ -291,6 +322,7 @@ enum rxrpc_conn_cache_state {
enum rxrpc_conn_proto_state {
RXRPC_CONN_UNUSED, /* Connection not yet attempted */
RXRPC_CONN_CLIENT, /* Client connection */
+ RXRPC_CONN_SERVICE_PREALLOC, /* Service connection preallocation */
RXRPC_CONN_SERVICE_UNSECURED, /* Service unsecured connection */
RXRPC_CONN_SERVICE_CHALLENGING, /* Service challenging for security */
RXRPC_CONN_SERVICE, /* Service secured connection */
@@ -345,17 +377,16 @@ struct rxrpc_connection {
unsigned long events;
unsigned long idle_timestamp; /* Time at which last became idle */
spinlock_t state_lock; /* state-change lock */
- enum rxrpc_conn_cache_state cache_state : 8;
- enum rxrpc_conn_proto_state state : 8; /* current state of connection */
+ enum rxrpc_conn_cache_state cache_state;
+ enum rxrpc_conn_proto_state state; /* current state of connection */
u32 local_abort; /* local abort code */
u32 remote_abort; /* remote abort code */
int debug_id; /* debug ID for printks */
atomic_t serial; /* packet serial number counter */
unsigned int hi_serial; /* highest serial number received */
+ u32 security_nonce; /* response re-use preventer */
u8 size_align; /* data size alignment (for security) */
- u8 header_size; /* rxrpc + security header size */
u8 security_size; /* security header size */
- u32 security_nonce; /* response re-use preventer */
u8 security_ix; /* security type */
u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */
};
@@ -365,39 +396,23 @@ struct rxrpc_connection {
*/
enum rxrpc_call_flag {
RXRPC_CALL_RELEASED, /* call has been released - no more message to userspace */
- RXRPC_CALL_TERMINAL_MSG, /* call has given the socket its final message */
- RXRPC_CALL_RCVD_LAST, /* all packets received */
- RXRPC_CALL_RUN_RTIMER, /* Tx resend timer started */
- RXRPC_CALL_TX_SOFT_ACK, /* sent some soft ACKs */
- RXRPC_CALL_INIT_ACCEPT, /* acceptance was initiated */
RXRPC_CALL_HAS_USERID, /* has a user ID attached */
- RXRPC_CALL_EXPECT_OOS, /* expect out of sequence packets */
RXRPC_CALL_IS_SERVICE, /* Call is service call */
RXRPC_CALL_EXPOSED, /* The call was exposed to the world */
- RXRPC_CALL_RX_NO_MORE, /* Don't indicate MSG_MORE from recvmsg() */
+ RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */
+ RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */
+ RXRPC_CALL_PINGING, /* Ping in process */
+ RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */
};
/*
* Events that can be raised on a call.
*/
enum rxrpc_call_event {
- RXRPC_CALL_EV_RCVD_ACKALL, /* ACKALL or reply received */
- RXRPC_CALL_EV_RCVD_BUSY, /* busy packet received */
- RXRPC_CALL_EV_RCVD_ABORT, /* abort packet received */
- RXRPC_CALL_EV_RCVD_ERROR, /* network error received */
- RXRPC_CALL_EV_ACK_FINAL, /* need to generate final ACK (and release call) */
RXRPC_CALL_EV_ACK, /* need to generate ACK */
- RXRPC_CALL_EV_REJECT_BUSY, /* need to generate busy message */
RXRPC_CALL_EV_ABORT, /* need to generate abort */
- RXRPC_CALL_EV_CONN_ABORT, /* local connection abort generated */
- RXRPC_CALL_EV_RESEND_TIMER, /* Tx resend timer expired */
+ RXRPC_CALL_EV_TIMER, /* Timer expired */
RXRPC_CALL_EV_RESEND, /* Tx resend required */
- RXRPC_CALL_EV_DRAIN_RX_OOS, /* drain the Rx out of sequence queue */
- RXRPC_CALL_EV_LIFE_TIMER, /* call's lifetimer ran out */
- RXRPC_CALL_EV_ACCEPTED, /* incoming call accepted by userspace app */
- RXRPC_CALL_EV_SECURED, /* incoming call's connection is now secure */
- RXRPC_CALL_EV_POST_ACCEPT, /* need to post an "accept?" message to the app */
- RXRPC_CALL_EV_RELEASE, /* need to release the call's resources */
};
/*
@@ -409,7 +424,7 @@ enum rxrpc_call_state {
RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */
RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */
RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */
- RXRPC_CALL_CLIENT_FINAL_ACK, /* - client sending final ACK phase */
+ RXRPC_CALL_SERVER_PREALLOC, /* - service preallocation */
RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */
RXRPC_CALL_SERVER_ACCEPTING, /* - server accepting request */
RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */
@@ -417,7 +432,6 @@ enum rxrpc_call_state {
RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */
RXRPC_CALL_SERVER_AWAIT_ACK, /* - server awaiting final ACK */
RXRPC_CALL_COMPLETE, /* - call complete */
- RXRPC_CALL_DEAD, /* - call is dead */
NR__RXRPC_CALL_STATES
};
@@ -426,7 +440,6 @@ enum rxrpc_call_state {
*/
enum rxrpc_call_completion {
RXRPC_CALL_SUCCEEDED, /* - Normal termination */
- RXRPC_CALL_SERVER_BUSY, /* - call rejected by busy server */
RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */
RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */
RXRPC_CALL_LOCAL_ERROR, /* - call failed due to local error */
@@ -435,6 +448,17 @@ enum rxrpc_call_completion {
};
/*
+ * Call Tx congestion management modes.
+ */
+enum rxrpc_congest_mode {
+ RXRPC_CALL_SLOW_START,
+ RXRPC_CALL_CONGEST_AVOIDANCE,
+ RXRPC_CALL_PACKET_LOSS,
+ RXRPC_CALL_FAST_RETRANSMIT,
+ NR__RXRPC_CONGEST_MODES
+};
+
+/*
* RxRPC call definition
* - matched by { connection, call_id }
*/
@@ -442,78 +466,329 @@ struct rxrpc_call {
struct rcu_head rcu;
struct rxrpc_connection *conn; /* connection carrying call */
struct rxrpc_peer *peer; /* Peer record for remote address */
- struct rxrpc_sock *socket; /* socket responsible */
- struct timer_list lifetimer; /* lifetime remaining on call */
- struct timer_list deadspan; /* reap timer for re-ACK'ing, etc */
- struct timer_list ack_timer; /* ACK generation timer */
- struct timer_list resend_timer; /* Tx resend timer */
- struct work_struct destroyer; /* call destroyer */
- struct work_struct processor; /* packet processor and ACK generator */
+ struct rxrpc_sock __rcu *socket; /* socket responsible */
+ unsigned long ack_at; /* When deferred ACK needs to happen */
+ unsigned long resend_at; /* When next resend needs to happen */
+ unsigned long expire_at; /* When the call times out */
+ struct timer_list timer; /* Combined event timer */
+ struct work_struct processor; /* Event processor */
rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */
struct list_head link; /* link in master call list */
struct list_head chan_wait_link; /* Link in conn->waiting_calls */
struct hlist_node error_link; /* link in error distribution list */
- struct list_head accept_link; /* calls awaiting acceptance */
- struct rb_node sock_node; /* node in socket call tree */
- struct sk_buff_head rx_queue; /* received packets */
- struct sk_buff_head rx_oos_queue; /* packets received out of sequence */
- struct sk_buff_head knlrecv_queue; /* Queue for kernel_recv [TODO: replace this] */
+ struct list_head accept_link; /* Link in rx->acceptq */
+ struct list_head recvmsg_link; /* Link in rx->recvmsg_q */
+ struct list_head sock_link; /* Link in rx->sock_calls */
+ struct rb_node sock_node; /* Node in rx->calls */
struct sk_buff *tx_pending; /* Tx socket buffer being filled */
wait_queue_head_t waitq; /* Wait queue for channel or Tx */
__be32 crypto_buf[2]; /* Temporary packet crypto buffer */
unsigned long user_call_ID; /* user-defined call ID */
- unsigned long creation_jif; /* time of call creation */
unsigned long flags;
unsigned long events;
spinlock_t lock;
rwlock_t state_lock; /* lock for state transition */
u32 abort_code; /* Local/remote abort code */
int error; /* Local error incurred */
- enum rxrpc_call_state state : 8; /* current state of call */
- enum rxrpc_call_completion completion : 8; /* Call completion condition */
+ enum rxrpc_call_state state; /* current state of call */
+ enum rxrpc_call_completion completion; /* Call completion condition */
atomic_t usage;
- atomic_t skb_count; /* Outstanding packets on this call */
- atomic_t sequence; /* Tx data packet sequence counter */
u16 service_id; /* service ID */
+ u8 security_ix; /* Security type */
u32 call_id; /* call ID on connection */
u32 cid; /* connection ID plus channel index */
int debug_id; /* debug ID for printks */
-
- /* transmission-phase ACK management */
- u8 acks_head; /* offset into window of first entry */
- u8 acks_tail; /* offset into window of last entry */
- u8 acks_winsz; /* size of un-ACK'd window */
- u8 acks_unacked; /* lowest unacked packet in last ACK received */
- int acks_latest; /* serial number of latest ACK received */
- rxrpc_seq_t acks_hard; /* highest definitively ACK'd msg seq */
- unsigned long *acks_window; /* sent packet window
- * - elements are pointers with LSB set if ACK'd
+ unsigned short rx_pkt_offset; /* Current recvmsg packet offset */
+ unsigned short rx_pkt_len; /* Current recvmsg packet len */
+
+ /* Rx/Tx circular buffer, depending on phase.
+ *
+ * In the Rx phase, packets are annotated with 0 or the number of the
+ * segment of a jumbo packet each buffer refers to. There can be up to
+ * 47 segments in a maximum-size UDP packet.
+ *
+ * In the Tx phase, packets are annotated with which buffers have been
+ * acked.
+ */
+#define RXRPC_RXTX_BUFF_SIZE 64
+#define RXRPC_RXTX_BUFF_MASK (RXRPC_RXTX_BUFF_SIZE - 1)
+#define RXRPC_INIT_RX_WINDOW_SIZE 32
+ struct sk_buff **rxtx_buffer;
+ u8 *rxtx_annotations;
+#define RXRPC_TX_ANNO_ACK 0
+#define RXRPC_TX_ANNO_UNACK 1
+#define RXRPC_TX_ANNO_NAK 2
+#define RXRPC_TX_ANNO_RETRANS 3
+#define RXRPC_TX_ANNO_MASK 0x03
+#define RXRPC_TX_ANNO_LAST 0x04
+#define RXRPC_TX_ANNO_RESENT 0x08
+
+#define RXRPC_RX_ANNO_JUMBO 0x3f /* Jumbo subpacket number + 1 if not zero */
+#define RXRPC_RX_ANNO_JLAST 0x40 /* Set if last element of a jumbo packet */
+#define RXRPC_RX_ANNO_VERIFIED 0x80 /* Set if verified and decrypted */
+ rxrpc_seq_t tx_hard_ack; /* Dead slot in buffer; the first transmitted but
+ * not hard-ACK'd packet follows this.
+ */
+ rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */
+
+ /* TCP-style slow-start congestion control [RFC5681]. Since the SMSS
+ * is fixed, we keep these numbers in terms of segments (ie. DATA
+ * packets) rather than bytes.
+ */
+#define RXRPC_TX_SMSS RXRPC_JUMBO_DATALEN
+ u8 cong_cwnd; /* Congestion window size */
+ u8 cong_extra; /* Extra to send for congestion management */
+ u8 cong_ssthresh; /* Slow-start threshold */
+ enum rxrpc_congest_mode cong_mode:8; /* Congestion management mode */
+ u8 cong_dup_acks; /* Count of ACKs showing missing packets */
+ u8 cong_cumul_acks; /* Cumulative ACK count */
+ ktime_t cong_tstamp; /* Last time cwnd was changed */
+
+ rxrpc_seq_t rx_hard_ack; /* Dead slot in buffer; the first received but not
+ * consumed packet follows this.
*/
+ rxrpc_seq_t rx_top; /* Highest Rx slot allocated. */
+ rxrpc_seq_t rx_expect_next; /* Expected next packet sequence number */
+ u8 rx_winsize; /* Size of Rx window */
+ u8 tx_winsize; /* Maximum size of Tx window */
+ bool tx_phase; /* T if transmission phase, F if receive phase */
+ u8 nr_jumbo_bad; /* Number of jumbo dups/exceeds-windows */
/* receive-phase ACK management */
- rxrpc_seq_t rx_data_expect; /* next data seq ID expected to be received */
- rxrpc_seq_t rx_data_post; /* next data seq ID expected to be posted */
- rxrpc_seq_t rx_data_recv; /* last data seq ID encountered by recvmsg */
- rxrpc_seq_t rx_data_eaten; /* last data seq ID consumed by recvmsg */
- rxrpc_seq_t rx_first_oos; /* first packet in rx_oos_queue (or 0) */
- rxrpc_seq_t ackr_win_top; /* top of ACK window (rx_data_eaten is bottom) */
- rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */
u8 ackr_reason; /* reason to ACK */
u16 ackr_skew; /* skew on packet being ACK'd */
rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */
- atomic_t ackr_not_idle; /* number of packets in Rx queue */
+ rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */
+ rxrpc_seq_t ackr_consumed; /* Highest packet shown consumed */
+ rxrpc_seq_t ackr_seen; /* Highest packet shown seen */
+ rxrpc_serial_t ackr_ping; /* Last ping sent */
+ ktime_t ackr_ping_time; /* Time last ping sent */
- /* received packet records, 1 bit per record */
-#define RXRPC_ACKR_WINDOW_ASZ DIV_ROUND_UP(RXRPC_MAXACKS, BITS_PER_LONG)
- unsigned long ackr_window[RXRPC_ACKR_WINDOW_ASZ + 1];
+ /* transmission-phase ACK management */
+ ktime_t acks_latest_ts; /* Timestamp of latest ACK received */
+ rxrpc_serial_t acks_latest; /* serial number of latest ACK received */
+ rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */
};
+/*
+ * Summary of a new ACK and the changes it made to the Tx buffer packet states.
+ */
+struct rxrpc_ack_summary {
+ u8 ack_reason;
+ u8 nr_acks; /* Number of ACKs in packet */
+ u8 nr_nacks; /* Number of NACKs in packet */
+ u8 nr_new_acks; /* Number of new ACKs in packet */
+ u8 nr_new_nacks; /* Number of new NACKs in packet */
+ u8 nr_rot_new_acks; /* Number of rotated new ACKs */
+ bool new_low_nack; /* T if new low NACK found */
+ bool retrans_timeo; /* T if reTx due to timeout happened */
+ u8 flight_size; /* Number of unreceived transmissions */
+ /* Place to stash values for tracing */
+ enum rxrpc_congest_mode mode:8;
+ u8 cwnd;
+ u8 ssthresh;
+ u8 dup_acks;
+ u8 cumulative_acks;
+};
+
+enum rxrpc_skb_trace {
+ rxrpc_skb_rx_cleaned,
+ rxrpc_skb_rx_freed,
+ rxrpc_skb_rx_got,
+ rxrpc_skb_rx_lost,
+ rxrpc_skb_rx_received,
+ rxrpc_skb_rx_rotated,
+ rxrpc_skb_rx_purged,
+ rxrpc_skb_rx_seen,
+ rxrpc_skb_tx_cleaned,
+ rxrpc_skb_tx_freed,
+ rxrpc_skb_tx_got,
+ rxrpc_skb_tx_lost,
+ rxrpc_skb_tx_new,
+ rxrpc_skb_tx_rotated,
+ rxrpc_skb_tx_seen,
+ rxrpc_skb__nr_trace
+};
+
+extern const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7];
+
+enum rxrpc_conn_trace {
+ rxrpc_conn_new_client,
+ rxrpc_conn_new_service,
+ rxrpc_conn_queued,
+ rxrpc_conn_seen,
+ rxrpc_conn_got,
+ rxrpc_conn_put_client,
+ rxrpc_conn_put_service,
+ rxrpc_conn__nr_trace
+};
+
+extern const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4];
+
+enum rxrpc_client_trace {
+ rxrpc_client_activate_chans,
+ rxrpc_client_alloc,
+ rxrpc_client_chan_activate,
+ rxrpc_client_chan_disconnect,
+ rxrpc_client_chan_pass,
+ rxrpc_client_chan_unstarted,
+ rxrpc_client_cleanup,
+ rxrpc_client_count,
+ rxrpc_client_discard,
+ rxrpc_client_duplicate,
+ rxrpc_client_exposed,
+ rxrpc_client_replace,
+ rxrpc_client_to_active,
+ rxrpc_client_to_culled,
+ rxrpc_client_to_idle,
+ rxrpc_client_to_inactive,
+ rxrpc_client_to_waiting,
+ rxrpc_client_uncount,
+ rxrpc_client__nr_trace
+};
+
+extern const char rxrpc_client_traces[rxrpc_client__nr_trace][7];
+extern const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5];
+
+enum rxrpc_call_trace {
+ rxrpc_call_new_client,
+ rxrpc_call_new_service,
+ rxrpc_call_queued,
+ rxrpc_call_queued_ref,
+ rxrpc_call_seen,
+ rxrpc_call_connected,
+ rxrpc_call_release,
+ rxrpc_call_got,
+ rxrpc_call_got_userid,
+ rxrpc_call_got_kernel,
+ rxrpc_call_put,
+ rxrpc_call_put_userid,
+ rxrpc_call_put_kernel,
+ rxrpc_call_put_noqueue,
+ rxrpc_call_error,
+ rxrpc_call__nr_trace
+};
+
+extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4];
+
+enum rxrpc_transmit_trace {
+ rxrpc_transmit_wait,
+ rxrpc_transmit_queue,
+ rxrpc_transmit_queue_last,
+ rxrpc_transmit_rotate,
+ rxrpc_transmit_rotate_last,
+ rxrpc_transmit_await_reply,
+ rxrpc_transmit_end,
+ rxrpc_transmit__nr_trace
+};
+
+extern const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4];
+
+enum rxrpc_receive_trace {
+ rxrpc_receive_incoming,
+ rxrpc_receive_queue,
+ rxrpc_receive_queue_last,
+ rxrpc_receive_front,
+ rxrpc_receive_rotate,
+ rxrpc_receive_end,
+ rxrpc_receive__nr_trace
+};
+
+extern const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4];
+
+enum rxrpc_recvmsg_trace {
+ rxrpc_recvmsg_enter,
+ rxrpc_recvmsg_wait,
+ rxrpc_recvmsg_dequeue,
+ rxrpc_recvmsg_hole,
+ rxrpc_recvmsg_next,
+ rxrpc_recvmsg_cont,
+ rxrpc_recvmsg_full,
+ rxrpc_recvmsg_data_return,
+ rxrpc_recvmsg_terminal,
+ rxrpc_recvmsg_to_be_accepted,
+ rxrpc_recvmsg_return,
+ rxrpc_recvmsg__nr_trace
+};
+
+extern const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5];
+
+enum rxrpc_rtt_tx_trace {
+ rxrpc_rtt_tx_ping,
+ rxrpc_rtt_tx_data,
+ rxrpc_rtt_tx__nr_trace
+};
+
+extern const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5];
+
+enum rxrpc_rtt_rx_trace {
+ rxrpc_rtt_rx_ping_response,
+ rxrpc_rtt_rx_requested_ack,
+ rxrpc_rtt_rx__nr_trace
+};
+
+extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5];
+
+enum rxrpc_timer_trace {
+ rxrpc_timer_begin,
+ rxrpc_timer_init_for_reply,
+ rxrpc_timer_expired,
+ rxrpc_timer_set_for_ack,
+ rxrpc_timer_set_for_resend,
+ rxrpc_timer_set_for_send,
+ rxrpc_timer__nr_trace
+};
+
+extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8];
+
+enum rxrpc_propose_ack_trace {
+ rxrpc_propose_ack_client_tx_end,
+ rxrpc_propose_ack_input_data,
+ rxrpc_propose_ack_ping_for_lost_ack,
+ rxrpc_propose_ack_ping_for_lost_reply,
+ rxrpc_propose_ack_ping_for_params,
+ rxrpc_propose_ack_respond_to_ack,
+ rxrpc_propose_ack_respond_to_ping,
+ rxrpc_propose_ack_retry_tx,
+ rxrpc_propose_ack_rotate_rx,
+ rxrpc_propose_ack_terminal_ack,
+ rxrpc_propose_ack__nr_trace
+};
+
+enum rxrpc_propose_ack_outcome {
+ rxrpc_propose_ack_use,
+ rxrpc_propose_ack_update,
+ rxrpc_propose_ack_subsume,
+ rxrpc_propose_ack__nr_outcomes
+};
+
+extern const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8];
+extern const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes];
+
+enum rxrpc_congest_change {
+ rxrpc_cong_begin_retransmission,
+ rxrpc_cong_cleared_nacks,
+ rxrpc_cong_new_low_nack,
+ rxrpc_cong_no_change,
+ rxrpc_cong_progress,
+ rxrpc_cong_retransmit_again,
+ rxrpc_cong_rtt_window_end,
+ rxrpc_cong_saw_nack,
+ rxrpc_congest__nr_change
+};
+
+extern const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10];
+extern const char rxrpc_congest_changes[rxrpc_congest__nr_change][9];
+
+extern const char *const rxrpc_pkts[];
+extern const char const rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4];
+
#include <trace/events/rxrpc.h>
/*
* af_rxrpc.c
*/
-extern atomic_t rxrpc_n_skbs;
+extern atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs;
extern u32 rxrpc_epoch;
extern atomic_t rxrpc_debug_id;
extern struct workqueue_struct *rxrpc_workqueue;
@@ -521,6 +796,11 @@ extern struct workqueue_struct *rxrpc_workqueue;
/*
* call_accept.c
*/
+int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t);
+void rxrpc_discard_prealloc(struct rxrpc_sock *);
+struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *,
+ struct rxrpc_connection *,
+ struct sk_buff *);
void rxrpc_accept_incoming_calls(struct rxrpc_local *);
struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long,
rxrpc_notify_rx_t);
@@ -529,8 +809,9 @@ int rxrpc_reject_call(struct rxrpc_sock *);
/*
* call_event.c
*/
-void __rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool);
-void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool);
+void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace);
+void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool,
+ enum rxrpc_propose_ack_trace);
void rxrpc_process_call(struct work_struct *);
/*
@@ -539,26 +820,26 @@ void rxrpc_process_call(struct work_struct *);
extern const char *const rxrpc_call_states[];
extern const char *const rxrpc_call_completions[];
extern unsigned int rxrpc_max_call_lifetime;
-extern unsigned int rxrpc_dead_call_expiry;
extern struct kmem_cache *rxrpc_call_jar;
extern struct list_head rxrpc_calls;
extern rwlock_t rxrpc_call_lock;
struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long);
+struct rxrpc_call *rxrpc_alloc_call(gfp_t);
struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *,
struct rxrpc_conn_parameters *,
struct sockaddr_rxrpc *,
unsigned long, gfp_t);
-struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *,
- struct rxrpc_connection *,
- struct sk_buff *);
-void rxrpc_release_call(struct rxrpc_call *);
+void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *,
+ struct sk_buff *);
+void rxrpc_release_call(struct rxrpc_sock *, struct rxrpc_call *);
void rxrpc_release_calls_on_socket(struct rxrpc_sock *);
+bool __rxrpc_queue_call(struct rxrpc_call *);
+bool rxrpc_queue_call(struct rxrpc_call *);
void rxrpc_see_call(struct rxrpc_call *);
-void rxrpc_get_call(struct rxrpc_call *);
-void rxrpc_put_call(struct rxrpc_call *);
-void rxrpc_get_call_for_skb(struct rxrpc_call *, struct sk_buff *);
-void rxrpc_put_call_for_skb(struct rxrpc_call *, struct sk_buff *);
+void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace);
+void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace);
+void rxrpc_cleanup_call(struct rxrpc_call *);
void __exit rxrpc_destroy_all_calls(void);
static inline bool rxrpc_is_service_call(const struct rxrpc_call *call)
@@ -584,6 +865,7 @@ static inline bool __rxrpc_set_call_completion(struct rxrpc_call *call,
call->error = error;
call->completion = compl,
call->state = RXRPC_CALL_COMPLETE;
+ wake_up(&call->waitq);
return true;
}
return false;
@@ -594,7 +876,7 @@ static inline bool rxrpc_set_call_completion(struct rxrpc_call *call,
u32 abort_code,
int error)
{
- int ret;
+ bool ret;
write_lock_bh(&call->state_lock);
ret = __rxrpc_set_call_completion(call, compl, abort_code, error);
@@ -605,40 +887,41 @@ static inline bool rxrpc_set_call_completion(struct rxrpc_call *call,
/*
* Record that a call successfully completed.
*/
-static inline void __rxrpc_call_completed(struct rxrpc_call *call)
+static inline bool __rxrpc_call_completed(struct rxrpc_call *call)
{
- __rxrpc_set_call_completion(call, RXRPC_CALL_SUCCEEDED, 0, 0);
+ return __rxrpc_set_call_completion(call, RXRPC_CALL_SUCCEEDED, 0, 0);
}
-static inline void rxrpc_call_completed(struct rxrpc_call *call)
+static inline bool rxrpc_call_completed(struct rxrpc_call *call)
{
+ bool ret;
+
write_lock_bh(&call->state_lock);
- __rxrpc_call_completed(call);
+ ret = __rxrpc_call_completed(call);
write_unlock_bh(&call->state_lock);
+ return ret;
}
/*
* Record that a call is locally aborted.
*/
-static inline bool __rxrpc_abort_call(struct rxrpc_call *call,
+static inline bool __rxrpc_abort_call(const char *why, struct rxrpc_call *call,
+ rxrpc_seq_t seq,
u32 abort_code, int error)
{
- if (__rxrpc_set_call_completion(call,
- RXRPC_CALL_LOCALLY_ABORTED,
- abort_code, error)) {
- set_bit(RXRPC_CALL_EV_ABORT, &call->events);
- return true;
- }
- return false;
+ trace_rxrpc_abort(why, call->cid, call->call_id, seq,
+ abort_code, error);
+ return __rxrpc_set_call_completion(call, RXRPC_CALL_LOCALLY_ABORTED,
+ abort_code, error);
}
-static inline bool rxrpc_abort_call(struct rxrpc_call *call,
- u32 abort_code, int error)
+static inline bool rxrpc_abort_call(const char *why, struct rxrpc_call *call,
+ rxrpc_seq_t seq, u32 abort_code, int error)
{
bool ret;
write_lock_bh(&call->state_lock);
- ret = __rxrpc_abort_call(call, abort_code, error);
+ ret = __rxrpc_abort_call(why, call, seq, abort_code, error);
write_unlock_bh(&call->state_lock);
return ret;
}
@@ -664,8 +947,6 @@ void __exit rxrpc_destroy_all_client_connections(void);
* conn_event.c
*/
void rxrpc_process_connection(struct work_struct *);
-void rxrpc_reject_packet(struct rxrpc_local *, struct sk_buff *);
-void rxrpc_reject_packets(struct rxrpc_local *);
/*
* conn_object.c
@@ -682,7 +963,11 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *,
void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *);
void rxrpc_disconnect_call(struct rxrpc_call *);
void rxrpc_kill_connection(struct rxrpc_connection *);
-void __rxrpc_put_connection(struct rxrpc_connection *);
+bool rxrpc_queue_conn(struct rxrpc_connection *);
+void rxrpc_see_connection(struct rxrpc_connection *);
+void rxrpc_get_connection(struct rxrpc_connection *);
+struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *);
+void rxrpc_put_service_conn(struct rxrpc_connection *);
void __exit rxrpc_destroy_all_connections(void);
static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn)
@@ -695,39 +980,15 @@ static inline bool rxrpc_conn_is_service(const struct rxrpc_connection *conn)
return !rxrpc_conn_is_client(conn);
}
-static inline void rxrpc_get_connection(struct rxrpc_connection *conn)
-{
- atomic_inc(&conn->usage);
-}
-
-static inline
-struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *conn)
-{
- return atomic_inc_not_zero(&conn->usage) ? conn : NULL;
-}
-
static inline void rxrpc_put_connection(struct rxrpc_connection *conn)
{
if (!conn)
return;
- if (rxrpc_conn_is_client(conn)) {
- if (atomic_dec_and_test(&conn->usage))
- rxrpc_put_client_conn(conn);
- } else {
- if (atomic_dec_return(&conn->usage) == 1)
- __rxrpc_put_connection(conn);
- }
-}
-
-
-static inline bool rxrpc_queue_conn(struct rxrpc_connection *conn)
-{
- if (!rxrpc_get_connection_maybe(conn))
- return false;
- if (!rxrpc_queue_work(&conn->processor))
- rxrpc_put_connection(conn);
- return true;
+ if (rxrpc_conn_is_client(conn))
+ rxrpc_put_client_conn(conn);
+ else
+ rxrpc_put_service_conn(conn);
}
/*
@@ -735,17 +996,14 @@ static inline bool rxrpc_queue_conn(struct rxrpc_connection *conn)
*/
struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *,
struct sk_buff *);
-struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *,
- struct sockaddr_rxrpc *,
- struct sk_buff *);
+struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t);
+void rxrpc_new_incoming_connection(struct rxrpc_connection *, struct sk_buff *);
void rxrpc_unpublish_service_conn(struct rxrpc_connection *);
/*
* input.c
*/
void rxrpc_data_ready(struct sock *);
-int rxrpc_queue_rcv_skb(struct rxrpc_call *, struct sk_buff *, bool, bool);
-void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *);
/*
* insecure.c
@@ -809,21 +1067,22 @@ extern unsigned int rxrpc_rx_mtu;
extern unsigned int rxrpc_rx_jumbo_max;
extern unsigned int rxrpc_resend_timeout;
-extern const char *const rxrpc_pkts[];
extern const s8 rxrpc_ack_priority[];
-extern const char *rxrpc_acks(u8 reason);
-
/*
* output.c
*/
-int rxrpc_send_data_packet(struct rxrpc_connection *, struct sk_buff *);
+int rxrpc_send_call_packet(struct rxrpc_call *, u8);
+int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *);
+void rxrpc_reject_packets(struct rxrpc_local *);
/*
* peer_event.c
*/
void rxrpc_error_report(struct sock *);
void rxrpc_peer_error_distributor(struct work_struct *);
+void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace,
+ rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t);
/*
* peer_object.c
@@ -833,6 +1092,8 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *,
struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *,
struct sockaddr_rxrpc *, gfp_t);
struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t);
+struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *,
+ struct rxrpc_peer *);
static inline struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer)
{
@@ -862,7 +1123,7 @@ extern const struct file_operations rxrpc_connection_seq_fops;
/*
* recvmsg.c
*/
-void rxrpc_remove_user_ID(struct rxrpc_sock *, struct rxrpc_call *);
+void rxrpc_notify_socket(struct rxrpc_call *);
int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int);
/*
@@ -879,7 +1140,7 @@ int __init rxrpc_init_security(void);
void rxrpc_exit_security(void);
int rxrpc_init_client_conn_security(struct rxrpc_connection *);
int rxrpc_init_server_conn_security(struct rxrpc_connection *);
-
+
/*
* sendmsg.c
*/
@@ -890,10 +1151,11 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t);
*/
void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *);
void rxrpc_packet_destructor(struct sk_buff *);
-void rxrpc_new_skb(struct sk_buff *);
-void rxrpc_see_skb(struct sk_buff *);
-void rxrpc_get_skb(struct sk_buff *);
-void rxrpc_free_skb(struct sk_buff *);
+void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace);
+void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace);
+void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace);
+void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace);
+void rxrpc_lose_skb(struct sk_buff *, enum rxrpc_skb_trace);
void rxrpc_purge_queue(struct sk_buff_head *);
/*
@@ -912,6 +1174,23 @@ static inline void rxrpc_sysctl_exit(void) {}
*/
int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *);
+static inline bool before(u32 seq1, u32 seq2)
+{
+ return (s32)(seq1 - seq2) < 0;
+}
+static inline bool before_eq(u32 seq1, u32 seq2)
+{
+ return (s32)(seq1 - seq2) <= 0;
+}
+static inline bool after(u32 seq1, u32 seq2)
+{
+ return (s32)(seq1 - seq2) > 0;
+}
+static inline bool after_eq(u32 seq1, u32 seq2)
+{
+ return (s32)(seq1 - seq2) >= 0;
+}
+
/*
* debug tracing
*/
@@ -994,11 +1273,12 @@ do { \
#define ASSERTCMP(X, OP, Y) \
do { \
- unsigned long _x = (unsigned long)(X); \
- unsigned long _y = (unsigned long)(Y); \
+ __typeof__(X) _x = (X); \
+ __typeof__(Y) _y = (__typeof__(X))(Y); \
if (unlikely(!(_x OP _y))) { \
- pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \
- _x, _x, #OP, _y, _y); \
+ pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \
+ (unsigned long)_x, (unsigned long)_x, #OP, \
+ (unsigned long)_y, (unsigned long)_y); \
BUG(); \
} \
} while (0)
@@ -1013,11 +1293,12 @@ do { \
#define ASSERTIFCMP(C, X, OP, Y) \
do { \
- unsigned long _x = (unsigned long)(X); \
- unsigned long _y = (unsigned long)(Y); \
+ __typeof__(X) _x = (X); \
+ __typeof__(Y) _y = (__typeof__(X))(Y); \
if (unlikely((C) && !(_x OP _y))) { \
pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \
- _x, _x, #OP, _y, _y); \
+ (unsigned long)_x, (unsigned long)_x, #OP, \
+ (unsigned long)_y, (unsigned long)_y); \
BUG(); \
} \
} while (0)
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 68a439e30df1..a8d39d7cf42c 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -20,265 +20,409 @@
#include <linux/in6.h>
#include <linux/icmp.h>
#include <linux/gfp.h>
+#include <linux/circ_buf.h>
#include <net/sock.h>
#include <net/af_rxrpc.h>
#include <net/ip.h>
#include "ar-internal.h"
/*
- * generate a connection-level abort
+ * Preallocate a single service call, connection and peer and, if possible,
+ * give them a user ID and attach the user's side of the ID to them.
*/
-static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx,
- struct rxrpc_wire_header *whdr)
+static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
+ struct rxrpc_backlog *b,
+ rxrpc_notify_rx_t notify_rx,
+ rxrpc_user_attach_call_t user_attach_call,
+ unsigned long user_call_ID, gfp_t gfp)
{
- struct msghdr msg;
- struct kvec iov[1];
- size_t len;
- int ret;
+ const void *here = __builtin_return_address(0);
+ struct rxrpc_call *call;
+ int max, tmp;
+ unsigned int size = RXRPC_BACKLOG_MAX;
+ unsigned int head, tail, call_head, call_tail;
+
+ max = rx->sk.sk_max_ack_backlog;
+ tmp = rx->sk.sk_ack_backlog;
+ if (tmp >= max) {
+ _leave(" = -ENOBUFS [full %u]", max);
+ return -ENOBUFS;
+ }
+ max -= tmp;
+
+ /* We don't need more conns and peers than we have calls, but on the
+ * other hand, we shouldn't ever use more peers than conns or conns
+ * than calls.
+ */
+ call_head = b->call_backlog_head;
+ call_tail = READ_ONCE(b->call_backlog_tail);
+ tmp = CIRC_CNT(call_head, call_tail, size);
+ if (tmp >= max) {
+ _leave(" = -ENOBUFS [enough %u]", tmp);
+ return -ENOBUFS;
+ }
+ max = tmp + 1;
+
+ head = b->peer_backlog_head;
+ tail = READ_ONCE(b->peer_backlog_tail);
+ if (CIRC_CNT(head, tail, size) < max) {
+ struct rxrpc_peer *peer = rxrpc_alloc_peer(rx->local, gfp);
+ if (!peer)
+ return -ENOMEM;
+ b->peer_backlog[head] = peer;
+ smp_store_release(&b->peer_backlog_head,
+ (head + 1) & (size - 1));
+ }
- _enter("%d,,", local->debug_id);
+ head = b->conn_backlog_head;
+ tail = READ_ONCE(b->conn_backlog_tail);
+ if (CIRC_CNT(head, tail, size) < max) {
+ struct rxrpc_connection *conn;
- whdr->type = RXRPC_PACKET_TYPE_BUSY;
- whdr->serial = htonl(1);
+ conn = rxrpc_prealloc_service_connection(gfp);
+ if (!conn)
+ return -ENOMEM;
+ b->conn_backlog[head] = conn;
+ smp_store_release(&b->conn_backlog_head,
+ (head + 1) & (size - 1));
- msg.msg_name = &srx->transport.sin;
- msg.msg_namelen = sizeof(srx->transport.sin);
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_flags = 0;
+ trace_rxrpc_conn(conn, rxrpc_conn_new_service,
+ atomic_read(&conn->usage), here);
+ }
- iov[0].iov_base = whdr;
- iov[0].iov_len = sizeof(*whdr);
+ /* Now it gets complicated, because calls get registered with the
+ * socket here, particularly if a user ID is preassigned by the user.
+ */
+ call = rxrpc_alloc_call(gfp);
+ if (!call)
+ return -ENOMEM;
+ call->flags |= (1 << RXRPC_CALL_IS_SERVICE);
+ call->state = RXRPC_CALL_SERVER_PREALLOC;
- len = iov[0].iov_len;
+ trace_rxrpc_call(call, rxrpc_call_new_service,
+ atomic_read(&call->usage),
+ here, (const void *)user_call_ID);
- _proto("Tx BUSY %%1");
+ write_lock(&rx->call_lock);
+ if (user_attach_call) {
+ struct rxrpc_call *xcall;
+ struct rb_node *parent, **pp;
+
+ /* Check the user ID isn't already in use */
+ pp = &rx->calls.rb_node;
+ parent = NULL;
+ while (*pp) {
+ parent = *pp;
+ xcall = rb_entry(parent, struct rxrpc_call, sock_node);
+ if (user_call_ID < call->user_call_ID)
+ pp = &(*pp)->rb_left;
+ else if (user_call_ID > call->user_call_ID)
+ pp = &(*pp)->rb_right;
+ else
+ goto id_in_use;
+ }
- ret = kernel_sendmsg(local->socket, &msg, iov, 1, len);
- if (ret < 0) {
- _leave(" = -EAGAIN [sendmsg failed: %d]", ret);
- return -EAGAIN;
+ call->user_call_ID = user_call_ID;
+ call->notify_rx = notify_rx;
+ rxrpc_get_call(call, rxrpc_call_got_kernel);
+ user_attach_call(call, user_call_ID);
+ rxrpc_get_call(call, rxrpc_call_got_userid);
+ rb_link_node(&call->sock_node, parent, pp);
+ rb_insert_color(&call->sock_node, &rx->calls);
+ set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
}
- _leave(" = 0");
+ list_add(&call->sock_link, &rx->sock_calls);
+
+ write_unlock(&rx->call_lock);
+
+ write_lock(&rxrpc_call_lock);
+ list_add_tail(&call->link, &rxrpc_calls);
+ write_unlock(&rxrpc_call_lock);
+
+ b->call_backlog[call_head] = call;
+ smp_store_release(&b->call_backlog_head, (call_head + 1) & (size - 1));
+ _leave(" = 0 [%d -> %lx]", call->debug_id, user_call_ID);
return 0;
+
+id_in_use:
+ write_unlock(&rx->call_lock);
+ rxrpc_cleanup_call(call);
+ _leave(" = -EBADSLT");
+ return -EBADSLT;
}
/*
- * accept an incoming call that needs peer, transport and/or connection setting
- * up
+ * Preallocate sufficient service connections, calls and peers to cover the
+ * entire backlog of a socket. When a new call comes in, if we don't have
+ * sufficient of each available, the call gets rejected as busy or ignored.
+ *
+ * The backlog is replenished when a connection is accepted or rejected.
*/
-static int rxrpc_accept_incoming_call(struct rxrpc_local *local,
- struct rxrpc_sock *rx,
- struct sk_buff *skb,
- struct sockaddr_rxrpc *srx)
+int rxrpc_service_prealloc(struct rxrpc_sock *rx, gfp_t gfp)
{
- struct rxrpc_connection *conn;
- struct rxrpc_skb_priv *sp, *nsp;
- struct rxrpc_call *call;
- struct sk_buff *notification;
- int ret;
+ struct rxrpc_backlog *b = rx->backlog;
- _enter("");
+ if (!b) {
+ b = kzalloc(sizeof(struct rxrpc_backlog), gfp);
+ if (!b)
+ return -ENOMEM;
+ rx->backlog = b;
+ }
+
+ if (rx->discard_new_call)
+ return 0;
+
+ while (rxrpc_service_prealloc_one(rx, b, NULL, NULL, 0, gfp) == 0)
+ ;
- sp = rxrpc_skb(skb);
+ return 0;
+}
- /* get a notification message to send to the server app */
- notification = alloc_skb(0, GFP_NOFS);
- if (!notification) {
- _debug("no memory");
- ret = -ENOMEM;
- goto error_nofree;
+/*
+ * Discard the preallocation on a service.
+ */
+void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
+{
+ struct rxrpc_backlog *b = rx->backlog;
+ unsigned int size = RXRPC_BACKLOG_MAX, head, tail;
+
+ if (!b)
+ return;
+ rx->backlog = NULL;
+
+ /* Make sure that there aren't any incoming calls in progress before we
+ * clear the preallocation buffers.
+ */
+ spin_lock_bh(&rx->incoming_lock);
+ spin_unlock_bh(&rx->incoming_lock);
+
+ head = b->peer_backlog_head;
+ tail = b->peer_backlog_tail;
+ while (CIRC_CNT(head, tail, size) > 0) {
+ struct rxrpc_peer *peer = b->peer_backlog[tail];
+ kfree(peer);
+ tail = (tail + 1) & (size - 1);
}
- rxrpc_new_skb(notification);
- notification->mark = RXRPC_SKB_MARK_NEW_CALL;
-
- conn = rxrpc_incoming_connection(local, srx, skb);
- if (IS_ERR(conn)) {
- _debug("no conn");
- ret = PTR_ERR(conn);
- goto error;
+
+ head = b->conn_backlog_head;
+ tail = b->conn_backlog_tail;
+ while (CIRC_CNT(head, tail, size) > 0) {
+ struct rxrpc_connection *conn = b->conn_backlog[tail];
+ write_lock(&rxrpc_connection_lock);
+ list_del(&conn->link);
+ list_del(&conn->proc_link);
+ write_unlock(&rxrpc_connection_lock);
+ kfree(conn);
+ tail = (tail + 1) & (size - 1);
}
- call = rxrpc_incoming_call(rx, conn, skb);
- rxrpc_put_connection(conn);
- if (IS_ERR(call)) {
- _debug("no call");
- ret = PTR_ERR(call);
- goto error;
+ head = b->call_backlog_head;
+ tail = b->call_backlog_tail;
+ while (CIRC_CNT(head, tail, size) > 0) {
+ struct rxrpc_call *call = b->call_backlog[tail];
+ if (rx->discard_new_call) {
+ _debug("discard %lx", call->user_call_ID);
+ rx->discard_new_call(call, call->user_call_ID);
+ rxrpc_put_call(call, rxrpc_call_put_kernel);
+ }
+ rxrpc_call_completed(call);
+ rxrpc_release_call(rx, call);
+ rxrpc_put_call(call, rxrpc_call_put);
+ tail = (tail + 1) & (size - 1);
}
- /* attach the call to the socket */
- read_lock_bh(&local->services_lock);
- if (rx->sk.sk_state == RXRPC_CLOSE)
- goto invalid_service;
+ kfree(b);
+}
- write_lock(&rx->call_lock);
- if (!test_and_set_bit(RXRPC_CALL_INIT_ACCEPT, &call->flags)) {
- rxrpc_get_call(call);
-
- spin_lock(&call->conn->state_lock);
- if (sp->hdr.securityIndex > 0 &&
- call->conn->state == RXRPC_CONN_SERVICE_UNSECURED) {
- _debug("await conn sec");
- list_add_tail(&call->accept_link, &rx->secureq);
- call->conn->state = RXRPC_CONN_SERVICE_CHALLENGING;
- set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events);
- rxrpc_queue_conn(call->conn);
- } else {
- _debug("conn ready");
- call->state = RXRPC_CALL_SERVER_ACCEPTING;
- list_add_tail(&call->accept_link, &rx->acceptq);
- rxrpc_get_call_for_skb(call, notification);
- nsp = rxrpc_skb(notification);
- nsp->call = call;
-
- ASSERTCMP(atomic_read(&call->usage), >=, 3);
-
- _debug("notify");
- spin_lock(&call->lock);
- ret = rxrpc_queue_rcv_skb(call, notification, true,
- false);
- spin_unlock(&call->lock);
- notification = NULL;
- BUG_ON(ret < 0);
+/*
+ * Allocate a new incoming call from the prealloc pool, along with a connection
+ * and a peer as necessary.
+ */
+static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
+ struct rxrpc_local *local,
+ struct rxrpc_connection *conn,
+ struct sk_buff *skb)
+{
+ struct rxrpc_backlog *b = rx->backlog;
+ struct rxrpc_peer *peer, *xpeer;
+ struct rxrpc_call *call;
+ unsigned short call_head, conn_head, peer_head;
+ unsigned short call_tail, conn_tail, peer_tail;
+ unsigned short call_count, conn_count;
+
+ /* #calls >= #conns >= #peers must hold true. */
+ call_head = smp_load_acquire(&b->call_backlog_head);
+ call_tail = b->call_backlog_tail;
+ call_count = CIRC_CNT(call_head, call_tail, RXRPC_BACKLOG_MAX);
+ conn_head = smp_load_acquire(&b->conn_backlog_head);
+ conn_tail = b->conn_backlog_tail;
+ conn_count = CIRC_CNT(conn_head, conn_tail, RXRPC_BACKLOG_MAX);
+ ASSERTCMP(conn_count, >=, call_count);
+ peer_head = smp_load_acquire(&b->peer_backlog_head);
+ peer_tail = b->peer_backlog_tail;
+ ASSERTCMP(CIRC_CNT(peer_head, peer_tail, RXRPC_BACKLOG_MAX), >=,
+ conn_count);
+
+ if (call_count == 0)
+ return NULL;
+
+ if (!conn) {
+ /* No connection. We're going to need a peer to start off
+ * with. If one doesn't yet exist, use a spare from the
+ * preallocation set. We dump the address into the spare in
+ * anticipation - and to save on stack space.
+ */
+ xpeer = b->peer_backlog[peer_tail];
+ if (rxrpc_extract_addr_from_skb(&xpeer->srx, skb) < 0)
+ return NULL;
+
+ peer = rxrpc_lookup_incoming_peer(local, xpeer);
+ if (peer == xpeer) {
+ b->peer_backlog[peer_tail] = NULL;
+ smp_store_release(&b->peer_backlog_tail,
+ (peer_tail + 1) &
+ (RXRPC_BACKLOG_MAX - 1));
}
- spin_unlock(&call->conn->state_lock);
- _debug("queued");
+ /* Now allocate and set up the connection */
+ conn = b->conn_backlog[conn_tail];
+ b->conn_backlog[conn_tail] = NULL;
+ smp_store_release(&b->conn_backlog_tail,
+ (conn_tail + 1) & (RXRPC_BACKLOG_MAX - 1));
+ rxrpc_get_local(local);
+ conn->params.local = local;
+ conn->params.peer = peer;
+ rxrpc_see_connection(conn);
+ rxrpc_new_incoming_connection(conn, skb);
+ } else {
+ rxrpc_get_connection(conn);
}
- write_unlock(&rx->call_lock);
- _debug("process");
- rxrpc_fast_process_packet(call, skb);
+ /* And now we can allocate and set up a new call */
+ call = b->call_backlog[call_tail];
+ b->call_backlog[call_tail] = NULL;
+ smp_store_release(&b->call_backlog_tail,
+ (call_tail + 1) & (RXRPC_BACKLOG_MAX - 1));
- _debug("done");
- read_unlock_bh(&local->services_lock);
- rxrpc_free_skb(notification);
- rxrpc_put_call(call);
- _leave(" = 0");
- return 0;
-
-invalid_service:
- _debug("invalid");
- read_unlock_bh(&local->services_lock);
-
- read_lock_bh(&call->state_lock);
- if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
- !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
- rxrpc_get_call(call);
- rxrpc_queue_call(call);
- }
- read_unlock_bh(&call->state_lock);
- rxrpc_put_call(call);
- ret = -ECONNREFUSED;
-error:
- rxrpc_free_skb(notification);
-error_nofree:
- _leave(" = %d", ret);
- return ret;
+ rxrpc_see_call(call);
+ call->conn = conn;
+ call->peer = rxrpc_get_peer(conn->params.peer);
+ return call;
}
/*
- * accept incoming calls that need peer, transport and/or connection setting up
- * - the packets we get are all incoming client DATA packets that have seq == 1
+ * Set up a new incoming call. Called in BH context with the RCU read lock
+ * held.
+ *
+ * If this is for a kernel service, when we allocate the call, it will have
+ * three refs on it: (1) the kernel service, (2) the user_call_ID tree, (3) the
+ * retainer ref obtained from the backlog buffer. Prealloc calls for userspace
+ * services only have the ref from the backlog buffer. We want to pass this
+ * ref to non-BH context to dispose of.
+ *
+ * If we want to report an error, we mark the skb with the packet type and
+ * abort code and return NULL.
*/
-void rxrpc_accept_incoming_calls(struct rxrpc_local *local)
+struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
+ struct rxrpc_connection *conn,
+ struct sk_buff *skb)
{
- struct rxrpc_skb_priv *sp;
- struct sockaddr_rxrpc srx;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct rxrpc_sock *rx;
- struct rxrpc_wire_header whdr;
- struct sk_buff *skb;
- int ret;
-
- _enter("%d", local->debug_id);
+ struct rxrpc_call *call;
- skb = skb_dequeue(&local->accept_queue);
- if (!skb) {
- _leave("\n");
- return;
- }
+ _enter("");
- _net("incoming call skb %p", skb);
-
- rxrpc_see_skb(skb);
- sp = rxrpc_skb(skb);
-
- /* Set up a response packet header in case we need it */
- whdr.epoch = htonl(sp->hdr.epoch);
- whdr.cid = htonl(sp->hdr.cid);
- whdr.callNumber = htonl(sp->hdr.callNumber);
- whdr.seq = htonl(sp->hdr.seq);
- whdr.serial = 0;
- whdr.flags = 0;
- whdr.type = 0;
- whdr.userStatus = 0;
- whdr.securityIndex = sp->hdr.securityIndex;
- whdr._rsvd = 0;
- whdr.serviceId = htons(sp->hdr.serviceId);
-
- if (rxrpc_extract_addr_from_skb(&srx, skb) < 0)
- goto drop;
-
- /* get the socket providing the service */
- read_lock_bh(&local->services_lock);
- list_for_each_entry(rx, &local->services, listen_link) {
- if (rx->srx.srx_service == sp->hdr.serviceId &&
- rx->sk.sk_state != RXRPC_CLOSE)
+ /* Get the socket providing the service */
+ hlist_for_each_entry_rcu_bh(rx, &local->services, listen_link) {
+ if (rx->srx.srx_service == sp->hdr.serviceId)
goto found_service;
}
- read_unlock_bh(&local->services_lock);
- goto invalid_service;
+
+ trace_rxrpc_abort("INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+ RX_INVALID_OPERATION, EOPNOTSUPP);
+ skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
+ skb->priority = RX_INVALID_OPERATION;
+ _leave(" = NULL [service]");
+ return NULL;
found_service:
- _debug("found service %hd", rx->srx.srx_service);
- if (sk_acceptq_is_full(&rx->sk))
- goto backlog_full;
- sk_acceptq_added(&rx->sk);
- sock_hold(&rx->sk);
- read_unlock_bh(&local->services_lock);
-
- ret = rxrpc_accept_incoming_call(local, rx, skb, &srx);
- if (ret < 0)
- sk_acceptq_removed(&rx->sk);
- sock_put(&rx->sk);
- switch (ret) {
- case -ECONNRESET: /* old calls are ignored */
- case -ECONNABORTED: /* aborted calls are reaborted or ignored */
- case 0:
- return;
- case -ECONNREFUSED:
- goto invalid_service;
- case -EBUSY:
- goto busy;
- case -EKEYREJECTED:
- goto security_mismatch;
+ spin_lock(&rx->incoming_lock);
+ if (rx->sk.sk_state == RXRPC_CLOSE) {
+ trace_rxrpc_abort("CLS", sp->hdr.cid, sp->hdr.callNumber,
+ sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN);
+ skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
+ skb->priority = RX_INVALID_OPERATION;
+ _leave(" = NULL [close]");
+ call = NULL;
+ goto out;
+ }
+
+ call = rxrpc_alloc_incoming_call(rx, local, conn, skb);
+ if (!call) {
+ skb->mark = RXRPC_SKB_MARK_BUSY;
+ _leave(" = NULL [busy]");
+ call = NULL;
+ goto out;
+ }
+
+ trace_rxrpc_receive(call, rxrpc_receive_incoming,
+ sp->hdr.serial, sp->hdr.seq);
+
+ /* Make the call live. */
+ rxrpc_incoming_call(rx, call, skb);
+ conn = call->conn;
+
+ if (rx->notify_new_call)
+ rx->notify_new_call(&rx->sk, call, call->user_call_ID);
+ else
+ sk_acceptq_added(&rx->sk);
+
+ spin_lock(&conn->state_lock);
+ switch (conn->state) {
+ case RXRPC_CONN_SERVICE_UNSECURED:
+ conn->state = RXRPC_CONN_SERVICE_CHALLENGING;
+ set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events);
+ rxrpc_queue_conn(call->conn);
+ break;
+
+ case RXRPC_CONN_SERVICE:
+ write_lock(&call->state_lock);
+ if (rx->discard_new_call)
+ call->state = RXRPC_CALL_SERVER_RECV_REQUEST;
+ else
+ call->state = RXRPC_CALL_SERVER_ACCEPTING;
+ write_unlock(&call->state_lock);
+ break;
+
+ case RXRPC_CONN_REMOTELY_ABORTED:
+ rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED,
+ conn->remote_abort, ECONNABORTED);
+ break;
+ case RXRPC_CONN_LOCALLY_ABORTED:
+ rxrpc_abort_call("CON", call, sp->hdr.seq,
+ conn->local_abort, ECONNABORTED);
+ break;
default:
BUG();
}
+ spin_unlock(&conn->state_lock);
-backlog_full:
- read_unlock_bh(&local->services_lock);
-busy:
- rxrpc_busy(local, &srx, &whdr);
- rxrpc_free_skb(skb);
- return;
+ if (call->state == RXRPC_CALL_SERVER_ACCEPTING)
+ rxrpc_notify_socket(call);
-drop:
- rxrpc_free_skb(skb);
- return;
+ /* We have to discard the prealloc queue's ref here and rely on a
+ * combination of the RCU read lock and refs held either by the socket
+ * (recvmsg queue, to-be-accepted queue or user ID tree) or the kernel
+ * service to prevent the call from being deallocated too early.
+ */
+ rxrpc_put_call(call, rxrpc_call_put);
-invalid_service:
- skb->priority = RX_INVALID_OPERATION;
- rxrpc_reject_packet(local, skb);
- return;
-
- /* can't change connection security type mid-flow */
-security_mismatch:
- skb->priority = RX_PROTOCOL_ERROR;
- rxrpc_reject_packet(local, skb);
- return;
+ _leave(" = %p{%d}", call, call->debug_id);
+out:
+ spin_unlock(&rx->incoming_lock);
+ return call;
}
/*
@@ -299,12 +443,13 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
write_lock(&rx->call_lock);
- ret = -ENODATA;
- if (list_empty(&rx->acceptq))
- goto out;
+ if (list_empty(&rx->to_be_accepted)) {
+ write_unlock(&rx->call_lock);
+ kleave(" = -ENODATA [empty]");
+ return ERR_PTR(-ENODATA);
+ }
/* check the user ID isn't already in use */
- ret = -EBADSLT;
pp = &rx->calls.rb_node;
parent = NULL;
while (*pp) {
@@ -316,11 +461,14 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
else if (user_call_ID > call->user_call_ID)
pp = &(*pp)->rb_right;
else
- goto out;
+ goto id_in_use;
}
- /* dequeue the first call and check it's still valid */
- call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link);
+ /* Dequeue the first call and check it's still valid. We gain
+ * responsibility for the queue's reference.
+ */
+ call = list_entry(rx->to_be_accepted.next,
+ struct rxrpc_call, accept_link);
list_del_init(&call->accept_link);
sk_acceptq_removed(&rx->sk);
rxrpc_see_call(call);
@@ -333,9 +481,6 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
case RXRPC_CALL_COMPLETE:
ret = call->error;
goto out_release;
- case RXRPC_CALL_DEAD:
- ret = -ETIME;
- goto out_discard;
default:
BUG();
}
@@ -343,33 +488,32 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
/* formalise the acceptance */
call->notify_rx = notify_rx;
call->user_call_ID = user_call_ID;
+ rxrpc_get_call(call, rxrpc_call_got_userid);
rb_link_node(&call->sock_node, parent, pp);
rb_insert_color(&call->sock_node, &rx->calls);
if (test_and_set_bit(RXRPC_CALL_HAS_USERID, &call->flags))
BUG();
- if (test_and_set_bit(RXRPC_CALL_EV_ACCEPTED, &call->events))
- BUG();
- rxrpc_queue_call(call);
- rxrpc_get_call(call);
write_unlock_bh(&call->state_lock);
write_unlock(&rx->call_lock);
+ rxrpc_notify_socket(call);
+ rxrpc_service_prealloc(rx, GFP_KERNEL);
_leave(" = %p{%d}", call, call->debug_id);
return call;
- /* if the call is already dying or dead, then we leave the socket's ref
- * on it to be released by rxrpc_dead_call_expired() as induced by
- * rxrpc_release_call() */
out_release:
_debug("release %p", call);
- if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
- !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
- rxrpc_queue_call(call);
-out_discard:
write_unlock_bh(&call->state_lock);
- _debug("discard %p", call);
-out:
write_unlock(&rx->call_lock);
+ rxrpc_release_call(rx, call);
+ rxrpc_put_call(call, rxrpc_call_put);
+ goto out;
+
+id_in_use:
+ ret = -EBADSLT;
+ write_unlock(&rx->call_lock);
+out:
+ rxrpc_service_prealloc(rx, GFP_KERNEL);
_leave(" = %d", ret);
return ERR_PTR(ret);
}
@@ -381,6 +525,7 @@ out:
int rxrpc_reject_call(struct rxrpc_sock *rx)
{
struct rxrpc_call *call;
+ bool abort = false;
int ret;
_enter("");
@@ -389,12 +534,16 @@ int rxrpc_reject_call(struct rxrpc_sock *rx)
write_lock(&rx->call_lock);
- ret = -ENODATA;
- if (list_empty(&rx->acceptq))
- goto out;
+ if (list_empty(&rx->to_be_accepted)) {
+ write_unlock(&rx->call_lock);
+ return -ENODATA;
+ }
- /* dequeue the first call and check it's still valid */
- call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link);
+ /* Dequeue the first call and check it's still valid. We gain
+ * responsibility for the queue's reference.
+ */
+ call = list_entry(rx->to_be_accepted.next,
+ struct rxrpc_call, accept_link);
list_del_init(&call->accept_link);
sk_acceptq_removed(&rx->sk);
rxrpc_see_call(call);
@@ -402,76 +551,56 @@ int rxrpc_reject_call(struct rxrpc_sock *rx)
write_lock_bh(&call->state_lock);
switch (call->state) {
case RXRPC_CALL_SERVER_ACCEPTING:
- __rxrpc_set_call_completion(call, RXRPC_CALL_SERVER_BUSY,
- 0, ECONNABORTED);
- if (test_and_set_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events))
- rxrpc_queue_call(call);
- ret = 0;
- goto out_release;
+ __rxrpc_abort_call("REJ", call, 1, RX_USER_ABORT, ECONNABORTED);
+ abort = true;
+ /* fall through */
case RXRPC_CALL_COMPLETE:
ret = call->error;
- goto out_release;
- case RXRPC_CALL_DEAD:
- ret = -ETIME;
goto out_discard;
default:
BUG();
}
- /* if the call is already dying or dead, then we leave the socket's ref
- * on it to be released by rxrpc_dead_call_expired() as induced by
- * rxrpc_release_call() */
-out_release:
- _debug("release %p", call);
- if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
- !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
- rxrpc_queue_call(call);
out_discard:
write_unlock_bh(&call->state_lock);
- _debug("discard %p", call);
-out:
write_unlock(&rx->call_lock);
+ if (abort) {
+ rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
+ rxrpc_release_call(rx, call);
+ rxrpc_put_call(call, rxrpc_call_put);
+ }
+ rxrpc_service_prealloc(rx, GFP_KERNEL);
_leave(" = %d", ret);
return ret;
}
-/**
- * rxrpc_kernel_accept_call - Allow a kernel service to accept an incoming call
- * @sock: The socket on which the impending call is waiting
- * @user_call_ID: The tag to attach to the call
- * @notify_rx: Where to send notifications instead of socket queue
+/*
+ * rxrpc_kernel_charge_accept - Charge up socket with preallocated calls
+ * @sock: The socket on which to preallocate
+ * @notify_rx: Event notification function for the call
+ * @user_attach_call: Func to attach call to user_call_ID
+ * @user_call_ID: The tag to attach to the preallocated call
+ * @gfp: The allocation conditions.
*
- * Allow a kernel service to accept an incoming call, assuming the incoming
- * call is still valid. The caller should immediately trigger their own
- * notification as there must be data waiting.
- */
-struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *sock,
- unsigned long user_call_ID,
- rxrpc_notify_rx_t notify_rx)
-{
- struct rxrpc_call *call;
-
- _enter(",%lx", user_call_ID);
- call = rxrpc_accept_call(rxrpc_sk(sock->sk), user_call_ID, notify_rx);
- _leave(" = %p", call);
- return call;
-}
-EXPORT_SYMBOL(rxrpc_kernel_accept_call);
-
-/**
- * rxrpc_kernel_reject_call - Allow a kernel service to reject an incoming call
- * @sock: The socket on which the impending call is waiting
+ * Charge up the socket with preallocated calls, each with a user ID. A
+ * function should be provided to effect the attachment from the user's side.
+ * The user is given a ref to hold on the call.
*
- * Allow a kernel service to reject an incoming call with a BUSY message,
- * assuming the incoming call is still valid.
+ * Note that the call may be come connected before this function returns.
*/
-int rxrpc_kernel_reject_call(struct socket *sock)
+int rxrpc_kernel_charge_accept(struct socket *sock,
+ rxrpc_notify_rx_t notify_rx,
+ rxrpc_user_attach_call_t user_attach_call,
+ unsigned long user_call_ID, gfp_t gfp)
{
- int ret;
+ struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+ struct rxrpc_backlog *b = rx->backlog;
- _enter("");
- ret = rxrpc_reject_call(rxrpc_sk(sock->sk));
- _leave(" = %d", ret);
- return ret;
+ if (sock->sk->sk_state == RXRPC_CLOSE)
+ return -ESHUTDOWN;
+
+ return rxrpc_service_prealloc_one(rx, b, notify_rx,
+ user_attach_call, user_call_ID,
+ gfp);
}
-EXPORT_SYMBOL(rxrpc_kernel_reject_call);
+EXPORT_SYMBOL(rxrpc_kernel_charge_accept);
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 4754c7fb6242..0e8478012212 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -22,1270 +22,336 @@
#include "ar-internal.h"
/*
- * propose an ACK be sent
+ * Set the timer
*/
-void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
- u16 skew, u32 serial, bool immediate)
+void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why)
{
- unsigned long expiry;
- s8 prior = rxrpc_ack_priority[ack_reason];
+ unsigned long t, now = jiffies;
- ASSERTCMP(prior, >, 0);
+ read_lock_bh(&call->state_lock);
- _enter("{%d},%s,%%%x,%u",
- call->debug_id, rxrpc_acks(ack_reason), serial, immediate);
+ if (call->state < RXRPC_CALL_COMPLETE) {
+ t = call->expire_at;
+ if (time_before_eq(t, now))
+ goto out;
- if (prior < rxrpc_ack_priority[call->ackr_reason]) {
- if (immediate)
- goto cancel_timer;
- return;
- }
+ if (time_after(call->resend_at, now) &&
+ time_before(call->resend_at, t))
+ t = call->resend_at;
- /* update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial
- * numbers */
- if (prior == rxrpc_ack_priority[call->ackr_reason]) {
- if (prior <= 4) {
- call->ackr_skew = skew;
- call->ackr_serial = serial;
- }
- if (immediate)
- goto cancel_timer;
- return;
- }
-
- call->ackr_reason = ack_reason;
- call->ackr_serial = serial;
+ if (time_after(call->ack_at, now) &&
+ time_before(call->ack_at, t))
+ t = call->ack_at;
- switch (ack_reason) {
- case RXRPC_ACK_DELAY:
- _debug("run delay timer");
- expiry = rxrpc_soft_ack_delay;
- goto run_timer;
-
- case RXRPC_ACK_IDLE:
- if (!immediate) {
- _debug("run defer timer");
- expiry = rxrpc_idle_ack_delay;
- goto run_timer;
- }
- goto cancel_timer;
-
- case RXRPC_ACK_REQUESTED:
- expiry = rxrpc_requested_ack_delay;
- if (!expiry)
- goto cancel_timer;
- if (!immediate || serial == 1) {
- _debug("run defer timer");
- goto run_timer;
+ if (call->timer.expires != t || !timer_pending(&call->timer)) {
+ mod_timer(&call->timer, t);
+ trace_rxrpc_timer(call, why, now);
}
-
- default:
- _debug("immediate ACK");
- goto cancel_timer;
}
-run_timer:
- expiry += jiffies;
- if (!timer_pending(&call->ack_timer) ||
- time_after(call->ack_timer.expires, expiry))
- mod_timer(&call->ack_timer, expiry);
- return;
-
-cancel_timer:
- _debug("cancel timer %%%u", serial);
- try_to_del_timer_sync(&call->ack_timer);
- read_lock_bh(&call->state_lock);
- if (call->state < RXRPC_CALL_COMPLETE &&
- !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events))
- rxrpc_queue_call(call);
+out:
read_unlock_bh(&call->state_lock);
}
/*
- * propose an ACK be sent, locking the call structure
+ * propose an ACK be sent
*/
-void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
- u16 skew, u32 serial, bool immediate)
+static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
+ u16 skew, u32 serial, bool immediate,
+ bool background,
+ enum rxrpc_propose_ack_trace why)
{
+ enum rxrpc_propose_ack_outcome outcome = rxrpc_propose_ack_use;
+ unsigned long now, ack_at, expiry = rxrpc_soft_ack_delay;
s8 prior = rxrpc_ack_priority[ack_reason];
- if (prior > rxrpc_ack_priority[call->ackr_reason]) {
- spin_lock_bh(&call->lock);
- __rxrpc_propose_ACK(call, ack_reason, skew, serial, immediate);
- spin_unlock_bh(&call->lock);
- }
-}
-
-/*
- * set the resend timer
- */
-static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend,
- unsigned long resend_at)
-{
- read_lock_bh(&call->state_lock);
- if (call->state == RXRPC_CALL_COMPLETE)
- resend = 0;
-
- if (resend & 1) {
- _debug("SET RESEND");
- set_bit(RXRPC_CALL_EV_RESEND, &call->events);
- }
-
- if (resend & 2) {
- _debug("MODIFY RESEND TIMER");
- set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
- mod_timer(&call->resend_timer, resend_at);
- } else {
- _debug("KILL RESEND TIMER");
- del_timer_sync(&call->resend_timer);
- clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
- clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
- }
- read_unlock_bh(&call->state_lock);
-}
-
-/*
- * resend packets
- */
-static void rxrpc_resend(struct rxrpc_call *call)
-{
- struct rxrpc_wire_header *whdr;
- struct rxrpc_skb_priv *sp;
- struct sk_buff *txb;
- unsigned long *p_txb, resend_at;
- bool stop;
- int loop;
- u8 resend;
-
- _enter("{%d,%d,%d,%d},",
- call->acks_hard, call->acks_unacked,
- atomic_read(&call->sequence),
- CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz));
-
- stop = false;
- resend = 0;
- resend_at = 0;
-
- for (loop = call->acks_tail;
- loop != call->acks_head || stop;
- loop = (loop + 1) & (call->acks_winsz - 1)
- ) {
- p_txb = call->acks_window + loop;
- smp_read_barrier_depends();
- if (*p_txb & 1)
- continue;
-
- txb = (struct sk_buff *) *p_txb;
- sp = rxrpc_skb(txb);
-
- if (sp->need_resend) {
- sp->need_resend = false;
-
- /* each Tx packet has a new serial number */
- sp->hdr.serial = atomic_inc_return(&call->conn->serial);
-
- whdr = (struct rxrpc_wire_header *)txb->head;
- whdr->serial = htonl(sp->hdr.serial);
-
- _proto("Tx DATA %%%u { #%d }",
- sp->hdr.serial, sp->hdr.seq);
- if (rxrpc_send_data_packet(call->conn, txb) < 0) {
- stop = true;
- sp->resend_at = jiffies + 3;
- } else {
- if (rxrpc_is_client_call(call))
- rxrpc_expose_client_call(call);
- sp->resend_at =
- jiffies + rxrpc_resend_timeout;
- }
- }
-
- if (time_after_eq(jiffies + 1, sp->resend_at)) {
- sp->need_resend = true;
- resend |= 1;
- } else if (resend & 2) {
- if (time_before(sp->resend_at, resend_at))
- resend_at = sp->resend_at;
- } else {
- resend_at = sp->resend_at;
- resend |= 2;
- }
- }
-
- rxrpc_set_resend(call, resend, resend_at);
- _leave("");
-}
-
-/*
- * handle resend timer expiry
- */
-static void rxrpc_resend_timer(struct rxrpc_call *call)
-{
- struct rxrpc_skb_priv *sp;
- struct sk_buff *txb;
- unsigned long *p_txb, resend_at;
- int loop;
- u8 resend;
-
- _enter("%d,%d,%d",
- call->acks_tail, call->acks_unacked, call->acks_head);
-
- if (call->state == RXRPC_CALL_COMPLETE)
- return;
-
- resend = 0;
- resend_at = 0;
-
- for (loop = call->acks_unacked;
- loop != call->acks_head;
- loop = (loop + 1) & (call->acks_winsz - 1)
- ) {
- p_txb = call->acks_window + loop;
- smp_read_barrier_depends();
- txb = (struct sk_buff *) (*p_txb & ~1);
- sp = rxrpc_skb(txb);
-
- ASSERT(!(*p_txb & 1));
-
- if (sp->need_resend) {
- ;
- } else if (time_after_eq(jiffies + 1, sp->resend_at)) {
- sp->need_resend = true;
- resend |= 1;
- } else if (resend & 2) {
- if (time_before(sp->resend_at, resend_at))
- resend_at = sp->resend_at;
- } else {
- resend_at = sp->resend_at;
- resend |= 2;
- }
- }
-
- rxrpc_set_resend(call, resend, resend_at);
- _leave("");
-}
-
-/*
- * process soft ACKs of our transmitted packets
- * - these indicate packets the peer has or has not received, but hasn't yet
- * given to the consumer, and so can still be discarded and re-requested
- */
-static int rxrpc_process_soft_ACKs(struct rxrpc_call *call,
- struct rxrpc_ackpacket *ack,
- struct sk_buff *skb)
-{
- struct rxrpc_skb_priv *sp;
- struct sk_buff *txb;
- unsigned long *p_txb, resend_at;
- int loop;
- u8 sacks[RXRPC_MAXACKS], resend;
-
- _enter("{%d,%d},{%d},",
- call->acks_hard,
- CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz),
- ack->nAcks);
-
- if (skb_copy_bits(skb, 0, sacks, ack->nAcks) < 0)
- goto protocol_error;
-
- resend = 0;
- resend_at = 0;
- for (loop = 0; loop < ack->nAcks; loop++) {
- p_txb = call->acks_window;
- p_txb += (call->acks_tail + loop) & (call->acks_winsz - 1);
- smp_read_barrier_depends();
- txb = (struct sk_buff *) (*p_txb & ~1);
- sp = rxrpc_skb(txb);
-
- switch (sacks[loop]) {
- case RXRPC_ACK_TYPE_ACK:
- sp->need_resend = false;
- *p_txb |= 1;
- break;
- case RXRPC_ACK_TYPE_NACK:
- sp->need_resend = true;
- *p_txb &= ~1;
- resend = 1;
- break;
- default:
- _debug("Unsupported ACK type %d", sacks[loop]);
- goto protocol_error;
- }
- }
-
- smp_mb();
- call->acks_unacked = (call->acks_tail + loop) & (call->acks_winsz - 1);
-
- /* anything not explicitly ACK'd is implicitly NACK'd, but may just not
- * have been received or processed yet by the far end */
- for (loop = call->acks_unacked;
- loop != call->acks_head;
- loop = (loop + 1) & (call->acks_winsz - 1)
- ) {
- p_txb = call->acks_window + loop;
- smp_read_barrier_depends();
- txb = (struct sk_buff *) (*p_txb & ~1);
- sp = rxrpc_skb(txb);
-
- if (*p_txb & 1) {
- /* packet must have been discarded */
- sp->need_resend = true;
- *p_txb &= ~1;
- resend |= 1;
- } else if (sp->need_resend) {
- ;
- } else if (time_after_eq(jiffies + 1, sp->resend_at)) {
- sp->need_resend = true;
- resend |= 1;
- } else if (resend & 2) {
- if (time_before(sp->resend_at, resend_at))
- resend_at = sp->resend_at;
- } else {
- resend_at = sp->resend_at;
- resend |= 2;
+ /* Update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial
+ * numbers, but we don't alter the timeout.
+ */
+ _debug("prior %u %u vs %u %u",
+ ack_reason, prior,
+ call->ackr_reason, rxrpc_ack_priority[call->ackr_reason]);
+ if (ack_reason == call->ackr_reason) {
+ if (RXRPC_ACK_UPDATEABLE & (1 << ack_reason)) {
+ outcome = rxrpc_propose_ack_update;
+ call->ackr_serial = serial;
+ call->ackr_skew = skew;
}
+ if (!immediate)
+ goto trace;
+ } else if (prior > rxrpc_ack_priority[call->ackr_reason]) {
+ call->ackr_reason = ack_reason;
+ call->ackr_serial = serial;
+ call->ackr_skew = skew;
+ } else {
+ outcome = rxrpc_propose_ack_subsume;
}
- rxrpc_set_resend(call, resend, resend_at);
- _leave(" = 0");
- return 0;
-
-protocol_error:
- _leave(" = -EPROTO");
- return -EPROTO;
-}
-
-/*
- * discard hard-ACK'd packets from the Tx window
- */
-static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard)
-{
- unsigned long _skb;
- int tail = call->acks_tail, old_tail;
- int win = CIRC_CNT(call->acks_head, tail, call->acks_winsz);
+ switch (ack_reason) {
+ case RXRPC_ACK_REQUESTED:
+ if (rxrpc_requested_ack_delay < expiry)
+ expiry = rxrpc_requested_ack_delay;
+ if (serial == 1)
+ immediate = false;
+ break;
- _enter("{%u,%u},%u", call->acks_hard, win, hard);
+ case RXRPC_ACK_DELAY:
+ if (rxrpc_soft_ack_delay < expiry)
+ expiry = rxrpc_soft_ack_delay;
+ break;
- ASSERTCMP(hard - call->acks_hard, <=, win);
+ case RXRPC_ACK_PING:
+ case RXRPC_ACK_IDLE:
+ if (rxrpc_idle_ack_delay < expiry)
+ expiry = rxrpc_idle_ack_delay;
+ break;
- while (call->acks_hard < hard) {
- smp_read_barrier_depends();
- _skb = call->acks_window[tail] & ~1;
- rxrpc_free_skb((struct sk_buff *) _skb);
- old_tail = tail;
- tail = (tail + 1) & (call->acks_winsz - 1);
- call->acks_tail = tail;
- if (call->acks_unacked == old_tail)
- call->acks_unacked = tail;
- call->acks_hard++;
+ default:
+ immediate = true;
+ break;
}
- wake_up(&call->waitq);
-}
-
-/*
- * clear the Tx window in the event of a failure
- */
-static void rxrpc_clear_tx_window(struct rxrpc_call *call)
-{
- rxrpc_rotate_tx_window(call, atomic_read(&call->sequence));
-}
-
-/*
- * drain the out of sequence received packet queue into the packet Rx queue
- */
-static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call)
-{
- struct rxrpc_skb_priv *sp;
- struct sk_buff *skb;
- bool terminal;
- int ret;
-
- _enter("{%d,%d}", call->rx_data_post, call->rx_first_oos);
-
- spin_lock_bh(&call->lock);
-
- ret = -ECONNRESET;
- if (test_bit(RXRPC_CALL_RELEASED, &call->flags))
- goto socket_unavailable;
-
- skb = skb_dequeue(&call->rx_oos_queue);
- if (skb) {
- rxrpc_see_skb(skb);
- sp = rxrpc_skb(skb);
-
- _debug("drain OOS packet %d [%d]",
- sp->hdr.seq, call->rx_first_oos);
-
- if (sp->hdr.seq != call->rx_first_oos) {
- skb_queue_head(&call->rx_oos_queue, skb);
- call->rx_first_oos = rxrpc_skb(skb)->hdr.seq;
- _debug("requeue %p {%u}", skb, call->rx_first_oos);
- } else {
- skb->mark = RXRPC_SKB_MARK_DATA;
- terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) &&
- !(sp->hdr.flags & RXRPC_CLIENT_INITIATED));
- ret = rxrpc_queue_rcv_skb(call, skb, true, terminal);
- BUG_ON(ret < 0);
- _debug("drain #%u", call->rx_data_post);
- call->rx_data_post++;
-
- /* find out what the next packet is */
- skb = skb_peek(&call->rx_oos_queue);
- rxrpc_see_skb(skb);
- if (skb)
- call->rx_first_oos = rxrpc_skb(skb)->hdr.seq;
- else
- call->rx_first_oos = 0;
- _debug("peek %p {%u}", skb, call->rx_first_oos);
+ now = jiffies;
+ if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) {
+ _debug("already scheduled");
+ } else if (immediate || expiry == 0) {
+ _debug("immediate ACK %lx", call->events);
+ if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events) &&
+ background)
+ rxrpc_queue_call(call);
+ } else {
+ ack_at = now + expiry;
+ _debug("deferred ACK %ld < %ld", expiry, call->ack_at - now);
+ if (time_before(ack_at, call->ack_at)) {
+ call->ack_at = ack_at;
+ rxrpc_set_timer(call, rxrpc_timer_set_for_ack);
}
}
- ret = 0;
-socket_unavailable:
- spin_unlock_bh(&call->lock);
- _leave(" = %d", ret);
- return ret;
+trace:
+ trace_rxrpc_propose_ack(call, why, ack_reason, serial, immediate,
+ background, outcome);
}
/*
- * insert an out of sequence packet into the buffer
+ * propose an ACK be sent, locking the call structure
*/
-static void rxrpc_insert_oos_packet(struct rxrpc_call *call,
- struct sk_buff *skb)
+void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
+ u16 skew, u32 serial, bool immediate, bool background,
+ enum rxrpc_propose_ack_trace why)
{
- struct rxrpc_skb_priv *sp, *psp;
- struct sk_buff *p;
- u32 seq;
-
- sp = rxrpc_skb(skb);
- seq = sp->hdr.seq;
- _enter(",,{%u}", seq);
-
- skb->destructor = rxrpc_packet_destructor;
- ASSERTCMP(sp->call, ==, NULL);
- sp->call = call;
- rxrpc_get_call_for_skb(call, skb);
-
- /* insert into the buffer in sequence order */
spin_lock_bh(&call->lock);
-
- skb_queue_walk(&call->rx_oos_queue, p) {
- psp = rxrpc_skb(p);
- if (psp->hdr.seq > seq) {
- _debug("insert oos #%u before #%u", seq, psp->hdr.seq);
- skb_insert(p, skb, &call->rx_oos_queue);
- goto inserted;
- }
- }
-
- _debug("append oos #%u", seq);
- skb_queue_tail(&call->rx_oos_queue, skb);
-inserted:
-
- /* we might now have a new front to the queue */
- if (call->rx_first_oos == 0 || seq < call->rx_first_oos)
- call->rx_first_oos = seq;
-
- read_lock(&call->state_lock);
- if (call->state < RXRPC_CALL_COMPLETE &&
- call->rx_data_post == call->rx_first_oos) {
- _debug("drain rx oos now");
- set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events);
- }
- read_unlock(&call->state_lock);
-
+ __rxrpc_propose_ACK(call, ack_reason, skew, serial,
+ immediate, background, why);
spin_unlock_bh(&call->lock);
- _leave(" [stored #%u]", call->rx_first_oos);
-}
-
-/*
- * clear the Tx window on final ACK reception
- */
-static void rxrpc_zap_tx_window(struct rxrpc_call *call)
-{
- struct rxrpc_skb_priv *sp;
- struct sk_buff *skb;
- unsigned long _skb, *acks_window;
- u8 winsz = call->acks_winsz;
- int tail;
-
- acks_window = call->acks_window;
- call->acks_window = NULL;
-
- while (CIRC_CNT(call->acks_head, call->acks_tail, winsz) > 0) {
- tail = call->acks_tail;
- smp_read_barrier_depends();
- _skb = acks_window[tail] & ~1;
- smp_mb();
- call->acks_tail = (call->acks_tail + 1) & (winsz - 1);
-
- skb = (struct sk_buff *) _skb;
- sp = rxrpc_skb(skb);
- _debug("+++ clear Tx %u", sp->hdr.seq);
- rxrpc_free_skb(skb);
- }
-
- kfree(acks_window);
}
/*
- * process the extra information that may be appended to an ACK packet
+ * Handle congestion being detected by the retransmit timeout.
*/
-static void rxrpc_extract_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
- unsigned int latest, int nAcks)
+static void rxrpc_congestion_timeout(struct rxrpc_call *call)
{
- struct rxrpc_ackinfo ackinfo;
- struct rxrpc_peer *peer;
- unsigned int mtu;
-
- if (skb_copy_bits(skb, nAcks + 3, &ackinfo, sizeof(ackinfo)) < 0) {
- _leave(" [no ackinfo]");
- return;
- }
-
- _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }",
- latest,
- ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU),
- ntohl(ackinfo.rwind), ntohl(ackinfo.jumbo_max));
-
- mtu = min(ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU));
-
- peer = call->peer;
- if (mtu < peer->maxdata) {
- spin_lock_bh(&peer->lock);
- peer->maxdata = mtu;
- peer->mtu = mtu + peer->hdrsize;
- spin_unlock_bh(&peer->lock);
- _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata);
- }
+ set_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags);
}
/*
- * process packets in the reception queue
+ * Perform retransmission of NAK'd and unack'd packets.
*/
-static int rxrpc_process_rx_queue(struct rxrpc_call *call,
- u32 *_abort_code)
+static void rxrpc_resend(struct rxrpc_call *call)
{
- struct rxrpc_ackpacket ack;
struct rxrpc_skb_priv *sp;
struct sk_buff *skb;
- bool post_ACK;
- int latest;
- u32 hard, tx;
-
- _enter("");
-
-process_further:
- skb = skb_dequeue(&call->rx_queue);
- if (!skb)
- return -EAGAIN;
-
- rxrpc_see_skb(skb);
- _net("deferred skb %p", skb);
-
- sp = rxrpc_skb(skb);
+ rxrpc_seq_t cursor, seq, top;
+ ktime_t now = ktime_get_real(), max_age, oldest, resend_at, ack_ts;
+ int ix;
+ u8 annotation, anno_type, retrans = 0, unacked = 0;
- _debug("process %s [st %d]", rxrpc_pkts[sp->hdr.type], call->state);
+ _enter("{%d,%d}", call->tx_hard_ack, call->tx_top);
- post_ACK = false;
+ max_age = ktime_sub_ms(now, rxrpc_resend_timeout);
- switch (sp->hdr.type) {
- /* data packets that wind up here have been received out of
- * order, need security processing or are jumbo packets */
- case RXRPC_PACKET_TYPE_DATA:
- _proto("OOSQ DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
+ spin_lock_bh(&call->lock);
- /* secured packets must be verified and possibly decrypted */
- if (call->conn->security->verify_packet(call, skb,
- _abort_code) < 0)
- goto protocol_error;
+ cursor = call->tx_hard_ack;
+ top = call->tx_top;
+ ASSERT(before_eq(cursor, top));
+ if (cursor == top)
+ goto out_unlock;
+
+ /* Scan the packet list without dropping the lock and decide which of
+ * the packets in the Tx buffer we're going to resend and what the new
+ * resend timeout will be.
+ */
+ oldest = now;
+ for (seq = cursor + 1; before_eq(seq, top); seq++) {
+ ix = seq & RXRPC_RXTX_BUFF_MASK;
+ annotation = call->rxtx_annotations[ix];
+ anno_type = annotation & RXRPC_TX_ANNO_MASK;
+ annotation &= ~RXRPC_TX_ANNO_MASK;
+ if (anno_type == RXRPC_TX_ANNO_ACK)
+ continue;
- rxrpc_insert_oos_packet(call, skb);
- goto process_further;
+ skb = call->rxtx_buffer[ix];
+ rxrpc_see_skb(skb, rxrpc_skb_tx_seen);
+ sp = rxrpc_skb(skb);
- /* partial ACK to process */
- case RXRPC_PACKET_TYPE_ACK:
- if (skb_copy_bits(skb, 0, &ack, sizeof(ack)) < 0) {
- _debug("extraction failure");
- goto protocol_error;
+ if (anno_type == RXRPC_TX_ANNO_UNACK) {
+ if (ktime_after(skb->tstamp, max_age)) {
+ if (ktime_before(skb->tstamp, oldest))
+ oldest = skb->tstamp;
+ continue;
+ }
+ if (!(annotation & RXRPC_TX_ANNO_RESENT))
+ unacked++;
}
- if (!skb_pull(skb, sizeof(ack)))
- BUG();
-
- latest = sp->hdr.serial;
- hard = ntohl(ack.firstPacket);
- tx = atomic_read(&call->sequence);
- _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
- latest,
- ntohs(ack.maxSkew),
- hard,
- ntohl(ack.previousPacket),
- ntohl(ack.serial),
- rxrpc_acks(ack.reason),
- ack.nAcks);
-
- rxrpc_extract_ackinfo(call, skb, latest, ack.nAcks);
+ /* Okay, we need to retransmit a packet. */
+ call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS | annotation;
+ retrans++;
+ trace_rxrpc_retransmit(call, seq, annotation | anno_type,
+ ktime_to_ns(ktime_sub(skb->tstamp, max_age)));
+ }
+
+ resend_at = ktime_add_ms(oldest, rxrpc_resend_timeout);
+ call->resend_at = jiffies +
+ nsecs_to_jiffies(ktime_to_ns(ktime_sub(resend_at, now))) +
+ 1; /* We have to make sure that the calculated jiffies value
+ * falls at or after the nsec value, or we shall loop
+ * ceaselessly because the timer times out, but we haven't
+ * reached the nsec timeout yet.
+ */
+
+ if (unacked)
+ rxrpc_congestion_timeout(call);
+
+ /* If there was nothing that needed retransmission then it's likely
+ * that an ACK got lost somewhere. Send a ping to find out instead of
+ * retransmitting data.
+ */
+ if (!retrans) {
+ rxrpc_set_timer(call, rxrpc_timer_set_for_resend);
+ spin_unlock_bh(&call->lock);
+ ack_ts = ktime_sub(now, call->acks_latest_ts);
+ if (ktime_to_ns(ack_ts) < call->peer->rtt)
+ goto out;
+ rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false,
+ rxrpc_propose_ack_ping_for_lost_ack);
+ rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
+ goto out;
+ }
+
+ /* Now go through the Tx window and perform the retransmissions. We
+ * have to drop the lock for each send. If an ACK comes in whilst the
+ * lock is dropped, it may clear some of the retransmission markers for
+ * packets that it soft-ACKs.
+ */
+ for (seq = cursor + 1; before_eq(seq, top); seq++) {
+ ix = seq & RXRPC_RXTX_BUFF_MASK;
+ annotation = call->rxtx_annotations[ix];
+ anno_type = annotation & RXRPC_TX_ANNO_MASK;
+ if (anno_type != RXRPC_TX_ANNO_RETRANS)
+ continue;
- if (ack.reason == RXRPC_ACK_PING) {
- _proto("Rx ACK %%%u PING Request", latest);
- rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE,
- skb->priority, sp->hdr.serial, true);
- }
+ skb = call->rxtx_buffer[ix];
+ rxrpc_get_skb(skb, rxrpc_skb_tx_got);
+ spin_unlock_bh(&call->lock);
- /* discard any out-of-order or duplicate ACKs */
- if (latest - call->acks_latest <= 0) {
- _debug("discard ACK %d <= %d",
- latest, call->acks_latest);
- goto discard;
+ if (rxrpc_send_data_packet(call, skb) < 0) {
+ rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
+ return;
}
- call->acks_latest = latest;
- if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST &&
- call->state != RXRPC_CALL_CLIENT_AWAIT_REPLY &&
- call->state != RXRPC_CALL_SERVER_SEND_REPLY &&
- call->state != RXRPC_CALL_SERVER_AWAIT_ACK)
- goto discard;
+ if (rxrpc_is_client_call(call))
+ rxrpc_expose_client_call(call);
- _debug("Tx=%d H=%u S=%d", tx, call->acks_hard, call->state);
-
- if (hard > 0) {
- if (hard - 1 > tx) {
- _debug("hard-ACK'd packet %d not transmitted"
- " (%d top)",
- hard - 1, tx);
- goto protocol_error;
- }
-
- if ((call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY ||
- call->state == RXRPC_CALL_SERVER_AWAIT_ACK) &&
- hard > tx) {
- call->acks_hard = tx;
- goto all_acked;
- }
-
- smp_rmb();
- rxrpc_rotate_tx_window(call, hard - 1);
- }
+ rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
+ spin_lock_bh(&call->lock);
- if (ack.nAcks > 0) {
- if (hard - 1 + ack.nAcks > tx) {
- _debug("soft-ACK'd packet %d+%d not"
- " transmitted (%d top)",
- hard - 1, ack.nAcks, tx);
- goto protocol_error;
+ /* We need to clear the retransmit state, but there are two
+ * things we need to be aware of: A new ACK/NAK might have been
+ * received and the packet might have been hard-ACK'd (in which
+ * case it will no longer be in the buffer).
+ */
+ if (after(seq, call->tx_hard_ack)) {
+ annotation = call->rxtx_annotations[ix];
+ anno_type = annotation & RXRPC_TX_ANNO_MASK;
+ if (anno_type == RXRPC_TX_ANNO_RETRANS ||
+ anno_type == RXRPC_TX_ANNO_NAK) {
+ annotation &= ~RXRPC_TX_ANNO_MASK;
+ annotation |= RXRPC_TX_ANNO_UNACK;
}
-
- if (rxrpc_process_soft_ACKs(call, &ack, skb) < 0)
- goto protocol_error;
+ annotation |= RXRPC_TX_ANNO_RESENT;
+ call->rxtx_annotations[ix] = annotation;
}
- goto discard;
-
- /* complete ACK to process */
- case RXRPC_PACKET_TYPE_ACKALL:
- goto all_acked;
-
- /* abort and busy are handled elsewhere */
- case RXRPC_PACKET_TYPE_BUSY:
- case RXRPC_PACKET_TYPE_ABORT:
- BUG();
-
- /* connection level events - also handled elsewhere */
- case RXRPC_PACKET_TYPE_CHALLENGE:
- case RXRPC_PACKET_TYPE_RESPONSE:
- case RXRPC_PACKET_TYPE_DEBUG:
- BUG();
- }
-
- /* if we've had a hard ACK that covers all the packets we've sent, then
- * that ends that phase of the operation */
-all_acked:
- write_lock_bh(&call->state_lock);
- _debug("ack all %d", call->state);
-
- switch (call->state) {
- case RXRPC_CALL_CLIENT_AWAIT_REPLY:
- call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
- break;
- case RXRPC_CALL_SERVER_AWAIT_ACK:
- _debug("srv complete");
- __rxrpc_call_completed(call);
- post_ACK = true;
- break;
- case RXRPC_CALL_CLIENT_SEND_REQUEST:
- case RXRPC_CALL_SERVER_RECV_REQUEST:
- goto protocol_error_unlock; /* can't occur yet */
- default:
- write_unlock_bh(&call->state_lock);
- goto discard; /* assume packet left over from earlier phase */
- }
-
- write_unlock_bh(&call->state_lock);
-
- /* if all the packets we sent are hard-ACK'd, then we can discard
- * whatever we've got left */
- _debug("clear Tx %d",
- CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz));
- del_timer_sync(&call->resend_timer);
- clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
- clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
-
- if (call->acks_window)
- rxrpc_zap_tx_window(call);
-
- if (post_ACK) {
- /* post the final ACK message for userspace to pick up */
- _debug("post ACK");
- skb->mark = RXRPC_SKB_MARK_FINAL_ACK;
- sp->call = call;
- rxrpc_get_call_for_skb(call, skb);
- spin_lock_bh(&call->lock);
- if (rxrpc_queue_rcv_skb(call, skb, true, true) < 0)
- BUG();
- spin_unlock_bh(&call->lock);
- goto process_further;
+ if (after(call->tx_hard_ack, seq))
+ seq = call->tx_hard_ack;
}
-discard:
- rxrpc_free_skb(skb);
- goto process_further;
-
-protocol_error_unlock:
- write_unlock_bh(&call->state_lock);
-protocol_error:
- rxrpc_free_skb(skb);
- _leave(" = -EPROTO");
- return -EPROTO;
-}
-
-/*
- * post a message to the socket Rx queue for recvmsg() to pick up
- */
-static int rxrpc_post_message(struct rxrpc_call *call, u32 mark, u32 error,
- bool fatal)
-{
- struct rxrpc_skb_priv *sp;
- struct sk_buff *skb;
- int ret;
-
- _enter("{%d,%lx},%u,%u,%d",
- call->debug_id, call->flags, mark, error, fatal);
-
- /* remove timers and things for fatal messages */
- if (fatal) {
- del_timer_sync(&call->resend_timer);
- del_timer_sync(&call->ack_timer);
- clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
- }
-
- if (mark != RXRPC_SKB_MARK_NEW_CALL &&
- !test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
- _leave("[no userid]");
- return 0;
- }
-
- if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) {
- skb = alloc_skb(0, GFP_NOFS);
- if (!skb)
- return -ENOMEM;
-
- rxrpc_new_skb(skb);
-
- skb->mark = mark;
-
- sp = rxrpc_skb(skb);
- memset(sp, 0, sizeof(*sp));
- sp->error = error;
- sp->call = call;
- rxrpc_get_call_for_skb(call, skb);
-
- spin_lock_bh(&call->lock);
- ret = rxrpc_queue_rcv_skb(call, skb, true, fatal);
- spin_unlock_bh(&call->lock);
- BUG_ON(ret < 0);
- }
-
- return 0;
+out_unlock:
+ spin_unlock_bh(&call->lock);
+out:
+ _leave("");
}
/*
- * handle background processing of incoming call packets and ACK / abort
- * generation
+ * Handle retransmission and deferred ACK/abort generation.
*/
void rxrpc_process_call(struct work_struct *work)
{
struct rxrpc_call *call =
container_of(work, struct rxrpc_call, processor);
- struct rxrpc_wire_header whdr;
- struct rxrpc_ackpacket ack;
- struct rxrpc_ackinfo ackinfo;
- struct msghdr msg;
- struct kvec iov[5];
- enum rxrpc_call_event genbit;
- unsigned long bits;
- __be32 data, pad;
- size_t len;
- int loop, nbit, ioc, ret, mtu;
- u32 serial, abort_code = RX_PROTOCOL_ERROR;
- u8 *acks = NULL;
+ unsigned long now;
rxrpc_see_call(call);
//printk("\n--------------------\n");
- _enter("{%d,%s,%lx} [%lu]",
- call->debug_id, rxrpc_call_states[call->state], call->events,
- (jiffies - call->creation_jif) / (HZ / 10));
-
- if (!call->conn)
- goto skip_msg_init;
-
- /* there's a good chance we're going to have to send a message, so set
- * one up in advance */
- msg.msg_name = &call->peer->srx.transport;
- msg.msg_namelen = call->peer->srx.transport_len;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_flags = 0;
+ _enter("{%d,%s,%lx}",
+ call->debug_id, rxrpc_call_states[call->state], call->events);
- whdr.epoch = htonl(call->conn->proto.epoch);
- whdr.cid = htonl(call->cid);
- whdr.callNumber = htonl(call->call_id);
- whdr.seq = 0;
- whdr.type = RXRPC_PACKET_TYPE_ACK;
- whdr.flags = call->conn->out_clientflag;
- whdr.userStatus = 0;
- whdr.securityIndex = call->conn->security_ix;
- whdr._rsvd = 0;
- whdr.serviceId = htons(call->service_id);
-
- memset(iov, 0, sizeof(iov));
- iov[0].iov_base = &whdr;
- iov[0].iov_len = sizeof(whdr);
-skip_msg_init:
-
- /* deal with events of a final nature */
- if (test_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events)) {
- enum rxrpc_skb_mark mark;
-
- clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events);
- clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events);
- clear_bit(RXRPC_CALL_EV_ABORT, &call->events);
-
- if (call->completion == RXRPC_CALL_NETWORK_ERROR) {
- mark = RXRPC_SKB_MARK_NET_ERROR;
- _debug("post net error %d", call->error);
- } else {
- mark = RXRPC_SKB_MARK_LOCAL_ERROR;
- _debug("post net local error %d", call->error);
- }
-
- if (rxrpc_post_message(call, mark, call->error, true) < 0)
- goto no_mem;
- clear_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events);
- goto kill_ACKs;
+recheck_state:
+ if (test_and_clear_bit(RXRPC_CALL_EV_ABORT, &call->events)) {
+ rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
+ goto recheck_state;
}
- if (test_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events)) {
- ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
-
- clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events);
- clear_bit(RXRPC_CALL_EV_ABORT, &call->events);
-
- _debug("post conn abort");
-
- if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
- call->error, true) < 0)
- goto no_mem;
- clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events);
- goto kill_ACKs;
+ if (call->state == RXRPC_CALL_COMPLETE) {
+ del_timer_sync(&call->timer);
+ goto out_put;
}
- if (test_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) {
- whdr.type = RXRPC_PACKET_TYPE_BUSY;
- genbit = RXRPC_CALL_EV_REJECT_BUSY;
- goto send_message;
+ now = jiffies;
+ if (time_after_eq(now, call->expire_at)) {
+ rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, ETIME);
+ set_bit(RXRPC_CALL_EV_ABORT, &call->events);
+ goto recheck_state;
}
- if (test_bit(RXRPC_CALL_EV_ABORT, &call->events)) {
- ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
-
- if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
- call->error, true) < 0)
- goto no_mem;
- whdr.type = RXRPC_PACKET_TYPE_ABORT;
- data = htonl(call->abort_code);
- iov[1].iov_base = &data;
- iov[1].iov_len = sizeof(data);
- genbit = RXRPC_CALL_EV_ABORT;
- goto send_message;
- }
-
- if (test_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events)) {
- genbit = RXRPC_CALL_EV_ACK_FINAL;
-
- ack.bufferSpace = htons(8);
- ack.maxSkew = 0;
- ack.serial = 0;
- ack.reason = RXRPC_ACK_IDLE;
- ack.nAcks = 0;
- call->ackr_reason = 0;
-
- spin_lock_bh(&call->lock);
- ack.serial = htonl(call->ackr_serial);
- ack.previousPacket = htonl(call->ackr_prev_seq);
- ack.firstPacket = htonl(call->rx_data_eaten + 1);
- spin_unlock_bh(&call->lock);
-
- pad = 0;
-
- iov[1].iov_base = &ack;
- iov[1].iov_len = sizeof(ack);
- iov[2].iov_base = &pad;
- iov[2].iov_len = 3;
- iov[3].iov_base = &ackinfo;
- iov[3].iov_len = sizeof(ackinfo);
- goto send_ACK;
- }
-
- if (call->events & ((1 << RXRPC_CALL_EV_RCVD_BUSY) |
- (1 << RXRPC_CALL_EV_RCVD_ABORT))
- ) {
- u32 mark;
-
- if (test_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events))
- mark = RXRPC_SKB_MARK_REMOTE_ABORT;
- else
- mark = RXRPC_SKB_MARK_BUSY;
-
- _debug("post abort/busy");
- rxrpc_clear_tx_window(call);
- if (rxrpc_post_message(call, mark, ECONNABORTED, true) < 0)
- goto no_mem;
-
- clear_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events);
- clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
- goto kill_ACKs;
- }
-
- if (test_and_clear_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events)) {
- _debug("do implicit ackall");
- rxrpc_clear_tx_window(call);
- }
-
- if (test_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events)) {
- rxrpc_abort_call(call, RX_CALL_TIMEOUT, ETIME);
-
- _debug("post timeout");
- if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
- ETIME, true) < 0)
- goto no_mem;
-
- clear_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events);
- goto kill_ACKs;
- }
-
- /* deal with assorted inbound messages */
- if (!skb_queue_empty(&call->rx_queue)) {
- ret = rxrpc_process_rx_queue(call, &abort_code);
- switch (ret) {
- case 0:
- case -EAGAIN:
- break;
- case -ENOMEM:
- goto no_mem;
- case -EKEYEXPIRED:
- case -EKEYREJECTED:
- case -EPROTO:
- rxrpc_abort_call(call, abort_code, -ret);
- goto kill_ACKs;
+ if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events) ||
+ time_after_eq(now, call->ack_at)) {
+ call->ack_at = call->expire_at;
+ if (call->ackr_reason) {
+ rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
+ goto recheck_state;
}
}
- /* handle resending */
- if (test_and_clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
- rxrpc_resend_timer(call);
- if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events))
+ if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events) ||
+ time_after_eq(now, call->resend_at)) {
rxrpc_resend(call);
-
- /* consider sending an ordinary ACK */
- if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) {
- _debug("send ACK: window: %d - %d { %lx }",
- call->rx_data_eaten, call->ackr_win_top,
- call->ackr_window[0]);
-
- if (call->state > RXRPC_CALL_SERVER_ACK_REQUEST &&
- call->ackr_reason != RXRPC_ACK_PING_RESPONSE) {
- /* ACK by sending reply DATA packet in this state */
- clear_bit(RXRPC_CALL_EV_ACK, &call->events);
- goto maybe_reschedule;
- }
-
- genbit = RXRPC_CALL_EV_ACK;
-
- acks = kzalloc(call->ackr_win_top - call->rx_data_eaten,
- GFP_NOFS);
- if (!acks)
- goto no_mem;
-
- //hdr.flags = RXRPC_SLOW_START_OK;
- ack.bufferSpace = htons(8);
- ack.maxSkew = 0;
-
- spin_lock_bh(&call->lock);
- ack.reason = call->ackr_reason;
- ack.serial = htonl(call->ackr_serial);
- ack.previousPacket = htonl(call->ackr_prev_seq);
- ack.firstPacket = htonl(call->rx_data_eaten + 1);
-
- ack.nAcks = 0;
- for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) {
- nbit = loop * BITS_PER_LONG;
- for (bits = call->ackr_window[loop]; bits; bits >>= 1
- ) {
- _debug("- l=%d n=%d b=%lx", loop, nbit, bits);
- if (bits & 1) {
- acks[nbit] = RXRPC_ACK_TYPE_ACK;
- ack.nAcks = nbit + 1;
- }
- nbit++;
- }
- }
- call->ackr_reason = 0;
- spin_unlock_bh(&call->lock);
-
- pad = 0;
-
- iov[1].iov_base = &ack;
- iov[1].iov_len = sizeof(ack);
- iov[2].iov_base = acks;
- iov[2].iov_len = ack.nAcks;
- iov[3].iov_base = &pad;
- iov[3].iov_len = 3;
- iov[4].iov_base = &ackinfo;
- iov[4].iov_len = sizeof(ackinfo);
-
- switch (ack.reason) {
- case RXRPC_ACK_REQUESTED:
- case RXRPC_ACK_DUPLICATE:
- case RXRPC_ACK_OUT_OF_SEQUENCE:
- case RXRPC_ACK_EXCEEDS_WINDOW:
- case RXRPC_ACK_NOSPACE:
- case RXRPC_ACK_PING:
- case RXRPC_ACK_PING_RESPONSE:
- goto send_ACK_with_skew;
- case RXRPC_ACK_DELAY:
- case RXRPC_ACK_IDLE:
- goto send_ACK;
- }
+ goto recheck_state;
}
- /* handle completion of security negotiations on an incoming
- * connection */
- if (test_and_clear_bit(RXRPC_CALL_EV_SECURED, &call->events)) {
- _debug("secured");
- spin_lock_bh(&call->lock);
-
- if (call->state == RXRPC_CALL_SERVER_SECURING) {
- _debug("securing");
- write_lock(&call->socket->call_lock);
- if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
- !test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
- _debug("not released");
- call->state = RXRPC_CALL_SERVER_ACCEPTING;
- list_move_tail(&call->accept_link,
- &call->socket->acceptq);
- }
- write_unlock(&call->socket->call_lock);
- read_lock(&call->state_lock);
- if (call->state < RXRPC_CALL_COMPLETE)
- set_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events);
- read_unlock(&call->state_lock);
- }
-
- spin_unlock_bh(&call->lock);
- if (!test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events))
- goto maybe_reschedule;
- }
-
- /* post a notification of an acceptable connection to the app */
- if (test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events)) {
- _debug("post accept");
- if (rxrpc_post_message(call, RXRPC_SKB_MARK_NEW_CALL,
- 0, false) < 0)
- goto no_mem;
- clear_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events);
- goto maybe_reschedule;
- }
-
- /* handle incoming call acceptance */
- if (test_and_clear_bit(RXRPC_CALL_EV_ACCEPTED, &call->events)) {
- _debug("accepted");
- ASSERTCMP(call->rx_data_post, ==, 0);
- call->rx_data_post = 1;
- read_lock_bh(&call->state_lock);
- if (call->state < RXRPC_CALL_COMPLETE)
- set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events);
- read_unlock_bh(&call->state_lock);
- }
-
- /* drain the out of sequence received packet queue into the packet Rx
- * queue */
- if (test_and_clear_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events)) {
- while (call->rx_data_post == call->rx_first_oos)
- if (rxrpc_drain_rx_oos_queue(call) < 0)
- break;
- goto maybe_reschedule;
- }
-
- if (test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
- rxrpc_release_call(call);
- clear_bit(RXRPC_CALL_EV_RELEASE, &call->events);
- }
+ rxrpc_set_timer(call, rxrpc_timer_set_for_resend);
/* other events may have been raised since we started checking */
- goto maybe_reschedule;
-
-send_ACK_with_skew:
- ack.maxSkew = htons(call->ackr_skew);
-send_ACK:
- mtu = call->peer->if_mtu;
- mtu -= call->peer->hdrsize;
- ackinfo.maxMTU = htonl(mtu);
- ackinfo.rwind = htonl(rxrpc_rx_window_size);
-
- /* permit the peer to send us jumbo packets if it wants to */
- ackinfo.rxMTU = htonl(rxrpc_rx_mtu);
- ackinfo.jumbo_max = htonl(rxrpc_rx_jumbo_max);
-
- serial = atomic_inc_return(&call->conn->serial);
- whdr.serial = htonl(serial);
- _proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
- serial,
- ntohs(ack.maxSkew),
- ntohl(ack.firstPacket),
- ntohl(ack.previousPacket),
- ntohl(ack.serial),
- rxrpc_acks(ack.reason),
- ack.nAcks);
-
- del_timer_sync(&call->ack_timer);
- if (ack.nAcks > 0)
- set_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags);
- goto send_message_2;
-
-send_message:
- _debug("send message");
-
- serial = atomic_inc_return(&call->conn->serial);
- whdr.serial = htonl(serial);
- _proto("Tx %s %%%u", rxrpc_pkts[whdr.type], serial);
-send_message_2:
-
- len = iov[0].iov_len;
- ioc = 1;
- if (iov[4].iov_len) {
- ioc = 5;
- len += iov[4].iov_len;
- len += iov[3].iov_len;
- len += iov[2].iov_len;
- len += iov[1].iov_len;
- } else if (iov[3].iov_len) {
- ioc = 4;
- len += iov[3].iov_len;
- len += iov[2].iov_len;
- len += iov[1].iov_len;
- } else if (iov[2].iov_len) {
- ioc = 3;
- len += iov[2].iov_len;
- len += iov[1].iov_len;
- } else if (iov[1].iov_len) {
- ioc = 2;
- len += iov[1].iov_len;
- }
-
- ret = kernel_sendmsg(call->conn->params.local->socket,
- &msg, iov, ioc, len);
- if (ret < 0) {
- _debug("sendmsg failed: %d", ret);
- read_lock_bh(&call->state_lock);
- if (call->state < RXRPC_CALL_DEAD)
- rxrpc_queue_call(call);
- read_unlock_bh(&call->state_lock);
- goto error;
- }
-
- switch (genbit) {
- case RXRPC_CALL_EV_ABORT:
- clear_bit(genbit, &call->events);
- clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
- goto kill_ACKs;
-
- case RXRPC_CALL_EV_ACK_FINAL:
- rxrpc_call_completed(call);
- goto kill_ACKs;
-
- default:
- clear_bit(genbit, &call->events);
- switch (call->state) {
- case RXRPC_CALL_CLIENT_AWAIT_REPLY:
- case RXRPC_CALL_CLIENT_RECV_REPLY:
- case RXRPC_CALL_SERVER_RECV_REQUEST:
- case RXRPC_CALL_SERVER_ACK_REQUEST:
- _debug("start ACK timer");
- rxrpc_propose_ACK(call, RXRPC_ACK_DELAY,
- call->ackr_skew, call->ackr_serial,
- false);
- default:
- break;
- }
- goto maybe_reschedule;
- }
-
-kill_ACKs:
- del_timer_sync(&call->ack_timer);
- if (test_and_clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events))
- rxrpc_put_call(call);
- clear_bit(RXRPC_CALL_EV_ACK, &call->events);
-
-maybe_reschedule:
- if (call->events || !skb_queue_empty(&call->rx_queue)) {
- read_lock_bh(&call->state_lock);
- if (call->state < RXRPC_CALL_DEAD)
- rxrpc_queue_call(call);
- read_unlock_bh(&call->state_lock);
- }
-
- /* don't leave aborted connections on the accept queue */
- if (call->state >= RXRPC_CALL_COMPLETE &&
- !list_empty(&call->accept_link)) {
- _debug("X unlinking once-pending call %p { e=%lx f=%lx c=%x }",
- call, call->events, call->flags, call->conn->proto.cid);
-
- read_lock_bh(&call->state_lock);
- if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
- !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
- rxrpc_queue_call(call);
- read_unlock_bh(&call->state_lock);
- }
-
-error:
- kfree(acks);
-
- /* because we don't want two CPUs both processing the work item for one
- * call at the same time, we use a flag to note when it's busy; however
- * this means there's a race between clearing the flag and setting the
- * work pending bit and the work item being processed again */
- if (call->events && !work_pending(&call->processor)) {
- _debug("jumpstart %x", call->conn->proto.cid);
- rxrpc_queue_call(call);
+ if (call->events && call->state < RXRPC_CALL_COMPLETE) {
+ __rxrpc_queue_call(call);
+ goto out;
}
+out_put:
+ rxrpc_put_call(call, rxrpc_call_put);
+out:
_leave("");
- return;
-
-no_mem:
- _debug("out of memory");
- goto maybe_reschedule;
}
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 65691742199b..d4b3293b78fa 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -24,18 +24,13 @@
*/
unsigned int rxrpc_max_call_lifetime = 60 * HZ;
-/*
- * Time till dead call expires after last use (in jiffies).
- */
-unsigned int rxrpc_dead_call_expiry = 2 * HZ;
-
const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = {
[RXRPC_CALL_UNINITIALISED] = "Uninit ",
[RXRPC_CALL_CLIENT_AWAIT_CONN] = "ClWtConn",
[RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq",
[RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl",
[RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl",
- [RXRPC_CALL_CLIENT_FINAL_ACK] = "ClFnlACK",
+ [RXRPC_CALL_SERVER_PREALLOC] = "SvPrealc",
[RXRPC_CALL_SERVER_SECURING] = "SvSecure",
[RXRPC_CALL_SERVER_ACCEPTING] = "SvAccept",
[RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq",
@@ -43,27 +38,49 @@ const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = {
[RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl",
[RXRPC_CALL_SERVER_AWAIT_ACK] = "SvAwtACK",
[RXRPC_CALL_COMPLETE] = "Complete",
- [RXRPC_CALL_DEAD] = "Dead ",
};
const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = {
[RXRPC_CALL_SUCCEEDED] = "Complete",
- [RXRPC_CALL_SERVER_BUSY] = "SvBusy ",
[RXRPC_CALL_REMOTELY_ABORTED] = "RmtAbort",
[RXRPC_CALL_LOCALLY_ABORTED] = "LocAbort",
[RXRPC_CALL_LOCAL_ERROR] = "LocError",
[RXRPC_CALL_NETWORK_ERROR] = "NetError",
};
+const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = {
+ [rxrpc_call_new_client] = "NWc",
+ [rxrpc_call_new_service] = "NWs",
+ [rxrpc_call_queued] = "QUE",
+ [rxrpc_call_queued_ref] = "QUR",
+ [rxrpc_call_connected] = "CON",
+ [rxrpc_call_release] = "RLS",
+ [rxrpc_call_seen] = "SEE",
+ [rxrpc_call_got] = "GOT",
+ [rxrpc_call_got_userid] = "Gus",
+ [rxrpc_call_got_kernel] = "Gke",
+ [rxrpc_call_put] = "PUT",
+ [rxrpc_call_put_userid] = "Pus",
+ [rxrpc_call_put_kernel] = "Pke",
+ [rxrpc_call_put_noqueue] = "PNQ",
+ [rxrpc_call_error] = "*E*",
+};
+
struct kmem_cache *rxrpc_call_jar;
LIST_HEAD(rxrpc_calls);
DEFINE_RWLOCK(rxrpc_call_lock);
-static void rxrpc_destroy_call(struct work_struct *work);
-static void rxrpc_call_life_expired(unsigned long _call);
-static void rxrpc_dead_call_expired(unsigned long _call);
-static void rxrpc_ack_time_expired(unsigned long _call);
-static void rxrpc_resend_time_expired(unsigned long _call);
+static void rxrpc_call_timer_expired(unsigned long _call)
+{
+ struct rxrpc_call *call = (struct rxrpc_call *)_call;
+
+ _enter("%d", call->debug_id);
+
+ if (call->state < RXRPC_CALL_COMPLETE) {
+ trace_rxrpc_timer(call, rxrpc_timer_expired, jiffies);
+ rxrpc_queue_call(call);
+ }
+}
/*
* find an extant server call
@@ -96,7 +113,7 @@ struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *rx,
return NULL;
found_extant_call:
- rxrpc_get_call(call);
+ rxrpc_get_call(call, rxrpc_call_got);
read_unlock(&rx->call_lock);
_leave(" = %p [%d]", call, atomic_read(&call->usage));
return call;
@@ -105,7 +122,7 @@ found_extant_call:
/*
* allocate a new call
*/
-static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
+struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
{
struct rxrpc_call *call;
@@ -113,30 +130,24 @@ static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
if (!call)
return NULL;
- call->acks_winsz = 16;
- call->acks_window = kmalloc(call->acks_winsz * sizeof(unsigned long),
+ call->rxtx_buffer = kcalloc(RXRPC_RXTX_BUFF_SIZE,
+ sizeof(struct sk_buff *),
gfp);
- if (!call->acks_window) {
- kmem_cache_free(rxrpc_call_jar, call);
- return NULL;
- }
+ if (!call->rxtx_buffer)
+ goto nomem;
+
+ call->rxtx_annotations = kcalloc(RXRPC_RXTX_BUFF_SIZE, sizeof(u8), gfp);
+ if (!call->rxtx_annotations)
+ goto nomem_2;
- setup_timer(&call->lifetimer, &rxrpc_call_life_expired,
- (unsigned long) call);
- setup_timer(&call->deadspan, &rxrpc_dead_call_expired,
- (unsigned long) call);
- setup_timer(&call->ack_timer, &rxrpc_ack_time_expired,
- (unsigned long) call);
- setup_timer(&call->resend_timer, &rxrpc_resend_time_expired,
- (unsigned long) call);
- INIT_WORK(&call->destroyer, &rxrpc_destroy_call);
+ setup_timer(&call->timer, rxrpc_call_timer_expired,
+ (unsigned long)call);
INIT_WORK(&call->processor, &rxrpc_process_call);
INIT_LIST_HEAD(&call->link);
INIT_LIST_HEAD(&call->chan_wait_link);
INIT_LIST_HEAD(&call->accept_link);
- skb_queue_head_init(&call->rx_queue);
- skb_queue_head_init(&call->rx_oos_queue);
- skb_queue_head_init(&call->knlrecv_queue);
+ INIT_LIST_HEAD(&call->recvmsg_link);
+ INIT_LIST_HEAD(&call->sock_link);
init_waitqueue_head(&call->waitq);
spin_lock_init(&call->lock);
rwlock_init(&call->state_lock);
@@ -145,65 +156,65 @@ static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
memset(&call->sock_node, 0xed, sizeof(call->sock_node));
- call->rx_data_expect = 1;
- call->rx_data_eaten = 0;
- call->rx_first_oos = 0;
- call->ackr_win_top = call->rx_data_eaten + 1 + rxrpc_rx_window_size;
- call->creation_jif = jiffies;
+ /* Leave space in the ring to handle a maxed-out jumbo packet */
+ call->rx_winsize = rxrpc_rx_window_size;
+ call->tx_winsize = 16;
+ call->rx_expect_next = 1;
+
+ if (RXRPC_TX_SMSS > 2190)
+ call->cong_cwnd = 2;
+ else if (RXRPC_TX_SMSS > 1095)
+ call->cong_cwnd = 3;
+ else
+ call->cong_cwnd = 4;
+ call->cong_ssthresh = RXRPC_RXTX_BUFF_SIZE - 1;
return call;
+
+nomem_2:
+ kfree(call->rxtx_buffer);
+nomem:
+ kmem_cache_free(rxrpc_call_jar, call);
+ return NULL;
}
/*
* Allocate a new client call.
*/
-static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
- struct sockaddr_rxrpc *srx,
+static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx,
gfp_t gfp)
{
struct rxrpc_call *call;
+ ktime_t now;
_enter("");
- ASSERT(rx->local != NULL);
-
call = rxrpc_alloc_call(gfp);
if (!call)
return ERR_PTR(-ENOMEM);
call->state = RXRPC_CALL_CLIENT_AWAIT_CONN;
-
- sock_hold(&rx->sk);
- call->socket = rx;
- call->rx_data_post = 1;
call->service_id = srx->srx_service;
+ call->tx_phase = true;
+ now = ktime_get_real();
+ call->acks_latest_ts = now;
+ call->cong_tstamp = now;
_leave(" = %p", call);
return call;
}
/*
- * Begin client call.
+ * Initiate the call ack/resend/expiry timer.
*/
-static int rxrpc_begin_client_call(struct rxrpc_call *call,
- struct rxrpc_conn_parameters *cp,
- struct sockaddr_rxrpc *srx,
- gfp_t gfp)
+static void rxrpc_start_call_timer(struct rxrpc_call *call)
{
- int ret;
-
- /* Set up or get a connection record and set the protocol parameters,
- * including channel number and call ID.
- */
- ret = rxrpc_connect_call(call, cp, srx, gfp);
- if (ret < 0)
- return ret;
-
- spin_lock(&call->conn->params.peer->lock);
- hlist_add_head(&call->error_link, &call->conn->params.peer->error_targets);
- spin_unlock(&call->conn->params.peer->lock);
-
- call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime;
- add_timer(&call->lifetimer);
- return 0;
+ unsigned long expire_at;
+
+ expire_at = jiffies + rxrpc_max_call_lifetime;
+ call->expire_at = expire_at;
+ call->ack_at = expire_at;
+ call->resend_at = expire_at;
+ call->timer.expires = expire_at + 1;
+ rxrpc_set_timer(call, rxrpc_timer_begin);
}
/*
@@ -223,19 +234,16 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
_enter("%p,%lx", rx, user_call_ID);
- call = rxrpc_alloc_client_call(rx, srx, gfp);
+ call = rxrpc_alloc_client_call(srx, gfp);
if (IS_ERR(call)) {
_leave(" = %ld", PTR_ERR(call));
return call;
}
- trace_rxrpc_call(call, 0, atomic_read(&call->usage), 0, here,
- (const void *)user_call_ID);
+ trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage),
+ here, (const void *)user_call_ID);
/* Publish the call, even though it is incompletely set up as yet */
- call->user_call_ID = user_call_ID;
- __set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
-
write_lock(&rx->call_lock);
pp = &rx->calls.rb_node;
@@ -249,192 +257,138 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
else if (user_call_ID > xcall->user_call_ID)
pp = &(*pp)->rb_right;
else
- goto found_user_ID_now_present;
+ goto error_dup_user_ID;
}
- rxrpc_get_call(call);
-
+ rcu_assign_pointer(call->socket, rx);
+ call->user_call_ID = user_call_ID;
+ __set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
+ rxrpc_get_call(call, rxrpc_call_got_userid);
rb_link_node(&call->sock_node, parent, pp);
rb_insert_color(&call->sock_node, &rx->calls);
+ list_add(&call->sock_link, &rx->sock_calls);
+
write_unlock(&rx->call_lock);
- write_lock_bh(&rxrpc_call_lock);
+ write_lock(&rxrpc_call_lock);
list_add_tail(&call->link, &rxrpc_calls);
- write_unlock_bh(&rxrpc_call_lock);
+ write_unlock(&rxrpc_call_lock);
- ret = rxrpc_begin_client_call(call, cp, srx, gfp);
+ /* Set up or get a connection record and set the protocol parameters,
+ * including channel number and call ID.
+ */
+ ret = rxrpc_connect_call(call, cp, srx, gfp);
if (ret < 0)
goto error;
- _net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id);
+ trace_rxrpc_call(call, rxrpc_call_connected, atomic_read(&call->usage),
+ here, ERR_PTR(ret));
- _leave(" = %p [new]", call);
- return call;
+ spin_lock_bh(&call->conn->params.peer->lock);
+ hlist_add_head(&call->error_link,
+ &call->conn->params.peer->error_targets);
+ spin_unlock_bh(&call->conn->params.peer->lock);
-error:
- write_lock(&rx->call_lock);
- rb_erase(&call->sock_node, &rx->calls);
- write_unlock(&rx->call_lock);
- rxrpc_put_call(call);
+ rxrpc_start_call_timer(call);
- write_lock_bh(&rxrpc_call_lock);
- list_del_init(&call->link);
- write_unlock_bh(&rxrpc_call_lock);
+ _net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id);
- set_bit(RXRPC_CALL_RELEASED, &call->flags);
- call->state = RXRPC_CALL_DEAD;
- rxrpc_put_call(call);
- _leave(" = %d", ret);
- return ERR_PTR(ret);
+ _leave(" = %p [new]", call);
+ return call;
/* We unexpectedly found the user ID in the list after taking
* the call_lock. This shouldn't happen unless the user races
* with itself and tries to add the same user ID twice at the
* same time in different threads.
*/
-found_user_ID_now_present:
+error_dup_user_ID:
write_unlock(&rx->call_lock);
- set_bit(RXRPC_CALL_RELEASED, &call->flags);
- call->state = RXRPC_CALL_DEAD;
- rxrpc_put_call(call);
- _leave(" = -EEXIST [%p]", call);
- return ERR_PTR(-EEXIST);
+ ret = -EEXIST;
+
+error:
+ __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
+ RX_CALL_DEAD, ret);
+ trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage),
+ here, ERR_PTR(ret));
+ rxrpc_release_call(rx, call);
+ rxrpc_put_call(call, rxrpc_call_put);
+ _leave(" = %d", ret);
+ return ERR_PTR(ret);
}
/*
- * set up an incoming call
- * - called in process context with IRQs enabled
+ * Set up an incoming call. call->conn points to the connection.
+ * This is called in BH context and isn't allowed to fail.
*/
-struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
- struct rxrpc_connection *conn,
- struct sk_buff *skb)
+void rxrpc_incoming_call(struct rxrpc_sock *rx,
+ struct rxrpc_call *call,
+ struct sk_buff *skb)
{
+ struct rxrpc_connection *conn = call->conn;
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- struct rxrpc_call *call, *candidate;
- const void *here = __builtin_return_address(0);
- u32 call_id, chan;
-
- _enter(",%d", conn->debug_id);
-
- ASSERT(rx != NULL);
-
- candidate = rxrpc_alloc_call(GFP_NOIO);
- if (!candidate)
- return ERR_PTR(-EBUSY);
-
- trace_rxrpc_call(candidate, 1, atomic_read(&candidate->usage),
- 0, here, NULL);
-
- chan = sp->hdr.cid & RXRPC_CHANNELMASK;
- candidate->socket = rx;
- candidate->conn = conn;
- candidate->peer = conn->params.peer;
- candidate->cid = sp->hdr.cid;
- candidate->call_id = sp->hdr.callNumber;
- candidate->rx_data_post = 0;
- candidate->state = RXRPC_CALL_SERVER_ACCEPTING;
- candidate->flags |= (1 << RXRPC_CALL_IS_SERVICE);
- if (conn->security_ix > 0)
- candidate->state = RXRPC_CALL_SERVER_SECURING;
-
- spin_lock(&conn->channel_lock);
-
- /* set the channel for this call */
- call = rcu_dereference_protected(conn->channels[chan].call,
- lockdep_is_held(&conn->channel_lock));
-
- _debug("channel[%u] is %p", candidate->cid & RXRPC_CHANNELMASK, call);
- if (call && call->call_id == sp->hdr.callNumber) {
- /* already set; must've been a duplicate packet */
- _debug("extant call [%d]", call->state);
- ASSERTCMP(call->conn, ==, conn);
-
- read_lock(&call->state_lock);
- switch (call->state) {
- case RXRPC_CALL_LOCALLY_ABORTED:
- if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events))
- rxrpc_queue_call(call);
- case RXRPC_CALL_REMOTELY_ABORTED:
- read_unlock(&call->state_lock);
- goto aborted_call;
- default:
- rxrpc_get_call(call);
- read_unlock(&call->state_lock);
- goto extant_call;
- }
- }
-
- if (call) {
- /* it seems the channel is still in use from the previous call
- * - ditch the old binding if its call is now complete */
- _debug("CALL: %u { %s }",
- call->debug_id, rxrpc_call_states[call->state]);
-
- if (call->state == RXRPC_CALL_COMPLETE) {
- __rxrpc_disconnect_call(conn, call);
- } else {
- spin_unlock(&conn->channel_lock);
- kmem_cache_free(rxrpc_call_jar, candidate);
- _leave(" = -EBUSY");
- return ERR_PTR(-EBUSY);
- }
- }
-
- /* check the call number isn't duplicate */
- _debug("check dup");
- call_id = sp->hdr.callNumber;
-
- /* We just ignore calls prior to the current call ID. Terminated calls
- * are handled via the connection.
+ u32 chan;
+
+ _enter(",%d", call->conn->debug_id);
+
+ rcu_assign_pointer(call->socket, rx);
+ call->call_id = sp->hdr.callNumber;
+ call->service_id = sp->hdr.serviceId;
+ call->cid = sp->hdr.cid;
+ call->state = RXRPC_CALL_SERVER_ACCEPTING;
+ if (sp->hdr.securityIndex > 0)
+ call->state = RXRPC_CALL_SERVER_SECURING;
+ call->cong_tstamp = skb->tstamp;
+
+ /* Set the channel for this call. We don't get channel_lock as we're
+ * only defending against the data_ready handler (which we're called
+ * from) and the RESPONSE packet parser (which is only really
+ * interested in call_counter and can cope with a disagreement with the
+ * call pointer).
*/
- if (call_id <= conn->channels[chan].call_counter)
- goto old_call; /* TODO: Just drop packet */
-
- /* make the call available */
- _debug("new call");
- call = candidate;
- candidate = NULL;
- conn->channels[chan].call_counter = call_id;
+ chan = sp->hdr.cid & RXRPC_CHANNELMASK;
+ conn->channels[chan].call_counter = call->call_id;
+ conn->channels[chan].call_id = call->call_id;
rcu_assign_pointer(conn->channels[chan].call, call);
- sock_hold(&rx->sk);
- rxrpc_get_connection(conn);
- rxrpc_get_peer(call->peer);
- spin_unlock(&conn->channel_lock);
spin_lock(&conn->params.peer->lock);
hlist_add_head(&call->error_link, &conn->params.peer->error_targets);
spin_unlock(&conn->params.peer->lock);
- write_lock_bh(&rxrpc_call_lock);
- list_add_tail(&call->link, &rxrpc_calls);
- write_unlock_bh(&rxrpc_call_lock);
-
- call->service_id = conn->params.service_id;
-
_net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id);
- call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime;
- add_timer(&call->lifetimer);
- _leave(" = %p {%d} [new]", call, call->debug_id);
- return call;
+ rxrpc_start_call_timer(call);
+ _leave("");
+}
-extant_call:
- spin_unlock(&conn->channel_lock);
- kmem_cache_free(rxrpc_call_jar, candidate);
- _leave(" = %p {%d} [extant]", call, call ? call->debug_id : -1);
- return call;
+/*
+ * Queue a call's work processor, getting a ref to pass to the work queue.
+ */
+bool rxrpc_queue_call(struct rxrpc_call *call)
+{
+ const void *here = __builtin_return_address(0);
+ int n = __atomic_add_unless(&call->usage, 1, 0);
+ if (n == 0)
+ return false;
+ if (rxrpc_queue_work(&call->processor))
+ trace_rxrpc_call(call, rxrpc_call_queued, n + 1, here, NULL);
+ else
+ rxrpc_put_call(call, rxrpc_call_put_noqueue);
+ return true;
+}
-aborted_call:
- spin_unlock(&conn->channel_lock);
- kmem_cache_free(rxrpc_call_jar, candidate);
- _leave(" = -ECONNABORTED");
- return ERR_PTR(-ECONNABORTED);
-
-old_call:
- spin_unlock(&conn->channel_lock);
- kmem_cache_free(rxrpc_call_jar, candidate);
- _leave(" = -ECONNRESET [old]");
- return ERR_PTR(-ECONNRESET);
+/*
+ * Queue a call's work processor, passing the callers ref to the work queue.
+ */
+bool __rxrpc_queue_call(struct rxrpc_call *call)
+{
+ const void *here = __builtin_return_address(0);
+ int n = atomic_read(&call->usage);
+ ASSERTCMP(n, >=, 1);
+ if (rxrpc_queue_work(&call->processor))
+ trace_rxrpc_call(call, rxrpc_call_queued_ref, n, here, NULL);
+ else
+ rxrpc_put_call(call, rxrpc_call_put_noqueue);
+ return true;
}
/*
@@ -445,157 +399,88 @@ void rxrpc_see_call(struct rxrpc_call *call)
const void *here = __builtin_return_address(0);
if (call) {
int n = atomic_read(&call->usage);
- int m = atomic_read(&call->skb_count);
- trace_rxrpc_call(call, 2, n, m, here, 0);
+ trace_rxrpc_call(call, rxrpc_call_seen, n, here, NULL);
}
}
/*
* Note the addition of a ref on a call.
*/
-void rxrpc_get_call(struct rxrpc_call *call)
+void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
{
const void *here = __builtin_return_address(0);
int n = atomic_inc_return(&call->usage);
- int m = atomic_read(&call->skb_count);
- trace_rxrpc_call(call, 3, n, m, here, 0);
+ trace_rxrpc_call(call, op, n, here, NULL);
}
/*
- * Note the addition of a ref on a call for a socket buffer.
+ * Detach a call from its owning socket.
*/
-void rxrpc_get_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb)
+void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
{
const void *here = __builtin_return_address(0);
- int n = atomic_inc_return(&call->usage);
- int m = atomic_inc_return(&call->skb_count);
-
- trace_rxrpc_call(call, 4, n, m, here, skb);
-}
-
-/*
- * detach a call from a socket and set up for release
- */
-void rxrpc_release_call(struct rxrpc_call *call)
-{
struct rxrpc_connection *conn = call->conn;
- struct rxrpc_sock *rx = call->socket;
+ bool put = false;
+ int i;
- _enter("{%d,%d,%d,%d}",
- call->debug_id, atomic_read(&call->usage),
- atomic_read(&call->ackr_not_idle),
- call->rx_first_oos);
+ _enter("{%d,%d}", call->debug_id, atomic_read(&call->usage));
- rxrpc_see_call(call);
+ trace_rxrpc_call(call, rxrpc_call_release, atomic_read(&call->usage),
+ here, (const void *)call->flags);
+
+ ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
spin_lock_bh(&call->lock);
if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags))
BUG();
spin_unlock_bh(&call->lock);
- /* dissociate from the socket
- * - the socket's ref on the call is passed to the death timer
- */
- _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn);
+ del_timer_sync(&call->timer);
- spin_lock(&conn->params.peer->lock);
- hlist_del_init(&call->error_link);
- spin_unlock(&conn->params.peer->lock);
+ /* Make sure we don't get any more notifications */
+ write_lock_bh(&rx->recvmsg_lock);
- write_lock_bh(&rx->call_lock);
- if (!list_empty(&call->accept_link)) {
+ if (!list_empty(&call->recvmsg_link)) {
_debug("unlinking once-pending call %p { e=%lx f=%lx }",
call, call->events, call->flags);
- ASSERT(!test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
- list_del_init(&call->accept_link);
- sk_acceptq_removed(&rx->sk);
- } else if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
- rb_erase(&call->sock_node, &rx->calls);
- memset(&call->sock_node, 0xdd, sizeof(call->sock_node));
- clear_bit(RXRPC_CALL_HAS_USERID, &call->flags);
+ list_del(&call->recvmsg_link);
+ put = true;
}
- write_unlock_bh(&rx->call_lock);
- /* free up the channel for reuse */
- write_lock_bh(&call->state_lock);
+ /* list_empty() must return false in rxrpc_notify_socket() */
+ call->recvmsg_link.next = NULL;
+ call->recvmsg_link.prev = NULL;
- if (call->state < RXRPC_CALL_COMPLETE &&
- call->state != RXRPC_CALL_CLIENT_FINAL_ACK) {
- _debug("+++ ABORTING STATE %d +++\n", call->state);
- __rxrpc_abort_call(call, RX_CALL_DEAD, ECONNRESET);
- }
- write_unlock_bh(&call->state_lock);
-
- rxrpc_disconnect_call(call);
-
- /* clean up the Rx queue */
- if (!skb_queue_empty(&call->rx_queue) ||
- !skb_queue_empty(&call->rx_oos_queue)) {
- struct rxrpc_skb_priv *sp;
- struct sk_buff *skb;
-
- _debug("purge Rx queues");
-
- spin_lock_bh(&call->lock);
- while ((skb = skb_dequeue(&call->rx_queue)) ||
- (skb = skb_dequeue(&call->rx_oos_queue))) {
- spin_unlock_bh(&call->lock);
-
- sp = rxrpc_skb(skb);
- _debug("- zap %s %%%u #%u",
- rxrpc_pkts[sp->hdr.type],
- sp->hdr.serial, sp->hdr.seq);
- rxrpc_free_skb(skb);
- spin_lock_bh(&call->lock);
- }
- spin_unlock_bh(&call->lock);
- }
+ write_unlock_bh(&rx->recvmsg_lock);
+ if (put)
+ rxrpc_put_call(call, rxrpc_call_put);
- del_timer_sync(&call->resend_timer);
- del_timer_sync(&call->ack_timer);
- del_timer_sync(&call->lifetimer);
- call->deadspan.expires = jiffies + rxrpc_dead_call_expiry;
- add_timer(&call->deadspan);
+ write_lock(&rx->call_lock);
- _leave("");
-}
+ if (test_and_clear_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
+ rb_erase(&call->sock_node, &rx->calls);
+ memset(&call->sock_node, 0xdd, sizeof(call->sock_node));
+ rxrpc_put_call(call, rxrpc_call_put_userid);
+ }
-/*
- * handle a dead call being ready for reaping
- */
-static void rxrpc_dead_call_expired(unsigned long _call)
-{
- struct rxrpc_call *call = (struct rxrpc_call *) _call;
+ list_del(&call->sock_link);
+ write_unlock(&rx->call_lock);
- _enter("{%d}", call->debug_id);
+ _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn);
- rxrpc_see_call(call);
- write_lock_bh(&call->state_lock);
- call->state = RXRPC_CALL_DEAD;
- write_unlock_bh(&call->state_lock);
- rxrpc_put_call(call);
-}
+ if (conn)
+ rxrpc_disconnect_call(call);
-/*
- * mark a call as to be released, aborting it if it's still in progress
- * - called with softirqs disabled
- */
-static void rxrpc_mark_call_released(struct rxrpc_call *call)
-{
- bool sched = false;
-
- rxrpc_see_call(call);
- write_lock(&call->state_lock);
- if (call->state < RXRPC_CALL_DEAD) {
- sched = __rxrpc_abort_call(call, RX_CALL_DEAD, ECONNRESET);
- if (!test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
- sched = true;
+ for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) {
+ rxrpc_free_skb(call->rxtx_buffer[i],
+ (call->tx_phase ? rxrpc_skb_tx_cleaned :
+ rxrpc_skb_rx_cleaned));
+ call->rxtx_buffer[i] = NULL;
}
- write_unlock(&call->state_lock);
- if (sched)
- rxrpc_queue_call(call);
+
+ _leave("");
}
/*
@@ -604,70 +489,52 @@ static void rxrpc_mark_call_released(struct rxrpc_call *call)
void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
{
struct rxrpc_call *call;
- struct rb_node *p;
_enter("%p", rx);
- read_lock_bh(&rx->call_lock);
-
- /* kill the not-yet-accepted incoming calls */
- list_for_each_entry(call, &rx->secureq, accept_link) {
- rxrpc_mark_call_released(call);
- }
-
- list_for_each_entry(call, &rx->acceptq, accept_link) {
- rxrpc_mark_call_released(call);
+ while (!list_empty(&rx->to_be_accepted)) {
+ call = list_entry(rx->to_be_accepted.next,
+ struct rxrpc_call, accept_link);
+ list_del(&call->accept_link);
+ rxrpc_abort_call("SKR", call, 0, RX_CALL_DEAD, ECONNRESET);
+ rxrpc_put_call(call, rxrpc_call_put);
}
- /* mark all the calls as no longer wanting incoming packets */
- for (p = rb_first(&rx->calls); p; p = rb_next(p)) {
- call = rb_entry(p, struct rxrpc_call, sock_node);
- rxrpc_mark_call_released(call);
+ while (!list_empty(&rx->sock_calls)) {
+ call = list_entry(rx->sock_calls.next,
+ struct rxrpc_call, sock_link);
+ rxrpc_get_call(call, rxrpc_call_got);
+ rxrpc_abort_call("SKT", call, 0, RX_CALL_DEAD, ECONNRESET);
+ rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
+ rxrpc_release_call(rx, call);
+ rxrpc_put_call(call, rxrpc_call_put);
}
- read_unlock_bh(&rx->call_lock);
_leave("");
}
/*
* release a call
*/
-void rxrpc_put_call(struct rxrpc_call *call)
+void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
{
const void *here = __builtin_return_address(0);
- int n, m;
+ int n;
ASSERT(call != NULL);
n = atomic_dec_return(&call->usage);
- m = atomic_read(&call->skb_count);
- trace_rxrpc_call(call, 5, n, m, here, NULL);
+ trace_rxrpc_call(call, op, n, here, NULL);
ASSERTCMP(n, >=, 0);
if (n == 0) {
_debug("call %d dead", call->debug_id);
- WARN_ON(m != 0);
- ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
- rxrpc_queue_work(&call->destroyer);
- }
-}
+ ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
-/*
- * Release a call ref held by a socket buffer.
- */
-void rxrpc_put_call_for_skb(struct rxrpc_call *call, struct sk_buff *skb)
-{
- const void *here = __builtin_return_address(0);
- int n, m;
+ write_lock(&rxrpc_call_lock);
+ list_del_init(&call->link);
+ write_unlock(&rxrpc_call_lock);
- n = atomic_dec_return(&call->usage);
- m = atomic_dec_return(&call->skb_count);
- trace_rxrpc_call(call, 6, n, m, here, skb);
- ASSERTCMP(n, >=, 0);
- if (n == 0) {
- _debug("call %d dead", call->debug_id);
- WARN_ON(m != 0);
- ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
- rxrpc_queue_work(&call->destroyer);
+ rxrpc_cleanup_call(call);
}
}
@@ -678,99 +545,53 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu)
{
struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu);
- rxrpc_purge_queue(&call->rx_queue);
- rxrpc_purge_queue(&call->knlrecv_queue);
rxrpc_put_peer(call->peer);
+ kfree(call->rxtx_buffer);
+ kfree(call->rxtx_annotations);
kmem_cache_free(rxrpc_call_jar, call);
}
/*
* clean up a call
*/
-static void rxrpc_cleanup_call(struct rxrpc_call *call)
+void rxrpc_cleanup_call(struct rxrpc_call *call)
{
- _net("DESTROY CALL %d", call->debug_id);
+ int i;
- ASSERT(call->socket);
+ _net("DESTROY CALL %d", call->debug_id);
memset(&call->sock_node, 0xcd, sizeof(call->sock_node));
- del_timer_sync(&call->lifetimer);
- del_timer_sync(&call->deadspan);
- del_timer_sync(&call->ack_timer);
- del_timer_sync(&call->resend_timer);
+ del_timer_sync(&call->timer);
+ ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags));
- ASSERTCMP(call->events, ==, 0);
- if (work_pending(&call->processor)) {
- _debug("defer destroy");
- rxrpc_queue_work(&call->destroyer);
- return;
- }
-
ASSERTCMP(call->conn, ==, NULL);
- if (call->acks_window) {
- _debug("kill Tx window %d",
- CIRC_CNT(call->acks_head, call->acks_tail,
- call->acks_winsz));
- smp_mb();
- while (CIRC_CNT(call->acks_head, call->acks_tail,
- call->acks_winsz) > 0) {
- struct rxrpc_skb_priv *sp;
- unsigned long _skb;
-
- _skb = call->acks_window[call->acks_tail] & ~1;
- sp = rxrpc_skb((struct sk_buff *)_skb);
- _debug("+++ clear Tx %u", sp->hdr.seq);
- rxrpc_free_skb((struct sk_buff *)_skb);
- call->acks_tail =
- (call->acks_tail + 1) & (call->acks_winsz - 1);
- }
-
- kfree(call->acks_window);
- }
+ /* Clean up the Rx/Tx buffer */
+ for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++)
+ rxrpc_free_skb(call->rxtx_buffer[i],
+ (call->tx_phase ? rxrpc_skb_tx_cleaned :
+ rxrpc_skb_rx_cleaned));
- rxrpc_free_skb(call->tx_pending);
+ rxrpc_free_skb(call->tx_pending, rxrpc_skb_tx_cleaned);
- rxrpc_purge_queue(&call->rx_queue);
- ASSERT(skb_queue_empty(&call->rx_oos_queue));
- rxrpc_purge_queue(&call->knlrecv_queue);
- sock_put(&call->socket->sk);
call_rcu(&call->rcu, rxrpc_rcu_destroy_call);
}
/*
- * destroy a call
- */
-static void rxrpc_destroy_call(struct work_struct *work)
-{
- struct rxrpc_call *call =
- container_of(work, struct rxrpc_call, destroyer);
-
- _enter("%p{%d,%x,%p}",
- call, atomic_read(&call->usage), call->cid, call->conn);
-
- ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
-
- write_lock_bh(&rxrpc_call_lock);
- list_del_init(&call->link);
- write_unlock_bh(&rxrpc_call_lock);
-
- rxrpc_cleanup_call(call);
- _leave("");
-}
-
-/*
- * preemptively destroy all the call records from a transport endpoint rather
- * than waiting for them to time out
+ * Make sure that all calls are gone.
*/
void __exit rxrpc_destroy_all_calls(void)
{
struct rxrpc_call *call;
_enter("");
- write_lock_bh(&rxrpc_call_lock);
+
+ if (list_empty(&rxrpc_calls))
+ return;
+
+ write_lock(&rxrpc_call_lock);
while (!list_empty(&rxrpc_calls)) {
call = list_entry(rxrpc_calls.next, struct rxrpc_call, link);
@@ -779,87 +600,15 @@ void __exit rxrpc_destroy_all_calls(void)
rxrpc_see_call(call);
list_del_init(&call->link);
- switch (atomic_read(&call->usage)) {
- case 0:
- ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
- break;
- case 1:
- if (del_timer_sync(&call->deadspan) != 0 &&
- call->state != RXRPC_CALL_DEAD)
- rxrpc_dead_call_expired((unsigned long) call);
- if (call->state != RXRPC_CALL_DEAD)
- break;
- default:
- pr_err("Call %p still in use (%d,%d,%s,%lx,%lx)!\n",
- call, atomic_read(&call->usage),
- atomic_read(&call->ackr_not_idle),
- rxrpc_call_states[call->state],
- call->flags, call->events);
- if (!skb_queue_empty(&call->rx_queue))
- pr_err("Rx queue occupied\n");
- if (!skb_queue_empty(&call->rx_oos_queue))
- pr_err("OOS queue occupied\n");
- break;
- }
-
- write_unlock_bh(&rxrpc_call_lock);
+ pr_err("Call %p still in use (%d,%s,%lx,%lx)!\n",
+ call, atomic_read(&call->usage),
+ rxrpc_call_states[call->state],
+ call->flags, call->events);
+
+ write_unlock(&rxrpc_call_lock);
cond_resched();
- write_lock_bh(&rxrpc_call_lock);
+ write_lock(&rxrpc_call_lock);
}
- write_unlock_bh(&rxrpc_call_lock);
- _leave("");
-}
-
-/*
- * handle call lifetime being exceeded
- */
-static void rxrpc_call_life_expired(unsigned long _call)
-{
- struct rxrpc_call *call = (struct rxrpc_call *) _call;
-
- _enter("{%d}", call->debug_id);
-
- rxrpc_see_call(call);
- if (call->state >= RXRPC_CALL_COMPLETE)
- return;
-
- set_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events);
- rxrpc_queue_call(call);
-}
-
-/*
- * handle resend timer expiry
- * - may not take call->state_lock as this can deadlock against del_timer_sync()
- */
-static void rxrpc_resend_time_expired(unsigned long _call)
-{
- struct rxrpc_call *call = (struct rxrpc_call *) _call;
-
- _enter("{%d}", call->debug_id);
-
- rxrpc_see_call(call);
- if (call->state >= RXRPC_CALL_COMPLETE)
- return;
-
- clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
- if (!test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
- rxrpc_queue_call(call);
-}
-
-/*
- * handle ACK timer expiry
- */
-static void rxrpc_ack_time_expired(unsigned long _call)
-{
- struct rxrpc_call *call = (struct rxrpc_call *) _call;
-
- _enter("{%d}", call->debug_id);
-
- rxrpc_see_call(call);
- if (call->state >= RXRPC_CALL_COMPLETE)
- return;
-
- if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events))
- rxrpc_queue_call(call);
+ write_unlock(&rxrpc_call_lock);
}
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 82de1aeaef21..c76a125df891 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -105,6 +105,14 @@ static void rxrpc_discard_expired_client_conns(struct work_struct *);
static DECLARE_DELAYED_WORK(rxrpc_client_conn_reap,
rxrpc_discard_expired_client_conns);
+const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5] = {
+ [RXRPC_CONN_CLIENT_INACTIVE] = "Inac",
+ [RXRPC_CONN_CLIENT_WAITING] = "Wait",
+ [RXRPC_CONN_CLIENT_ACTIVE] = "Actv",
+ [RXRPC_CONN_CLIENT_CULLED] = "Cull",
+ [RXRPC_CONN_CLIENT_IDLE] = "Idle",
+};
+
/*
* Get a connection ID and epoch for a client connection from the global pool.
* The connection struct pointer is then recorded in the idr radix tree. The
@@ -220,6 +228,9 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
rxrpc_get_local(conn->params.local);
key_get(conn->params.key);
+ trace_rxrpc_conn(conn, rxrpc_conn_new_client, atomic_read(&conn->usage),
+ __builtin_return_address(0));
+ trace_rxrpc_client(conn, -1, rxrpc_client_alloc);
_leave(" = %p", conn);
return conn;
@@ -348,6 +359,7 @@ static int rxrpc_get_client_conn(struct rxrpc_call *call,
if (cp->exclusive) {
call->conn = candidate;
+ call->security_ix = candidate->security_ix;
_leave(" = 0 [exclusive %d]", candidate->debug_id);
return 0;
}
@@ -384,6 +396,7 @@ static int rxrpc_get_client_conn(struct rxrpc_call *call,
rb_replace_node(&conn->client_node,
&candidate->client_node,
&local->client_conns);
+ trace_rxrpc_client(conn, -1, rxrpc_client_replace);
goto candidate_published;
}
}
@@ -395,6 +408,7 @@ static int rxrpc_get_client_conn(struct rxrpc_call *call,
candidate_published:
set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags);
call->conn = candidate;
+ call->security_ix = candidate->security_ix;
spin_unlock(&local->client_conns_lock);
_leave(" = 0 [new %d]", candidate->debug_id);
return 0;
@@ -407,11 +421,15 @@ found_extant_conn:
_debug("found conn");
spin_unlock(&local->client_conns_lock);
- rxrpc_put_connection(candidate);
- candidate = NULL;
+ if (candidate) {
+ trace_rxrpc_client(candidate, -1, rxrpc_client_duplicate);
+ rxrpc_put_connection(candidate);
+ candidate = NULL;
+ }
spin_lock(&conn->channel_lock);
call->conn = conn;
+ call->security_ix = conn->security_ix;
list_add(&call->chan_wait_link, &conn->waiting_calls);
spin_unlock(&conn->channel_lock);
_leave(" = 0 [extant %d]", conn->debug_id);
@@ -430,6 +448,7 @@ error:
*/
static void rxrpc_activate_conn(struct rxrpc_connection *conn)
{
+ trace_rxrpc_client(conn, -1, rxrpc_client_to_active);
conn->cache_state = RXRPC_CONN_CLIENT_ACTIVE;
rxrpc_nr_active_client_conns++;
list_move_tail(&conn->cache_link, &rxrpc_active_client_conns);
@@ -459,8 +478,10 @@ static void rxrpc_animate_client_conn(struct rxrpc_connection *conn)
spin_lock(&rxrpc_client_conn_cache_lock);
nr_conns = rxrpc_nr_client_conns;
- if (!test_and_set_bit(RXRPC_CONN_COUNTED, &conn->flags))
+ if (!test_and_set_bit(RXRPC_CONN_COUNTED, &conn->flags)) {
+ trace_rxrpc_client(conn, -1, rxrpc_client_count);
rxrpc_nr_client_conns = nr_conns + 1;
+ }
switch (conn->cache_state) {
case RXRPC_CONN_CLIENT_ACTIVE:
@@ -491,6 +512,7 @@ activate_conn:
wait_for_capacity:
_debug("wait");
+ trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting);
conn->cache_state = RXRPC_CONN_CLIENT_WAITING;
list_move_tail(&conn->cache_link, &rxrpc_waiting_client_conns);
goto out_unlock;
@@ -521,6 +543,8 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn,
struct rxrpc_call, chan_wait_link);
u32 call_id = chan->call_counter + 1;
+ trace_rxrpc_client(conn, channel, rxrpc_client_chan_activate);
+
write_lock_bh(&call->state_lock);
call->state = RXRPC_CALL_CLIENT_SEND_REQUEST;
write_unlock_bh(&call->state_lock);
@@ -560,6 +584,8 @@ static void rxrpc_activate_channels(struct rxrpc_connection *conn)
_enter("%d", conn->debug_id);
+ trace_rxrpc_client(conn, -1, rxrpc_client_activate_chans);
+
if (conn->cache_state != RXRPC_CONN_CLIENT_ACTIVE ||
conn->active_chans == RXRPC_ACTIVE_CHANS_MASK)
return;
@@ -654,10 +680,13 @@ int rxrpc_connect_call(struct rxrpc_call *call,
* had a chance at re-use (the per-connection security negotiation is
* expensive).
*/
-static void rxrpc_expose_client_conn(struct rxrpc_connection *conn)
+static void rxrpc_expose_client_conn(struct rxrpc_connection *conn,
+ unsigned int channel)
{
- if (!test_and_set_bit(RXRPC_CONN_EXPOSED, &conn->flags))
+ if (!test_and_set_bit(RXRPC_CONN_EXPOSED, &conn->flags)) {
+ trace_rxrpc_client(conn, channel, rxrpc_client_exposed);
rxrpc_get_connection(conn);
+ }
}
/*
@@ -666,9 +695,9 @@ static void rxrpc_expose_client_conn(struct rxrpc_connection *conn)
*/
void rxrpc_expose_client_call(struct rxrpc_call *call)
{
+ unsigned int channel = call->cid & RXRPC_CHANNELMASK;
struct rxrpc_connection *conn = call->conn;
- struct rxrpc_channel *chan =
- &conn->channels[call->cid & RXRPC_CHANNELMASK];
+ struct rxrpc_channel *chan = &conn->channels[channel];
if (!test_and_set_bit(RXRPC_CALL_EXPOSED, &call->flags)) {
/* Mark the call ID as being used. If the callNumber counter
@@ -679,7 +708,7 @@ void rxrpc_expose_client_call(struct rxrpc_call *call)
chan->call_counter++;
if (chan->call_counter >= INT_MAX)
set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
- rxrpc_expose_client_conn(conn);
+ rxrpc_expose_client_conn(conn, channel);
}
}
@@ -692,6 +721,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
struct rxrpc_connection *conn = call->conn;
struct rxrpc_channel *chan = &conn->channels[channel];
+ trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect);
call->conn = NULL;
spin_lock(&conn->channel_lock);
@@ -706,6 +736,8 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags));
list_del_init(&call->chan_wait_link);
+ trace_rxrpc_client(conn, channel, rxrpc_client_chan_unstarted);
+
/* We must deactivate or idle the connection if it's now
* waiting for nothing.
*/
@@ -718,7 +750,6 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
}
ASSERTCMP(rcu_access_pointer(chan->call), ==, call);
- ASSERTCMP(atomic_read(&conn->usage), >=, 2);
/* If a client call was exposed to the world, we save the result for
* retransmission.
@@ -737,7 +768,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
/* See if we can pass the channel directly to another call. */
if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE &&
!list_empty(&conn->waiting_calls)) {
- _debug("pass chan");
+ trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass);
rxrpc_activate_one_channel(conn, channel);
goto out_2;
}
@@ -760,7 +791,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
goto out;
}
- _debug("pass chan 2");
+ trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass);
rxrpc_activate_one_channel(conn, channel);
goto out;
@@ -792,7 +823,7 @@ idle_connection:
* immediately or moved to the idle list for a short while.
*/
if (test_bit(RXRPC_CONN_EXPOSED, &conn->flags)) {
- _debug("make idle");
+ trace_rxrpc_client(conn, channel, rxrpc_client_to_idle);
conn->idle_timestamp = jiffies;
conn->cache_state = RXRPC_CONN_CLIENT_IDLE;
list_move_tail(&conn->cache_link, &rxrpc_idle_client_conns);
@@ -802,7 +833,7 @@ idle_connection:
&rxrpc_client_conn_reap,
rxrpc_conn_idle_client_expiry);
} else {
- _debug("make inactive");
+ trace_rxrpc_client(conn, channel, rxrpc_client_to_inactive);
conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE;
list_del_init(&conn->cache_link);
}
@@ -815,10 +846,12 @@ idle_connection:
static struct rxrpc_connection *
rxrpc_put_one_client_conn(struct rxrpc_connection *conn)
{
- struct rxrpc_connection *next;
+ struct rxrpc_connection *next = NULL;
struct rxrpc_local *local = conn->params.local;
unsigned int nr_conns;
+ trace_rxrpc_client(conn, -1, rxrpc_client_cleanup);
+
if (test_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags)) {
spin_lock(&local->client_conns_lock);
if (test_and_clear_bit(RXRPC_CONN_IN_CLIENT_CONNS,
@@ -831,24 +864,23 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn)
ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_INACTIVE);
- if (!test_bit(RXRPC_CONN_COUNTED, &conn->flags))
- return NULL;
-
- spin_lock(&rxrpc_client_conn_cache_lock);
- nr_conns = --rxrpc_nr_client_conns;
+ if (test_bit(RXRPC_CONN_COUNTED, &conn->flags)) {
+ trace_rxrpc_client(conn, -1, rxrpc_client_uncount);
+ spin_lock(&rxrpc_client_conn_cache_lock);
+ nr_conns = --rxrpc_nr_client_conns;
+
+ if (nr_conns < rxrpc_max_client_connections &&
+ !list_empty(&rxrpc_waiting_client_conns)) {
+ next = list_entry(rxrpc_waiting_client_conns.next,
+ struct rxrpc_connection, cache_link);
+ rxrpc_get_connection(next);
+ rxrpc_activate_conn(next);
+ }
- next = NULL;
- if (nr_conns < rxrpc_max_client_connections &&
- !list_empty(&rxrpc_waiting_client_conns)) {
- next = list_entry(rxrpc_waiting_client_conns.next,
- struct rxrpc_connection, cache_link);
- rxrpc_get_connection(next);
- rxrpc_activate_conn(next);
+ spin_unlock(&rxrpc_client_conn_cache_lock);
}
- spin_unlock(&rxrpc_client_conn_cache_lock);
rxrpc_kill_connection(conn);
-
if (next)
rxrpc_activate_channels(next);
@@ -863,20 +895,18 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn)
*/
void rxrpc_put_client_conn(struct rxrpc_connection *conn)
{
- struct rxrpc_connection *next;
+ const void *here = __builtin_return_address(0);
+ int n;
do {
- _enter("%p{u=%d,d=%d}",
- conn, atomic_read(&conn->usage), conn->debug_id);
-
- next = rxrpc_put_one_client_conn(conn);
-
- if (!next)
- break;
- conn = next;
- } while (atomic_dec_and_test(&conn->usage));
-
- _leave("");
+ n = atomic_dec_return(&conn->usage);
+ trace_rxrpc_conn(conn, rxrpc_conn_put_client, n, here);
+ if (n > 0)
+ return;
+ ASSERTCMP(n, >=, 0);
+
+ conn = rxrpc_put_one_client_conn(conn);
+ } while (conn);
}
/*
@@ -907,9 +937,11 @@ static void rxrpc_cull_active_client_conns(void)
ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_ACTIVE);
if (list_empty(&conn->waiting_calls)) {
+ trace_rxrpc_client(conn, -1, rxrpc_client_to_culled);
conn->cache_state = RXRPC_CONN_CLIENT_CULLED;
list_del_init(&conn->cache_link);
} else {
+ trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting);
conn->cache_state = RXRPC_CONN_CLIENT_WAITING;
list_move_tail(&conn->cache_link,
&rxrpc_waiting_client_conns);
@@ -983,7 +1015,7 @@ next:
goto not_yet_expired;
}
- _debug("discard conn %d", conn->debug_id);
+ trace_rxrpc_client(conn, -1, rxrpc_client_discard);
if (!test_and_clear_bit(RXRPC_CONN_EXPOSED, &conn->flags))
BUG();
conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE;
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 9db90f4f768d..37609ce89f52 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -15,10 +15,6 @@
#include <linux/net.h>
#include <linux/skbuff.h>
#include <linux/errqueue.h>
-#include <linux/udp.h>
-#include <linux/in.h>
-#include <linux/in6.h>
-#include <linux/icmp.h>
#include <net/sock.h>
#include <net/af_rxrpc.h>
#include <net/ip.h>
@@ -101,6 +97,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
pkt.info.maxMTU = htonl(mtu);
pkt.info.rwind = htonl(rxrpc_rx_window_size);
pkt.info.jumbo_max = htonl(rxrpc_rx_jumbo_max);
+ pkt.whdr.flags |= RXRPC_SLOW_START_OK;
len += sizeof(pkt.ack) + sizeof(pkt.info);
break;
}
@@ -123,6 +120,8 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
_proto("Tx ABORT %%%u { %d } [re]", serial, conn->local_abort);
break;
case RXRPC_PACKET_TYPE_ACK:
+ trace_rxrpc_tx_ack(NULL, serial, chan->last_seq, 0,
+ RXRPC_ACK_DUPLICATE, 0);
_proto("Tx ACK %%%u [re]", serial);
break;
}
@@ -140,16 +139,10 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn,
u32 abort_code, int error)
{
struct rxrpc_call *call;
- bool queue;
- int i, bit;
+ int i;
_enter("{%d},%x", conn->debug_id, abort_code);
- if (compl == RXRPC_CALL_LOCALLY_ABORTED)
- bit = RXRPC_CALL_EV_CONN_ABORT;
- else
- bit = RXRPC_CALL_EV_RCVD_ABORT;
-
spin_lock(&conn->channel_lock);
for (i = 0; i < RXRPC_MAXCALLS; i++) {
@@ -157,16 +150,13 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn,
conn->channels[i].call,
lockdep_is_held(&conn->channel_lock));
if (call) {
- rxrpc_see_call(call);
- write_lock_bh(&call->state_lock);
- if (rxrpc_set_call_completion(call, compl, abort_code,
- error)) {
- set_bit(bit, &call->events);
- queue = true;
- }
- write_unlock_bh(&call->state_lock);
- if (queue)
- rxrpc_queue_call(call);
+ if (compl == RXRPC_CALL_LOCALLY_ABORTED)
+ trace_rxrpc_abort("CON", call->cid,
+ call->call_id, 0,
+ abort_code, error);
+ if (rxrpc_set_call_completion(call, compl,
+ abort_code, error))
+ rxrpc_notify_socket(call);
}
}
@@ -245,17 +235,18 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
/*
* mark a call as being on a now-secured channel
- * - must be called with softirqs disabled
+ * - must be called with BH's disabled.
*/
static void rxrpc_call_is_secure(struct rxrpc_call *call)
{
_enter("%p", call);
if (call) {
- read_lock(&call->state_lock);
- if (call->state < RXRPC_CALL_COMPLETE &&
- !test_and_set_bit(RXRPC_CALL_EV_SECURED, &call->events))
- rxrpc_queue_call(call);
- read_unlock(&call->state_lock);
+ write_lock_bh(&call->state_lock);
+ if (call->state == RXRPC_CALL_SERVER_SECURING) {
+ call->state = RXRPC_CALL_SERVER_ACCEPTING;
+ rxrpc_notify_socket(call);
+ }
+ write_unlock_bh(&call->state_lock);
}
}
@@ -272,7 +263,7 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
int loop, ret;
if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) {
- kleave(" = -ECONNABORTED [%u]", conn->state);
+ _leave(" = -ECONNABORTED [%u]", conn->state);
return -ECONNABORTED;
}
@@ -285,14 +276,14 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
return 0;
case RXRPC_PACKET_TYPE_ABORT:
- if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0)
+ if (skb_copy_bits(skb, sp->offset, &wtmp, sizeof(wtmp)) < 0)
return -EPROTO;
abort_code = ntohl(wtmp);
_proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code);
conn->state = RXRPC_CONN_REMOTELY_ABORTED;
- rxrpc_abort_calls(conn, 0, RXRPC_CALL_REMOTELY_ABORTED,
- abort_code);
+ rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED,
+ abort_code, ECONNABORTED);
return -ECONNABORTED;
case RXRPC_PACKET_TYPE_CHALLENGE:
@@ -317,14 +308,16 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) {
conn->state = RXRPC_CONN_SERVICE;
+ spin_unlock(&conn->state_lock);
for (loop = 0; loop < RXRPC_MAXCALLS; loop++)
rxrpc_call_is_secure(
rcu_dereference_protected(
conn->channels[loop].call,
lockdep_is_held(&conn->channel_lock)));
+ } else {
+ spin_unlock(&conn->state_lock);
}
- spin_unlock(&conn->state_lock);
spin_unlock(&conn->channel_lock);
return 0;
@@ -387,7 +380,7 @@ void rxrpc_process_connection(struct work_struct *work)
u32 abort_code = RX_PROTOCOL_ERROR;
int ret;
- _enter("{%d}", conn->debug_id);
+ rxrpc_see_connection(conn);
if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events))
rxrpc_secure_connection(conn);
@@ -395,7 +388,7 @@ void rxrpc_process_connection(struct work_struct *work)
/* go through the conn-level event packets, releasing the ref on this
* connection that each one has when we've finished with it */
while ((skb = skb_dequeue(&conn->rx_queue))) {
- rxrpc_see_skb(skb);
+ rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
ret = rxrpc_process_event(conn, skb, &abort_code);
switch (ret) {
case -EPROTO:
@@ -406,7 +399,7 @@ void rxrpc_process_connection(struct work_struct *work)
goto requeue_and_leave;
case -ECONNABORTED:
default:
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
break;
}
}
@@ -423,92 +416,7 @@ requeue_and_leave:
protocol_error:
if (rxrpc_abort_connection(conn, -ret, abort_code) < 0)
goto requeue_and_leave;
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
_leave(" [EPROTO]");
goto out;
}
-
-/*
- * put a packet up for transport-level abort
- */
-void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb)
-{
- CHECK_SLAB_OKAY(&local->usage);
-
- skb_queue_tail(&local->reject_queue, skb);
- rxrpc_queue_local(local);
-}
-
-/*
- * reject packets through the local endpoint
- */
-void rxrpc_reject_packets(struct rxrpc_local *local)
-{
- union {
- struct sockaddr sa;
- struct sockaddr_in sin;
- } sa;
- struct rxrpc_skb_priv *sp;
- struct rxrpc_wire_header whdr;
- struct sk_buff *skb;
- struct msghdr msg;
- struct kvec iov[2];
- size_t size;
- __be32 code;
-
- _enter("%d", local->debug_id);
-
- iov[0].iov_base = &whdr;
- iov[0].iov_len = sizeof(whdr);
- iov[1].iov_base = &code;
- iov[1].iov_len = sizeof(code);
- size = sizeof(whdr) + sizeof(code);
-
- msg.msg_name = &sa;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_flags = 0;
-
- memset(&sa, 0, sizeof(sa));
- sa.sa.sa_family = local->srx.transport.family;
- switch (sa.sa.sa_family) {
- case AF_INET:
- msg.msg_namelen = sizeof(sa.sin);
- break;
- default:
- msg.msg_namelen = 0;
- break;
- }
-
- memset(&whdr, 0, sizeof(whdr));
- whdr.type = RXRPC_PACKET_TYPE_ABORT;
-
- while ((skb = skb_dequeue(&local->reject_queue))) {
- rxrpc_see_skb(skb);
- sp = rxrpc_skb(skb);
- switch (sa.sa.sa_family) {
- case AF_INET:
- sa.sin.sin_port = udp_hdr(skb)->source;
- sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
- code = htonl(skb->priority);
-
- whdr.epoch = htonl(sp->hdr.epoch);
- whdr.cid = htonl(sp->hdr.cid);
- whdr.callNumber = htonl(sp->hdr.callNumber);
- whdr.serviceId = htons(sp->hdr.serviceId);
- whdr.flags = sp->hdr.flags;
- whdr.flags ^= RXRPC_CLIENT_INITIATED;
- whdr.flags &= RXRPC_CLIENT_INITIATED;
-
- kernel_sendmsg(local->socket, &msg, iov, 2, size);
- break;
-
- default:
- break;
- }
-
- rxrpc_free_skb(skb);
- }
-
- _leave("");
-}
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 9c6685b97e70..e1e83af47866 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -53,7 +53,6 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
spin_lock_init(&conn->state_lock);
conn->debug_id = atomic_inc_return(&rxrpc_debug_id);
conn->size_align = 4;
- conn->header_size = sizeof(struct rxrpc_wire_header);
conn->idle_timestamp = jiffies;
}
@@ -134,6 +133,16 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local,
srx.transport.sin.sin_addr.s_addr)
goto not_found;
break;
+#ifdef CONFIG_AF_RXRPC_IPV6
+ case AF_INET6:
+ if (peer->srx.transport.sin6.sin6_port !=
+ srx.transport.sin6.sin6_port ||
+ memcmp(&peer->srx.transport.sin6.sin6_addr,
+ &srx.transport.sin6.sin6_addr,
+ sizeof(struct in6_addr)) != 0)
+ goto not_found;
+ break;
+#endif
default:
BUG();
}
@@ -169,7 +178,7 @@ void __rxrpc_disconnect_call(struct rxrpc_connection *conn,
chan->last_abort = call->abort_code;
chan->last_type = RXRPC_PACKET_TYPE_ABORT;
} else {
- chan->last_seq = call->rx_data_eaten;
+ chan->last_seq = call->rx_hard_ack;
chan->last_type = RXRPC_PACKET_TYPE_ACK;
}
/* Sync with rxrpc_conn_retransmit(). */
@@ -191,6 +200,10 @@ void rxrpc_disconnect_call(struct rxrpc_call *call)
{
struct rxrpc_connection *conn = call->conn;
+ spin_lock_bh(&conn->params.peer->lock);
+ hlist_del_init(&call->error_link);
+ spin_unlock_bh(&conn->params.peer->lock);
+
if (rxrpc_is_client_call(call))
return rxrpc_disconnect_client_call(call);
@@ -232,11 +245,77 @@ void rxrpc_kill_connection(struct rxrpc_connection *conn)
}
/*
- * release a virtual connection
+ * Queue a connection's work processor, getting a ref to pass to the work
+ * queue.
*/
-void __rxrpc_put_connection(struct rxrpc_connection *conn)
+bool rxrpc_queue_conn(struct rxrpc_connection *conn)
{
- rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
+ const void *here = __builtin_return_address(0);
+ int n = __atomic_add_unless(&conn->usage, 1, 0);
+ if (n == 0)
+ return false;
+ if (rxrpc_queue_work(&conn->processor))
+ trace_rxrpc_conn(conn, rxrpc_conn_queued, n + 1, here);
+ else
+ rxrpc_put_connection(conn);
+ return true;
+}
+
+/*
+ * Note the re-emergence of a connection.
+ */
+void rxrpc_see_connection(struct rxrpc_connection *conn)
+{
+ const void *here = __builtin_return_address(0);
+ if (conn) {
+ int n = atomic_read(&conn->usage);
+
+ trace_rxrpc_conn(conn, rxrpc_conn_seen, n, here);
+ }
+}
+
+/*
+ * Get a ref on a connection.
+ */
+void rxrpc_get_connection(struct rxrpc_connection *conn)
+{
+ const void *here = __builtin_return_address(0);
+ int n = atomic_inc_return(&conn->usage);
+
+ trace_rxrpc_conn(conn, rxrpc_conn_got, n, here);
+}
+
+/*
+ * Try to get a ref on a connection.
+ */
+struct rxrpc_connection *
+rxrpc_get_connection_maybe(struct rxrpc_connection *conn)
+{
+ const void *here = __builtin_return_address(0);
+
+ if (conn) {
+ int n = __atomic_add_unless(&conn->usage, 1, 0);
+ if (n > 0)
+ trace_rxrpc_conn(conn, rxrpc_conn_got, n + 1, here);
+ else
+ conn = NULL;
+ }
+ return conn;
+}
+
+/*
+ * Release a service connection
+ */
+void rxrpc_put_service_conn(struct rxrpc_connection *conn)
+{
+ const void *here = __builtin_return_address(0);
+ int n;
+
+ n = atomic_dec_return(&conn->usage);
+ trace_rxrpc_conn(conn, rxrpc_conn_put_service, n, here);
+ ASSERTCMP(n, >=, 0);
+ if (n == 0)
+ rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
}
/*
@@ -286,6 +365,8 @@ static void rxrpc_connection_reaper(struct work_struct *work)
ASSERTCMP(atomic_read(&conn->usage), >, 0);
if (likely(atomic_read(&conn->usage) > 1))
continue;
+ if (conn->state == RXRPC_CONN_SERVICE_PREALLOC)
+ continue;
idle_timestamp = READ_ONCE(conn->idle_timestamp);
_debug("reap CONN %d { u=%d,t=%ld }",
diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c
index 316a92107fee..eef551f40dc2 100644
--- a/net/rxrpc/conn_service.c
+++ b/net/rxrpc/conn_service.c
@@ -65,9 +65,8 @@ done:
* Insert a service connection into a peer's tree, thereby making it a target
* for incoming packets.
*/
-static struct rxrpc_connection *
-rxrpc_publish_service_conn(struct rxrpc_peer *peer,
- struct rxrpc_connection *conn)
+static void rxrpc_publish_service_conn(struct rxrpc_peer *peer,
+ struct rxrpc_connection *conn)
{
struct rxrpc_connection *cursor = NULL;
struct rxrpc_conn_proto k = conn->proto;
@@ -96,7 +95,7 @@ conn_published:
set_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags);
write_sequnlock_bh(&peer->service_conn_lock);
_leave(" = %d [new]", conn->debug_id);
- return conn;
+ return;
found_extant_conn:
if (atomic_read(&cursor->usage) == 0)
@@ -119,106 +118,58 @@ replace_old_connection:
}
/*
- * get a record of an incoming connection
+ * Preallocate a service connection. The connection is placed on the proc and
+ * reap lists so that we don't have to get the lock from BH context.
*/
-struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local,
- struct sockaddr_rxrpc *srx,
- struct sk_buff *skb)
+struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t gfp)
{
- struct rxrpc_connection *conn;
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- struct rxrpc_peer *peer;
- const char *new = "old";
-
- _enter("");
+ struct rxrpc_connection *conn = rxrpc_alloc_connection(gfp);
- peer = rxrpc_lookup_peer(local, srx, GFP_NOIO);
- if (!peer) {
- _debug("no peer");
- return ERR_PTR(-EBUSY);
- }
+ if (conn) {
+ /* We maintain an extra ref on the connection whilst it is on
+ * the rxrpc_connections list.
+ */
+ conn->state = RXRPC_CONN_SERVICE_PREALLOC;
+ atomic_set(&conn->usage, 2);
- ASSERT(sp->hdr.flags & RXRPC_CLIENT_INITIATED);
-
- rcu_read_lock();
- peer = rxrpc_lookup_peer_rcu(local, srx);
- if (peer) {
- conn = rxrpc_find_service_conn_rcu(peer, skb);
- if (conn) {
- if (sp->hdr.securityIndex != conn->security_ix)
- goto security_mismatch_rcu;
- if (rxrpc_get_connection_maybe(conn))
- goto found_extant_connection_rcu;
-
- /* The conn has expired but we can't remove it without
- * the appropriate lock, so we attempt to replace it
- * when we have a new candidate.
- */
- }
+ write_lock(&rxrpc_connection_lock);
+ list_add_tail(&conn->link, &rxrpc_connections);
+ list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list);
+ write_unlock(&rxrpc_connection_lock);
- if (!rxrpc_get_peer_maybe(peer))
- peer = NULL;
+ trace_rxrpc_conn(conn, rxrpc_conn_new_service,
+ atomic_read(&conn->usage),
+ __builtin_return_address(0));
}
- rcu_read_unlock();
- if (!peer) {
- peer = rxrpc_lookup_peer(local, srx, GFP_NOIO);
- if (!peer)
- goto enomem;
- }
+ return conn;
+}
- /* We don't have a matching record yet. */
- conn = rxrpc_alloc_connection(GFP_NOIO);
- if (!conn)
- goto enomem_peer;
+/*
+ * Set up an incoming connection. This is called in BH context with the RCU
+ * read lock held.
+ */
+void rxrpc_new_incoming_connection(struct rxrpc_connection *conn,
+ struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+ _enter("");
conn->proto.epoch = sp->hdr.epoch;
conn->proto.cid = sp->hdr.cid & RXRPC_CIDMASK;
- conn->params.local = local;
- conn->params.peer = peer;
conn->params.service_id = sp->hdr.serviceId;
conn->security_ix = sp->hdr.securityIndex;
conn->out_clientflag = 0;
- conn->state = RXRPC_CONN_SERVICE;
- if (conn->params.service_id)
+ if (conn->security_ix)
conn->state = RXRPC_CONN_SERVICE_UNSECURED;
-
- rxrpc_get_local(local);
-
- /* We maintain an extra ref on the connection whilst it is on
- * the rxrpc_connections list.
- */
- atomic_set(&conn->usage, 2);
-
- write_lock(&rxrpc_connection_lock);
- list_add_tail(&conn->link, &rxrpc_connections);
- list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list);
- write_unlock(&rxrpc_connection_lock);
+ else
+ conn->state = RXRPC_CONN_SERVICE;
/* Make the connection a target for incoming packets. */
- rxrpc_publish_service_conn(peer, conn);
-
- new = "new";
-
-success:
- _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->proto.cid);
- _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage));
- return conn;
-
-found_extant_connection_rcu:
- rcu_read_unlock();
- goto success;
-
-security_mismatch_rcu:
- rcu_read_unlock();
- _leave(" = -EKEYREJECTED");
- return ERR_PTR(-EKEYREJECTED);
+ rxrpc_publish_service_conn(conn->params.peer, conn);
-enomem_peer:
- rxrpc_put_peer(peer);
-enomem:
- _leave(" = -ENOMEM");
- return ERR_PTR(-ENOMEM);
+ _net("CONNECTION new %d {%x}", conn->debug_id, conn->proto.cid);
}
/*
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 72f016cfaaf5..094720dd1eaf 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -1,6 +1,6 @@
/* RxRPC packet reception
*
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
@@ -27,545 +27,908 @@
#include <net/net_namespace.h>
#include "ar-internal.h"
+static void rxrpc_proto_abort(const char *why,
+ struct rxrpc_call *call, rxrpc_seq_t seq)
+{
+ if (rxrpc_abort_call(why, call, seq, RX_PROTOCOL_ERROR, EBADMSG)) {
+ set_bit(RXRPC_CALL_EV_ABORT, &call->events);
+ rxrpc_queue_call(call);
+ }
+}
+
/*
- * queue a packet for recvmsg to pass to userspace
- * - the caller must hold a lock on call->lock
- * - must not be called with interrupts disabled (sk_filter() disables BH's)
- * - eats the packet whether successful or not
- * - there must be just one reference to the packet, which the caller passes to
- * this function
+ * Do TCP-style congestion management [RFC 5681].
*/
-int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb,
- bool force, bool terminal)
+static void rxrpc_congestion_management(struct rxrpc_call *call,
+ struct sk_buff *skb,
+ struct rxrpc_ack_summary *summary)
{
- struct rxrpc_skb_priv *sp;
- struct rxrpc_sock *rx = call->socket;
- struct sock *sk;
- int ret;
+ enum rxrpc_congest_change change = rxrpc_cong_no_change;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ unsigned int cumulative_acks = call->cong_cumul_acks;
+ unsigned int cwnd = call->cong_cwnd;
+ bool resend = false;
+
+ summary->flight_size =
+ (call->tx_top - call->tx_hard_ack) - summary->nr_acks;
+
+ if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) {
+ summary->retrans_timeo = true;
+ call->cong_ssthresh = max_t(unsigned int,
+ summary->flight_size / 2, 2);
+ cwnd = 1;
+ if (cwnd > call->cong_ssthresh &&
+ call->cong_mode == RXRPC_CALL_SLOW_START) {
+ call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
+ call->cong_tstamp = skb->tstamp;
+ cumulative_acks = 0;
+ }
+ }
- _enter(",,%d,%d", force, terminal);
+ cumulative_acks += summary->nr_new_acks;
+ cumulative_acks += summary->nr_rot_new_acks;
+ if (cumulative_acks > 255)
+ cumulative_acks = 255;
+
+ summary->mode = call->cong_mode;
+ summary->cwnd = call->cong_cwnd;
+ summary->ssthresh = call->cong_ssthresh;
+ summary->cumulative_acks = cumulative_acks;
+ summary->dup_acks = call->cong_dup_acks;
+
+ switch (call->cong_mode) {
+ case RXRPC_CALL_SLOW_START:
+ if (summary->nr_nacks > 0)
+ goto packet_loss_detected;
+ if (summary->cumulative_acks > 0)
+ cwnd += 1;
+ if (cwnd > call->cong_ssthresh) {
+ call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
+ call->cong_tstamp = skb->tstamp;
+ }
+ goto out;
- ASSERT(!irqs_disabled());
+ case RXRPC_CALL_CONGEST_AVOIDANCE:
+ if (summary->nr_nacks > 0)
+ goto packet_loss_detected;
- sp = rxrpc_skb(skb);
- ASSERTCMP(sp->call, ==, call);
-
- /* if we've already posted the terminal message for a call, then we
- * don't post any more */
- if (test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) {
- _debug("already terminated");
- ASSERTCMP(call->state, >=, RXRPC_CALL_COMPLETE);
- rxrpc_free_skb(skb);
- return 0;
- }
-
- sk = &rx->sk;
-
- if (!force) {
- /* cast skb->rcvbuf to unsigned... It's pointless, but
- * reduces number of warnings when compiling with -W
- * --ANK */
-// ret = -ENOBUFS;
-// if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
-// (unsigned int) sk->sk_rcvbuf)
-// goto out;
-
- ret = sk_filter(sk, skb);
- if (ret < 0)
+ /* We analyse the number of packets that get ACK'd per RTT
+ * period and increase the window if we managed to fill it.
+ */
+ if (call->peer->rtt_usage == 0)
goto out;
- }
+ if (ktime_before(skb->tstamp,
+ ktime_add_ns(call->cong_tstamp,
+ call->peer->rtt)))
+ goto out_no_clear_ca;
+ change = rxrpc_cong_rtt_window_end;
+ call->cong_tstamp = skb->tstamp;
+ if (cumulative_acks >= cwnd)
+ cwnd++;
+ goto out;
- spin_lock_bh(&sk->sk_receive_queue.lock);
- if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags) &&
- !test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
- call->socket->sk.sk_state != RXRPC_CLOSE) {
- skb->destructor = rxrpc_packet_destructor;
- skb->dev = NULL;
- skb->sk = sk;
- atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+ case RXRPC_CALL_PACKET_LOSS:
+ if (summary->nr_nacks == 0)
+ goto resume_normality;
- if (terminal) {
- _debug("<<<< TERMINAL MESSAGE >>>>");
- set_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags);
+ if (summary->new_low_nack) {
+ change = rxrpc_cong_new_low_nack;
+ call->cong_dup_acks = 1;
+ if (call->cong_extra > 1)
+ call->cong_extra = 1;
+ goto send_extra_data;
}
- /* allow interception by a kernel service */
- if (skb->mark == RXRPC_SKB_MARK_NEW_CALL &&
- rx->notify_new_call) {
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- skb_queue_tail(&call->knlrecv_queue, skb);
- rx->notify_new_call(&rx->sk);
- } else if (call->notify_rx) {
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- skb_queue_tail(&call->knlrecv_queue, skb);
- call->notify_rx(&rx->sk, call, call->user_call_ID);
- } else {
- _net("post skb %p", skb);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
- spin_unlock_bh(&sk->sk_receive_queue.lock);
+ call->cong_dup_acks++;
+ if (call->cong_dup_acks < 3)
+ goto send_extra_data;
+
+ change = rxrpc_cong_begin_retransmission;
+ call->cong_mode = RXRPC_CALL_FAST_RETRANSMIT;
+ call->cong_ssthresh = max_t(unsigned int,
+ summary->flight_size / 2, 2);
+ cwnd = call->cong_ssthresh + 3;
+ call->cong_extra = 0;
+ call->cong_dup_acks = 0;
+ resend = true;
+ goto out;
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk);
+ case RXRPC_CALL_FAST_RETRANSMIT:
+ if (!summary->new_low_nack) {
+ if (summary->nr_new_acks == 0)
+ cwnd += 1;
+ call->cong_dup_acks++;
+ if (call->cong_dup_acks == 2) {
+ change = rxrpc_cong_retransmit_again;
+ call->cong_dup_acks = 0;
+ resend = true;
+ }
+ } else {
+ change = rxrpc_cong_progress;
+ cwnd = call->cong_ssthresh;
+ if (summary->nr_nacks == 0)
+ goto resume_normality;
}
- skb = NULL;
- } else {
- spin_unlock_bh(&sk->sk_receive_queue.lock);
+ goto out;
+
+ default:
+ BUG();
+ goto out;
}
- ret = 0;
+resume_normality:
+ change = rxrpc_cong_cleared_nacks;
+ call->cong_dup_acks = 0;
+ call->cong_extra = 0;
+ call->cong_tstamp = skb->tstamp;
+ if (cwnd <= call->cong_ssthresh)
+ call->cong_mode = RXRPC_CALL_SLOW_START;
+ else
+ call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
out:
- rxrpc_free_skb(skb);
+ cumulative_acks = 0;
+out_no_clear_ca:
+ if (cwnd >= RXRPC_RXTX_BUFF_SIZE - 1)
+ cwnd = RXRPC_RXTX_BUFF_SIZE - 1;
+ call->cong_cwnd = cwnd;
+ call->cong_cumul_acks = cumulative_acks;
+ trace_rxrpc_congest(call, summary, sp->hdr.serial, change);
+ if (resend && !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
+ rxrpc_queue_call(call);
+ return;
- _leave(" = %d", ret);
- return ret;
+packet_loss_detected:
+ change = rxrpc_cong_saw_nack;
+ call->cong_mode = RXRPC_CALL_PACKET_LOSS;
+ call->cong_dup_acks = 0;
+ goto send_extra_data;
+
+send_extra_data:
+ /* Send some previously unsent DATA if we have some to advance the ACK
+ * state.
+ */
+ if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] &
+ RXRPC_TX_ANNO_LAST ||
+ summary->nr_acks != call->tx_top - call->tx_hard_ack) {
+ call->cong_extra++;
+ wake_up(&call->waitq);
+ }
+ goto out_no_clear_ca;
}
/*
- * process a DATA packet, posting the packet to the appropriate queue
- * - eats the packet if successful
+ * Ping the other end to fill our RTT cache and to retrieve the rwind
+ * and MTU parameters.
*/
-static int rxrpc_fast_process_data(struct rxrpc_call *call,
- struct sk_buff *skb, u32 seq)
+static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb,
+ int skew)
{
- struct rxrpc_skb_priv *sp;
- bool terminal;
- int ret, ackbit, ack;
- u32 serial;
- u16 skew;
- u8 flags;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ ktime_t now = skb->tstamp;
- _enter("{%u,%u},,{%u}", call->rx_data_post, call->rx_first_oos, seq);
+ if (call->peer->rtt_usage < 3 ||
+ ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now))
+ rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial,
+ true, true,
+ rxrpc_propose_ack_ping_for_params);
+}
- sp = rxrpc_skb(skb);
- ASSERTCMP(sp->call, ==, NULL);
- flags = sp->hdr.flags;
- serial = sp->hdr.serial;
- skew = skb->priority;
+/*
+ * Apply a hard ACK by advancing the Tx window.
+ */
+static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
+ struct rxrpc_ack_summary *summary)
+{
+ struct sk_buff *skb, *list = NULL;
+ int ix;
+ u8 annotation;
+
+ if (call->acks_lowest_nak == call->tx_hard_ack) {
+ call->acks_lowest_nak = to;
+ } else if (before_eq(call->acks_lowest_nak, to)) {
+ summary->new_low_nack = true;
+ call->acks_lowest_nak = to;
+ }
spin_lock(&call->lock);
- if (call->state > RXRPC_CALL_COMPLETE)
- goto discard;
+ while (before(call->tx_hard_ack, to)) {
+ call->tx_hard_ack++;
+ ix = call->tx_hard_ack & RXRPC_RXTX_BUFF_MASK;
+ skb = call->rxtx_buffer[ix];
+ annotation = call->rxtx_annotations[ix];
+ rxrpc_see_skb(skb, rxrpc_skb_tx_rotated);
+ call->rxtx_buffer[ix] = NULL;
+ call->rxtx_annotations[ix] = 0;
+ skb->next = list;
+ list = skb;
+
+ if (annotation & RXRPC_TX_ANNO_LAST)
+ set_bit(RXRPC_CALL_TX_LAST, &call->flags);
+ if ((annotation & RXRPC_TX_ANNO_MASK) != RXRPC_TX_ANNO_ACK)
+ summary->nr_rot_new_acks++;
+ }
- ASSERTCMP(call->rx_data_expect, >=, call->rx_data_post);
- ASSERTCMP(call->rx_data_post, >=, call->rx_data_recv);
- ASSERTCMP(call->rx_data_recv, >=, call->rx_data_eaten);
+ spin_unlock(&call->lock);
- if (seq < call->rx_data_post) {
- _debug("dup #%u [-%u]", seq, call->rx_data_post);
- ack = RXRPC_ACK_DUPLICATE;
- ret = -ENOBUFS;
- goto discard_and_ack;
- }
+ trace_rxrpc_transmit(call, (test_bit(RXRPC_CALL_TX_LAST, &call->flags) ?
+ rxrpc_transmit_rotate_last :
+ rxrpc_transmit_rotate));
+ wake_up(&call->waitq);
- /* we may already have the packet in the out of sequence queue */
- ackbit = seq - (call->rx_data_eaten + 1);
- ASSERTCMP(ackbit, >=, 0);
- if (__test_and_set_bit(ackbit, call->ackr_window)) {
- _debug("dup oos #%u [%u,%u]",
- seq, call->rx_data_eaten, call->rx_data_post);
- ack = RXRPC_ACK_DUPLICATE;
- goto discard_and_ack;
+ while (list) {
+ skb = list;
+ list = skb->next;
+ skb->next = NULL;
+ rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
}
+}
- if (seq >= call->ackr_win_top) {
- _debug("exceed #%u [%u]", seq, call->ackr_win_top);
- __clear_bit(ackbit, call->ackr_window);
- ack = RXRPC_ACK_EXCEEDS_WINDOW;
- goto discard_and_ack;
- }
+/*
+ * End the transmission phase of a call.
+ *
+ * This occurs when we get an ACKALL packet, the first DATA packet of a reply,
+ * or a final ACK packet.
+ */
+static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun,
+ const char *abort_why)
+{
- if (seq == call->rx_data_expect) {
- clear_bit(RXRPC_CALL_EXPECT_OOS, &call->flags);
- call->rx_data_expect++;
- } else if (seq > call->rx_data_expect) {
- _debug("oos #%u [%u]", seq, call->rx_data_expect);
- call->rx_data_expect = seq + 1;
- if (test_and_set_bit(RXRPC_CALL_EXPECT_OOS, &call->flags)) {
- ack = RXRPC_ACK_OUT_OF_SEQUENCE;
- goto enqueue_and_ack;
- }
- goto enqueue_packet;
- }
+ ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags));
- if (seq != call->rx_data_post) {
- _debug("ahead #%u [%u]", seq, call->rx_data_post);
- goto enqueue_packet;
- }
+ write_lock(&call->state_lock);
- if (test_bit(RXRPC_CALL_RCVD_LAST, &call->flags))
- goto protocol_error;
+ switch (call->state) {
+ case RXRPC_CALL_CLIENT_SEND_REQUEST:
+ case RXRPC_CALL_CLIENT_AWAIT_REPLY:
+ if (reply_begun)
+ call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
+ else
+ call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY;
+ break;
- /* if the packet need security things doing to it, then it goes down
- * the slow path */
- if (call->conn->security_ix)
- goto enqueue_packet;
+ case RXRPC_CALL_SERVER_AWAIT_ACK:
+ __rxrpc_call_completed(call);
+ rxrpc_notify_socket(call);
+ break;
- sp->call = call;
- rxrpc_get_call_for_skb(call, skb);
- terminal = ((flags & RXRPC_LAST_PACKET) &&
- !(flags & RXRPC_CLIENT_INITIATED));
- ret = rxrpc_queue_rcv_skb(call, skb, false, terminal);
- if (ret < 0) {
- if (ret == -ENOMEM || ret == -ENOBUFS) {
- __clear_bit(ackbit, call->ackr_window);
- ack = RXRPC_ACK_NOSPACE;
- goto discard_and_ack;
- }
- goto out;
+ default:
+ goto bad_state;
}
- skb = NULL;
- sp = NULL;
-
- _debug("post #%u", seq);
- ASSERTCMP(call->rx_data_post, ==, seq);
- call->rx_data_post++;
+ write_unlock(&call->state_lock);
+ if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) {
+ rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, false, true,
+ rxrpc_propose_ack_client_tx_end);
+ trace_rxrpc_transmit(call, rxrpc_transmit_await_reply);
+ } else {
+ trace_rxrpc_transmit(call, rxrpc_transmit_end);
+ }
+ _leave(" = ok");
+ return true;
+
+bad_state:
+ write_unlock(&call->state_lock);
+ kdebug("end_tx %s", rxrpc_call_states[call->state]);
+ rxrpc_proto_abort(abort_why, call, call->tx_top);
+ return false;
+}
- if (flags & RXRPC_LAST_PACKET)
- set_bit(RXRPC_CALL_RCVD_LAST, &call->flags);
+/*
+ * Begin the reply reception phase of a call.
+ */
+static bool rxrpc_receiving_reply(struct rxrpc_call *call)
+{
+ struct rxrpc_ack_summary summary = { 0 };
+ rxrpc_seq_t top = READ_ONCE(call->tx_top);
+
+ if (call->ackr_reason) {
+ spin_lock_bh(&call->lock);
+ call->ackr_reason = 0;
+ call->resend_at = call->expire_at;
+ call->ack_at = call->expire_at;
+ spin_unlock_bh(&call->lock);
+ rxrpc_set_timer(call, rxrpc_timer_init_for_reply);
+ }
- /* if we've reached an out of sequence packet then we need to drain
- * that queue into the socket Rx queue now */
- if (call->rx_data_post == call->rx_first_oos) {
- _debug("drain rx oos now");
- read_lock(&call->state_lock);
- if (call->state < RXRPC_CALL_COMPLETE &&
- !test_and_set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events))
- rxrpc_queue_call(call);
- read_unlock(&call->state_lock);
+ if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags))
+ rxrpc_rotate_tx_window(call, top, &summary);
+ if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) {
+ rxrpc_proto_abort("TXL", call, top);
+ return false;
}
+ if (!rxrpc_end_tx_phase(call, true, "ETD"))
+ return false;
+ call->tx_phase = false;
+ return true;
+}
- spin_unlock(&call->lock);
- atomic_inc(&call->ackr_not_idle);
- rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, skew, serial, false);
- _leave(" = 0 [posted]");
- return 0;
+/*
+ * Scan a jumbo packet to validate its structure and to work out how many
+ * subpackets it contains.
+ *
+ * A jumbo packet is a collection of consecutive packets glued together with
+ * little headers between that indicate how to change the initial header for
+ * each subpacket.
+ *
+ * RXRPC_JUMBO_PACKET must be set on all but the last subpacket - and all but
+ * the last are RXRPC_JUMBO_DATALEN in size. The last subpacket may be of any
+ * size.
+ */
+static bool rxrpc_validate_jumbo(struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ unsigned int offset = sp->offset;
+ unsigned int len = skb->len;
+ int nr_jumbo = 1;
+ u8 flags = sp->hdr.flags;
-protocol_error:
- ret = -EBADMSG;
-out:
- spin_unlock(&call->lock);
- _leave(" = %d", ret);
- return ret;
+ do {
+ nr_jumbo++;
+ if (len - offset < RXRPC_JUMBO_SUBPKTLEN)
+ goto protocol_error;
+ if (flags & RXRPC_LAST_PACKET)
+ goto protocol_error;
+ offset += RXRPC_JUMBO_DATALEN;
+ if (skb_copy_bits(skb, offset, &flags, 1) < 0)
+ goto protocol_error;
+ offset += sizeof(struct rxrpc_jumbo_header);
+ } while (flags & RXRPC_JUMBO_PACKET);
-discard_and_ack:
- _debug("discard and ACK packet %p", skb);
- __rxrpc_propose_ACK(call, ack, skew, serial, true);
-discard:
- spin_unlock(&call->lock);
- rxrpc_free_skb(skb);
- _leave(" = 0 [discarded]");
- return 0;
+ sp->nr_jumbo = nr_jumbo;
+ return true;
-enqueue_and_ack:
- __rxrpc_propose_ACK(call, ack, skew, serial, true);
-enqueue_packet:
- _net("defer skb %p", skb);
- spin_unlock(&call->lock);
- skb_queue_tail(&call->rx_queue, skb);
- atomic_inc(&call->ackr_not_idle);
- read_lock(&call->state_lock);
- if (call->state < RXRPC_CALL_DEAD)
- rxrpc_queue_call(call);
- read_unlock(&call->state_lock);
- _leave(" = 0 [queued]");
- return 0;
+protocol_error:
+ return false;
}
/*
- * assume an implicit ACKALL of the transmission phase of a client socket upon
- * reception of the first reply packet
+ * Handle reception of a duplicate packet.
+ *
+ * We have to take care to avoid an attack here whereby we're given a series of
+ * jumbograms, each with a sequence number one before the preceding one and
+ * filled up to maximum UDP size. If they never send us the first packet in
+ * the sequence, they can cause us to have to hold on to around 2MiB of kernel
+ * space until the call times out.
+ *
+ * We limit the space usage by only accepting three duplicate jumbo packets per
+ * call. After that, we tell the other side we're no longer accepting jumbos
+ * (that information is encoded in the ACK packet).
*/
-static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial)
+static void rxrpc_input_dup_data(struct rxrpc_call *call, rxrpc_seq_t seq,
+ u8 annotation, bool *_jumbo_bad)
{
- write_lock_bh(&call->state_lock);
-
- switch (call->state) {
- case RXRPC_CALL_CLIENT_AWAIT_REPLY:
- call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
- call->acks_latest = serial;
-
- _debug("implicit ACKALL %%%u", call->acks_latest);
- set_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events);
- write_unlock_bh(&call->state_lock);
-
- if (try_to_del_timer_sync(&call->resend_timer) >= 0) {
- clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
- clear_bit(RXRPC_CALL_EV_RESEND, &call->events);
- clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
- }
- break;
+ /* Discard normal packets that are duplicates. */
+ if (annotation == 0)
+ return;
- default:
- write_unlock_bh(&call->state_lock);
- break;
+ /* Skip jumbo subpackets that are duplicates. When we've had three or
+ * more partially duplicate jumbo packets, we refuse to take any more
+ * jumbos for this call.
+ */
+ if (!*_jumbo_bad) {
+ call->nr_jumbo_bad++;
+ *_jumbo_bad = true;
}
}
/*
- * post an incoming packet to the nominated call to deal with
- * - must get rid of the sk_buff, either by freeing it or by queuing it
+ * Process a DATA packet, adding the packet to the Rx ring.
*/
-void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
+static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
+ u16 skew)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- __be32 wtmp;
- u32 abort_code;
+ unsigned int offset = sp->offset;
+ unsigned int ix;
+ rxrpc_serial_t serial = sp->hdr.serial, ack_serial = 0;
+ rxrpc_seq_t seq = sp->hdr.seq, hard_ack;
+ bool immediate_ack = false, jumbo_bad = false, queued;
+ u16 len;
+ u8 ack = 0, flags, annotation = 0;
- _enter("%p,%p", call, skb);
+ _enter("{%u,%u},{%u,%u}",
+ call->rx_hard_ack, call->rx_top, skb->len, seq);
- ASSERT(!irqs_disabled());
+ _proto("Rx DATA %%%u { #%u f=%02x }",
+ sp->hdr.serial, seq, sp->hdr.flags);
-#if 0 // INJECT RX ERROR
- if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) {
- static int skip = 0;
- if (++skip == 3) {
- printk("DROPPED 3RD PACKET!!!!!!!!!!!!!\n");
- skip = 0;
- goto free_packet;
- }
+ if (call->state >= RXRPC_CALL_COMPLETE)
+ return;
+
+ /* Received data implicitly ACKs all of the request packets we sent
+ * when we're acting as a client.
+ */
+ if ((call->state == RXRPC_CALL_CLIENT_SEND_REQUEST ||
+ call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) &&
+ !rxrpc_receiving_reply(call))
+ return;
+
+ call->ackr_prev_seq = seq;
+
+ hard_ack = READ_ONCE(call->rx_hard_ack);
+ if (after(seq, hard_ack + call->rx_winsize)) {
+ ack = RXRPC_ACK_EXCEEDS_WINDOW;
+ ack_serial = serial;
+ goto ack;
}
-#endif
- /* request ACK generation for any ACK or DATA packet that requests
- * it */
- if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
- _proto("ACK Requested on %%%u", sp->hdr.serial);
- rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED,
- skb->priority, sp->hdr.serial, false);
+ flags = sp->hdr.flags;
+ if (flags & RXRPC_JUMBO_PACKET) {
+ if (call->nr_jumbo_bad > 3) {
+ ack = RXRPC_ACK_NOSPACE;
+ ack_serial = serial;
+ goto ack;
+ }
+ annotation = 1;
}
- switch (sp->hdr.type) {
- case RXRPC_PACKET_TYPE_ABORT:
- _debug("abort");
+next_subpacket:
+ queued = false;
+ ix = seq & RXRPC_RXTX_BUFF_MASK;
+ len = skb->len;
+ if (flags & RXRPC_JUMBO_PACKET)
+ len = RXRPC_JUMBO_DATALEN;
+
+ if (flags & RXRPC_LAST_PACKET) {
+ if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
+ seq != call->rx_top)
+ return rxrpc_proto_abort("LSN", call, seq);
+ } else {
+ if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
+ after_eq(seq, call->rx_top))
+ return rxrpc_proto_abort("LSA", call, seq);
+ }
- if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0)
- goto protocol_error;
+ if (before_eq(seq, hard_ack)) {
+ ack = RXRPC_ACK_DUPLICATE;
+ ack_serial = serial;
+ goto skip;
+ }
- abort_code = ntohl(wtmp);
- _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code);
+ if (flags & RXRPC_REQUEST_ACK && !ack) {
+ ack = RXRPC_ACK_REQUESTED;
+ ack_serial = serial;
+ }
- if (__rxrpc_set_call_completion(call,
- RXRPC_CALL_REMOTELY_ABORTED,
- abort_code, ECONNABORTED)) {
- set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
- rxrpc_queue_call(call);
+ if (call->rxtx_buffer[ix]) {
+ rxrpc_input_dup_data(call, seq, annotation, &jumbo_bad);
+ if (ack != RXRPC_ACK_DUPLICATE) {
+ ack = RXRPC_ACK_DUPLICATE;
+ ack_serial = serial;
}
- goto free_packet;
+ immediate_ack = true;
+ goto skip;
+ }
- case RXRPC_PACKET_TYPE_BUSY:
- _proto("Rx BUSY %%%u", sp->hdr.serial);
+ /* Queue the packet. We use a couple of memory barriers here as need
+ * to make sure that rx_top is perceived to be set after the buffer
+ * pointer and that the buffer pointer is set after the annotation and
+ * the skb data.
+ *
+ * Barriers against rxrpc_recvmsg_data() and rxrpc_rotate_rx_window()
+ * and also rxrpc_fill_out_ack().
+ */
+ rxrpc_get_skb(skb, rxrpc_skb_rx_got);
+ call->rxtx_annotations[ix] = annotation;
+ smp_wmb();
+ call->rxtx_buffer[ix] = skb;
+ if (after(seq, call->rx_top)) {
+ smp_store_release(&call->rx_top, seq);
+ } else if (before(seq, call->rx_top)) {
+ /* Send an immediate ACK if we fill in a hole */
+ if (!ack) {
+ ack = RXRPC_ACK_DELAY;
+ ack_serial = serial;
+ }
+ immediate_ack = true;
+ }
+ if (flags & RXRPC_LAST_PACKET) {
+ set_bit(RXRPC_CALL_RX_LAST, &call->flags);
+ trace_rxrpc_receive(call, rxrpc_receive_queue_last, serial, seq);
+ } else {
+ trace_rxrpc_receive(call, rxrpc_receive_queue, serial, seq);
+ }
+ queued = true;
- if (rxrpc_conn_is_service(call->conn))
- goto protocol_error;
+ if (after_eq(seq, call->rx_expect_next)) {
+ if (after(seq, call->rx_expect_next)) {
+ _net("OOS %u > %u", seq, call->rx_expect_next);
+ ack = RXRPC_ACK_OUT_OF_SEQUENCE;
+ ack_serial = serial;
+ }
+ call->rx_expect_next = seq + 1;
+ }
- write_lock_bh(&call->state_lock);
- switch (call->state) {
- case RXRPC_CALL_CLIENT_SEND_REQUEST:
- __rxrpc_set_call_completion(call,
- RXRPC_CALL_SERVER_BUSY,
- 0, EBUSY);
- set_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events);
- rxrpc_queue_call(call);
- case RXRPC_CALL_SERVER_BUSY:
- goto free_packet_unlock;
- default:
- goto protocol_error_locked;
+skip:
+ offset += len;
+ if (flags & RXRPC_JUMBO_PACKET) {
+ if (skb_copy_bits(skb, offset, &flags, 1) < 0)
+ return rxrpc_proto_abort("XJF", call, seq);
+ offset += sizeof(struct rxrpc_jumbo_header);
+ seq++;
+ serial++;
+ annotation++;
+ if (flags & RXRPC_JUMBO_PACKET)
+ annotation |= RXRPC_RX_ANNO_JLAST;
+ if (after(seq, hard_ack + call->rx_winsize)) {
+ ack = RXRPC_ACK_EXCEEDS_WINDOW;
+ ack_serial = serial;
+ if (!jumbo_bad) {
+ call->nr_jumbo_bad++;
+ jumbo_bad = true;
+ }
+ goto ack;
}
- default:
- _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial);
- goto protocol_error;
+ _proto("Rx DATA Jumbo %%%u", serial);
+ goto next_subpacket;
+ }
- case RXRPC_PACKET_TYPE_DATA:
- _proto("Rx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
+ if (queued && flags & RXRPC_LAST_PACKET && !ack) {
+ ack = RXRPC_ACK_DELAY;
+ ack_serial = serial;
+ }
- if (sp->hdr.seq == 0)
- goto protocol_error;
+ack:
+ if (ack)
+ rxrpc_propose_ACK(call, ack, skew, ack_serial,
+ immediate_ack, true,
+ rxrpc_propose_ack_input_data);
+
+ if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1)
+ rxrpc_notify_socket(call);
+ _leave(" [queued]");
+}
+
+/*
+ * Process a requested ACK.
+ */
+static void rxrpc_input_requested_ack(struct rxrpc_call *call,
+ ktime_t resp_time,
+ rxrpc_serial_t orig_serial,
+ rxrpc_serial_t ack_serial)
+{
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *skb;
+ ktime_t sent_at;
+ int ix;
+
+ for (ix = 0; ix < RXRPC_RXTX_BUFF_SIZE; ix++) {
+ skb = call->rxtx_buffer[ix];
+ if (!skb)
+ continue;
+
+ sp = rxrpc_skb(skb);
+ if (sp->hdr.serial != orig_serial)
+ continue;
+ smp_rmb();
+ sent_at = skb->tstamp;
+ goto found;
+ }
+ return;
- call->ackr_prev_seq = sp->hdr.seq;
+found:
+ rxrpc_peer_add_rtt(call, rxrpc_rtt_rx_requested_ack,
+ orig_serial, ack_serial, sent_at, resp_time);
+}
- /* received data implicitly ACKs all of the request packets we
- * sent when we're acting as a client */
- if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY)
- rxrpc_assume_implicit_ackall(call, sp->hdr.serial);
+/*
+ * Process a ping response.
+ */
+static void rxrpc_input_ping_response(struct rxrpc_call *call,
+ ktime_t resp_time,
+ rxrpc_serial_t orig_serial,
+ rxrpc_serial_t ack_serial)
+{
+ rxrpc_serial_t ping_serial;
+ ktime_t ping_time;
- switch (rxrpc_fast_process_data(call, skb, sp->hdr.seq)) {
- case 0:
- skb = NULL;
- goto done;
+ ping_time = call->ackr_ping_time;
+ smp_rmb();
+ ping_serial = call->ackr_ping;
- default:
- BUG();
+ if (!test_bit(RXRPC_CALL_PINGING, &call->flags) ||
+ before(orig_serial, ping_serial))
+ return;
+ clear_bit(RXRPC_CALL_PINGING, &call->flags);
+ if (after(orig_serial, ping_serial))
+ return;
- /* data packet received beyond the last packet */
- case -EBADMSG:
- goto protocol_error;
- }
+ rxrpc_peer_add_rtt(call, rxrpc_rtt_rx_ping_response,
+ orig_serial, ack_serial, ping_time, resp_time);
+}
- case RXRPC_PACKET_TYPE_ACKALL:
- case RXRPC_PACKET_TYPE_ACK:
- /* ACK processing is done in process context */
- read_lock_bh(&call->state_lock);
- if (call->state < RXRPC_CALL_DEAD) {
- skb_queue_tail(&call->rx_queue, skb);
- rxrpc_queue_call(call);
- skb = NULL;
- }
- read_unlock_bh(&call->state_lock);
- goto free_packet;
+/*
+ * Process the extra information that may be appended to an ACK packet
+ */
+static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
+ struct rxrpc_ackinfo *ackinfo)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxrpc_peer *peer;
+ unsigned int mtu;
+ u32 rwind = ntohl(ackinfo->rwind);
+
+ _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }",
+ sp->hdr.serial,
+ ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU),
+ rwind, ntohl(ackinfo->jumbo_max));
+
+ if (rwind > RXRPC_RXTX_BUFF_SIZE - 1)
+ rwind = RXRPC_RXTX_BUFF_SIZE - 1;
+ call->tx_winsize = rwind;
+
+ mtu = min(ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU));
+
+ peer = call->peer;
+ if (mtu < peer->maxdata) {
+ spin_lock_bh(&peer->lock);
+ peer->maxdata = mtu;
+ peer->mtu = mtu + peer->hdrsize;
+ spin_unlock_bh(&peer->lock);
+ _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata);
}
+}
-protocol_error:
- _debug("protocol error");
- write_lock_bh(&call->state_lock);
-protocol_error_locked:
- if (__rxrpc_abort_call(call, RX_PROTOCOL_ERROR, EPROTO))
- rxrpc_queue_call(call);
-free_packet_unlock:
- write_unlock_bh(&call->state_lock);
-free_packet:
- rxrpc_free_skb(skb);
-done:
- _leave("");
+/*
+ * Process individual soft ACKs.
+ *
+ * Each ACK in the array corresponds to one packet and can be either an ACK or
+ * a NAK. If we get find an explicitly NAK'd packet we resend immediately;
+ * packets that lie beyond the end of the ACK list are scheduled for resend by
+ * the timer on the basis that the peer might just not have processed them at
+ * the time the ACK was sent.
+ */
+static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,
+ rxrpc_seq_t seq, int nr_acks,
+ struct rxrpc_ack_summary *summary)
+{
+ int ix;
+ u8 annotation, anno_type;
+
+ for (; nr_acks > 0; nr_acks--, seq++) {
+ ix = seq & RXRPC_RXTX_BUFF_MASK;
+ annotation = call->rxtx_annotations[ix];
+ anno_type = annotation & RXRPC_TX_ANNO_MASK;
+ annotation &= ~RXRPC_TX_ANNO_MASK;
+ switch (*acks++) {
+ case RXRPC_ACK_TYPE_ACK:
+ summary->nr_acks++;
+ if (anno_type == RXRPC_TX_ANNO_ACK)
+ continue;
+ summary->nr_new_acks++;
+ call->rxtx_annotations[ix] =
+ RXRPC_TX_ANNO_ACK | annotation;
+ break;
+ case RXRPC_ACK_TYPE_NACK:
+ if (!summary->nr_nacks &&
+ call->acks_lowest_nak != seq) {
+ call->acks_lowest_nak = seq;
+ summary->new_low_nack = true;
+ }
+ summary->nr_nacks++;
+ if (anno_type == RXRPC_TX_ANNO_NAK)
+ continue;
+ summary->nr_new_nacks++;
+ if (anno_type == RXRPC_TX_ANNO_RETRANS)
+ continue;
+ call->rxtx_annotations[ix] =
+ RXRPC_TX_ANNO_NAK | annotation;
+ break;
+ default:
+ return rxrpc_proto_abort("SFT", call, 0);
+ }
+ }
}
/*
- * split up a jumbo data packet
+ * Process an ACK packet.
+ *
+ * ack.firstPacket is the sequence number of the first soft-ACK'd/NAK'd packet
+ * in the ACK array. Anything before that is hard-ACK'd and may be discarded.
+ *
+ * A hard-ACK means that a packet has been processed and may be discarded; a
+ * soft-ACK means that the packet may be discarded and retransmission
+ * requested. A phase is complete when all packets are hard-ACK'd.
*/
-static void rxrpc_process_jumbo_packet(struct rxrpc_call *call,
- struct sk_buff *jumbo)
+static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
+ u16 skew)
{
- struct rxrpc_jumbo_header jhdr;
- struct rxrpc_skb_priv *sp;
- struct sk_buff *part;
+ struct rxrpc_ack_summary summary = { 0 };
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ union {
+ struct rxrpc_ackpacket ack;
+ struct rxrpc_ackinfo info;
+ u8 acks[RXRPC_MAXACKS];
+ } buf;
+ rxrpc_serial_t acked_serial;
+ rxrpc_seq_t first_soft_ack, hard_ack;
+ int nr_acks, offset;
+
+ _enter("");
+
+ if (skb_copy_bits(skb, sp->offset, &buf.ack, sizeof(buf.ack)) < 0) {
+ _debug("extraction failure");
+ return rxrpc_proto_abort("XAK", call, 0);
+ }
+ sp->offset += sizeof(buf.ack);
+
+ acked_serial = ntohl(buf.ack.serial);
+ first_soft_ack = ntohl(buf.ack.firstPacket);
+ hard_ack = first_soft_ack - 1;
+ nr_acks = buf.ack.nAcks;
+ summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ?
+ buf.ack.reason : RXRPC_ACK__INVALID);
+
+ trace_rxrpc_rx_ack(call, first_soft_ack, summary.ack_reason, nr_acks);
+
+ _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
+ sp->hdr.serial,
+ ntohs(buf.ack.maxSkew),
+ first_soft_ack,
+ ntohl(buf.ack.previousPacket),
+ acked_serial,
+ rxrpc_ack_names[summary.ack_reason],
+ buf.ack.nAcks);
+
+ if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE)
+ rxrpc_input_ping_response(call, skb->tstamp, acked_serial,
+ sp->hdr.serial);
+ if (buf.ack.reason == RXRPC_ACK_REQUESTED)
+ rxrpc_input_requested_ack(call, skb->tstamp, acked_serial,
+ sp->hdr.serial);
+
+ if (buf.ack.reason == RXRPC_ACK_PING) {
+ _proto("Rx ACK %%%u PING Request", sp->hdr.serial);
+ rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE,
+ skew, sp->hdr.serial, true, true,
+ rxrpc_propose_ack_respond_to_ping);
+ } else if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
+ rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED,
+ skew, sp->hdr.serial, true, true,
+ rxrpc_propose_ack_respond_to_ack);
+ }
- _enter(",{%u,%u}", jumbo->data_len, jumbo->len);
+ offset = sp->offset + nr_acks + 3;
+ if (skb->len >= offset + sizeof(buf.info)) {
+ if (skb_copy_bits(skb, offset, &buf.info, sizeof(buf.info)) < 0)
+ return rxrpc_proto_abort("XAI", call, 0);
+ rxrpc_input_ackinfo(call, skb, &buf.info);
+ }
- sp = rxrpc_skb(jumbo);
+ if (first_soft_ack == 0)
+ return rxrpc_proto_abort("AK0", call, 0);
- do {
- sp->hdr.flags &= ~RXRPC_JUMBO_PACKET;
-
- /* make a clone to represent the first subpacket in what's left
- * of the jumbo packet */
- part = skb_clone(jumbo, GFP_ATOMIC);
- if (!part) {
- /* simply ditch the tail in the event of ENOMEM */
- pskb_trim(jumbo, RXRPC_JUMBO_DATALEN);
- break;
- }
- rxrpc_new_skb(part);
+ /* Ignore ACKs unless we are or have just been transmitting. */
+ switch (call->state) {
+ case RXRPC_CALL_CLIENT_SEND_REQUEST:
+ case RXRPC_CALL_CLIENT_AWAIT_REPLY:
+ case RXRPC_CALL_SERVER_SEND_REPLY:
+ case RXRPC_CALL_SERVER_AWAIT_ACK:
+ break;
+ default:
+ return;
+ }
- pskb_trim(part, RXRPC_JUMBO_DATALEN);
+ /* Discard any out-of-order or duplicate ACKs. */
+ if (before_eq(sp->hdr.serial, call->acks_latest)) {
+ _debug("discard ACK %d <= %d",
+ sp->hdr.serial, call->acks_latest);
+ return;
+ }
+ call->acks_latest_ts = skb->tstamp;
+ call->acks_latest = sp->hdr.serial;
+
+ if (before(hard_ack, call->tx_hard_ack) ||
+ after(hard_ack, call->tx_top))
+ return rxrpc_proto_abort("AKW", call, 0);
+ if (nr_acks > call->tx_top - hard_ack)
+ return rxrpc_proto_abort("AKN", call, 0);
+
+ if (after(hard_ack, call->tx_hard_ack))
+ rxrpc_rotate_tx_window(call, hard_ack, &summary);
+
+ if (nr_acks > 0) {
+ if (skb_copy_bits(skb, sp->offset, buf.acks, nr_acks) < 0)
+ return rxrpc_proto_abort("XSA", call, 0);
+ rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks,
+ &summary);
+ }
- if (!pskb_pull(jumbo, RXRPC_JUMBO_DATALEN))
- goto protocol_error;
+ if (test_bit(RXRPC_CALL_TX_LAST, &call->flags)) {
+ rxrpc_end_tx_phase(call, false, "ETA");
+ return;
+ }
- if (skb_copy_bits(jumbo, 0, &jhdr, sizeof(jhdr)) < 0)
- goto protocol_error;
- if (!pskb_pull(jumbo, sizeof(jhdr)))
- BUG();
+ if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] &
+ RXRPC_TX_ANNO_LAST &&
+ summary.nr_acks == call->tx_top - hard_ack)
+ rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial,
+ false, true,
+ rxrpc_propose_ack_ping_for_lost_reply);
- sp->hdr.seq += 1;
- sp->hdr.serial += 1;
- sp->hdr.flags = jhdr.flags;
- sp->hdr._rsvd = ntohs(jhdr._rsvd);
+ return rxrpc_congestion_management(call, skb, &summary);
+}
- _proto("Rx DATA Jumbo %%%u", sp->hdr.serial - 1);
+/*
+ * Process an ACKALL packet.
+ */
+static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb)
+{
+ struct rxrpc_ack_summary summary = { 0 };
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- rxrpc_fast_process_packet(call, part);
- part = NULL;
+ _proto("Rx ACKALL %%%u", sp->hdr.serial);
- } while (sp->hdr.flags & RXRPC_JUMBO_PACKET);
+ rxrpc_rotate_tx_window(call, call->tx_top, &summary);
+ if (test_bit(RXRPC_CALL_TX_LAST, &call->flags))
+ rxrpc_end_tx_phase(call, false, "ETL");
+}
- rxrpc_fast_process_packet(call, jumbo);
- _leave("");
- return;
+/*
+ * Process an ABORT packet.
+ */
+static void rxrpc_input_abort(struct rxrpc_call *call, struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ __be32 wtmp;
+ u32 abort_code = RX_CALL_DEAD;
-protocol_error:
- _debug("protocol error");
- rxrpc_free_skb(part);
- rxrpc_free_skb(jumbo);
- if (rxrpc_abort_call(call, RX_PROTOCOL_ERROR, EPROTO))
- rxrpc_queue_call(call);
- _leave("");
+ _enter("");
+
+ if (skb->len >= 4 &&
+ skb_copy_bits(skb, sp->offset, &wtmp, sizeof(wtmp)) >= 0)
+ abort_code = ntohl(wtmp);
+
+ _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code);
+
+ if (rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED,
+ abort_code, ECONNABORTED))
+ rxrpc_notify_socket(call);
}
/*
- * post an incoming packet to the appropriate call/socket to deal with
- * - must get rid of the sk_buff, either by freeing it or by queuing it
+ * Process an incoming call packet.
*/
-static void rxrpc_post_packet_to_call(struct rxrpc_call *call,
- struct sk_buff *skb)
+static void rxrpc_input_call_packet(struct rxrpc_call *call,
+ struct sk_buff *skb, u16 skew)
{
- struct rxrpc_skb_priv *sp;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
_enter("%p,%p", call, skb);
- sp = rxrpc_skb(skb);
+ switch (sp->hdr.type) {
+ case RXRPC_PACKET_TYPE_DATA:
+ rxrpc_input_data(call, skb, skew);
+ break;
- _debug("extant call [%d]", call->state);
+ case RXRPC_PACKET_TYPE_ACK:
+ rxrpc_input_ack(call, skb, skew);
+ break;
- read_lock(&call->state_lock);
- switch (call->state) {
- case RXRPC_CALL_DEAD:
- goto dead_call;
-
- case RXRPC_CALL_COMPLETE:
- switch (call->completion) {
- case RXRPC_CALL_LOCALLY_ABORTED:
- if (!test_and_set_bit(RXRPC_CALL_EV_ABORT,
- &call->events)) {
- rxrpc_queue_call(call);
- goto free_unlock;
- }
- default:
- goto dead_call;
- case RXRPC_CALL_SUCCEEDED:
- if (rxrpc_conn_is_service(call->conn))
- goto dead_call;
- goto resend_final_ack;
- }
+ case RXRPC_PACKET_TYPE_BUSY:
+ _proto("Rx BUSY %%%u", sp->hdr.serial);
- case RXRPC_CALL_CLIENT_FINAL_ACK:
- goto resend_final_ack;
+ /* Just ignore BUSY packets from the server; the retry and
+ * lifespan timers will take care of business. BUSY packets
+ * from the client don't make sense.
+ */
+ break;
+
+ case RXRPC_PACKET_TYPE_ABORT:
+ rxrpc_input_abort(call, skb);
+ break;
+
+ case RXRPC_PACKET_TYPE_ACKALL:
+ rxrpc_input_ackall(call, skb);
+ break;
default:
+ _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial);
break;
}
- read_unlock(&call->state_lock);
- rxrpc_get_call(call);
-
- if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
- sp->hdr.flags & RXRPC_JUMBO_PACKET)
- rxrpc_process_jumbo_packet(call, skb);
- else
- rxrpc_fast_process_packet(call, skb);
-
- rxrpc_put_call(call);
- goto done;
-
-resend_final_ack:
- _debug("final ack again");
- rxrpc_get_call(call);
- set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
- rxrpc_queue_call(call);
- goto free_unlock;
-
-dead_call:
- if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) {
- skb->priority = RX_CALL_DEAD;
- rxrpc_reject_packet(call->conn->params.local, skb);
- goto unlock;
- }
-free_unlock:
- rxrpc_free_skb(skb);
-unlock:
- read_unlock(&call->state_lock);
-done:
_leave("");
}
@@ -597,6 +960,17 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local,
}
/*
+ * put a packet up for transport-level abort
+ */
+static void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb)
+{
+ CHECK_SLAB_OKAY(&local->usage);
+
+ skb_queue_tail(&local->reject_queue, skb);
+ rxrpc_queue_local(local);
+}
+
+/*
* Extract the wire header from a packet and translate the byte order.
*/
static noinline
@@ -607,8 +981,6 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
/* dig out the RxRPC connection details */
if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0)
return -EBADMSG;
- if (!pskb_pull(skb, sizeof(whdr)))
- BUG();
memset(sp, 0, sizeof(*sp));
sp->hdr.epoch = ntohl(whdr.epoch);
@@ -622,6 +994,7 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
sp->hdr.securityIndex = whdr.securityIndex;
sp->hdr._rsvd = ntohs(whdr._rsvd);
sp->hdr.serviceId = ntohs(whdr.serviceId);
+ sp->offset = sizeof(whdr);
return 0;
}
@@ -633,19 +1006,22 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
* shut down and the local endpoint from going away, thus sk_user_data will not
* be cleared until this function returns.
*/
-void rxrpc_data_ready(struct sock *sk)
+void rxrpc_data_ready(struct sock *udp_sk)
{
struct rxrpc_connection *conn;
+ struct rxrpc_channel *chan;
+ struct rxrpc_call *call;
struct rxrpc_skb_priv *sp;
- struct rxrpc_local *local = sk->sk_user_data;
+ struct rxrpc_local *local = udp_sk->sk_user_data;
struct sk_buff *skb;
+ unsigned int channel;
int ret, skew;
- _enter("%p", sk);
+ _enter("%p", udp_sk);
ASSERT(!irqs_disabled());
- skb = skb_recv_datagram(sk, 0, 1, &ret);
+ skb = skb_recv_datagram(udp_sk, 0, 1, &ret);
if (!skb) {
if (ret == -EAGAIN)
return;
@@ -653,13 +1029,13 @@ void rxrpc_data_ready(struct sock *sk)
return;
}
- rxrpc_new_skb(skb);
+ rxrpc_new_skb(skb, rxrpc_skb_rx_received);
_net("recv skb %p", skb);
/* we'll probably need to checksum it (didn't call sock_recvmsg) */
if (skb_checksum_complete(skb)) {
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
__UDP_INC_STATS(&init_net, UDP_MIB_INERRORS, 0);
_leave(" [CSUM failed]");
return;
@@ -673,13 +1049,21 @@ void rxrpc_data_ready(struct sock *sk)
skb_orphan(skb);
sp = rxrpc_skb(skb);
- _net("Rx UDP packet from %08x:%04hu",
- ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source));
-
/* dig out the RxRPC connection details */
if (rxrpc_extract_header(sp, skb) < 0)
goto bad_message;
+ if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
+ static int lose;
+ if ((lose++ & 7) == 7) {
+ trace_rxrpc_rx_lose(sp);
+ rxrpc_lose_skb(skb, rxrpc_skb_rx_lost);
+ return;
+ }
+ }
+
+ trace_rxrpc_rx_packet(sp);
+
_net("Rx RxRPC %s ep=%x call=%x:%x",
sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient",
sp->hdr.epoch, sp->hdr.cid, sp->hdr.callNumber);
@@ -690,110 +1074,125 @@ void rxrpc_data_ready(struct sock *sk)
goto bad_message;
}
- if (sp->hdr.type == RXRPC_PACKET_TYPE_VERSION) {
+ switch (sp->hdr.type) {
+ case RXRPC_PACKET_TYPE_VERSION:
rxrpc_post_packet_to_local(local, skb);
goto out;
- }
- if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
- (sp->hdr.callNumber == 0 || sp->hdr.seq == 0))
- goto bad_message;
+ case RXRPC_PACKET_TYPE_BUSY:
+ if (sp->hdr.flags & RXRPC_CLIENT_INITIATED)
+ goto discard;
+
+ case RXRPC_PACKET_TYPE_DATA:
+ if (sp->hdr.callNumber == 0)
+ goto bad_message;
+ if (sp->hdr.flags & RXRPC_JUMBO_PACKET &&
+ !rxrpc_validate_jumbo(skb))
+ goto bad_message;
+ break;
+ }
rcu_read_lock();
conn = rxrpc_find_connection_rcu(local, skb);
- if (!conn) {
- skb->priority = 0;
- goto cant_route_call;
- }
+ if (conn) {
+ if (sp->hdr.securityIndex != conn->security_ix)
+ goto wrong_security;
- /* Note the serial number skew here */
- skew = (int)sp->hdr.serial - (int)conn->hi_serial;
- if (skew >= 0) {
- if (skew > 0)
- conn->hi_serial = sp->hdr.serial;
- skb->priority = 0;
- } else {
- skew = -skew;
- skb->priority = min(skew, 65535);
- }
+ if (sp->hdr.callNumber == 0) {
+ /* Connection-level packet */
+ _debug("CONN %p {%d}", conn, conn->debug_id);
+ rxrpc_post_packet_to_conn(conn, skb);
+ goto out_unlock;
+ }
+
+ /* Note the serial number skew here */
+ skew = (int)sp->hdr.serial - (int)conn->hi_serial;
+ if (skew >= 0) {
+ if (skew > 0)
+ conn->hi_serial = sp->hdr.serial;
+ } else {
+ skew = -skew;
+ skew = min(skew, 65535);
+ }
- if (sp->hdr.callNumber == 0) {
- /* Connection-level packet */
- _debug("CONN %p {%d}", conn, conn->debug_id);
- rxrpc_post_packet_to_conn(conn, skb);
- goto out_unlock;
- } else {
/* Call-bound packets are routed by connection channel. */
- unsigned int channel = sp->hdr.cid & RXRPC_CHANNELMASK;
- struct rxrpc_channel *chan = &conn->channels[channel];
- struct rxrpc_call *call;
+ channel = sp->hdr.cid & RXRPC_CHANNELMASK;
+ chan = &conn->channels[channel];
/* Ignore really old calls */
if (sp->hdr.callNumber < chan->last_call)
goto discard_unlock;
if (sp->hdr.callNumber == chan->last_call) {
- /* For the previous service call, if completed
- * successfully, we discard all further packets.
+ /* For the previous service call, if completed successfully, we
+ * discard all further packets.
*/
if (rxrpc_conn_is_service(conn) &&
(chan->last_type == RXRPC_PACKET_TYPE_ACK ||
sp->hdr.type == RXRPC_PACKET_TYPE_ABORT))
goto discard_unlock;
- /* But otherwise we need to retransmit the final packet
- * from data cached in the connection record.
+ /* But otherwise we need to retransmit the final packet from
+ * data cached in the connection record.
*/
rxrpc_post_packet_to_conn(conn, skb);
goto out_unlock;
}
call = rcu_dereference(chan->call);
- if (!call || atomic_read(&call->usage) == 0)
- goto cant_route_call;
+ } else {
+ skew = 0;
+ call = NULL;
+ }
- rxrpc_see_call(call);
- rxrpc_post_packet_to_call(call, skb);
- goto out_unlock;
+ if (!call || atomic_read(&call->usage) == 0) {
+ if (!(sp->hdr.type & RXRPC_CLIENT_INITIATED) ||
+ sp->hdr.callNumber == 0 ||
+ sp->hdr.type != RXRPC_PACKET_TYPE_DATA)
+ goto bad_message_unlock;
+ if (sp->hdr.seq != 1)
+ goto discard_unlock;
+ call = rxrpc_new_incoming_call(local, conn, skb);
+ if (!call) {
+ rcu_read_unlock();
+ goto reject_packet;
+ }
+ rxrpc_send_ping(call, skb, skew);
}
+ rxrpc_input_call_packet(call, skb, skew);
+ goto discard_unlock;
+
discard_unlock:
- rxrpc_free_skb(skb);
-out_unlock:
rcu_read_unlock();
+discard:
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
out:
+ trace_rxrpc_rx_done(0, 0);
return;
-cant_route_call:
+out_unlock:
rcu_read_unlock();
+ goto out;
- _debug("can't route call");
- if (sp->hdr.flags & RXRPC_CLIENT_INITIATED &&
- sp->hdr.type == RXRPC_PACKET_TYPE_DATA) {
- if (sp->hdr.seq == 1) {
- _debug("first packet");
- skb_queue_tail(&local->accept_queue, skb);
- rxrpc_queue_work(&local->processor);
- _leave(" [incoming]");
- return;
- }
- skb->priority = RX_INVALID_OPERATION;
- } else {
- skb->priority = RX_CALL_DEAD;
- }
-
- if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) {
- _debug("reject type %d",sp->hdr.type);
- rxrpc_reject_packet(local, skb);
- } else {
- rxrpc_free_skb(skb);
- }
- _leave(" [no call]");
- return;
+wrong_security:
+ rcu_read_unlock();
+ trace_rxrpc_abort("SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+ RXKADINCONSISTENCY, EBADMSG);
+ skb->priority = RXKADINCONSISTENCY;
+ goto post_abort;
+bad_message_unlock:
+ rcu_read_unlock();
bad_message:
+ trace_rxrpc_abort("BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+ RX_PROTOCOL_ERROR, EBADMSG);
skb->priority = RX_PROTOCOL_ERROR;
+post_abort:
+ skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
+reject_packet:
+ trace_rxrpc_rx_done(skb->mark, skb->priority);
rxrpc_reject_packet(local, skb);
_leave(" [badmsg]");
}
diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c
index c21ad213b337..7d4375e557e6 100644
--- a/net/rxrpc/insecure.c
+++ b/net/rxrpc/insecure.c
@@ -23,31 +23,36 @@ static int none_prime_packet_security(struct rxrpc_connection *conn)
}
static int none_secure_packet(struct rxrpc_call *call,
- struct sk_buff *skb,
- size_t data_size,
- void *sechdr)
+ struct sk_buff *skb,
+ size_t data_size,
+ void *sechdr)
{
return 0;
}
-static int none_verify_packet(struct rxrpc_call *call,
- struct sk_buff *skb,
- u32 *_abort_code)
+static int none_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
+ unsigned int offset, unsigned int len,
+ rxrpc_seq_t seq, u16 expected_cksum)
{
return 0;
}
+static void none_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
+ unsigned int *_offset, unsigned int *_len)
+{
+}
+
static int none_respond_to_challenge(struct rxrpc_connection *conn,
- struct sk_buff *skb,
- u32 *_abort_code)
+ struct sk_buff *skb,
+ u32 *_abort_code)
{
*_abort_code = RX_PROTOCOL_ERROR;
return -EPROTO;
}
static int none_verify_response(struct rxrpc_connection *conn,
- struct sk_buff *skb,
- u32 *_abort_code)
+ struct sk_buff *skb,
+ u32 *_abort_code)
{
*_abort_code = RX_PROTOCOL_ERROR;
return -EPROTO;
@@ -78,6 +83,7 @@ const struct rxrpc_security rxrpc_no_security = {
.prime_packet_security = none_prime_packet_security,
.secure_packet = none_secure_packet,
.verify_packet = none_verify_packet,
+ .locate_data = none_locate_data,
.respond_to_challenge = none_respond_to_challenge,
.verify_response = none_verify_response,
.clear = none_clear,
diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c
index bcc6593b4cdb..190f68bd9e27 100644
--- a/net/rxrpc/local_event.c
+++ b/net/rxrpc/local_event.c
@@ -15,8 +15,6 @@
#include <linux/net.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
-#include <linux/udp.h>
-#include <linux/ip.h>
#include <net/sock.h>
#include <net/af_rxrpc.h>
#include <generated/utsrelease.h>
@@ -33,7 +31,7 @@ static void rxrpc_send_version_request(struct rxrpc_local *local,
{
struct rxrpc_wire_header whdr;
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- struct sockaddr_in sin;
+ struct sockaddr_rxrpc srx;
struct msghdr msg;
struct kvec iov[2];
size_t len;
@@ -41,12 +39,11 @@ static void rxrpc_send_version_request(struct rxrpc_local *local,
_enter("");
- sin.sin_family = AF_INET;
- sin.sin_port = udp_hdr(skb)->source;
- sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
+ if (rxrpc_extract_addr_from_skb(&srx, skb) < 0)
+ return;
- msg.msg_name = &sin;
- msg.msg_namelen = sizeof(sin);
+ msg.msg_name = &srx.transport;
+ msg.msg_namelen = srx.transport_len;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
@@ -93,12 +90,12 @@ void rxrpc_process_local_events(struct rxrpc_local *local)
if (skb) {
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- rxrpc_see_skb(skb);
+ rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
_debug("{%d},{%u}", local->debug_id, sp->hdr.type);
switch (sp->hdr.type) {
case RXRPC_PACKET_TYPE_VERSION:
- if (skb_copy_bits(skb, 0, &v, 1) < 0)
+ if (skb_copy_bits(skb, sp->offset, &v, 1) < 0)
return;
_proto("Rx VERSION { %02x }", v);
if (v == 0)
@@ -110,7 +107,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local)
break;
}
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
}
_leave("");
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index a753796fbe8f..e3fad80b0795 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -58,6 +58,17 @@ static long rxrpc_local_cmp_key(const struct rxrpc_local *local,
memcmp(&local->srx.transport.sin.sin_addr,
&srx->transport.sin.sin_addr,
sizeof(struct in_addr));
+#ifdef CONFIG_AF_RXRPC_IPV6
+ case AF_INET6:
+ /* If the choice of UDP6 port is left up to the transport, then
+ * the endpoint record doesn't match.
+ */
+ return ((u16 __force)local->srx.transport.sin6.sin6_port -
+ (u16 __force)srx->transport.sin6.sin6_port) ?:
+ memcmp(&local->srx.transport.sin6.sin6_addr,
+ &srx->transport.sin6.sin6_addr,
+ sizeof(struct in6_addr));
+#endif
default:
BUG();
}
@@ -75,9 +86,8 @@ static struct rxrpc_local *rxrpc_alloc_local(const struct sockaddr_rxrpc *srx)
atomic_set(&local->usage, 1);
INIT_LIST_HEAD(&local->link);
INIT_WORK(&local->processor, rxrpc_local_processor);
- INIT_LIST_HEAD(&local->services);
+ INIT_HLIST_HEAD(&local->services);
init_rwsem(&local->defrag_sem);
- skb_queue_head_init(&local->accept_queue);
skb_queue_head_init(&local->reject_queue);
skb_queue_head_init(&local->event_queue);
local->client_conns = RB_ROOT;
@@ -101,11 +111,12 @@ static int rxrpc_open_socket(struct rxrpc_local *local)
struct sock *sock;
int ret, opt;
- _enter("%p{%d}", local, local->srx.transport_type);
+ _enter("%p{%d,%d}",
+ local, local->srx.transport_type, local->srx.transport.family);
/* create a socket to represent the local endpoint */
- ret = sock_create_kern(&init_net, PF_INET, local->srx.transport_type,
- IPPROTO_UDP, &local->socket);
+ ret = sock_create_kern(&init_net, local->srx.transport.family,
+ local->srx.transport_type, 0, &local->socket);
if (ret < 0) {
_leave(" = %d [socket]", ret);
return ret;
@@ -170,18 +181,8 @@ struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *srx)
long diff;
int ret;
- if (srx->transport.family == AF_INET) {
- _enter("{%d,%u,%pI4+%hu}",
- srx->transport_type,
- srx->transport.family,
- &srx->transport.sin.sin_addr,
- ntohs(srx->transport.sin.sin_port));
- } else {
- _enter("{%d,%u}",
- srx->transport_type,
- srx->transport.family);
- return ERR_PTR(-EAFNOSUPPORT);
- }
+ _enter("{%d,%d,%pISp}",
+ srx->transport_type, srx->transport.family, &srx->transport);
mutex_lock(&rxrpc_local_mutex);
@@ -234,13 +235,8 @@ struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *srx)
found:
mutex_unlock(&rxrpc_local_mutex);
- _net("LOCAL %s %d {%d,%u,%pI4+%hu}",
- age,
- local->debug_id,
- local->srx.transport_type,
- local->srx.transport.family,
- &local->srx.transport.sin.sin_addr,
- ntohs(local->srx.transport.sin.sin_port));
+ _net("LOCAL %s %d {%pISp}",
+ age, local->debug_id, &local->srx.transport);
_leave(" = %p", local);
return local;
@@ -296,7 +292,7 @@ static void rxrpc_local_destroyer(struct rxrpc_local *local)
mutex_unlock(&rxrpc_local_mutex);
ASSERT(RB_EMPTY_ROOT(&local->client_conns));
- ASSERT(list_empty(&local->services));
+ ASSERT(hlist_empty(&local->services));
if (socket) {
local->socket = NULL;
@@ -308,7 +304,6 @@ static void rxrpc_local_destroyer(struct rxrpc_local *local)
/* At this point, there should be no more packets coming in to the
* local endpoint.
*/
- rxrpc_purge_queue(&local->accept_queue);
rxrpc_purge_queue(&local->reject_queue);
rxrpc_purge_queue(&local->event_queue);
@@ -332,11 +327,6 @@ static void rxrpc_local_processor(struct work_struct *work)
if (atomic_read(&local->usage) == 0)
return rxrpc_local_destroyer(local);
- if (!skb_queue_empty(&local->accept_queue)) {
- rxrpc_accept_incoming_calls(local);
- again = true;
- }
-
if (!skb_queue_empty(&local->reject_queue)) {
rxrpc_reject_packets(local);
again = true;
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 39e7cc37c392..aedb8978226d 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -50,7 +50,10 @@ unsigned int rxrpc_idle_ack_delay = 0.5 * HZ;
* limit is hit, we should generate an EXCEEDS_WINDOW ACK and discard further
* packets.
*/
-unsigned int rxrpc_rx_window_size = 32;
+unsigned int rxrpc_rx_window_size = RXRPC_INIT_RX_WINDOW_SIZE;
+#if (RXRPC_RXTX_BUFF_SIZE - 1) < RXRPC_INIT_RX_WINDOW_SIZE
+#error Need to reduce RXRPC_INIT_RX_WINDOW_SIZE
+#endif
/*
* Maximum Rx MTU size. This indicates to the sender the size of jumbo packet
@@ -65,9 +68,9 @@ unsigned int rxrpc_rx_mtu = 5692;
unsigned int rxrpc_rx_jumbo_max = 4;
/*
- * Time till packet resend (in jiffies).
+ * Time till packet resend (in milliseconds).
*/
-unsigned int rxrpc_resend_timeout = 4 * HZ;
+unsigned int rxrpc_resend_timeout = 4 * 1000;
const char *const rxrpc_pkts[] = {
"?00",
@@ -80,21 +83,153 @@ const s8 rxrpc_ack_priority[] = {
[RXRPC_ACK_DELAY] = 1,
[RXRPC_ACK_REQUESTED] = 2,
[RXRPC_ACK_IDLE] = 3,
- [RXRPC_ACK_PING_RESPONSE] = 4,
- [RXRPC_ACK_DUPLICATE] = 5,
- [RXRPC_ACK_OUT_OF_SEQUENCE] = 6,
- [RXRPC_ACK_EXCEEDS_WINDOW] = 7,
- [RXRPC_ACK_NOSPACE] = 8,
-};
-
-const char *rxrpc_acks(u8 reason)
-{
- static const char *const str[] = {
- "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY",
- "IDL", "-?-"
- };
-
- if (reason >= ARRAY_SIZE(str))
- reason = ARRAY_SIZE(str) - 1;
- return str[reason];
-}
+ [RXRPC_ACK_DUPLICATE] = 4,
+ [RXRPC_ACK_OUT_OF_SEQUENCE] = 5,
+ [RXRPC_ACK_EXCEEDS_WINDOW] = 6,
+ [RXRPC_ACK_NOSPACE] = 7,
+ [RXRPC_ACK_PING_RESPONSE] = 8,
+ [RXRPC_ACK_PING] = 9,
+};
+
+const char const rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4] = {
+ "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY",
+ "IDL", "-?-"
+};
+
+const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7] = {
+ [rxrpc_skb_rx_cleaned] = "Rx CLN",
+ [rxrpc_skb_rx_freed] = "Rx FRE",
+ [rxrpc_skb_rx_got] = "Rx GOT",
+ [rxrpc_skb_rx_lost] = "Rx *L*",
+ [rxrpc_skb_rx_received] = "Rx RCV",
+ [rxrpc_skb_rx_purged] = "Rx PUR",
+ [rxrpc_skb_rx_rotated] = "Rx ROT",
+ [rxrpc_skb_rx_seen] = "Rx SEE",
+ [rxrpc_skb_tx_cleaned] = "Tx CLN",
+ [rxrpc_skb_tx_freed] = "Tx FRE",
+ [rxrpc_skb_tx_got] = "Tx GOT",
+ [rxrpc_skb_tx_lost] = "Tx *L*",
+ [rxrpc_skb_tx_new] = "Tx NEW",
+ [rxrpc_skb_tx_rotated] = "Tx ROT",
+ [rxrpc_skb_tx_seen] = "Tx SEE",
+};
+
+const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4] = {
+ [rxrpc_conn_new_client] = "NWc",
+ [rxrpc_conn_new_service] = "NWs",
+ [rxrpc_conn_queued] = "QUE",
+ [rxrpc_conn_seen] = "SEE",
+ [rxrpc_conn_got] = "GOT",
+ [rxrpc_conn_put_client] = "PTc",
+ [rxrpc_conn_put_service] = "PTs",
+};
+
+const char rxrpc_client_traces[rxrpc_client__nr_trace][7] = {
+ [rxrpc_client_activate_chans] = "Activa",
+ [rxrpc_client_alloc] = "Alloc ",
+ [rxrpc_client_chan_activate] = "ChActv",
+ [rxrpc_client_chan_disconnect] = "ChDisc",
+ [rxrpc_client_chan_pass] = "ChPass",
+ [rxrpc_client_chan_unstarted] = "ChUnst",
+ [rxrpc_client_cleanup] = "Clean ",
+ [rxrpc_client_count] = "Count ",
+ [rxrpc_client_discard] = "Discar",
+ [rxrpc_client_duplicate] = "Duplic",
+ [rxrpc_client_exposed] = "Expose",
+ [rxrpc_client_replace] = "Replac",
+ [rxrpc_client_to_active] = "->Actv",
+ [rxrpc_client_to_culled] = "->Cull",
+ [rxrpc_client_to_idle] = "->Idle",
+ [rxrpc_client_to_inactive] = "->Inac",
+ [rxrpc_client_to_waiting] = "->Wait",
+ [rxrpc_client_uncount] = "Uncoun",
+};
+
+const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4] = {
+ [rxrpc_transmit_wait] = "WAI",
+ [rxrpc_transmit_queue] = "QUE",
+ [rxrpc_transmit_queue_last] = "QLS",
+ [rxrpc_transmit_rotate] = "ROT",
+ [rxrpc_transmit_rotate_last] = "RLS",
+ [rxrpc_transmit_await_reply] = "AWR",
+ [rxrpc_transmit_end] = "END",
+};
+
+const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4] = {
+ [rxrpc_receive_incoming] = "INC",
+ [rxrpc_receive_queue] = "QUE",
+ [rxrpc_receive_queue_last] = "QLS",
+ [rxrpc_receive_front] = "FRN",
+ [rxrpc_receive_rotate] = "ROT",
+ [rxrpc_receive_end] = "END",
+};
+
+const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5] = {
+ [rxrpc_recvmsg_enter] = "ENTR",
+ [rxrpc_recvmsg_wait] = "WAIT",
+ [rxrpc_recvmsg_dequeue] = "DEQU",
+ [rxrpc_recvmsg_hole] = "HOLE",
+ [rxrpc_recvmsg_next] = "NEXT",
+ [rxrpc_recvmsg_cont] = "CONT",
+ [rxrpc_recvmsg_full] = "FULL",
+ [rxrpc_recvmsg_data_return] = "DATA",
+ [rxrpc_recvmsg_terminal] = "TERM",
+ [rxrpc_recvmsg_to_be_accepted] = "TBAC",
+ [rxrpc_recvmsg_return] = "RETN",
+};
+
+const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5] = {
+ [rxrpc_rtt_tx_ping] = "PING",
+ [rxrpc_rtt_tx_data] = "DATA",
+};
+
+const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = {
+ [rxrpc_rtt_rx_ping_response] = "PONG",
+ [rxrpc_rtt_rx_requested_ack] = "RACK",
+};
+
+const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = {
+ [rxrpc_timer_begin] = "Begin ",
+ [rxrpc_timer_expired] = "*EXPR*",
+ [rxrpc_timer_init_for_reply] = "IniRpl",
+ [rxrpc_timer_set_for_ack] = "SetAck",
+ [rxrpc_timer_set_for_send] = "SetTx ",
+ [rxrpc_timer_set_for_resend] = "SetRTx",
+};
+
+const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = {
+ [rxrpc_propose_ack_client_tx_end] = "ClTxEnd",
+ [rxrpc_propose_ack_input_data] = "DataIn ",
+ [rxrpc_propose_ack_ping_for_lost_ack] = "LostAck",
+ [rxrpc_propose_ack_ping_for_lost_reply] = "LostRpl",
+ [rxrpc_propose_ack_ping_for_params] = "Params ",
+ [rxrpc_propose_ack_respond_to_ack] = "Rsp2Ack",
+ [rxrpc_propose_ack_respond_to_ping] = "Rsp2Png",
+ [rxrpc_propose_ack_retry_tx] = "RetryTx",
+ [rxrpc_propose_ack_rotate_rx] = "RxAck ",
+ [rxrpc_propose_ack_terminal_ack] = "ClTerm ",
+};
+
+const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes] = {
+ [rxrpc_propose_ack_use] = "",
+ [rxrpc_propose_ack_update] = " Update",
+ [rxrpc_propose_ack_subsume] = " Subsume",
+};
+
+const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10] = {
+ [RXRPC_CALL_SLOW_START] = "SlowStart",
+ [RXRPC_CALL_CONGEST_AVOIDANCE] = "CongAvoid",
+ [RXRPC_CALL_PACKET_LOSS] = "PktLoss ",
+ [RXRPC_CALL_FAST_RETRANSMIT] = "FastReTx ",
+};
+
+const char rxrpc_congest_changes[rxrpc_congest__nr_change][9] = {
+ [rxrpc_cong_begin_retransmission] = " Retrans",
+ [rxrpc_cong_cleared_nacks] = " Cleared",
+ [rxrpc_cong_new_low_nack] = " NewLowN",
+ [rxrpc_cong_no_change] = "",
+ [rxrpc_cong_progress] = " Progres",
+ [rxrpc_cong_retransmit_again] = " ReTxAgn",
+ [rxrpc_cong_rtt_window_end] = " RttWinE",
+ [rxrpc_cong_saw_nack] = " SawNack",
+};
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 5b5508f6fc2a..cf43a715685e 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -19,46 +19,319 @@
#include <net/af_rxrpc.h>
#include "ar-internal.h"
+struct rxrpc_pkt_buffer {
+ struct rxrpc_wire_header whdr;
+ union {
+ struct {
+ struct rxrpc_ackpacket ack;
+ u8 acks[255];
+ u8 pad[3];
+ };
+ __be32 abort_code;
+ };
+ struct rxrpc_ackinfo ackinfo;
+};
+
+/*
+ * Fill out an ACK packet.
+ */
+static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
+ struct rxrpc_pkt_buffer *pkt,
+ rxrpc_seq_t *_hard_ack,
+ rxrpc_seq_t *_top)
+{
+ rxrpc_serial_t serial;
+ rxrpc_seq_t hard_ack, top, seq;
+ int ix;
+ u32 mtu, jmax;
+ u8 *ackp = pkt->acks;
+
+ /* Barrier against rxrpc_input_data(). */
+ serial = call->ackr_serial;
+ hard_ack = READ_ONCE(call->rx_hard_ack);
+ top = smp_load_acquire(&call->rx_top);
+ *_hard_ack = hard_ack;
+ *_top = top;
+
+ pkt->ack.bufferSpace = htons(8);
+ pkt->ack.maxSkew = htons(call->ackr_skew);
+ pkt->ack.firstPacket = htonl(hard_ack + 1);
+ pkt->ack.previousPacket = htonl(call->ackr_prev_seq);
+ pkt->ack.serial = htonl(serial);
+ pkt->ack.reason = call->ackr_reason;
+ pkt->ack.nAcks = top - hard_ack;
+
+ if (pkt->ack.reason == RXRPC_ACK_PING)
+ pkt->whdr.flags |= RXRPC_REQUEST_ACK;
+
+ if (after(top, hard_ack)) {
+ seq = hard_ack + 1;
+ do {
+ ix = seq & RXRPC_RXTX_BUFF_MASK;
+ if (call->rxtx_buffer[ix])
+ *ackp++ = RXRPC_ACK_TYPE_ACK;
+ else
+ *ackp++ = RXRPC_ACK_TYPE_NACK;
+ seq++;
+ } while (before_eq(seq, top));
+ }
+
+ mtu = call->conn->params.peer->if_mtu;
+ mtu -= call->conn->params.peer->hdrsize;
+ jmax = (call->nr_jumbo_bad > 3) ? 1 : rxrpc_rx_jumbo_max;
+ pkt->ackinfo.rxMTU = htonl(rxrpc_rx_mtu);
+ pkt->ackinfo.maxMTU = htonl(mtu);
+ pkt->ackinfo.rwind = htonl(call->rx_winsize);
+ pkt->ackinfo.jumbo_max = htonl(jmax);
+
+ *ackp++ = 0;
+ *ackp++ = 0;
+ *ackp++ = 0;
+ return top - hard_ack + 3;
+}
+
+/*
+ * Send an ACK or ABORT call packet.
+ */
+int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
+{
+ struct rxrpc_connection *conn = NULL;
+ struct rxrpc_pkt_buffer *pkt;
+ struct msghdr msg;
+ struct kvec iov[2];
+ rxrpc_serial_t serial;
+ rxrpc_seq_t hard_ack, top;
+ size_t len, n;
+ bool ping = false;
+ int ioc, ret;
+ u32 abort_code;
+
+ _enter("%u,%s", call->debug_id, rxrpc_pkts[type]);
+
+ spin_lock_bh(&call->lock);
+ if (call->conn)
+ conn = rxrpc_get_connection_maybe(call->conn);
+ spin_unlock_bh(&call->lock);
+ if (!conn)
+ return -ECONNRESET;
+
+ pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+ if (!pkt) {
+ rxrpc_put_connection(conn);
+ return -ENOMEM;
+ }
+
+ msg.msg_name = &call->peer->srx.transport;
+ msg.msg_namelen = call->peer->srx.transport_len;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ pkt->whdr.epoch = htonl(conn->proto.epoch);
+ pkt->whdr.cid = htonl(call->cid);
+ pkt->whdr.callNumber = htonl(call->call_id);
+ pkt->whdr.seq = 0;
+ pkt->whdr.type = type;
+ pkt->whdr.flags = conn->out_clientflag;
+ pkt->whdr.userStatus = 0;
+ pkt->whdr.securityIndex = call->security_ix;
+ pkt->whdr._rsvd = 0;
+ pkt->whdr.serviceId = htons(call->service_id);
+
+ iov[0].iov_base = pkt;
+ iov[0].iov_len = sizeof(pkt->whdr);
+ len = sizeof(pkt->whdr);
+
+ switch (type) {
+ case RXRPC_PACKET_TYPE_ACK:
+ spin_lock_bh(&call->lock);
+ if (!call->ackr_reason) {
+ spin_unlock_bh(&call->lock);
+ ret = 0;
+ goto out;
+ }
+ ping = (call->ackr_reason == RXRPC_ACK_PING);
+ n = rxrpc_fill_out_ack(call, pkt, &hard_ack, &top);
+ call->ackr_reason = 0;
+
+ spin_unlock_bh(&call->lock);
+
+
+ pkt->whdr.flags |= RXRPC_SLOW_START_OK;
+
+ iov[0].iov_len += sizeof(pkt->ack) + n;
+ iov[1].iov_base = &pkt->ackinfo;
+ iov[1].iov_len = sizeof(pkt->ackinfo);
+ len += sizeof(pkt->ack) + n + sizeof(pkt->ackinfo);
+ ioc = 2;
+ break;
+
+ case RXRPC_PACKET_TYPE_ABORT:
+ abort_code = call->abort_code;
+ pkt->abort_code = htonl(abort_code);
+ iov[0].iov_len += sizeof(pkt->abort_code);
+ len += sizeof(pkt->abort_code);
+ ioc = 1;
+ break;
+
+ default:
+ BUG();
+ ret = -ENOANO;
+ goto out;
+ }
+
+ serial = atomic_inc_return(&conn->serial);
+ pkt->whdr.serial = htonl(serial);
+ switch (type) {
+ case RXRPC_PACKET_TYPE_ACK:
+ trace_rxrpc_tx_ack(call, serial,
+ ntohl(pkt->ack.firstPacket),
+ ntohl(pkt->ack.serial),
+ pkt->ack.reason, pkt->ack.nAcks);
+ break;
+ }
+
+ if (ping) {
+ call->ackr_ping = serial;
+ smp_wmb();
+ /* We need to stick a time in before we send the packet in case
+ * the reply gets back before kernel_sendmsg() completes - but
+ * asking UDP to send the packet can take a relatively long
+ * time, so we update the time after, on the assumption that
+ * the packet transmission is more likely to happen towards the
+ * end of the kernel_sendmsg() call.
+ */
+ call->ackr_ping_time = ktime_get_real();
+ set_bit(RXRPC_CALL_PINGING, &call->flags);
+ trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_ping, serial);
+ }
+ ret = kernel_sendmsg(conn->params.local->socket,
+ &msg, iov, ioc, len);
+ if (ping)
+ call->ackr_ping_time = ktime_get_real();
+
+ if (type == RXRPC_PACKET_TYPE_ACK &&
+ call->state < RXRPC_CALL_COMPLETE) {
+ if (ret < 0) {
+ clear_bit(RXRPC_CALL_PINGING, &call->flags);
+ rxrpc_propose_ACK(call, pkt->ack.reason,
+ ntohs(pkt->ack.maxSkew),
+ ntohl(pkt->ack.serial),
+ true, true,
+ rxrpc_propose_ack_retry_tx);
+ } else {
+ spin_lock_bh(&call->lock);
+ if (after(hard_ack, call->ackr_consumed))
+ call->ackr_consumed = hard_ack;
+ if (after(top, call->ackr_seen))
+ call->ackr_seen = top;
+ spin_unlock_bh(&call->lock);
+ }
+ }
+
+out:
+ rxrpc_put_connection(conn);
+ kfree(pkt);
+ return ret;
+}
+
/*
* send a packet through the transport endpoint
*/
-int rxrpc_send_data_packet(struct rxrpc_connection *conn, struct sk_buff *skb)
+int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb)
{
- struct kvec iov[1];
+ struct rxrpc_connection *conn = call->conn;
+ struct rxrpc_wire_header whdr;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct msghdr msg;
+ struct kvec iov[2];
+ rxrpc_serial_t serial;
+ size_t len;
int ret, opt;
_enter(",{%d}", skb->len);
- iov[0].iov_base = skb->head;
- iov[0].iov_len = skb->len;
+ /* Each transmission of a Tx packet needs a new serial number */
+ serial = atomic_inc_return(&conn->serial);
- msg.msg_name = &conn->params.peer->srx.transport;
- msg.msg_namelen = conn->params.peer->srx.transport_len;
+ whdr.epoch = htonl(conn->proto.epoch);
+ whdr.cid = htonl(call->cid);
+ whdr.callNumber = htonl(call->call_id);
+ whdr.seq = htonl(sp->hdr.seq);
+ whdr.serial = htonl(serial);
+ whdr.type = RXRPC_PACKET_TYPE_DATA;
+ whdr.flags = sp->hdr.flags;
+ whdr.userStatus = 0;
+ whdr.securityIndex = call->security_ix;
+ whdr._rsvd = htons(sp->hdr._rsvd);
+ whdr.serviceId = htons(call->service_id);
+
+ iov[0].iov_base = &whdr;
+ iov[0].iov_len = sizeof(whdr);
+ iov[1].iov_base = skb->head;
+ iov[1].iov_len = skb->len;
+ len = iov[0].iov_len + iov[1].iov_len;
+
+ msg.msg_name = &call->peer->srx.transport;
+ msg.msg_namelen = call->peer->srx.transport_len;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
+ /* If our RTT cache needs working on, request an ACK. Also request
+ * ACKs if a DATA packet appears to have been lost.
+ */
+ if (call->cong_mode == RXRPC_CALL_FAST_RETRANSMIT ||
+ (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) ||
+ ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000),
+ ktime_get_real()))
+ whdr.flags |= RXRPC_REQUEST_ACK;
+
+ if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
+ static int lose;
+ if ((lose++ & 7) == 7) {
+ trace_rxrpc_tx_data(call, sp->hdr.seq, serial,
+ whdr.flags, true);
+ rxrpc_lose_skb(skb, rxrpc_skb_tx_lost);
+ _leave(" = 0 [lose]");
+ return 0;
+ }
+ }
+
+ _proto("Tx DATA %%%u { #%u }", serial, sp->hdr.seq);
+
/* send the packet with the don't fragment bit set if we currently
* think it's small enough */
- if (skb->len - sizeof(struct rxrpc_wire_header) < conn->params.peer->maxdata) {
- down_read(&conn->params.local->defrag_sem);
- /* send the packet by UDP
- * - returns -EMSGSIZE if UDP would have to fragment the packet
- * to go out of the interface
- * - in which case, we'll have processed the ICMP error
- * message and update the peer record
- */
- ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1,
- iov[0].iov_len);
+ if (iov[1].iov_len >= call->peer->maxdata)
+ goto send_fragmentable;
+
+ down_read(&conn->params.local->defrag_sem);
+ /* send the packet by UDP
+ * - returns -EMSGSIZE if UDP would have to fragment the packet
+ * to go out of the interface
+ * - in which case, we'll have processed the ICMP error
+ * message and update the peer record
+ */
+ ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
- up_read(&conn->params.local->defrag_sem);
- if (ret == -EMSGSIZE)
- goto send_fragmentable;
+ up_read(&conn->params.local->defrag_sem);
+ if (ret == -EMSGSIZE)
+ goto send_fragmentable;
- _leave(" = %d [%u]", ret, conn->params.peer->maxdata);
- return ret;
+done:
+ trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, false);
+ if (ret >= 0) {
+ ktime_t now = ktime_get_real();
+ skb->tstamp = now;
+ smp_wmb();
+ sp->hdr.serial = serial;
+ if (whdr.flags & RXRPC_REQUEST_ACK) {
+ call->peer->rtt_last_req = now;
+ trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial);
+ }
}
+ _leave(" = %d [%u]", ret, call->peer->maxdata);
+ return ret;
send_fragmentable:
/* attempt to send this message with fragmentation enabled */
@@ -73,8 +346,8 @@ send_fragmentable:
SOL_IP, IP_MTU_DISCOVER,
(char *)&opt, sizeof(opt));
if (ret == 0) {
- ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1,
- iov[0].iov_len);
+ ret = kernel_sendmsg(conn->params.local->socket, &msg,
+ iov, 2, len);
opt = IP_PMTUDISC_DO;
kernel_setsockopt(conn->params.local->socket, SOL_IP,
@@ -82,9 +355,82 @@ send_fragmentable:
(char *)&opt, sizeof(opt));
}
break;
+
+#ifdef CONFIG_AF_RXRPC_IPV6
+ case AF_INET6:
+ opt = IPV6_PMTUDISC_DONT;
+ ret = kernel_setsockopt(conn->params.local->socket,
+ SOL_IPV6, IPV6_MTU_DISCOVER,
+ (char *)&opt, sizeof(opt));
+ if (ret == 0) {
+ ret = kernel_sendmsg(conn->params.local->socket, &msg,
+ iov, 1, iov[0].iov_len);
+
+ opt = IPV6_PMTUDISC_DO;
+ kernel_setsockopt(conn->params.local->socket,
+ SOL_IPV6, IPV6_MTU_DISCOVER,
+ (char *)&opt, sizeof(opt));
+ }
+ break;
+#endif
}
up_write(&conn->params.local->defrag_sem);
- _leave(" = %d [frag %u]", ret, conn->params.peer->maxdata);
- return ret;
+ goto done;
+}
+
+/*
+ * reject packets through the local endpoint
+ */
+void rxrpc_reject_packets(struct rxrpc_local *local)
+{
+ struct sockaddr_rxrpc srx;
+ struct rxrpc_skb_priv *sp;
+ struct rxrpc_wire_header whdr;
+ struct sk_buff *skb;
+ struct msghdr msg;
+ struct kvec iov[2];
+ size_t size;
+ __be32 code;
+
+ _enter("%d", local->debug_id);
+
+ iov[0].iov_base = &whdr;
+ iov[0].iov_len = sizeof(whdr);
+ iov[1].iov_base = &code;
+ iov[1].iov_len = sizeof(code);
+ size = sizeof(whdr) + sizeof(code);
+
+ msg.msg_name = &srx.transport;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ memset(&whdr, 0, sizeof(whdr));
+ whdr.type = RXRPC_PACKET_TYPE_ABORT;
+
+ while ((skb = skb_dequeue(&local->reject_queue))) {
+ rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
+ sp = rxrpc_skb(skb);
+
+ if (rxrpc_extract_addr_from_skb(&srx, skb) == 0) {
+ msg.msg_namelen = srx.transport_len;
+
+ code = htonl(skb->priority);
+
+ whdr.epoch = htonl(sp->hdr.epoch);
+ whdr.cid = htonl(sp->hdr.cid);
+ whdr.callNumber = htonl(sp->hdr.callNumber);
+ whdr.serviceId = htons(sp->hdr.serviceId);
+ whdr.flags = sp->hdr.flags;
+ whdr.flags ^= RXRPC_CLIENT_INITIATED;
+ whdr.flags &= RXRPC_CLIENT_INITIATED;
+
+ kernel_sendmsg(local->socket, &msg, iov, 2, size);
+ }
+
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
+ }
+
+ _leave("");
}
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 27b9ecad007e..bf13b8470c9a 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -66,6 +66,32 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
}
break;
+#ifdef CONFIG_AF_RXRPC_IPV6
+ case AF_INET6:
+ srx.transport.sin6.sin6_port = serr->port;
+ srx.transport_len = sizeof(struct sockaddr_in6);
+ switch (serr->ee.ee_origin) {
+ case SO_EE_ORIGIN_ICMP6:
+ _net("Rx ICMP6");
+ memcpy(&srx.transport.sin6.sin6_addr,
+ skb_network_header(skb) + serr->addr_offset,
+ sizeof(struct in6_addr));
+ break;
+ case SO_EE_ORIGIN_ICMP:
+ _net("Rx ICMP on v6 sock");
+ memcpy(srx.transport.sin6.sin6_addr.s6_addr + 12,
+ skb_network_header(skb) + serr->addr_offset,
+ sizeof(struct in_addr));
+ break;
+ default:
+ memcpy(&srx.transport.sin6.sin6_addr,
+ &ipv6_hdr(skb)->saddr,
+ sizeof(struct in6_addr));
+ break;
+ }
+ break;
+#endif
+
default:
BUG();
}
@@ -129,22 +155,21 @@ void rxrpc_error_report(struct sock *sk)
_leave("UDP socket errqueue empty");
return;
}
+ rxrpc_new_skb(skb, rxrpc_skb_rx_received);
serr = SKB_EXT_ERR(skb);
if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) {
_leave("UDP empty message");
- kfree_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
return;
}
- rxrpc_new_skb(skb);
-
rcu_read_lock();
peer = rxrpc_lookup_peer_icmp_rcu(local, skb);
if (peer && !rxrpc_get_peer_maybe(peer))
peer = NULL;
if (!peer) {
rcu_read_unlock();
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
_leave(" [no peer]");
return;
}
@@ -154,7 +179,7 @@ void rxrpc_error_report(struct sock *sk)
serr->ee.ee_code == ICMP_FRAG_NEEDED)) {
rxrpc_adjust_mtu(peer, serr);
rcu_read_unlock();
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
rxrpc_put_peer(peer);
_leave(" [MTU update]");
return;
@@ -162,7 +187,7 @@ void rxrpc_error_report(struct sock *sk)
rxrpc_store_error(peer, serr);
rcu_read_unlock();
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
/* The ref we obtained is passed off to the work item */
rxrpc_queue_work(&peer->error_distributor);
@@ -249,7 +274,6 @@ void rxrpc_peer_error_distributor(struct work_struct *work)
container_of(work, struct rxrpc_peer, error_distributor);
struct rxrpc_call *call;
enum rxrpc_call_completion compl;
- bool queue;
int error;
_enter("");
@@ -272,15 +296,8 @@ void rxrpc_peer_error_distributor(struct work_struct *work)
hlist_del_init(&call->error_link);
rxrpc_see_call(call);
- queue = false;
- write_lock(&call->state_lock);
- if (__rxrpc_set_call_completion(call, compl, 0, error)) {
- set_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events);
- queue = true;
- }
- write_unlock(&call->state_lock);
- if (queue)
- rxrpc_queue_call(call);
+ if (rxrpc_set_call_completion(call, compl, 0, error))
+ rxrpc_notify_socket(call);
}
spin_unlock_bh(&peer->lock);
@@ -288,3 +305,44 @@ void rxrpc_peer_error_distributor(struct work_struct *work)
rxrpc_put_peer(peer);
_leave("");
}
+
+/*
+ * Add RTT information to cache. This is called in softirq mode and has
+ * exclusive access to the peer RTT data.
+ */
+void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
+ rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial,
+ ktime_t send_time, ktime_t resp_time)
+{
+ struct rxrpc_peer *peer = call->peer;
+ s64 rtt;
+ u64 sum = peer->rtt_sum, avg;
+ u8 cursor = peer->rtt_cursor, usage = peer->rtt_usage;
+
+ rtt = ktime_to_ns(ktime_sub(resp_time, send_time));
+ if (rtt < 0)
+ return;
+
+ /* Replace the oldest datum in the RTT buffer */
+ sum -= peer->rtt_cache[cursor];
+ sum += rtt;
+ peer->rtt_cache[cursor] = rtt;
+ peer->rtt_cursor = (cursor + 1) & (RXRPC_RTT_CACHE_SIZE - 1);
+ peer->rtt_sum = sum;
+ if (usage < RXRPC_RTT_CACHE_SIZE) {
+ usage++;
+ peer->rtt_usage = usage;
+ }
+
+ /* Now recalculate the average */
+ if (usage == RXRPC_RTT_CACHE_SIZE) {
+ avg = sum / RXRPC_RTT_CACHE_SIZE;
+ } else {
+ avg = sum;
+ do_div(avg, usage);
+ }
+
+ peer->rtt = avg;
+ trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, rtt,
+ usage, avg);
+}
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index aebc73ac16dc..941b724d523b 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -16,12 +16,14 @@
#include <linux/skbuff.h>
#include <linux/udp.h>
#include <linux/in.h>
+#include <linux/in6.h>
#include <linux/slab.h>
#include <linux/hashtable.h>
#include <net/sock.h>
#include <net/af_rxrpc.h>
#include <net/ip.h>
#include <net/route.h>
+#include <net/ip6_route.h>
#include "ar-internal.h"
static DEFINE_HASHTABLE(rxrpc_peer_hash, 10);
@@ -50,6 +52,13 @@ static unsigned long rxrpc_peer_hash_key(struct rxrpc_local *local,
size = sizeof(srx->transport.sin.sin_addr);
p = (u16 *)&srx->transport.sin.sin_addr;
break;
+#ifdef CONFIG_AF_RXRPC_IPV6
+ case AF_INET6:
+ hash_key += (u16 __force)srx->transport.sin.sin_port;
+ size = sizeof(srx->transport.sin6.sin6_addr);
+ p = (u16 *)&srx->transport.sin6.sin6_addr;
+ break;
+#endif
default:
WARN(1, "AF_RXRPC: Unsupported transport address family\n");
return 0;
@@ -93,6 +102,14 @@ static long rxrpc_peer_cmp_key(const struct rxrpc_peer *peer,
memcmp(&peer->srx.transport.sin.sin_addr,
&srx->transport.sin.sin_addr,
sizeof(struct in_addr));
+#ifdef CONFIG_AF_RXRPC_IPV6
+ case AF_INET6:
+ return ((u16 __force)peer->srx.transport.sin6.sin6_port -
+ (u16 __force)srx->transport.sin6.sin6_port) ?:
+ memcmp(&peer->srx.transport.sin6.sin6_addr,
+ &srx->transport.sin6.sin6_addr,
+ sizeof(struct in6_addr));
+#endif
default:
BUG();
}
@@ -130,17 +147,7 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local,
peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key);
if (peer) {
- switch (srx->transport.family) {
- case AF_INET:
- _net("PEER %d {%d,%u,%pI4+%hu}",
- peer->debug_id,
- peer->srx.transport_type,
- peer->srx.transport.family,
- &peer->srx.transport.sin.sin_addr,
- ntohs(peer->srx.transport.sin.sin_port));
- break;
- }
-
+ _net("PEER %d {%pISp}", peer->debug_id, &peer->srx.transport);
_leave(" = %p {u=%d}", peer, atomic_read(&peer->usage));
}
return peer;
@@ -152,22 +159,53 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local,
*/
static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer)
{
+ struct dst_entry *dst;
struct rtable *rt;
- struct flowi4 fl4;
+ struct flowi fl;
+ struct flowi4 *fl4 = &fl.u.ip4;
+#ifdef CONFIG_AF_RXRPC_IPV6
+ struct flowi6 *fl6 = &fl.u.ip6;
+#endif
peer->if_mtu = 1500;
- rt = ip_route_output_ports(&init_net, &fl4, NULL,
- peer->srx.transport.sin.sin_addr.s_addr, 0,
- htons(7000), htons(7001),
- IPPROTO_UDP, 0, 0);
- if (IS_ERR(rt)) {
- _leave(" [route err %ld]", PTR_ERR(rt));
- return;
+ memset(&fl, 0, sizeof(fl));
+ switch (peer->srx.transport.family) {
+ case AF_INET:
+ rt = ip_route_output_ports(
+ &init_net, fl4, NULL,
+ peer->srx.transport.sin.sin_addr.s_addr, 0,
+ htons(7000), htons(7001), IPPROTO_UDP, 0, 0);
+ if (IS_ERR(rt)) {
+ _leave(" [route err %ld]", PTR_ERR(rt));
+ return;
+ }
+ dst = &rt->dst;
+ break;
+
+#ifdef CONFIG_AF_RXRPC_IPV6
+ case AF_INET6:
+ fl6->flowi6_iif = LOOPBACK_IFINDEX;
+ fl6->flowi6_scope = RT_SCOPE_UNIVERSE;
+ fl6->flowi6_proto = IPPROTO_UDP;
+ memcpy(&fl6->daddr, &peer->srx.transport.sin6.sin6_addr,
+ sizeof(struct in6_addr));
+ fl6->fl6_dport = htons(7001);
+ fl6->fl6_sport = htons(7000);
+ dst = ip6_route_output(&init_net, NULL, fl6);
+ if (IS_ERR(dst)) {
+ _leave(" [route err %ld]", PTR_ERR(dst));
+ return;
+ }
+ break;
+#endif
+
+ default:
+ BUG();
}
- peer->if_mtu = dst_mtu(&rt->dst);
- dst_release(&rt->dst);
+ peer->if_mtu = dst_mtu(dst);
+ dst_release(dst);
_leave(" [if_mtu %u]", peer->if_mtu);
}
@@ -199,6 +237,41 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)
}
/*
+ * Initialise peer record.
+ */
+static void rxrpc_init_peer(struct rxrpc_peer *peer, unsigned long hash_key)
+{
+ peer->hash_key = hash_key;
+ rxrpc_assess_MTU_size(peer);
+ peer->mtu = peer->if_mtu;
+ peer->rtt_last_req = ktime_get_real();
+
+ switch (peer->srx.transport.family) {
+ case AF_INET:
+ peer->hdrsize = sizeof(struct iphdr);
+ break;
+#ifdef CONFIG_AF_RXRPC_IPV6
+ case AF_INET6:
+ peer->hdrsize = sizeof(struct ipv6hdr);
+ break;
+#endif
+ default:
+ BUG();
+ }
+
+ switch (peer->srx.transport_type) {
+ case SOCK_DGRAM:
+ peer->hdrsize += sizeof(struct udphdr);
+ break;
+ default:
+ BUG();
+ }
+
+ peer->hdrsize += sizeof(struct rxrpc_wire_header);
+ peer->maxdata = peer->mtu - peer->hdrsize;
+}
+
+/*
* Set up a new peer.
*/
static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local,
@@ -212,31 +285,40 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local,
peer = rxrpc_alloc_peer(local, gfp);
if (peer) {
- peer->hash_key = hash_key;
memcpy(&peer->srx, srx, sizeof(*srx));
+ rxrpc_init_peer(peer, hash_key);
+ }
- rxrpc_assess_MTU_size(peer);
- peer->mtu = peer->if_mtu;
-
- if (srx->transport.family == AF_INET) {
- peer->hdrsize = sizeof(struct iphdr);
- switch (srx->transport_type) {
- case SOCK_DGRAM:
- peer->hdrsize += sizeof(struct udphdr);
- break;
- default:
- BUG();
- break;
- }
- } else {
- BUG();
- }
+ _leave(" = %p", peer);
+ return peer;
+}
+
+/*
+ * Set up a new incoming peer. The address is prestored in the preallocated
+ * peer.
+ */
+struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *local,
+ struct rxrpc_peer *prealloc)
+{
+ struct rxrpc_peer *peer;
+ unsigned long hash_key;
+
+ hash_key = rxrpc_peer_hash_key(local, &prealloc->srx);
+ prealloc->local = local;
+ rxrpc_init_peer(prealloc, hash_key);
- peer->hdrsize += sizeof(struct rxrpc_wire_header);
- peer->maxdata = peer->mtu - peer->hdrsize;
+ spin_lock(&rxrpc_peer_hash_lock);
+
+ /* Need to check that we aren't racing with someone else */
+ peer = __rxrpc_lookup_peer_rcu(local, &prealloc->srx, hash_key);
+ if (peer && !rxrpc_get_peer_maybe(peer))
+ peer = NULL;
+ if (!peer) {
+ peer = prealloc;
+ hash_add_rcu(rxrpc_peer_hash, &peer->hash_link, hash_key);
}
- _leave(" = %p", peer);
+ spin_unlock(&rxrpc_peer_hash_lock);
return peer;
}
@@ -249,11 +331,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
struct rxrpc_peer *peer, *candidate;
unsigned long hash_key = rxrpc_peer_hash_key(local, srx);
- _enter("{%d,%d,%pI4+%hu}",
- srx->transport_type,
- srx->transport_len,
- &srx->transport.sin.sin_addr,
- ntohs(srx->transport.sin.sin_port));
+ _enter("{%pISp}", &srx->transport);
/* search the peer list first */
rcu_read_lock();
@@ -272,7 +350,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
return NULL;
}
- spin_lock(&rxrpc_peer_hash_lock);
+ spin_lock_bh(&rxrpc_peer_hash_lock);
/* Need to check that we aren't racing with someone else */
peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key);
@@ -282,7 +360,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
hash_add_rcu(rxrpc_peer_hash,
&candidate->hash_link, hash_key);
- spin_unlock(&rxrpc_peer_hash_lock);
+ spin_unlock_bh(&rxrpc_peer_hash_lock);
if (peer)
kfree(candidate);
@@ -290,11 +368,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
peer = candidate;
}
- _net("PEER %d {%d,%pI4+%hu}",
- peer->debug_id,
- peer->srx.transport_type,
- &peer->srx.transport.sin.sin_addr,
- ntohs(peer->srx.transport.sin.sin_port));
+ _net("PEER %d {%pISp}", peer->debug_id, &peer->srx.transport);
_leave(" = %p {u=%d}", peer, atomic_read(&peer->usage));
return peer;
@@ -307,9 +381,9 @@ void __rxrpc_put_peer(struct rxrpc_peer *peer)
{
ASSERT(hlist_empty(&peer->error_targets));
- spin_lock(&rxrpc_peer_hash_lock);
+ spin_lock_bh(&rxrpc_peer_hash_lock);
hash_del_rcu(&peer->hash_link);
- spin_unlock(&rxrpc_peer_hash_lock);
+ spin_unlock_bh(&rxrpc_peer_hash_lock);
kfree_rcu(peer, rcu);
}
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index 82c64055449d..65cd980767fa 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -17,6 +17,7 @@
static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = {
[RXRPC_CONN_UNUSED] = "Unused ",
[RXRPC_CONN_CLIENT] = "Client ",
+ [RXRPC_CONN_SERVICE_PREALLOC] = "SvPrealc",
[RXRPC_CONN_SERVICE_UNSECURED] = "SvUnsec ",
[RXRPC_CONN_SERVICE_CHALLENGING] = "SvChall ",
[RXRPC_CONN_SERVICE] = "SvSecure",
@@ -29,6 +30,7 @@ static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = {
*/
static void *rxrpc_call_seq_start(struct seq_file *seq, loff_t *_pos)
{
+ rcu_read_lock();
read_lock(&rxrpc_call_lock);
return seq_list_start_head(&rxrpc_calls, *_pos);
}
@@ -41,6 +43,7 @@ static void *rxrpc_call_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static void rxrpc_call_seq_stop(struct seq_file *seq, void *v)
{
read_unlock(&rxrpc_call_lock);
+ rcu_read_unlock();
}
static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
@@ -49,11 +52,12 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
struct rxrpc_sock *rx;
struct rxrpc_peer *peer;
struct rxrpc_call *call;
- char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1];
+ char lbuff[50], rbuff[50];
if (v == &rxrpc_calls) {
seq_puts(seq,
- "Proto Local Remote "
+ "Proto Local "
+ " Remote "
" SvID ConnID CallID End Use State Abort "
" UserID\n");
return 0;
@@ -61,13 +65,11 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
call = list_entry(v, struct rxrpc_call, link);
- rx = READ_ONCE(call->socket);
+ rx = rcu_dereference(call->socket);
if (rx) {
local = READ_ONCE(rx->local);
if (local)
- sprintf(lbuff, "%pI4:%u",
- &local->srx.transport.sin.sin_addr,
- ntohs(local->srx.transport.sin.sin_port));
+ sprintf(lbuff, "%pISpc", &local->srx.transport);
else
strcpy(lbuff, "no_local");
} else {
@@ -76,14 +78,12 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
peer = call->peer;
if (peer)
- sprintf(rbuff, "%pI4:%u",
- &peer->srx.transport.sin.sin_addr,
- ntohs(peer->srx.transport.sin.sin_port));
+ sprintf(rbuff, "%pISpc", &peer->srx.transport);
else
strcpy(rbuff, "no_connection");
seq_printf(seq,
- "UDP %-22.22s %-22.22s %4x %08x %08x %s %3u"
+ "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u"
" %-8.8s %08x %lx\n",
lbuff,
rbuff,
@@ -142,11 +142,12 @@ static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v)
static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
{
struct rxrpc_connection *conn;
- char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1];
+ char lbuff[50], rbuff[50];
if (v == &rxrpc_connection_proc_list) {
seq_puts(seq,
- "Proto Local Remote "
+ "Proto Local "
+ " Remote "
" SvID ConnID End Use State Key "
" Serial ISerial\n"
);
@@ -154,17 +155,18 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
}
conn = list_entry(v, struct rxrpc_connection, proc_link);
+ if (conn->state == RXRPC_CONN_SERVICE_PREALLOC) {
+ strcpy(lbuff, "no_local");
+ strcpy(rbuff, "no_connection");
+ goto print;
+ }
- sprintf(lbuff, "%pI4:%u",
- &conn->params.local->srx.transport.sin.sin_addr,
- ntohs(conn->params.local->srx.transport.sin.sin_port));
-
- sprintf(rbuff, "%pI4:%u",
- &conn->params.peer->srx.transport.sin.sin_addr,
- ntohs(conn->params.peer->srx.transport.sin.sin_port));
+ sprintf(lbuff, "%pISpc", &conn->params.local->srx.transport);
+ sprintf(rbuff, "%pISpc", &conn->params.peer->srx.transport);
+print:
seq_printf(seq,
- "UDP %-22.22s %-22.22s %4x %08x %s %3u"
+ "UDP %-47.47s %-47.47s %4x %08x %s %3u"
" %s %08x %08x %08x\n",
lbuff,
rbuff,
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 0ab7b334bab1..038ae62ddb4d 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -19,427 +19,545 @@
#include "ar-internal.h"
/*
- * removal a call's user ID from the socket tree to make the user ID available
- * again and so that it won't be seen again in association with that call
+ * Post a call for attention by the socket or kernel service. Further
+ * notifications are suppressed by putting recvmsg_link on a dummy queue.
*/
-void rxrpc_remove_user_ID(struct rxrpc_sock *rx, struct rxrpc_call *call)
+void rxrpc_notify_socket(struct rxrpc_call *call)
{
- _debug("RELEASE CALL %d", call->debug_id);
+ struct rxrpc_sock *rx;
+ struct sock *sk;
- if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
- write_lock_bh(&rx->call_lock);
- rb_erase(&call->sock_node, &call->socket->calls);
- clear_bit(RXRPC_CALL_HAS_USERID, &call->flags);
- write_unlock_bh(&rx->call_lock);
+ _enter("%d", call->debug_id);
+
+ if (!list_empty(&call->recvmsg_link))
+ return;
+
+ rcu_read_lock();
+
+ rx = rcu_dereference(call->socket);
+ sk = &rx->sk;
+ if (rx && sk->sk_state < RXRPC_CLOSE) {
+ if (call->notify_rx) {
+ call->notify_rx(sk, call, call->user_call_ID);
+ } else {
+ write_lock_bh(&rx->recvmsg_lock);
+ if (list_empty(&call->recvmsg_link)) {
+ rxrpc_get_call(call, rxrpc_call_got);
+ list_add_tail(&call->recvmsg_link, &rx->recvmsg_q);
+ }
+ write_unlock_bh(&rx->recvmsg_lock);
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ _debug("call %ps", sk->sk_data_ready);
+ sk->sk_data_ready(sk);
+ }
+ }
}
- read_lock_bh(&call->state_lock);
- if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
- !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
- rxrpc_queue_call(call);
- read_unlock_bh(&call->state_lock);
+ rcu_read_unlock();
+ _leave("");
}
/*
- * receive a message from an RxRPC socket
- * - we need to be careful about two or more threads calling recvmsg
- * simultaneously
+ * Pass a call terminating message to userspace.
*/
-int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
- int flags)
+static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg)
{
- struct rxrpc_skb_priv *sp;
- struct rxrpc_call *call = NULL, *continue_call = NULL;
- struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
- struct sk_buff *skb;
- long timeo;
- int copy, ret, ullen, offset, copied = 0;
- u32 abort_code;
+ u32 tmp = 0;
+ int ret;
- DEFINE_WAIT(wait);
+ switch (call->completion) {
+ case RXRPC_CALL_SUCCEEDED:
+ ret = 0;
+ if (rxrpc_is_service_call(call))
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &tmp);
+ break;
+ case RXRPC_CALL_REMOTELY_ABORTED:
+ tmp = call->abort_code;
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &tmp);
+ break;
+ case RXRPC_CALL_LOCALLY_ABORTED:
+ tmp = call->abort_code;
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &tmp);
+ break;
+ case RXRPC_CALL_NETWORK_ERROR:
+ tmp = call->error;
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &tmp);
+ break;
+ case RXRPC_CALL_LOCAL_ERROR:
+ tmp = call->error;
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4, &tmp);
+ break;
+ default:
+ pr_err("Invalid terminal call state %u\n", call->state);
+ BUG();
+ break;
+ }
- _enter(",,,%zu,%d", len, flags);
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_terminal, call->rx_hard_ack,
+ call->rx_pkt_offset, call->rx_pkt_len, ret);
+ return ret;
+}
- if (flags & (MSG_OOB | MSG_TRUNC))
- return -EOPNOTSUPP;
+/*
+ * Pass back notification of a new call. The call is added to the
+ * to-be-accepted list. This means that the next call to be accepted might not
+ * be the last call seen awaiting acceptance, but unless we leave this on the
+ * front of the queue and block all other messages until someone gives us a
+ * user_ID for it, there's not a lot we can do.
+ */
+static int rxrpc_recvmsg_new_call(struct rxrpc_sock *rx,
+ struct rxrpc_call *call,
+ struct msghdr *msg, int flags)
+{
+ int tmp = 0, ret;
- ullen = msg->msg_flags & MSG_CMSG_COMPAT ? 4 : sizeof(unsigned long);
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &tmp);
- timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT);
- msg->msg_flags |= MSG_MORE;
+ if (ret == 0 && !(flags & MSG_PEEK)) {
+ _debug("to be accepted");
+ write_lock_bh(&rx->recvmsg_lock);
+ list_del_init(&call->recvmsg_link);
+ write_unlock_bh(&rx->recvmsg_lock);
- lock_sock(&rx->sk);
+ rxrpc_get_call(call, rxrpc_call_got);
+ write_lock(&rx->call_lock);
+ list_add_tail(&call->accept_link, &rx->to_be_accepted);
+ write_unlock(&rx->call_lock);
+ }
- for (;;) {
- /* return immediately if a client socket has no outstanding
- * calls */
- if (RB_EMPTY_ROOT(&rx->calls)) {
- if (copied)
- goto out;
- if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) {
- release_sock(&rx->sk);
- if (continue_call)
- rxrpc_put_call(continue_call);
- return -ENODATA;
- }
- }
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_to_be_accepted, 1, 0, 0, ret);
+ return ret;
+}
- /* get the next message on the Rx queue */
- skb = skb_peek(&rx->sk.sk_receive_queue);
- if (!skb) {
- /* nothing remains on the queue */
- if (copied &&
- (flags & MSG_PEEK || timeo == 0))
- goto out;
+/*
+ * End the packet reception phase.
+ */
+static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial)
+{
+ _enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]);
- /* wait for a message to turn up */
- release_sock(&rx->sk);
- prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait,
- TASK_INTERRUPTIBLE);
- ret = sock_error(&rx->sk);
- if (ret)
- goto wait_error;
-
- if (skb_queue_empty(&rx->sk.sk_receive_queue)) {
- if (signal_pending(current))
- goto wait_interrupted;
- timeo = schedule_timeout(timeo);
- }
- finish_wait(sk_sleep(&rx->sk), &wait);
- lock_sock(&rx->sk);
- continue;
- }
+ trace_rxrpc_receive(call, rxrpc_receive_end, 0, call->rx_top);
+ ASSERTCMP(call->rx_hard_ack, ==, call->rx_top);
- peek_next_packet:
- rxrpc_see_skb(skb);
- sp = rxrpc_skb(skb);
- call = sp->call;
- ASSERT(call != NULL);
- rxrpc_see_call(call);
-
- _debug("next pkt %s", rxrpc_pkts[sp->hdr.type]);
-
- /* make sure we wait for the state to be updated in this call */
- spin_lock_bh(&call->lock);
- spin_unlock_bh(&call->lock);
-
- if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) {
- _debug("packet from released call");
- if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
- BUG();
- rxrpc_free_skb(skb);
- continue;
- }
+ if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) {
+ rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, true, false,
+ rxrpc_propose_ack_terminal_ack);
+ rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
+ }
- /* determine whether to continue last data receive */
- if (continue_call) {
- _debug("maybe cont");
- if (call != continue_call ||
- skb->mark != RXRPC_SKB_MARK_DATA) {
- release_sock(&rx->sk);
- rxrpc_put_call(continue_call);
- _leave(" = %d [noncont]", copied);
- return copied;
- }
- }
+ write_lock_bh(&call->state_lock);
- rxrpc_get_call(call);
+ switch (call->state) {
+ case RXRPC_CALL_CLIENT_RECV_REPLY:
+ __rxrpc_call_completed(call);
+ break;
- /* copy the peer address and timestamp */
- if (!continue_call) {
- if (msg->msg_name) {
- size_t len =
- sizeof(call->conn->params.peer->srx);
- memcpy(msg->msg_name,
- &call->conn->params.peer->srx, len);
- msg->msg_namelen = len;
- }
- sock_recv_timestamp(msg, &rx->sk, skb);
- }
+ case RXRPC_CALL_SERVER_RECV_REQUEST:
+ call->tx_phase = true;
+ call->state = RXRPC_CALL_SERVER_ACK_REQUEST;
+ break;
+ default:
+ break;
+ }
- /* receive the message */
- if (skb->mark != RXRPC_SKB_MARK_DATA)
- goto receive_non_data_message;
+ write_unlock_bh(&call->state_lock);
+}
- _debug("recvmsg DATA #%u { %d, %d }",
- sp->hdr.seq, skb->len, sp->offset);
+/*
+ * Discard a packet we've used up and advance the Rx window by one.
+ */
+static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
+{
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *skb;
+ rxrpc_serial_t serial;
+ rxrpc_seq_t hard_ack, top;
+ u8 flags;
+ int ix;
- if (!continue_call) {
- /* only set the control data once per recvmsg() */
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
- ullen, &call->user_call_ID);
- if (ret < 0)
- goto copy_error;
- ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
- }
+ _enter("%d", call->debug_id);
- ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv);
- ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1);
- call->rx_data_recv = sp->hdr.seq;
+ hard_ack = call->rx_hard_ack;
+ top = smp_load_acquire(&call->rx_top);
+ ASSERT(before(hard_ack, top));
- ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten);
+ hard_ack++;
+ ix = hard_ack & RXRPC_RXTX_BUFF_MASK;
+ skb = call->rxtx_buffer[ix];
+ rxrpc_see_skb(skb, rxrpc_skb_rx_rotated);
+ sp = rxrpc_skb(skb);
+ flags = sp->hdr.flags;
+ serial = sp->hdr.serial;
+ if (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO)
+ serial += (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO) - 1;
+
+ call->rxtx_buffer[ix] = NULL;
+ call->rxtx_annotations[ix] = 0;
+ /* Barrier against rxrpc_input_data(). */
+ smp_store_release(&call->rx_hard_ack, hard_ack);
+
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
+
+ _debug("%u,%u,%02x", hard_ack, top, flags);
+ trace_rxrpc_receive(call, rxrpc_receive_rotate, serial, hard_ack);
+ if (flags & RXRPC_LAST_PACKET) {
+ rxrpc_end_rx_phase(call, serial);
+ } else {
+ /* Check to see if there's an ACK that needs sending. */
+ if (after_eq(hard_ack, call->ackr_consumed + 2) ||
+ after_eq(top, call->ackr_seen + 2) ||
+ (hard_ack == top && after(hard_ack, call->ackr_consumed)))
+ rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial,
+ true, false,
+ rxrpc_propose_ack_rotate_rx);
+ if (call->ackr_reason)
+ rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
+ }
+}
- offset = sp->offset;
- copy = skb->len - offset;
- if (copy > len - copied)
- copy = len - copied;
+/*
+ * Decrypt and verify a (sub)packet. The packet's length may be changed due to
+ * padding, but if this is the case, the packet length will be resident in the
+ * socket buffer. Note that we can't modify the master skb info as the skb may
+ * be the home to multiple subpackets.
+ */
+static int rxrpc_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
+ u8 annotation,
+ unsigned int offset, unsigned int len)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ rxrpc_seq_t seq = sp->hdr.seq;
+ u16 cksum = sp->hdr.cksum;
+
+ _enter("");
+
+ /* For all but the head jumbo subpacket, the security checksum is in a
+ * jumbo header immediately prior to the data.
+ */
+ if ((annotation & RXRPC_RX_ANNO_JUMBO) > 1) {
+ __be16 tmp;
+ if (skb_copy_bits(skb, offset - 2, &tmp, 2) < 0)
+ BUG();
+ cksum = ntohs(tmp);
+ seq += (annotation & RXRPC_RX_ANNO_JUMBO) - 1;
+ }
- ret = skb_copy_datagram_msg(skb, offset, msg, copy);
+ return call->conn->security->verify_packet(call, skb, offset, len,
+ seq, cksum);
+}
+/*
+ * Locate the data within a packet. This is complicated by:
+ *
+ * (1) An skb may contain a jumbo packet - so we have to find the appropriate
+ * subpacket.
+ *
+ * (2) The (sub)packets may be encrypted and, if so, the encrypted portion
+ * contains an extra header which includes the true length of the data,
+ * excluding any encrypted padding.
+ */
+static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
+ u8 *_annotation,
+ unsigned int *_offset, unsigned int *_len)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ unsigned int offset = *_offset;
+ unsigned int len = *_len;
+ int ret;
+ u8 annotation = *_annotation;
+
+ /* Locate the subpacket */
+ offset = sp->offset;
+ len = skb->len - sp->offset;
+ if ((annotation & RXRPC_RX_ANNO_JUMBO) > 0) {
+ offset += (((annotation & RXRPC_RX_ANNO_JUMBO) - 1) *
+ RXRPC_JUMBO_SUBPKTLEN);
+ len = (annotation & RXRPC_RX_ANNO_JLAST) ?
+ skb->len - offset : RXRPC_JUMBO_SUBPKTLEN;
+ }
+
+ if (!(annotation & RXRPC_RX_ANNO_VERIFIED)) {
+ ret = rxrpc_verify_packet(call, skb, annotation, offset, len);
if (ret < 0)
- goto copy_error;
+ return ret;
+ *_annotation |= RXRPC_RX_ANNO_VERIFIED;
+ }
+
+ *_offset = offset;
+ *_len = len;
+ call->conn->security->locate_data(call, skb, _offset, _len);
+ return 0;
+}
- /* handle piecemeal consumption of data packets */
- _debug("copied %d+%d", copy, copied);
+/*
+ * Deliver messages to a call. This keeps processing packets until the buffer
+ * is filled and we find either more DATA (returns 0) or the end of the DATA
+ * (returns 1). If more packets are required, it returns -EAGAIN.
+ */
+static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
+ struct msghdr *msg, struct iov_iter *iter,
+ size_t len, int flags, size_t *_offset)
+{
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *skb;
+ rxrpc_seq_t hard_ack, top, seq;
+ size_t remain;
+ bool last;
+ unsigned int rx_pkt_offset, rx_pkt_len;
+ int ix, copy, ret = -EAGAIN, ret2;
- offset += copy;
- copied += copy;
+ rx_pkt_offset = call->rx_pkt_offset;
+ rx_pkt_len = call->rx_pkt_len;
- if (!(flags & MSG_PEEK))
- sp->offset = offset;
+ if (call->state >= RXRPC_CALL_SERVER_ACK_REQUEST) {
+ seq = call->rx_hard_ack;
+ ret = 1;
+ goto done;
+ }
- if (sp->offset < skb->len) {
- _debug("buffer full");
- ASSERTCMP(copied, ==, len);
+ /* Barriers against rxrpc_input_data(). */
+ hard_ack = call->rx_hard_ack;
+ top = smp_load_acquire(&call->rx_top);
+ for (seq = hard_ack + 1; before_eq(seq, top); seq++) {
+ ix = seq & RXRPC_RXTX_BUFF_MASK;
+ skb = call->rxtx_buffer[ix];
+ if (!skb) {
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_hole, seq,
+ rx_pkt_offset, rx_pkt_len, 0);
break;
}
+ smp_rmb();
+ rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
+ sp = rxrpc_skb(skb);
- /* we transferred the whole data packet */
if (!(flags & MSG_PEEK))
- rxrpc_kernel_data_consumed(call, skb);
-
- if (sp->hdr.flags & RXRPC_LAST_PACKET) {
- _debug("last");
- if (rxrpc_conn_is_client(call->conn)) {
- /* last byte of reply received */
- ret = copied;
- goto terminal_message;
+ trace_rxrpc_receive(call, rxrpc_receive_front,
+ sp->hdr.serial, seq);
+
+ if (msg)
+ sock_recv_timestamp(msg, sock->sk, skb);
+
+ if (rx_pkt_offset == 0) {
+ ret2 = rxrpc_locate_data(call, skb,
+ &call->rxtx_annotations[ix],
+ &rx_pkt_offset, &rx_pkt_len);
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_next, seq,
+ rx_pkt_offset, rx_pkt_len, ret2);
+ if (ret2 < 0) {
+ ret = ret2;
+ goto out;
}
+ } else {
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_cont, seq,
+ rx_pkt_offset, rx_pkt_len, 0);
+ }
- /* last bit of request received */
- if (!(flags & MSG_PEEK)) {
- _debug("eat packet");
- if (skb_dequeue(&rx->sk.sk_receive_queue) !=
- skb)
- BUG();
- rxrpc_free_skb(skb);
+ /* We have to handle short, empty and used-up DATA packets. */
+ remain = len - *_offset;
+ copy = rx_pkt_len;
+ if (copy > remain)
+ copy = remain;
+ if (copy > 0) {
+ ret2 = skb_copy_datagram_iter(skb, rx_pkt_offset, iter,
+ copy);
+ if (ret2 < 0) {
+ ret = ret2;
+ goto out;
}
- msg->msg_flags &= ~MSG_MORE;
- break;
- }
- /* move on to the next data message */
- _debug("next");
- if (!continue_call)
- continue_call = sp->call;
- else
- rxrpc_put_call(call);
- call = NULL;
-
- if (flags & MSG_PEEK) {
- _debug("peek next");
- skb = skb->next;
- if (skb == (struct sk_buff *) &rx->sk.sk_receive_queue)
- break;
- goto peek_next_packet;
+ /* handle piecemeal consumption of data packets */
+ rx_pkt_offset += copy;
+ rx_pkt_len -= copy;
+ *_offset += copy;
}
- _debug("eat packet");
- if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
- BUG();
- rxrpc_free_skb(skb);
- }
-
- /* end of non-terminal data packet reception for the moment */
- _debug("end rcv data");
-out:
- release_sock(&rx->sk);
- if (call)
- rxrpc_put_call(call);
- if (continue_call)
- rxrpc_put_call(continue_call);
- _leave(" = %d [data]", copied);
- return copied;
-
- /* handle non-DATA messages such as aborts, incoming connections and
- * final ACKs */
-receive_non_data_message:
- _debug("non-data");
-
- if (skb->mark == RXRPC_SKB_MARK_NEW_CALL) {
- _debug("RECV NEW CALL");
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &abort_code);
- if (ret < 0)
- goto copy_error;
- if (!(flags & MSG_PEEK)) {
- if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
- BUG();
- rxrpc_free_skb(skb);
+ if (rx_pkt_len > 0) {
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_full, seq,
+ rx_pkt_offset, rx_pkt_len, 0);
+ ASSERTCMP(*_offset, ==, len);
+ ret = 0;
+ break;
}
- goto out;
- }
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
- ullen, &call->user_call_ID);
- if (ret < 0)
- goto copy_error;
- ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
+ /* The whole packet has been transferred. */
+ last = sp->hdr.flags & RXRPC_LAST_PACKET;
+ if (!(flags & MSG_PEEK))
+ rxrpc_rotate_rx_window(call);
+ rx_pkt_offset = 0;
+ rx_pkt_len = 0;
- switch (skb->mark) {
- case RXRPC_SKB_MARK_DATA:
- BUG();
- case RXRPC_SKB_MARK_FINAL_ACK:
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &abort_code);
- break;
- case RXRPC_SKB_MARK_BUSY:
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_BUSY, 0, &abort_code);
- break;
- case RXRPC_SKB_MARK_REMOTE_ABORT:
- abort_code = call->abort_code;
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code);
- break;
- case RXRPC_SKB_MARK_LOCAL_ABORT:
- abort_code = call->abort_code;
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code);
- if (call->error) {
- abort_code = call->error;
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4,
- &abort_code);
+ if (last) {
+ ASSERTCMP(seq, ==, READ_ONCE(call->rx_top));
+ ret = 1;
+ goto out;
}
- break;
- case RXRPC_SKB_MARK_NET_ERROR:
- _debug("RECV NET ERROR %d", sp->error);
- abort_code = sp->error;
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &abort_code);
- break;
- case RXRPC_SKB_MARK_LOCAL_ERROR:
- _debug("RECV LOCAL ERROR %d", sp->error);
- abort_code = sp->error;
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4,
- &abort_code);
- break;
- default:
- pr_err("Unknown packet mark %u\n", skb->mark);
- BUG();
- break;
}
- if (ret < 0)
- goto copy_error;
-
-terminal_message:
- _debug("terminal");
- msg->msg_flags &= ~MSG_MORE;
- msg->msg_flags |= MSG_EOR;
-
+out:
if (!(flags & MSG_PEEK)) {
- _net("free terminal skb %p", skb);
- if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
- BUG();
- rxrpc_free_skb(skb);
- rxrpc_remove_user_ID(rx, call);
+ call->rx_pkt_offset = rx_pkt_offset;
+ call->rx_pkt_len = rx_pkt_len;
}
-
- release_sock(&rx->sk);
- rxrpc_put_call(call);
- if (continue_call)
- rxrpc_put_call(continue_call);
- _leave(" = %d", ret);
+done:
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq,
+ rx_pkt_offset, rx_pkt_len, ret);
return ret;
-
-copy_error:
- _debug("copy error");
- release_sock(&rx->sk);
- rxrpc_put_call(call);
- if (continue_call)
- rxrpc_put_call(continue_call);
- _leave(" = %d", ret);
- return ret;
-
-wait_interrupted:
- ret = sock_intr_errno(timeo);
-wait_error:
- finish_wait(sk_sleep(&rx->sk), &wait);
- if (continue_call)
- rxrpc_put_call(continue_call);
- if (copied)
- copied = ret;
- _leave(" = %d [waitfail %d]", copied, ret);
- return copied;
-
}
/*
- * Deliver messages to a call. This keeps processing packets until the buffer
- * is filled and we find either more DATA (returns 0) or the end of the DATA
- * (returns 1). If more packets are required, it returns -EAGAIN.
- *
- * TODO: Note that this is hacked in at the moment and will be replaced.
+ * Receive a message from an RxRPC socket
+ * - we need to be careful about two or more threads calling recvmsg
+ * simultaneously
*/
-static int temp_deliver_data(struct socket *sock, struct rxrpc_call *call,
- struct iov_iter *iter, size_t size,
- size_t *_offset)
+int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
{
- struct rxrpc_skb_priv *sp;
- struct sk_buff *skb;
- size_t remain;
- int ret, copy;
+ struct rxrpc_call *call;
+ struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+ struct list_head *l;
+ size_t copied = 0;
+ long timeo;
+ int ret;
- _enter("%d", call->debug_id);
+ DEFINE_WAIT(wait);
-next:
- local_bh_disable();
- skb = skb_dequeue(&call->knlrecv_queue);
- local_bh_enable();
- if (!skb) {
- if (test_bit(RXRPC_CALL_RX_NO_MORE, &call->flags))
- return 1;
- _leave(" = -EAGAIN [empty]");
- return -EAGAIN;
+ trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_enter, 0, 0, 0, 0);
+
+ if (flags & (MSG_OOB | MSG_TRUNC))
+ return -EOPNOTSUPP;
+
+ timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT);
+
+try_again:
+ lock_sock(&rx->sk);
+
+ /* Return immediately if a client socket has no outstanding calls */
+ if (RB_EMPTY_ROOT(&rx->calls) &&
+ list_empty(&rx->recvmsg_q) &&
+ rx->sk.sk_state != RXRPC_SERVER_LISTENING) {
+ release_sock(&rx->sk);
+ return -ENODATA;
}
- sp = rxrpc_skb(skb);
- _debug("dequeued %p %u/%zu", skb, sp->offset, size);
-
- switch (skb->mark) {
- case RXRPC_SKB_MARK_DATA:
- remain = size - *_offset;
- if (remain > 0) {
- copy = skb->len - sp->offset;
- if (copy > remain)
- copy = remain;
- ret = skb_copy_datagram_iter(skb, sp->offset, iter,
- copy);
- if (ret < 0)
- goto requeue_and_leave;
+ if (list_empty(&rx->recvmsg_q)) {
+ ret = -EWOULDBLOCK;
+ if (timeo == 0) {
+ call = NULL;
+ goto error_no_call;
+ }
- /* handle piecemeal consumption of data packets */
- sp->offset += copy;
- *_offset += copy;
+ release_sock(&rx->sk);
+
+ /* Wait for something to happen */
+ prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait,
+ TASK_INTERRUPTIBLE);
+ ret = sock_error(&rx->sk);
+ if (ret)
+ goto wait_error;
+
+ if (list_empty(&rx->recvmsg_q)) {
+ if (signal_pending(current))
+ goto wait_interrupted;
+ trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_wait,
+ 0, 0, 0, 0);
+ timeo = schedule_timeout(timeo);
}
+ finish_wait(sk_sleep(&rx->sk), &wait);
+ goto try_again;
+ }
- if (sp->offset < skb->len)
- goto partially_used_skb;
+ /* Find the next call and dequeue it if we're not just peeking. If we
+ * do dequeue it, that comes with a ref that we will need to release.
+ */
+ write_lock_bh(&rx->recvmsg_lock);
+ l = rx->recvmsg_q.next;
+ call = list_entry(l, struct rxrpc_call, recvmsg_link);
+ if (!(flags & MSG_PEEK))
+ list_del_init(&call->recvmsg_link);
+ else
+ rxrpc_get_call(call, rxrpc_call_got);
+ write_unlock_bh(&rx->recvmsg_lock);
+
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0, 0, 0, 0);
+
+ if (test_bit(RXRPC_CALL_RELEASED, &call->flags))
+ BUG();
- /* We consumed the whole packet */
- ASSERTCMP(sp->offset, ==, skb->len);
- if (sp->hdr.flags & RXRPC_LAST_PACKET)
- set_bit(RXRPC_CALL_RX_NO_MORE, &call->flags);
- rxrpc_kernel_data_consumed(call, skb);
- rxrpc_free_skb(skb);
- goto next;
+ if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
+ if (flags & MSG_CMSG_COMPAT) {
+ unsigned int id32 = call->user_call_ID;
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
+ sizeof(unsigned int), &id32);
+ } else {
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
+ sizeof(unsigned long),
+ &call->user_call_ID);
+ }
+ if (ret < 0)
+ goto error;
+ }
+
+ if (msg->msg_name) {
+ size_t len = sizeof(call->conn->params.peer->srx);
+ memcpy(msg->msg_name, &call->conn->params.peer->srx, len);
+ msg->msg_namelen = len;
+ }
+
+ switch (call->state) {
+ case RXRPC_CALL_SERVER_ACCEPTING:
+ ret = rxrpc_recvmsg_new_call(rx, call, msg, flags);
+ break;
+ case RXRPC_CALL_CLIENT_RECV_REPLY:
+ case RXRPC_CALL_SERVER_RECV_REQUEST:
+ case RXRPC_CALL_SERVER_ACK_REQUEST:
+ ret = rxrpc_recvmsg_data(sock, call, msg, &msg->msg_iter, len,
+ flags, &copied);
+ if (ret == -EAGAIN)
+ ret = 0;
+
+ if (after(call->rx_top, call->rx_hard_ack) &&
+ call->rxtx_buffer[(call->rx_hard_ack + 1) & RXRPC_RXTX_BUFF_MASK])
+ rxrpc_notify_socket(call);
+ break;
default:
- rxrpc_free_skb(skb);
- goto next;
+ ret = 0;
+ break;
+ }
+
+ if (ret < 0)
+ goto error;
+
+ if (call->state == RXRPC_CALL_COMPLETE) {
+ ret = rxrpc_recvmsg_term(call, msg);
+ if (ret < 0)
+ goto error;
+ if (!(flags & MSG_PEEK))
+ rxrpc_release_call(rx, call);
+ msg->msg_flags |= MSG_EOR;
+ ret = 1;
}
-partially_used_skb:
- ASSERTCMP(*_offset, ==, size);
- ret = 0;
-requeue_and_leave:
- skb_queue_head(&call->knlrecv_queue, skb);
+ if (ret == 0)
+ msg->msg_flags |= MSG_MORE;
+ else
+ msg->msg_flags &= ~MSG_MORE;
+ ret = copied;
+
+error:
+ rxrpc_put_call(call, rxrpc_call_put);
+error_no_call:
+ release_sock(&rx->sk);
+ trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret);
return ret;
+
+wait_interrupted:
+ ret = sock_intr_errno(timeo);
+wait_error:
+ finish_wait(sk_sleep(&rx->sk), &wait);
+ call = NULL;
+ goto error_no_call;
}
/**
@@ -474,8 +592,9 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
struct kvec iov;
int ret;
- _enter("{%d,%s},%zu,%d",
- call->debug_id, rxrpc_call_states[call->state], size, want_more);
+ _enter("{%d,%s},%zu/%zu,%d",
+ call->debug_id, rxrpc_call_states[call->state],
+ *_offset, size, want_more);
ASSERTCMP(*_offset, <=, size);
ASSERTCMP(call->state, !=, RXRPC_CALL_SERVER_ACCEPTING);
@@ -490,7 +609,8 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
case RXRPC_CALL_CLIENT_RECV_REPLY:
case RXRPC_CALL_SERVER_RECV_REQUEST:
case RXRPC_CALL_SERVER_ACK_REQUEST:
- ret = temp_deliver_data(sock, call, &iter, size, _offset);
+ ret = rxrpc_recvmsg_data(sock, call, NULL, &iter, size, 0,
+ _offset);
if (ret < 0)
goto out;
@@ -515,7 +635,6 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
goto call_complete;
default:
- *_offset = 0;
ret = -EINPROGRESS;
goto out;
}
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 89f475febfd7..88d080a1a3de 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -80,12 +80,10 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn)
case RXRPC_SECURITY_AUTH:
conn->size_align = 8;
conn->security_size = sizeof(struct rxkad_level1_hdr);
- conn->header_size += sizeof(struct rxkad_level1_hdr);
break;
case RXRPC_SECURITY_ENCRYPT:
conn->size_align = 8;
conn->security_size = sizeof(struct rxkad_level2_hdr);
- conn->header_size += sizeof(struct rxkad_level2_hdr);
break;
default:
ret = -EKEYREJECTED;
@@ -161,7 +159,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
_enter("");
- check = sp->hdr.seq ^ sp->hdr.callNumber;
+ check = sp->hdr.seq ^ call->call_id;
data_size |= (u32)check << 16;
hdr.data_size = htonl(data_size);
@@ -205,7 +203,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
_enter("");
- check = sp->hdr.seq ^ sp->hdr.callNumber;
+ check = sp->hdr.seq ^ call->call_id;
rxkhdr.data_size = htonl(data_size | (u32)check << 16);
rxkhdr.checksum = 0;
@@ -277,7 +275,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
/* calculate the security checksum */
x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT);
x |= sp->hdr.seq & 0x3fffffff;
- call->crypto_buf[0] = htonl(sp->hdr.callNumber);
+ call->crypto_buf[0] = htonl(call->call_id);
call->crypto_buf[1] = htonl(x);
sg_init_one(&sg, call->crypto_buf, 8);
@@ -316,12 +314,11 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
/*
* decrypt partial encryption on a packet (level 1 security)
*/
-static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
- struct sk_buff *skb,
- u32 *_abort_code)
+static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
+ unsigned int offset, unsigned int len,
+ rxrpc_seq_t seq)
{
struct rxkad_level1_hdr sechdr;
- struct rxrpc_skb_priv *sp;
SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
struct rxrpc_crypt iv;
struct scatterlist sg[16];
@@ -332,15 +329,20 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
_enter("");
- sp = rxrpc_skb(skb);
+ if (len < 8) {
+ rxrpc_abort_call("V1H", call, seq, RXKADSEALEDINCON, EPROTO);
+ goto protocol_error;
+ }
- /* we want to decrypt the skbuff in-place */
+ /* Decrypt the skbuff in-place. TODO: We really want to decrypt
+ * directly into the target buffer.
+ */
nsg = skb_cow_data(skb, 0, &trailer);
if (nsg < 0 || nsg > 16)
goto nomem;
sg_init_table(sg, nsg);
- skb_to_sgvec(skb, sg, 0, 8);
+ skb_to_sgvec(skb, sg, offset, 8);
/* start the decryption afresh */
memset(&iv, 0, sizeof(iv));
@@ -351,35 +353,35 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
crypto_skcipher_decrypt(req);
skcipher_request_zero(req);
- /* remove the decrypted packet length */
- if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0)
- goto datalen_error;
- if (!skb_pull(skb, sizeof(sechdr)))
- BUG();
+ /* Extract the decrypted packet length */
+ if (skb_copy_bits(skb, offset, &sechdr, sizeof(sechdr)) < 0) {
+ rxrpc_abort_call("XV1", call, seq, RXKADDATALEN, EPROTO);
+ goto protocol_error;
+ }
+ offset += sizeof(sechdr);
+ len -= sizeof(sechdr);
buf = ntohl(sechdr.data_size);
data_size = buf & 0xffff;
check = buf >> 16;
- check ^= sp->hdr.seq ^ sp->hdr.callNumber;
+ check ^= seq ^ call->call_id;
check &= 0xffff;
if (check != 0) {
- *_abort_code = RXKADSEALEDINCON;
+ rxrpc_abort_call("V1C", call, seq, RXKADSEALEDINCON, EPROTO);
goto protocol_error;
}
- /* shorten the packet to remove the padding */
- if (data_size > skb->len)
- goto datalen_error;
- else if (data_size < skb->len)
- skb->len = data_size;
+ if (data_size > len) {
+ rxrpc_abort_call("V1L", call, seq, RXKADDATALEN, EPROTO);
+ goto protocol_error;
+ }
_leave(" = 0 [dlen=%x]", data_size);
return 0;
-datalen_error:
- *_abort_code = RXKADDATALEN;
protocol_error:
+ rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
_leave(" = -EPROTO");
return -EPROTO;
@@ -391,13 +393,12 @@ nomem:
/*
* wholly decrypt a packet (level 2 security)
*/
-static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
- struct sk_buff *skb,
- u32 *_abort_code)
+static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
+ unsigned int offset, unsigned int len,
+ rxrpc_seq_t seq)
{
const struct rxrpc_key_token *token;
struct rxkad_level2_hdr sechdr;
- struct rxrpc_skb_priv *sp;
SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
struct rxrpc_crypt iv;
struct scatterlist _sg[4], *sg;
@@ -408,9 +409,14 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
_enter(",{%d}", skb->len);
- sp = rxrpc_skb(skb);
+ if (len < 8) {
+ rxrpc_abort_call("V2H", call, seq, RXKADSEALEDINCON, EPROTO);
+ goto protocol_error;
+ }
- /* we want to decrypt the skbuff in-place */
+ /* Decrypt the skbuff in-place. TODO: We really want to decrypt
+ * directly into the target buffer.
+ */
nsg = skb_cow_data(skb, 0, &trailer);
if (nsg < 0)
goto nomem;
@@ -423,7 +429,7 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
}
sg_init_table(sg, nsg);
- skb_to_sgvec(skb, sg, 0, skb->len);
+ skb_to_sgvec(skb, sg, offset, len);
/* decrypt from the session key */
token = call->conn->params.key->payload.data[0];
@@ -431,41 +437,41 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
skcipher_request_set_tfm(req, call->conn->cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, sg, sg, skb->len, iv.x);
+ skcipher_request_set_crypt(req, sg, sg, len, iv.x);
crypto_skcipher_decrypt(req);
skcipher_request_zero(req);
if (sg != _sg)
kfree(sg);
- /* remove the decrypted packet length */
- if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0)
- goto datalen_error;
- if (!skb_pull(skb, sizeof(sechdr)))
- BUG();
+ /* Extract the decrypted packet length */
+ if (skb_copy_bits(skb, offset, &sechdr, sizeof(sechdr)) < 0) {
+ rxrpc_abort_call("XV2", call, seq, RXKADDATALEN, EPROTO);
+ goto protocol_error;
+ }
+ offset += sizeof(sechdr);
+ len -= sizeof(sechdr);
buf = ntohl(sechdr.data_size);
data_size = buf & 0xffff;
check = buf >> 16;
- check ^= sp->hdr.seq ^ sp->hdr.callNumber;
+ check ^= seq ^ call->call_id;
check &= 0xffff;
if (check != 0) {
- *_abort_code = RXKADSEALEDINCON;
+ rxrpc_abort_call("V2C", call, seq, RXKADSEALEDINCON, EPROTO);
goto protocol_error;
}
- /* shorten the packet to remove the padding */
- if (data_size > skb->len)
- goto datalen_error;
- else if (data_size < skb->len)
- skb->len = data_size;
+ if (data_size > len) {
+ rxrpc_abort_call("V2L", call, seq, RXKADDATALEN, EPROTO);
+ goto protocol_error;
+ }
_leave(" = 0 [dlen=%x]", data_size);
return 0;
-datalen_error:
- *_abort_code = RXKADDATALEN;
protocol_error:
+ rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
_leave(" = -EPROTO");
return -EPROTO;
@@ -475,40 +481,31 @@ nomem:
}
/*
- * verify the security on a received packet
+ * Verify the security on a received packet or subpacket (if part of a
+ * jumbo packet).
*/
-static int rxkad_verify_packet(struct rxrpc_call *call,
- struct sk_buff *skb,
- u32 *_abort_code)
+static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
+ unsigned int offset, unsigned int len,
+ rxrpc_seq_t seq, u16 expected_cksum)
{
SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
- struct rxrpc_skb_priv *sp;
struct rxrpc_crypt iv;
struct scatterlist sg;
u16 cksum;
u32 x, y;
- int ret;
-
- sp = rxrpc_skb(skb);
_enter("{%d{%x}},{#%u}",
- call->debug_id, key_serial(call->conn->params.key), sp->hdr.seq);
+ call->debug_id, key_serial(call->conn->params.key), seq);
if (!call->conn->cipher)
return 0;
- if (sp->hdr.securityIndex != RXRPC_SECURITY_RXKAD) {
- *_abort_code = RXKADINCONSISTENCY;
- _leave(" = -EPROTO [not rxkad]");
- return -EPROTO;
- }
-
/* continue encrypting from where we left off */
memcpy(&iv, call->conn->csum_iv.x, sizeof(iv));
/* validate the security checksum */
x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT);
- x |= sp->hdr.seq & 0x3fffffff;
+ x |= seq & 0x3fffffff;
call->crypto_buf[0] = htonl(call->call_id);
call->crypto_buf[1] = htonl(x);
@@ -524,29 +521,69 @@ static int rxkad_verify_packet(struct rxrpc_call *call,
if (cksum == 0)
cksum = 1; /* zero checksums are not permitted */
- if (sp->hdr.cksum != cksum) {
- *_abort_code = RXKADSEALEDINCON;
+ if (cksum != expected_cksum) {
+ rxrpc_abort_call("VCK", call, seq, RXKADSEALEDINCON, EPROTO);
+ rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
_leave(" = -EPROTO [csum failed]");
return -EPROTO;
}
switch (call->conn->params.security_level) {
case RXRPC_SECURITY_PLAIN:
- ret = 0;
- break;
+ return 0;
case RXRPC_SECURITY_AUTH:
- ret = rxkad_verify_packet_auth(call, skb, _abort_code);
- break;
+ return rxkad_verify_packet_1(call, skb, offset, len, seq);
case RXRPC_SECURITY_ENCRYPT:
- ret = rxkad_verify_packet_encrypt(call, skb, _abort_code);
- break;
+ return rxkad_verify_packet_2(call, skb, offset, len, seq);
default:
- ret = -ENOANO;
- break;
+ return -ENOANO;
}
+}
- _leave(" = %d", ret);
- return ret;
+/*
+ * Locate the data contained in a packet that was partially encrypted.
+ */
+static void rxkad_locate_data_1(struct rxrpc_call *call, struct sk_buff *skb,
+ unsigned int *_offset, unsigned int *_len)
+{
+ struct rxkad_level1_hdr sechdr;
+
+ if (skb_copy_bits(skb, *_offset, &sechdr, sizeof(sechdr)) < 0)
+ BUG();
+ *_offset += sizeof(sechdr);
+ *_len = ntohl(sechdr.data_size) & 0xffff;
+}
+
+/*
+ * Locate the data contained in a packet that was completely encrypted.
+ */
+static void rxkad_locate_data_2(struct rxrpc_call *call, struct sk_buff *skb,
+ unsigned int *_offset, unsigned int *_len)
+{
+ struct rxkad_level2_hdr sechdr;
+
+ if (skb_copy_bits(skb, *_offset, &sechdr, sizeof(sechdr)) < 0)
+ BUG();
+ *_offset += sizeof(sechdr);
+ *_len = ntohl(sechdr.data_size) & 0xffff;
+}
+
+/*
+ * Locate the data contained in an already decrypted packet.
+ */
+static void rxkad_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
+ unsigned int *_offset, unsigned int *_len)
+{
+ switch (call->conn->params.security_level) {
+ case RXRPC_SECURITY_AUTH:
+ rxkad_locate_data_1(call, skb, _offset, _len);
+ return;
+ case RXRPC_SECURITY_ENCRYPT:
+ rxkad_locate_data_2(call, skb, _offset, _len);
+ return;
+ default:
+ return;
+ }
}
/*
@@ -716,7 +753,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
struct rxkad_challenge challenge;
struct rxkad_response resp
__attribute__((aligned(8))); /* must be aligned for crypto */
- struct rxrpc_skb_priv *sp;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
u32 version, nonce, min_level, abort_code;
int ret;
@@ -734,8 +771,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
}
abort_code = RXKADPACKETSHORT;
- sp = rxrpc_skb(skb);
- if (skb_copy_bits(skb, 0, &challenge, sizeof(challenge)) < 0)
+ if (skb_copy_bits(skb, sp->offset, &challenge, sizeof(challenge)) < 0)
goto protocol_error;
version = ntohl(challenge.version);
@@ -981,7 +1017,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
{
struct rxkad_response response
__attribute__((aligned(8))); /* must be aligned for crypto */
- struct rxrpc_skb_priv *sp;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct rxrpc_crypt session_key;
time_t expiry;
void *ticket;
@@ -992,7 +1028,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
_enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key));
abort_code = RXKADPACKETSHORT;
- if (skb_copy_bits(skb, 0, &response, sizeof(response)) < 0)
+ if (skb_copy_bits(skb, sp->offset, &response, sizeof(response)) < 0)
goto protocol_error;
if (!pskb_pull(skb, sizeof(response)))
BUG();
@@ -1000,7 +1036,6 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
version = ntohl(response.version);
ticket_len = ntohl(response.ticket_len);
kvno = ntohl(response.kvno);
- sp = rxrpc_skb(skb);
_proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }",
sp->hdr.serial, version, kvno, ticket_len);
@@ -1022,7 +1057,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
return -ENOMEM;
abort_code = RXKADPACKETSHORT;
- if (skb_copy_bits(skb, 0, ticket, ticket_len) < 0)
+ if (skb_copy_bits(skb, sp->offset, ticket, ticket_len) < 0)
goto protocol_error_free;
ret = rxkad_decrypt_ticket(conn, ticket, ticket_len, &session_key,
@@ -1147,6 +1182,7 @@ const struct rxrpc_security rxkad = {
.prime_packet_security = rxkad_prime_packet_security,
.secure_packet = rxkad_secure_packet,
.verify_packet = rxkad_verify_packet,
+ .locate_data = rxkad_locate_data,
.issue_challenge = rxkad_issue_challenge,
.respond_to_challenge = rxkad_respond_to_challenge,
.verify_response = rxkad_verify_response,
diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
index 814d285ff802..82d8134e9287 100644
--- a/net/rxrpc/security.c
+++ b/net/rxrpc/security.c
@@ -130,20 +130,20 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
}
/* find the service */
- read_lock_bh(&local->services_lock);
- list_for_each_entry(rx, &local->services, listen_link) {
+ read_lock(&local->services_lock);
+ hlist_for_each_entry(rx, &local->services, listen_link) {
if (rx->srx.srx_service == conn->params.service_id)
goto found_service;
}
/* the service appears to have died */
- read_unlock_bh(&local->services_lock);
+ read_unlock(&local->services_lock);
_leave(" = -ENOENT");
return -ENOENT;
found_service:
if (!rx->securities) {
- read_unlock_bh(&local->services_lock);
+ read_unlock(&local->services_lock);
_leave(" = -ENOKEY");
return -ENOKEY;
}
@@ -152,13 +152,13 @@ found_service:
kref = keyring_search(make_key_ref(rx->securities, 1UL),
&key_type_rxrpc_s, kdesc);
if (IS_ERR(kref)) {
- read_unlock_bh(&local->services_lock);
+ read_unlock(&local->services_lock);
_leave(" = %ld [search]", PTR_ERR(kref));
return PTR_ERR(kref);
}
key = key_ref_to_ptr(kref);
- read_unlock_bh(&local->services_lock);
+ read_unlock(&local->services_lock);
conn->server_key = key;
conn->security = sec;
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 7376794a0308..1f8040d82395 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -15,7 +15,6 @@
#include <linux/gfp.h>
#include <linux/skbuff.h>
#include <linux/export.h>
-#include <linux/circ_buf.h>
#include <net/sock.h>
#include <net/af_rxrpc.h>
#include "ar-internal.h"
@@ -38,24 +37,28 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
DECLARE_WAITQUEUE(myself, current);
int ret;
- _enter(",{%d},%ld",
- CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail),
- call->acks_winsz),
- *timeo);
+ _enter(",{%u,%u,%u}",
+ call->tx_hard_ack, call->tx_top, call->tx_winsize);
add_wait_queue(&call->waitq, &myself);
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
ret = 0;
- if (CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail),
- call->acks_winsz) > 0)
+ if (call->tx_top - call->tx_hard_ack <
+ min_t(unsigned int, call->tx_winsize,
+ call->cong_cwnd + call->cong_extra))
break;
+ if (call->state >= RXRPC_CALL_COMPLETE) {
+ ret = -call->error;
+ break;
+ }
if (signal_pending(current)) {
ret = sock_intr_errno(*timeo);
break;
}
+ trace_rxrpc_transmit(call, rxrpc_transmit_wait);
release_sock(&rx->sk);
*timeo = schedule_timeout(*timeo);
lock_sock(&rx->sk);
@@ -68,36 +71,55 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
}
/*
- * attempt to schedule an instant Tx resend
+ * Schedule an instant Tx resend.
*/
-static inline void rxrpc_instant_resend(struct rxrpc_call *call)
+static inline void rxrpc_instant_resend(struct rxrpc_call *call, int ix)
{
- read_lock_bh(&call->state_lock);
- if (try_to_del_timer_sync(&call->resend_timer) >= 0) {
- clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
- if (call->state < RXRPC_CALL_COMPLETE &&
- !test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
+ spin_lock_bh(&call->lock);
+
+ if (call->state < RXRPC_CALL_COMPLETE) {
+ call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS;
+ if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
rxrpc_queue_call(call);
}
- read_unlock_bh(&call->state_lock);
+
+ spin_unlock_bh(&call->lock);
}
/*
- * queue a packet for transmission, set the resend timer and attempt
- * to send the packet immediately
+ * Queue a DATA packet for transmission, set the resend timeout and send the
+ * packet immediately
*/
static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
bool last)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- int ret;
+ rxrpc_seq_t seq = sp->hdr.seq;
+ int ret, ix;
+ u8 annotation = RXRPC_TX_ANNO_UNACK;
- _net("queue skb %p [%d]", skb, call->acks_head);
+ _net("queue skb %p [%d]", skb, seq);
- ASSERT(call->acks_window != NULL);
- call->acks_window[call->acks_head] = (unsigned long) skb;
+ ASSERTCMP(seq, ==, call->tx_top + 1);
+
+ if (last)
+ annotation |= RXRPC_TX_ANNO_LAST;
+
+ /* We have to set the timestamp before queueing as the retransmit
+ * algorithm can see the packet as soon as we queue it.
+ */
+ skb->tstamp = ktime_get_real();
+
+ ix = seq & RXRPC_RXTX_BUFF_MASK;
+ rxrpc_get_skb(skb, rxrpc_skb_tx_got);
+ call->rxtx_annotations[ix] = annotation;
smp_wmb();
- call->acks_head = (call->acks_head + 1) & (call->acks_winsz - 1);
+ call->rxtx_buffer[ix] = skb;
+ call->tx_top = seq;
+ if (last)
+ trace_rxrpc_transmit(call, rxrpc_transmit_queue_last);
+ else
+ trace_rxrpc_transmit(call, rxrpc_transmit_queue);
if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) {
_debug("________awaiting reply/ACK__________");
@@ -119,60 +141,26 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
write_unlock_bh(&call->state_lock);
}
- _proto("Tx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
-
- sp->need_resend = false;
- sp->resend_at = jiffies + rxrpc_resend_timeout;
- if (!test_and_set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags)) {
- _debug("run timer");
- call->resend_timer.expires = sp->resend_at;
- add_timer(&call->resend_timer);
- }
-
- /* attempt to cancel the rx-ACK timer, deferring reply transmission if
- * we're ACK'ing the request phase of an incoming call */
- ret = -EAGAIN;
- if (try_to_del_timer_sync(&call->ack_timer) >= 0) {
- /* the packet may be freed by rxrpc_process_call() before this
- * returns */
- if (rxrpc_is_client_call(call))
- rxrpc_expose_client_call(call);
- ret = rxrpc_send_data_packet(call->conn, skb);
- _net("sent skb %p", skb);
- } else {
- _debug("failed to delete ACK timer");
- }
+ if (seq == 1 && rxrpc_is_client_call(call))
+ rxrpc_expose_client_call(call);
+ ret = rxrpc_send_data_packet(call, skb);
if (ret < 0) {
_debug("need instant resend %d", ret);
- sp->need_resend = true;
- rxrpc_instant_resend(call);
- }
+ rxrpc_instant_resend(call, ix);
+ } else {
+ unsigned long resend_at;
- _leave("");
-}
+ resend_at = jiffies + msecs_to_jiffies(rxrpc_resend_timeout);
-/*
- * Convert a host-endian header into a network-endian header.
- */
-static void rxrpc_insert_header(struct sk_buff *skb)
-{
- struct rxrpc_wire_header whdr;
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ if (time_before(resend_at, call->resend_at)) {
+ call->resend_at = resend_at;
+ rxrpc_set_timer(call, rxrpc_timer_set_for_send);
+ }
+ }
- whdr.epoch = htonl(sp->hdr.epoch);
- whdr.cid = htonl(sp->hdr.cid);
- whdr.callNumber = htonl(sp->hdr.callNumber);
- whdr.seq = htonl(sp->hdr.seq);
- whdr.serial = htonl(sp->hdr.serial);
- whdr.type = sp->hdr.type;
- whdr.flags = sp->hdr.flags;
- whdr.userStatus = sp->hdr.userStatus;
- whdr.securityIndex = sp->hdr.securityIndex;
- whdr._rsvd = htons(sp->hdr._rsvd);
- whdr.serviceId = htons(sp->hdr.serviceId);
-
- memcpy(skb->head, &whdr, sizeof(whdr));
+ rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
+ _leave("");
}
/*
@@ -203,18 +191,22 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
skb = call->tx_pending;
call->tx_pending = NULL;
- rxrpc_see_skb(skb);
+ rxrpc_see_skb(skb, rxrpc_skb_tx_seen);
copied = 0;
do {
+ /* Check to see if there's a ping ACK to reply to. */
+ if (call->ackr_reason == RXRPC_ACK_PING_RESPONSE)
+ rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
+
if (!skb) {
size_t size, chunk, max, space;
_debug("alloc");
- if (CIRC_SPACE(call->acks_head,
- ACCESS_ONCE(call->acks_tail),
- call->acks_winsz) <= 0) {
+ if (call->tx_top - call->tx_hard_ack >=
+ min_t(unsigned int, call->tx_winsize,
+ call->cong_cwnd + call->cong_extra)) {
ret = -EAGAIN;
if (msg->msg_flags & MSG_DONTWAIT)
goto maybe_error;
@@ -224,7 +216,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
goto maybe_error;
}
- max = call->conn->params.peer->maxdata;
+ max = RXRPC_JUMBO_DATALEN;
max -= call->conn->security_size;
max &= ~(call->conn->size_align - 1UL);
@@ -235,7 +227,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
space = chunk + call->conn->size_align;
space &= ~(call->conn->size_align - 1UL);
- size = space + call->conn->header_size;
+ size = space + call->conn->security_size;
_debug("SIZE: %zu/%zu/%zu", chunk, space, size);
@@ -245,15 +237,15 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
if (!skb)
goto maybe_error;
- rxrpc_new_skb(skb);
+ rxrpc_new_skb(skb, rxrpc_skb_tx_new);
_debug("ALLOC SEND %p", skb);
ASSERTCMP(skb->mark, ==, 0);
- _debug("HS: %u", call->conn->header_size);
- skb_reserve(skb, call->conn->header_size);
- skb->len += call->conn->header_size;
+ _debug("HS: %u", call->conn->security_size);
+ skb_reserve(skb, call->conn->security_size);
+ skb->len += call->conn->security_size;
sp = rxrpc_skb(skb);
sp->remain = chunk;
@@ -313,36 +305,23 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
memset(skb_put(skb, pad), 0, pad);
}
- seq = atomic_inc_return(&call->sequence);
+ seq = call->tx_top + 1;
- sp->hdr.epoch = conn->proto.epoch;
- sp->hdr.cid = call->cid;
- sp->hdr.callNumber = call->call_id;
sp->hdr.seq = seq;
- sp->hdr.serial = atomic_inc_return(&conn->serial);
- sp->hdr.type = RXRPC_PACKET_TYPE_DATA;
- sp->hdr.userStatus = 0;
- sp->hdr.securityIndex = conn->security_ix;
sp->hdr._rsvd = 0;
- sp->hdr.serviceId = call->service_id;
+ sp->hdr.flags = conn->out_clientflag;
- sp->hdr.flags = conn->out_clientflag;
if (msg_data_left(msg) == 0 && !more)
sp->hdr.flags |= RXRPC_LAST_PACKET;
- else if (CIRC_SPACE(call->acks_head,
- ACCESS_ONCE(call->acks_tail),
- call->acks_winsz) > 1)
+ else if (call->tx_top - call->tx_hard_ack <
+ call->tx_winsize)
sp->hdr.flags |= RXRPC_MORE_PACKETS;
- if (more && seq & 1)
- sp->hdr.flags |= RXRPC_REQUEST_ACK;
ret = conn->security->secure_packet(
- call, skb, skb->mark,
- skb->head + sizeof(struct rxrpc_wire_header));
+ call, skb, skb->mark, skb->head);
if (ret < 0)
goto out;
- rxrpc_insert_header(skb);
rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more);
skb = NULL;
}
@@ -356,9 +335,9 @@ out:
return ret;
call_terminated:
- rxrpc_free_skb(skb);
+ rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
_leave(" = %d", -call->error);
- return ret;
+ return -call->error;
maybe_error:
if (copied)
@@ -452,28 +431,6 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg,
}
/*
- * abort a call, sending an ABORT packet to the peer
- */
-static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code)
-{
- if (call->state >= RXRPC_CALL_COMPLETE)
- return;
-
- write_lock_bh(&call->state_lock);
-
- if (__rxrpc_abort_call(call, abort_code, ECONNABORTED)) {
- del_timer_sync(&call->resend_timer);
- del_timer_sync(&call->ack_timer);
- clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
- clear_bit(RXRPC_CALL_EV_ACK, &call->events);
- clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
- rxrpc_queue_call(call);
- }
-
- write_unlock_bh(&call->state_lock);
-}
-
-/*
* Create a new client call for sendmsg().
*/
static struct rxrpc_call *
@@ -534,7 +491,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
call = rxrpc_accept_call(rx, user_call_ID, NULL);
if (IS_ERR(call))
return PTR_ERR(call);
- rxrpc_put_call(call);
+ rxrpc_put_call(call, rxrpc_call_put);
return 0;
}
@@ -548,7 +505,6 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
return PTR_ERR(call);
}
- rxrpc_see_call(call);
_debug("CALL %d USR %lx ST %d on CONN %p",
call->debug_id, call->user_call_ID, call->state, call->conn);
@@ -556,8 +512,10 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
/* it's too late for this call */
ret = -ESHUTDOWN;
} else if (cmd == RXRPC_CMD_SEND_ABORT) {
- rxrpc_send_abort(call, abort_code);
ret = 0;
+ if (rxrpc_abort_call("CMD", call, 0, abort_code, ECONNABORTED))
+ ret = rxrpc_send_call_packet(call,
+ RXRPC_PACKET_TYPE_ABORT);
} else if (cmd != RXRPC_CMD_SEND_DATA) {
ret = -EINVAL;
} else if (rxrpc_is_client_call(call) &&
@@ -573,7 +531,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
ret = rxrpc_send_data(rx, call, msg, len);
}
- rxrpc_put_call(call);
+ rxrpc_put_call(call, rxrpc_call_put);
_leave(" = %d", ret);
return ret;
}
@@ -626,20 +584,20 @@ EXPORT_SYMBOL(rxrpc_kernel_send_data);
* @sock: The socket the call is on
* @call: The call to be aborted
* @abort_code: The abort code to stick into the ABORT packet
+ * @error: Local error value
+ * @why: 3-char string indicating why.
*
* Allow a kernel service to abort a call, if it's still in an abortable state.
*/
void rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call,
- u32 abort_code)
+ u32 abort_code, int error, const char *why)
{
- _enter("{%d},%d", call->debug_id, abort_code);
+ _enter("{%d},%d,%d,%s", call->debug_id, abort_code, error, why);
lock_sock(sock->sk);
- _debug("CALL %d USR %lx ST %d on CONN %p",
- call->debug_id, call->user_call_ID, call->state, call->conn);
-
- rxrpc_send_abort(call, abort_code);
+ if (rxrpc_abort_call(why, call, 0, abort_code, error))
+ rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
release_sock(sock->sk);
_leave("");
diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c
index 9752f8b1fdd0..5154cbf7e540 100644
--- a/net/rxrpc/skbuff.c
+++ b/net/rxrpc/skbuff.c
@@ -18,198 +18,77 @@
#include <net/af_rxrpc.h>
#include "ar-internal.h"
-/*
- * set up for the ACK at the end of the receive phase when we discard the final
- * receive phase data packet
- * - called with softirqs disabled
- */
-static void rxrpc_request_final_ACK(struct rxrpc_call *call)
-{
- /* the call may be aborted before we have a chance to ACK it */
- write_lock(&call->state_lock);
-
- switch (call->state) {
- case RXRPC_CALL_CLIENT_RECV_REPLY:
- call->state = RXRPC_CALL_CLIENT_FINAL_ACK;
- _debug("request final ACK");
-
- /* get an extra ref on the call for the final-ACK generator to
- * release */
- rxrpc_get_call(call);
- set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
- if (try_to_del_timer_sync(&call->ack_timer) >= 0)
- rxrpc_queue_call(call);
- break;
-
- case RXRPC_CALL_SERVER_RECV_REQUEST:
- call->state = RXRPC_CALL_SERVER_ACK_REQUEST;
- default:
- break;
- }
-
- write_unlock(&call->state_lock);
-}
-
-/*
- * drop the bottom ACK off of the call ACK window and advance the window
- */
-static void rxrpc_hard_ACK_data(struct rxrpc_call *call, struct sk_buff *skb)
-{
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- int loop;
- u32 seq;
-
- spin_lock_bh(&call->lock);
-
- _debug("hard ACK #%u", sp->hdr.seq);
-
- for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) {
- call->ackr_window[loop] >>= 1;
- call->ackr_window[loop] |=
- call->ackr_window[loop + 1] << (BITS_PER_LONG - 1);
- }
-
- seq = sp->hdr.seq;
- ASSERTCMP(seq, ==, call->rx_data_eaten + 1);
- call->rx_data_eaten = seq;
-
- if (call->ackr_win_top < UINT_MAX)
- call->ackr_win_top++;
-
- ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE,
- call->rx_data_post, >=, call->rx_data_recv);
- ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE,
- call->rx_data_recv, >=, call->rx_data_eaten);
-
- if (sp->hdr.flags & RXRPC_LAST_PACKET) {
- rxrpc_request_final_ACK(call);
- } else if (atomic_dec_and_test(&call->ackr_not_idle) &&
- test_and_clear_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags)) {
- /* We previously soft-ACK'd some received packets that have now
- * been consumed, so send a hard-ACK if no more packets are
- * immediately forthcoming to allow the transmitter to free up
- * its Tx bufferage.
- */
- _debug("send Rx idle ACK");
- __rxrpc_propose_ACK(call, RXRPC_ACK_IDLE,
- skb->priority, sp->hdr.serial, false);
- }
-
- spin_unlock_bh(&call->lock);
-}
-
-/**
- * rxrpc_kernel_data_consumed - Record consumption of data message
- * @call: The call to which the message pertains.
- * @skb: Message holding data
- *
- * Record the consumption of a data message and generate an ACK if appropriate.
- * The call state is shifted if this was the final packet. The caller must be
- * in process context with no spinlocks held.
- *
- * TODO: Actually generate the ACK here rather than punting this to the
- * workqueue.
- */
-void rxrpc_kernel_data_consumed(struct rxrpc_call *call, struct sk_buff *skb)
-{
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-
- _enter("%d,%p{%u}", call->debug_id, skb, sp->hdr.seq);
-
- ASSERTCMP(sp->call, ==, call);
- ASSERTCMP(sp->hdr.type, ==, RXRPC_PACKET_TYPE_DATA);
-
- /* TODO: Fix the sequence number tracking */
- ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv);
- ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1);
- ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten);
-
- call->rx_data_recv = sp->hdr.seq;
- rxrpc_hard_ACK_data(call, skb);
-}
+#define select_skb_count(op) (op >= rxrpc_skb_tx_cleaned ? &rxrpc_n_tx_skbs : &rxrpc_n_rx_skbs)
/*
- * Destroy a packet that has an RxRPC control buffer
+ * Note the allocation or reception of a socket buffer.
*/
-void rxrpc_packet_destructor(struct sk_buff *skb)
-{
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- struct rxrpc_call *call = sp->call;
-
- _enter("%p{%p}", skb, call);
-
- if (call) {
- rxrpc_put_call_for_skb(call, skb);
- sp->call = NULL;
- }
-
- if (skb->sk)
- sock_rfree(skb);
- _leave("");
-}
-
-/**
- * rxrpc_kernel_free_skb - Free an RxRPC socket buffer
- * @skb: The socket buffer to be freed
- *
- * Let RxRPC free its own socket buffer, permitting it to maintain debug
- * accounting.
- */
-void rxrpc_kernel_free_skb(struct sk_buff *skb)
-{
- rxrpc_free_skb(skb);
-}
-EXPORT_SYMBOL(rxrpc_kernel_free_skb);
-
-/*
- * Note the existence of a new-to-us socket buffer (allocated or dequeued).
- */
-void rxrpc_new_skb(struct sk_buff *skb)
+void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
{
const void *here = __builtin_return_address(0);
- int n = atomic_inc_return(&rxrpc_n_skbs);
- trace_rxrpc_skb(skb, 0, atomic_read(&skb->users), n, here);
+ int n = atomic_inc_return(select_skb_count(op));
+ trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
}
/*
* Note the re-emergence of a socket buffer from a queue or buffer.
*/
-void rxrpc_see_skb(struct sk_buff *skb)
+void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
{
const void *here = __builtin_return_address(0);
if (skb) {
- int n = atomic_read(&rxrpc_n_skbs);
- trace_rxrpc_skb(skb, 1, atomic_read(&skb->users), n, here);
+ int n = atomic_read(select_skb_count(op));
+ trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
}
}
/*
* Note the addition of a ref on a socket buffer.
*/
-void rxrpc_get_skb(struct sk_buff *skb)
+void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
{
const void *here = __builtin_return_address(0);
- int n = atomic_inc_return(&rxrpc_n_skbs);
- trace_rxrpc_skb(skb, 2, atomic_read(&skb->users), n, here);
+ int n = atomic_inc_return(select_skb_count(op));
+ trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
skb_get(skb);
}
/*
* Note the destruction of a socket buffer.
*/
-void rxrpc_free_skb(struct sk_buff *skb)
+void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
{
const void *here = __builtin_return_address(0);
if (skb) {
int n;
CHECK_SLAB_OKAY(&skb->users);
- n = atomic_dec_return(&rxrpc_n_skbs);
- trace_rxrpc_skb(skb, 3, atomic_read(&skb->users), n, here);
+ n = atomic_dec_return(select_skb_count(op));
+ trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
kfree_skb(skb);
}
}
/*
+ * Note the injected loss of a socket buffer.
+ */
+void rxrpc_lose_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
+{
+ const void *here = __builtin_return_address(0);
+ if (skb) {
+ int n;
+ CHECK_SLAB_OKAY(&skb->users);
+ if (op == rxrpc_skb_tx_lost) {
+ n = atomic_read(select_skb_count(op));
+ trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
+ } else {
+ n = atomic_dec_return(select_skb_count(op));
+ trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here);
+ kfree_skb(skb);
+ }
+ }
+}
+
+/*
* Clear a queue of socket buffers.
*/
void rxrpc_purge_queue(struct sk_buff_head *list)
@@ -217,8 +96,9 @@ void rxrpc_purge_queue(struct sk_buff_head *list)
const void *here = __builtin_return_address(0);
struct sk_buff *skb;
while ((skb = skb_dequeue((list))) != NULL) {
- int n = atomic_dec_return(&rxrpc_n_skbs);
- trace_rxrpc_skb(skb, 4, atomic_read(&skb->users), n, here);
+ int n = atomic_dec_return(select_skb_count(rxrpc_skb_rx_purged));
+ trace_rxrpc_skb(skb, rxrpc_skb_rx_purged,
+ atomic_read(&skb->users), n, here);
kfree_skb(skb);
}
}
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index dc380af8a81e..13d1df03ebac 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -20,7 +20,7 @@ static const unsigned int one = 1;
static const unsigned int four = 4;
static const unsigned int thirtytwo = 32;
static const unsigned int n_65535 = 65535;
-static const unsigned int n_max_acks = RXRPC_MAXACKS;
+static const unsigned int n_max_acks = RXRPC_RXTX_BUFF_SIZE - 1;
/*
* RxRPC operating parameters.
@@ -59,7 +59,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
.data = &rxrpc_resend_timeout,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec_ms_jiffies,
+ .proc_handler = proc_dointvec,
.extra1 = (void *)&one,
},
{
@@ -88,14 +88,6 @@ static struct ctl_table rxrpc_sysctl_table[] = {
.proc_handler = proc_dointvec_jiffies,
.extra1 = (void *)&one,
},
- {
- .procname = "dead_call_expiry",
- .data = &rxrpc_dead_call_expiry,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- .extra1 = (void *)&one,
- },
/* Non-time values */
{
diff --git a/net/rxrpc/utils.c b/net/rxrpc/utils.c
index b88914d53ca5..ff7af71c4b49 100644
--- a/net/rxrpc/utils.c
+++ b/net/rxrpc/utils.c
@@ -30,6 +30,7 @@ int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *srx, struct sk_buff *skb)
srx->transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
return 0;
+#ifdef CONFIG_AF_RXRPC_IPV6
case ETH_P_IPV6:
srx->transport_type = SOCK_DGRAM;
srx->transport_len = sizeof(srx->transport.sin6);
@@ -37,6 +38,7 @@ int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *srx, struct sk_buff *skb)
srx->transport.sin6.sin6_port = udp_hdr(skb)->source;
srx->transport.sin6.sin6_addr = ipv6_hdr(skb)->saddr;
return 0;
+#endif
default:
pr_warn_ratelimited("AF_RXRPC: Unknown eth protocol %u\n",
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index ccf931b3b94c..87956a768d1b 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -749,6 +749,17 @@ config NET_ACT_CONNMARK
To compile this code as a module, choose M here: the
module will be called act_connmark.
+config NET_ACT_SKBMOD
+ tristate "skb data modification action"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to allow modification of skb data
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_skbmod.
+
config NET_ACT_IFE
tristate "Inter-FE action based on IETF ForCES InterFE LFB"
depends on NET_CLS_ACT
@@ -761,6 +772,17 @@ config NET_ACT_IFE
To compile this code as a module, choose M here: the
module will be called act_ife.
+config NET_ACT_TUNNEL_KEY
+ tristate "IP tunnel metadata manipulation"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to set/release ip tunnel metadata.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_tunnel_key.
+
config NET_IFE_SKBMARK
tristate "Support to encoding decoding skb mark on IFE action"
depends on NET_ACT_IFE
@@ -771,6 +793,11 @@ config NET_IFE_SKBPRIO
depends on NET_ACT_IFE
---help---
+config NET_IFE_SKBTCINDEX
+ tristate "Support to encoding decoding skb tcindex on IFE action"
+ depends on NET_ACT_IFE
+ ---help---
+
config NET_CLS_IND
bool "Incoming device classification"
depends on NET_CLS_U32 || NET_CLS_FW
diff --git a/net/sched/Makefile b/net/sched/Makefile
index ae088a5a9d95..4bdda3634e0b 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -19,9 +19,12 @@ obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o
obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o
obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o
obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o
+obj-$(CONFIG_NET_ACT_SKBMOD) += act_skbmod.o
obj-$(CONFIG_NET_ACT_IFE) += act_ife.o
obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o
obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o
+obj-$(CONFIG_NET_IFE_SKBTCINDEX) += act_meta_skbtcindex.o
+obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o
obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o
obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index d09d0687594b..c9102172ce3b 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -592,9 +592,19 @@ err_out:
return ERR_PTR(err);
}
-int tcf_action_init(struct net *net, struct nlattr *nla,
- struct nlattr *est, char *name, int ovr,
- int bind, struct list_head *actions)
+static void cleanup_a(struct list_head *actions, int ovr)
+{
+ struct tc_action *a;
+
+ if (!ovr)
+ return;
+
+ list_for_each_entry(a, actions, list)
+ a->tcfa_refcnt--;
+}
+
+int tcf_action_init(struct net *net, struct nlattr *nla, struct nlattr *est,
+ char *name, int ovr, int bind, struct list_head *actions)
{
struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
struct tc_action *act;
@@ -612,8 +622,15 @@ int tcf_action_init(struct net *net, struct nlattr *nla,
goto err;
}
act->order = i;
+ if (ovr)
+ act->tcfa_refcnt++;
list_add_tail(&act->list, actions);
}
+
+ /* Remove the temp refcnt which was necessary to protect against
+ * destroying an existing action which was being replaced
+ */
+ cleanup_a(actions, ovr);
return 0;
err:
@@ -883,6 +900,8 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
goto err;
}
act->order = i;
+ if (event == RTM_GETACTION)
+ act->tcfa_refcnt++;
list_add_tail(&act->list, &actions);
}
@@ -923,9 +942,8 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
return err;
}
-static int
-tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
- u32 portid, int ovr)
+static int tcf_action_add(struct net *net, struct nlattr *nla,
+ struct nlmsghdr *n, u32 portid, int ovr)
{
int ret = 0;
LIST_HEAD(actions);
@@ -988,8 +1006,7 @@ replay:
return ret;
}
-static struct nlattr *
-find_dump_kind(const struct nlmsghdr *n)
+static struct nlattr *find_dump_kind(const struct nlmsghdr *n)
{
struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
@@ -1016,8 +1033,7 @@ find_dump_kind(const struct nlmsghdr *n)
return kind;
}
-static int
-tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
+static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
struct nlmsghdr *nlh;
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index bfa870731e74..1d3960033f61 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -39,13 +39,10 @@ static struct tc_action_ops act_bpf_ops;
static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
struct tcf_result *res)
{
+ bool at_ingress = skb_at_tc_ingress(skb);
struct tcf_bpf *prog = to_bpf(act);
struct bpf_prog *filter;
int action, filter_res;
- bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS;
-
- if (unlikely(!skb_mac_header_was_set(skb)))
- return TC_ACT_UNSPEC;
tcf_lastuse_update(&prog->tcf_tm);
bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb);
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index b5dbf633a863..e0defcef376d 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -116,8 +116,8 @@ static void *tcf_csum_skb_nextlayer(struct sk_buff *skb,
return (void *)(skb_network_header(skb) + ihl);
}
-static int tcf_csum_ipv4_icmp(struct sk_buff *skb,
- unsigned int ihl, unsigned int ipl)
+static int tcf_csum_ipv4_icmp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl)
{
struct icmphdr *icmph;
@@ -152,8 +152,8 @@ static int tcf_csum_ipv4_igmp(struct sk_buff *skb,
return 1;
}
-static int tcf_csum_ipv6_icmp(struct sk_buff *skb,
- unsigned int ihl, unsigned int ipl)
+static int tcf_csum_ipv6_icmp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl)
{
struct icmp6hdr *icmp6h;
const struct ipv6hdr *ip6h;
@@ -174,8 +174,8 @@ static int tcf_csum_ipv6_icmp(struct sk_buff *skb,
return 1;
}
-static int tcf_csum_ipv4_tcp(struct sk_buff *skb,
- unsigned int ihl, unsigned int ipl)
+static int tcf_csum_ipv4_tcp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl)
{
struct tcphdr *tcph;
const struct iphdr *iph;
@@ -195,8 +195,8 @@ static int tcf_csum_ipv4_tcp(struct sk_buff *skb,
return 1;
}
-static int tcf_csum_ipv6_tcp(struct sk_buff *skb,
- unsigned int ihl, unsigned int ipl)
+static int tcf_csum_ipv6_tcp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl)
{
struct tcphdr *tcph;
const struct ipv6hdr *ip6h;
@@ -217,8 +217,8 @@ static int tcf_csum_ipv6_tcp(struct sk_buff *skb,
return 1;
}
-static int tcf_csum_ipv4_udp(struct sk_buff *skb,
- unsigned int ihl, unsigned int ipl, int udplite)
+static int tcf_csum_ipv4_udp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl, int udplite)
{
struct udphdr *udph;
const struct iphdr *iph;
@@ -270,8 +270,8 @@ ignore_obscure_skb:
return 1;
}
-static int tcf_csum_ipv6_udp(struct sk_buff *skb,
- unsigned int ihl, unsigned int ipl, int udplite)
+static int tcf_csum_ipv6_udp(struct sk_buff *skb, unsigned int ihl,
+ unsigned int ipl, int udplite)
{
struct udphdr *udph;
const struct ipv6hdr *ip6h;
@@ -380,8 +380,8 @@ fail:
return 0;
}
-static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh,
- unsigned int ixhl, unsigned int *pl)
+static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh, unsigned int ixhl,
+ unsigned int *pl)
{
int off, len, optlen;
unsigned char *xh = (void *)ip6xh;
@@ -494,8 +494,8 @@ fail:
return 0;
}
-static int tcf_csum(struct sk_buff *skb,
- const struct tc_action *a, struct tcf_result *res)
+static int tcf_csum(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_csum *p = to_tcf_csum(a);
int action;
@@ -531,8 +531,8 @@ drop:
return TC_ACT_SHOT;
}
-static int tcf_csum_dump(struct sk_buff *skb,
- struct tc_action *a, int bind, int ref)
+static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
+ int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_csum *p = to_tcf_csum(a);
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index e24a4093d6f6..e0aa30f83c6c 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -156,7 +156,8 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
int action = READ_ONCE(gact->tcf_action);
struct tcf_t *tm = &gact->tcf_tm;
- _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes, packets);
+ _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes,
+ packets);
if (action == TC_ACT_SHOT)
this_cpu_ptr(gact->common.cpu_qstats)->drops += packets;
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index e87cd81315e1..ccf7b4b655fe 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -63,6 +63,23 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
}
EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
+int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi)
+{
+ u16 edata = 0;
+
+ if (mi->metaval)
+ edata = *(u16 *)mi->metaval;
+ else if (metaval)
+ edata = metaval;
+
+ if (!edata) /* will not encode */
+ return 0;
+
+ edata = htons(edata);
+ return ife_tlv_meta_encode(skbdata, mi->metaid, 2, &edata);
+}
+EXPORT_SYMBOL_GPL(ife_encode_meta_u16);
+
int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi)
{
if (mi->metaval)
@@ -81,6 +98,15 @@ int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi)
}
EXPORT_SYMBOL_GPL(ife_check_meta_u32);
+int ife_check_meta_u16(u16 metaval, struct tcf_meta_info *mi)
+{
+ if (metaval || mi->metaval)
+ return 8; /* T+L+(V) == 2+2+(2+2bytepad) */
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ife_check_meta_u16);
+
int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi)
{
u32 edata = metaval;
diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c
new file mode 100644
index 000000000000..3b35774ce890
--- /dev/null
+++ b/net/sched/act_meta_skbtcindex.c
@@ -0,0 +1,79 @@
+/*
+ * net/sched/act_meta_tc_index.c IFE skb->tc_index metadata module
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * copyright Jamal Hadi Salim (2016)
+ *
+*/
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <uapi/linux/tc_act/tc_ife.h>
+#include <net/tc_act/tc_ife.h>
+#include <linux/rtnetlink.h>
+
+static int skbtcindex_encode(struct sk_buff *skb, void *skbdata,
+ struct tcf_meta_info *e)
+{
+ u32 ifetc_index = skb->tc_index;
+
+ return ife_encode_meta_u16(ifetc_index, skbdata, e);
+}
+
+static int skbtcindex_decode(struct sk_buff *skb, void *data, u16 len)
+{
+ u16 ifetc_index = *(u16 *)data;
+
+ skb->tc_index = ntohs(ifetc_index);
+ return 0;
+}
+
+static int skbtcindex_check(struct sk_buff *skb, struct tcf_meta_info *e)
+{
+ return ife_check_meta_u16(skb->tc_index, e);
+}
+
+static struct tcf_meta_ops ife_skbtcindex_ops = {
+ .metaid = IFE_META_TCINDEX,
+ .metatype = NLA_U16,
+ .name = "tc_index",
+ .synopsis = "skb tc_index 16 bit metadata",
+ .check_presence = skbtcindex_check,
+ .encode = skbtcindex_encode,
+ .decode = skbtcindex_decode,
+ .get = ife_get_meta_u16,
+ .alloc = ife_alloc_meta_u16,
+ .release = ife_release_meta_gen,
+ .validate = ife_validate_meta_u16,
+ .owner = THIS_MODULE,
+};
+
+static int __init ifetc_index_init_module(void)
+{
+ return register_ife_op(&ife_skbtcindex_ops);
+}
+
+static void __exit ifetc_index_cleanup_module(void)
+{
+ unregister_ife_op(&ife_skbtcindex_ops);
+}
+
+module_init(ifetc_index_init_module);
+module_exit(ifetc_index_cleanup_module);
+
+MODULE_AUTHOR("Jamal Hadi Salim(2016)");
+MODULE_DESCRIPTION("Inter-FE skb tc_index metadata module");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_IFE_META(IFE_META_SKBTCINDEX);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 6038c85d92f5..667dc382df82 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -204,7 +204,15 @@ out:
return retval;
}
-static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
+ u64 lastuse)
+{
+ tcf_lastuse_update(&a->tcfa_tm);
+ _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
+}
+
+static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind,
+ int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_mirred *m = to_mirred(a);
@@ -280,6 +288,7 @@ static struct tc_action_ops act_mirred_ops = {
.type = TCA_ACT_MIRRED,
.owner = THIS_MODULE,
.act = tcf_mirred,
+ .stats_update = tcf_stats_update,
.dump = tcf_mirred_dump,
.cleanup = tcf_mirred_release,
.init = tcf_mirred_init,
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 8a3be1d99775..d1bd248fe146 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -249,6 +249,8 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
police->tcfp_t_c = now;
police->tcfp_toks = toks;
police->tcfp_ptoks = ptoks;
+ if (police->tcfp_result == TC_ACT_SHOT)
+ police->tcf_qstats.drops++;
spin_unlock(&police->tcf_lock);
return police->tcfp_result;
}
@@ -261,8 +263,8 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
return police->tcf_action;
}
-static int
-tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_police *police = to_police(a);
@@ -347,14 +349,12 @@ static struct pernet_operations police_net_ops = {
.size = sizeof(struct tc_action_net),
};
-static int __init
-police_init_module(void)
+static int __init police_init_module(void)
{
return tcf_register_action(&act_police_ops, &police_net_ops);
}
-static void __exit
-police_cleanup_module(void)
+static void __exit police_cleanup_module(void)
{
tcf_unregister_action(&act_police_ops, &police_net_ops);
}
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
new file mode 100644
index 000000000000..e7d96381c908
--- /dev/null
+++ b/net/sched/act_skbmod.c
@@ -0,0 +1,301 @@
+/*
+ * net/sched/act_skbmod.c skb data modifier
+ *
+ * Copyright (c) 2016 Jamal Hadi Salim <jhs@mojatatu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+*/
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+#include <linux/tc_act/tc_skbmod.h>
+#include <net/tc_act/tc_skbmod.h>
+
+#define SKBMOD_TAB_MASK 15
+
+static int skbmod_net_id;
+static struct tc_action_ops act_skbmod_ops;
+
+#define MAX_EDIT_LEN ETH_HLEN
+static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_skbmod *d = to_skbmod(a);
+ int action;
+ struct tcf_skbmod_params *p;
+ u64 flags;
+ int err;
+
+ tcf_lastuse_update(&d->tcf_tm);
+ bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb);
+
+ /* XXX: if you are going to edit more fields beyond ethernet header
+ * (example when you add IP header replacement or vlan swap)
+ * then MAX_EDIT_LEN needs to change appropriately
+ */
+ err = skb_ensure_writable(skb, MAX_EDIT_LEN);
+ if (unlikely(err)) { /* best policy is to drop on the floor */
+ qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
+ return TC_ACT_SHOT;
+ }
+
+ rcu_read_lock();
+ action = READ_ONCE(d->tcf_action);
+ if (unlikely(action == TC_ACT_SHOT)) {
+ qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
+ rcu_read_unlock();
+ return action;
+ }
+
+ p = rcu_dereference(d->skbmod_p);
+ flags = p->flags;
+ if (flags & SKBMOD_F_DMAC)
+ ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst);
+ if (flags & SKBMOD_F_SMAC)
+ ether_addr_copy(eth_hdr(skb)->h_source, p->eth_src);
+ if (flags & SKBMOD_F_ETYPE)
+ eth_hdr(skb)->h_proto = p->eth_type;
+ rcu_read_unlock();
+
+ if (flags & SKBMOD_F_SWAPMAC) {
+ u16 tmpaddr[ETH_ALEN / 2]; /* ether_addr_copy() requirement */
+ /*XXX: I am sure we can come up with more efficient swapping*/
+ ether_addr_copy((u8 *)tmpaddr, eth_hdr(skb)->h_dest);
+ ether_addr_copy(eth_hdr(skb)->h_dest, eth_hdr(skb)->h_source);
+ ether_addr_copy(eth_hdr(skb)->h_source, (u8 *)tmpaddr);
+ }
+
+ return action;
+}
+
+static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
+ [TCA_SKBMOD_PARMS] = { .len = sizeof(struct tc_skbmod) },
+ [TCA_SKBMOD_DMAC] = { .len = ETH_ALEN },
+ [TCA_SKBMOD_SMAC] = { .len = ETH_ALEN },
+ [TCA_SKBMOD_ETYPE] = { .type = NLA_U16 },
+};
+
+static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind)
+{
+ struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+ struct nlattr *tb[TCA_SKBMOD_MAX + 1];
+ struct tcf_skbmod_params *p, *p_old;
+ struct tc_skbmod *parm;
+ struct tcf_skbmod *d;
+ bool exists = false;
+ u8 *daddr = NULL;
+ u8 *saddr = NULL;
+ u16 eth_type = 0;
+ u32 lflags = 0;
+ int ret = 0, err;
+
+ if (!nla)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_SKBMOD_MAX, nla, skbmod_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_SKBMOD_PARMS])
+ return -EINVAL;
+
+ if (tb[TCA_SKBMOD_DMAC]) {
+ daddr = nla_data(tb[TCA_SKBMOD_DMAC]);
+ lflags |= SKBMOD_F_DMAC;
+ }
+
+ if (tb[TCA_SKBMOD_SMAC]) {
+ saddr = nla_data(tb[TCA_SKBMOD_SMAC]);
+ lflags |= SKBMOD_F_SMAC;
+ }
+
+ if (tb[TCA_SKBMOD_ETYPE]) {
+ eth_type = nla_get_u16(tb[TCA_SKBMOD_ETYPE]);
+ lflags |= SKBMOD_F_ETYPE;
+ }
+
+ parm = nla_data(tb[TCA_SKBMOD_PARMS]);
+ if (parm->flags & SKBMOD_F_SWAPMAC)
+ lflags = SKBMOD_F_SWAPMAC;
+
+ exists = tcf_hash_check(tn, parm->index, a, bind);
+ if (exists && bind)
+ return 0;
+
+ if (!lflags)
+ return -EINVAL;
+
+ if (!exists) {
+ ret = tcf_hash_create(tn, parm->index, est, a,
+ &act_skbmod_ops, bind, true);
+ if (ret)
+ return ret;
+
+ ret = ACT_P_CREATED;
+ } else {
+ tcf_hash_release(*a, bind);
+ if (!ovr)
+ return -EEXIST;
+ }
+
+ d = to_skbmod(*a);
+
+ ASSERT_RTNL();
+ p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL);
+ if (unlikely(!p)) {
+ if (ovr)
+ tcf_hash_release(*a, bind);
+ return -ENOMEM;
+ }
+
+ p->flags = lflags;
+ d->tcf_action = parm->action;
+
+ p_old = rtnl_dereference(d->skbmod_p);
+
+ if (ovr)
+ spin_lock_bh(&d->tcf_lock);
+
+ if (lflags & SKBMOD_F_DMAC)
+ ether_addr_copy(p->eth_dst, daddr);
+ if (lflags & SKBMOD_F_SMAC)
+ ether_addr_copy(p->eth_src, saddr);
+ if (lflags & SKBMOD_F_ETYPE)
+ p->eth_type = htons(eth_type);
+
+ rcu_assign_pointer(d->skbmod_p, p);
+ if (ovr)
+ spin_unlock_bh(&d->tcf_lock);
+
+ if (p_old)
+ kfree_rcu(p_old, rcu);
+
+ if (ret == ACT_P_CREATED)
+ tcf_hash_insert(tn, *a);
+ return ret;
+}
+
+static void tcf_skbmod_cleanup(struct tc_action *a, int bind)
+{
+ struct tcf_skbmod *d = to_skbmod(a);
+ struct tcf_skbmod_params *p;
+
+ p = rcu_dereference_protected(d->skbmod_p, 1);
+ kfree_rcu(p, rcu);
+}
+
+static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ struct tcf_skbmod *d = to_skbmod(a);
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_skbmod_params *p = rtnl_dereference(d->skbmod_p);
+ struct tc_skbmod opt = {
+ .index = d->tcf_index,
+ .refcnt = d->tcf_refcnt - ref,
+ .bindcnt = d->tcf_bindcnt - bind,
+ .action = d->tcf_action,
+ };
+ struct tcf_t t;
+
+ opt.flags = p->flags;
+ if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ if ((p->flags & SKBMOD_F_DMAC) &&
+ nla_put(skb, TCA_SKBMOD_DMAC, ETH_ALEN, p->eth_dst))
+ goto nla_put_failure;
+ if ((p->flags & SKBMOD_F_SMAC) &&
+ nla_put(skb, TCA_SKBMOD_SMAC, ETH_ALEN, p->eth_src))
+ goto nla_put_failure;
+ if ((p->flags & SKBMOD_F_ETYPE) &&
+ nla_put_u16(skb, TCA_SKBMOD_ETYPE, ntohs(p->eth_type)))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &d->tcf_tm);
+ if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD))
+ goto nla_put_failure;
+
+ return skb->len;
+nla_put_failure:
+ rcu_read_unlock();
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops)
+{
+ struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops);
+}
+
+static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+ return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_skbmod_ops = {
+ .kind = "skbmod",
+ .type = TCA_ACT_SKBMOD,
+ .owner = THIS_MODULE,
+ .act = tcf_skbmod_run,
+ .dump = tcf_skbmod_dump,
+ .init = tcf_skbmod_init,
+ .cleanup = tcf_skbmod_cleanup,
+ .walk = tcf_skbmod_walker,
+ .lookup = tcf_skbmod_search,
+ .size = sizeof(struct tcf_skbmod),
+};
+
+static __net_init int skbmod_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+ return tc_action_net_init(tn, &act_skbmod_ops, SKBMOD_TAB_MASK);
+}
+
+static void __net_exit skbmod_exit_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+ tc_action_net_exit(tn);
+}
+
+static struct pernet_operations skbmod_net_ops = {
+ .init = skbmod_init_net,
+ .exit = skbmod_exit_net,
+ .id = &skbmod_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+MODULE_AUTHOR("Jamal Hadi Salim, <jhs@mojatatu.com>");
+MODULE_DESCRIPTION("SKB data mod-ing");
+MODULE_LICENSE("GPL");
+
+static int __init skbmod_init_module(void)
+{
+ return tcf_register_action(&act_skbmod_ops, &skbmod_net_ops);
+}
+
+static void __exit skbmod_cleanup_module(void)
+{
+ tcf_unregister_action(&act_skbmod_ops, &skbmod_net_ops);
+}
+
+module_init(skbmod_init_module);
+module_exit(skbmod_cleanup_module);
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
new file mode 100644
index 000000000000..af47bdf2f483
--- /dev/null
+++ b/net/sched/act_tunnel_key.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <amir@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/dst.h>
+#include <net/dst_metadata.h>
+
+#include <linux/tc_act/tc_tunnel_key.h>
+#include <net/tc_act/tc_tunnel_key.h>
+
+#define TUNNEL_KEY_TAB_MASK 15
+
+static int tunnel_key_net_id;
+static struct tc_action_ops act_tunnel_key_ops;
+
+static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_tunnel_key *t = to_tunnel_key(a);
+ struct tcf_tunnel_key_params *params;
+ int action;
+
+ rcu_read_lock();
+
+ params = rcu_dereference(t->params);
+
+ tcf_lastuse_update(&t->tcf_tm);
+ bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb);
+ action = params->action;
+
+ switch (params->tcft_action) {
+ case TCA_TUNNEL_KEY_ACT_RELEASE:
+ skb_dst_drop(skb);
+ break;
+ case TCA_TUNNEL_KEY_ACT_SET:
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst_clone(&params->tcft_enc_metadata->dst));
+ break;
+ default:
+ WARN_ONCE(1, "Bad tunnel_key action %d.\n",
+ params->tcft_action);
+ break;
+ }
+
+ rcu_read_unlock();
+
+ return action;
+}
+
+static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
+ [TCA_TUNNEL_KEY_PARMS] = { .len = sizeof(struct tc_tunnel_key) },
+ [TCA_TUNNEL_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 },
+ [TCA_TUNNEL_KEY_ENC_IPV4_DST] = { .type = NLA_U32 },
+ [TCA_TUNNEL_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) },
+ [TCA_TUNNEL_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) },
+ [TCA_TUNNEL_KEY_ENC_KEY_ID] = { .type = NLA_U32 },
+};
+
+static int tunnel_key_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind)
+{
+ struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+ struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1];
+ struct tcf_tunnel_key_params *params_old;
+ struct tcf_tunnel_key_params *params_new;
+ struct metadata_dst *metadata = NULL;
+ struct tc_tunnel_key *parm;
+ struct tcf_tunnel_key *t;
+ bool exists = false;
+ __be64 key_id;
+ int ret = 0;
+ int err;
+
+ if (!nla)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_TUNNEL_KEY_PARMS])
+ return -EINVAL;
+
+ parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]);
+ exists = tcf_hash_check(tn, parm->index, a, bind);
+ if (exists && bind)
+ return 0;
+
+ switch (parm->t_action) {
+ case TCA_TUNNEL_KEY_ACT_RELEASE:
+ break;
+ case TCA_TUNNEL_KEY_ACT_SET:
+ if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) {
+ ret = -EINVAL;
+ goto err_out;
+ }
+
+ key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]));
+
+ if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] &&
+ tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) {
+ __be32 saddr;
+ __be32 daddr;
+
+ saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]);
+ daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]);
+
+ metadata = __ip_tun_set_dst(saddr, daddr, 0, 0,
+ TUNNEL_KEY, key_id, 0);
+ } else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] &&
+ tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+
+ saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]);
+ daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]);
+
+ metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, 0,
+ TUNNEL_KEY, key_id, 0);
+ }
+
+ if (!metadata) {
+ ret = -EINVAL;
+ goto err_out;
+ }
+
+ metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX;
+ break;
+ default:
+ goto err_out;
+ }
+
+ if (!exists) {
+ ret = tcf_hash_create(tn, parm->index, est, a,
+ &act_tunnel_key_ops, bind, true);
+ if (ret)
+ return ret;
+
+ ret = ACT_P_CREATED;
+ } else {
+ tcf_hash_release(*a, bind);
+ if (!ovr)
+ return -EEXIST;
+ }
+
+ t = to_tunnel_key(*a);
+
+ ASSERT_RTNL();
+ params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
+ if (unlikely(!params_new)) {
+ if (ret == ACT_P_CREATED)
+ tcf_hash_release(*a, bind);
+ return -ENOMEM;
+ }
+
+ params_old = rtnl_dereference(t->params);
+
+ params_new->action = parm->action;
+ params_new->tcft_action = parm->t_action;
+ params_new->tcft_enc_metadata = metadata;
+
+ rcu_assign_pointer(t->params, params_new);
+
+ if (params_old)
+ kfree_rcu(params_old, rcu);
+
+ if (ret == ACT_P_CREATED)
+ tcf_hash_insert(tn, *a);
+
+ return ret;
+
+err_out:
+ if (exists)
+ tcf_hash_release(*a, bind);
+ return ret;
+}
+
+static void tunnel_key_release(struct tc_action *a, int bind)
+{
+ struct tcf_tunnel_key *t = to_tunnel_key(a);
+ struct tcf_tunnel_key_params *params;
+
+ params = rcu_dereference_protected(t->params, 1);
+
+ if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET)
+ dst_release(&params->tcft_enc_metadata->dst);
+
+ kfree_rcu(params, rcu);
+}
+
+static int tunnel_key_dump_addresses(struct sk_buff *skb,
+ const struct ip_tunnel_info *info)
+{
+ unsigned short family = ip_tunnel_info_af(info);
+
+ if (family == AF_INET) {
+ __be32 saddr = info->key.u.ipv4.src;
+ __be32 daddr = info->key.u.ipv4.dst;
+
+ if (!nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_SRC, saddr) &&
+ !nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_DST, daddr))
+ return 0;
+ }
+
+ if (family == AF_INET6) {
+ const struct in6_addr *saddr6 = &info->key.u.ipv6.src;
+ const struct in6_addr *daddr6 = &info->key.u.ipv6.dst;
+
+ if (!nla_put_in6_addr(skb,
+ TCA_TUNNEL_KEY_ENC_IPV6_SRC, saddr6) &&
+ !nla_put_in6_addr(skb,
+ TCA_TUNNEL_KEY_ENC_IPV6_DST, daddr6))
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_tunnel_key *t = to_tunnel_key(a);
+ struct tcf_tunnel_key_params *params;
+ struct tc_tunnel_key opt = {
+ .index = t->tcf_index,
+ .refcnt = t->tcf_refcnt - ref,
+ .bindcnt = t->tcf_bindcnt - bind,
+ };
+ struct tcf_t tm;
+
+ params = rtnl_dereference(t->params);
+
+ opt.t_action = params->tcft_action;
+ opt.action = params->action;
+
+ if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) {
+ struct ip_tunnel_key *key =
+ &params->tcft_enc_metadata->u.tun_info.key;
+ __be32 key_id = tunnel_id_to_key32(key->tun_id);
+
+ if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) ||
+ tunnel_key_dump_addresses(skb,
+ &params->tcft_enc_metadata->u.tun_info))
+ goto nla_put_failure;
+ }
+
+ tcf_tm_dump(&tm, &t->tcf_tm);
+ if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm),
+ &tm, TCA_TUNNEL_KEY_PAD))
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tunnel_key_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops)
+{
+ struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops);
+}
+
+static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+ return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_tunnel_key_ops = {
+ .kind = "tunnel_key",
+ .type = TCA_ACT_TUNNEL_KEY,
+ .owner = THIS_MODULE,
+ .act = tunnel_key_act,
+ .dump = tunnel_key_dump,
+ .init = tunnel_key_init,
+ .cleanup = tunnel_key_release,
+ .walk = tunnel_key_walker,
+ .lookup = tunnel_key_search,
+ .size = sizeof(struct tcf_tunnel_key),
+};
+
+static __net_init int tunnel_key_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+ return tc_action_net_init(tn, &act_tunnel_key_ops, TUNNEL_KEY_TAB_MASK);
+}
+
+static void __net_exit tunnel_key_exit_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+ tc_action_net_exit(tn);
+}
+
+static struct pernet_operations tunnel_key_net_ops = {
+ .init = tunnel_key_init_net,
+ .exit = tunnel_key_exit_net,
+ .id = &tunnel_key_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init tunnel_key_init_module(void)
+{
+ return tcf_register_action(&act_tunnel_key_ops, &tunnel_key_net_ops);
+}
+
+static void __exit tunnel_key_cleanup_module(void)
+{
+ tcf_unregister_action(&act_tunnel_key_ops, &tunnel_key_net_ops);
+}
+
+module_init(tunnel_key_init_module);
+module_exit(tunnel_key_cleanup_module);
+
+MODULE_AUTHOR("Amir Vadai <amir@vadai.me>");
+MODULE_DESCRIPTION("ip tunnel manipulation actions");
+MODULE_LICENSE("GPL v2");
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 59a8d3150ae2..a95c00b119da 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -30,6 +30,7 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
struct tcf_vlan *v = to_vlan(a);
int action;
int err;
+ u16 tci;
spin_lock(&v->tcf_lock);
tcf_lastuse_update(&v->tcf_tm);
@@ -48,6 +49,30 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
if (err)
goto drop;
break;
+ case TCA_VLAN_ACT_MODIFY:
+ /* No-op if no vlan tag (either hw-accel or in-payload) */
+ if (!skb_vlan_tagged(skb))
+ goto unlock;
+ /* extract existing tag (and guarantee no hw-accel tag) */
+ if (skb_vlan_tag_present(skb)) {
+ tci = skb_vlan_tag_get(skb);
+ skb->vlan_tci = 0;
+ } else {
+ /* in-payload vlan tag, pop it */
+ err = __skb_vlan_pop(skb, &tci);
+ if (err)
+ goto drop;
+ }
+ /* replace the vid */
+ tci = (tci & ~VLAN_VID_MASK) | v->tcfv_push_vid;
+ /* replace prio bits, if tcfv_push_prio specified */
+ if (v->tcfv_push_prio) {
+ tci &= ~VLAN_PRIO_MASK;
+ tci |= v->tcfv_push_prio << VLAN_PRIO_SHIFT;
+ }
+ /* put updated tci as hwaccel tag */
+ __vlan_hwaccel_put_tag(skb, v->tcfv_push_proto, tci);
+ break;
default:
BUG();
}
@@ -102,6 +127,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
case TCA_VLAN_ACT_POP:
break;
case TCA_VLAN_ACT_PUSH:
+ case TCA_VLAN_ACT_MODIFY:
if (!tb[TCA_VLAN_PUSH_VLAN_ID]) {
if (exists)
tcf_hash_release(*a, bind);
@@ -185,7 +211,8 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
- if (v->tcfv_action == TCA_VLAN_ACT_PUSH &&
+ if ((v->tcfv_action == TCA_VLAN_ACT_PUSH ||
+ v->tcfv_action == TCA_VLAN_ACT_MODIFY) &&
(nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) ||
nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL,
v->tcfv_push_proto) ||
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index a7c5645373af..11da7da0b7c4 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -344,13 +344,15 @@ replay:
if (err == 0) {
struct tcf_proto *next = rtnl_dereference(tp->next);
- tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
+ tfilter_notify(net, skb, n, tp, fh,
+ RTM_DELTFILTER);
if (tcf_destroy(tp, false))
RCU_INIT_POINTER(*back, next);
}
goto errout;
case RTM_GETTFILTER:
- err = tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER);
+ err = tfilter_notify(net, skb, n, tp, fh,
+ RTM_NEWTFILTER);
goto errout;
default:
err = -EINVAL;
@@ -448,7 +450,8 @@ static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,
struct net *net = sock_net(a->skb->sk);
return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid,
- a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER);
+ a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWTFILTER);
}
/* called with RTNL */
@@ -552,7 +555,7 @@ void tcf_exts_destroy(struct tcf_exts *exts)
EXPORT_SYMBOL(tcf_exts_destroy);
int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
- struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr)
+ struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr)
{
#ifdef CONFIG_NET_CLS_ACT
{
@@ -560,8 +563,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
if (exts->police && tb[exts->police]) {
act = tcf_action_init_1(net, tb[exts->police], rate_tlv,
- "police", ovr,
- TCA_ACT_BIND);
+ "police", ovr, TCA_ACT_BIND);
if (IS_ERR(act))
return PTR_ERR(act);
@@ -573,8 +575,8 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
int err, i = 0;
err = tcf_action_init(net, tb[exts->action], rate_tlv,
- NULL, ovr,
- TCA_ACT_BIND, &actions);
+ NULL, ovr, TCA_ACT_BIND,
+ &actions);
if (err)
return err;
list_for_each_entry(act, &actions, list)
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 4742f415ee5b..bb1d5a487081 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -27,6 +27,8 @@ MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
MODULE_DESCRIPTION("TC BPF based classifier");
#define CLS_BPF_NAME_LEN 256
+#define CLS_BPF_SUPPORTED_GEN_FLAGS \
+ (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW)
struct cls_bpf_head {
struct list_head plist;
@@ -39,6 +41,8 @@ struct cls_bpf_prog {
struct list_head link;
struct tcf_result res;
bool exts_integrated;
+ bool offloaded;
+ u32 gen_flags;
struct tcf_exts exts;
u32 handle;
union {
@@ -54,8 +58,10 @@ struct cls_bpf_prog {
static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
[TCA_BPF_CLASSID] = { .type = NLA_U32 },
[TCA_BPF_FLAGS] = { .type = NLA_U32 },
+ [TCA_BPF_FLAGS_GEN] = { .type = NLA_U32 },
[TCA_BPF_FD] = { .type = NLA_U32 },
- [TCA_BPF_NAME] = { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN },
+ [TCA_BPF_NAME] = { .type = NLA_NUL_STRING,
+ .len = CLS_BPF_NAME_LEN },
[TCA_BPF_OPS_LEN] = { .type = NLA_U16 },
[TCA_BPF_OPS] = { .type = NLA_BINARY,
.len = sizeof(struct sock_filter) * BPF_MAXINSNS },
@@ -83,9 +89,6 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct cls_bpf_prog *prog;
int ret = -1;
- if (unlikely(!skb_mac_header_was_set(skb)))
- return -1;
-
/* Needed here for accessing maps. */
rcu_read_lock();
list_for_each_entry_rcu(prog, &head->plist, link) {
@@ -93,7 +96,9 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
qdisc_skb_cb(skb)->tc_classid = prog->res.classid;
- if (at_ingress) {
+ if (tc_skip_sw(prog->gen_flags)) {
+ filter_res = prog->exts_integrated ? TC_ACT_UNSPEC : 0;
+ } else if (at_ingress) {
/* It is safe to push/pull even if skb_shared() */
__skb_push(skb, skb->mac_len);
bpf_compute_data_end(skb);
@@ -140,6 +145,91 @@ static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog)
return !prog->bpf_ops;
}
+static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
+ enum tc_clsbpf_command cmd)
+{
+ struct net_device *dev = tp->q->dev_queue->dev;
+ struct tc_cls_bpf_offload bpf_offload = {};
+ struct tc_to_netdev offload;
+
+ offload.type = TC_SETUP_CLSBPF;
+ offload.cls_bpf = &bpf_offload;
+
+ bpf_offload.command = cmd;
+ bpf_offload.exts = &prog->exts;
+ bpf_offload.prog = prog->filter;
+ bpf_offload.name = prog->bpf_name;
+ bpf_offload.exts_integrated = prog->exts_integrated;
+ bpf_offload.gen_flags = prog->gen_flags;
+
+ return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+ tp->protocol, &offload);
+}
+
+static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
+ struct cls_bpf_prog *oldprog)
+{
+ struct net_device *dev = tp->q->dev_queue->dev;
+ struct cls_bpf_prog *obj = prog;
+ enum tc_clsbpf_command cmd;
+ bool skip_sw;
+ int ret;
+
+ skip_sw = tc_skip_sw(prog->gen_flags) ||
+ (oldprog && tc_skip_sw(oldprog->gen_flags));
+
+ if (oldprog && oldprog->offloaded) {
+ if (tc_should_offload(dev, tp, prog->gen_flags)) {
+ cmd = TC_CLSBPF_REPLACE;
+ } else if (!tc_skip_sw(prog->gen_flags)) {
+ obj = oldprog;
+ cmd = TC_CLSBPF_DESTROY;
+ } else {
+ return -EINVAL;
+ }
+ } else {
+ if (!tc_should_offload(dev, tp, prog->gen_flags))
+ return skip_sw ? -EINVAL : 0;
+ cmd = TC_CLSBPF_ADD;
+ }
+
+ ret = cls_bpf_offload_cmd(tp, obj, cmd);
+ if (ret)
+ return skip_sw ? ret : 0;
+
+ obj->offloaded = true;
+ if (oldprog)
+ oldprog->offloaded = false;
+
+ return 0;
+}
+
+static void cls_bpf_stop_offload(struct tcf_proto *tp,
+ struct cls_bpf_prog *prog)
+{
+ int err;
+
+ if (!prog->offloaded)
+ return;
+
+ err = cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY);
+ if (err) {
+ pr_err("Stopping hardware offload failed: %d\n", err);
+ return;
+ }
+
+ prog->offloaded = false;
+}
+
+static void cls_bpf_offload_update_stats(struct tcf_proto *tp,
+ struct cls_bpf_prog *prog)
+{
+ if (!prog->offloaded)
+ return;
+
+ cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_STATS);
+}
+
static int cls_bpf_init(struct tcf_proto *tp)
{
struct cls_bpf_head *head;
@@ -179,6 +269,7 @@ static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
{
struct cls_bpf_prog *prog = (struct cls_bpf_prog *) arg;
+ cls_bpf_stop_offload(tp, prog);
list_del_rcu(&prog->link);
tcf_unbind_filter(tp, &prog->res);
call_rcu(&prog->rcu, __cls_bpf_delete_prog);
@@ -195,6 +286,7 @@ static bool cls_bpf_destroy(struct tcf_proto *tp, bool force)
return false;
list_for_each_entry_safe(prog, tmp, &head->plist, link) {
+ cls_bpf_stop_offload(tp, prog);
list_del_rcu(&prog->link);
tcf_unbind_filter(tp, &prog->res);
call_rcu(&prog->rcu, __cls_bpf_delete_prog);
@@ -304,6 +396,7 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
{
bool is_bpf, is_ebpf, have_exts = false;
struct tcf_exts exts;
+ u32 gen_flags = 0;
int ret;
is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS];
@@ -328,8 +421,17 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT;
}
+ if (tb[TCA_BPF_FLAGS_GEN]) {
+ gen_flags = nla_get_u32(tb[TCA_BPF_FLAGS_GEN]);
+ if (gen_flags & ~CLS_BPF_SUPPORTED_GEN_FLAGS ||
+ !tc_flags_valid(gen_flags)) {
+ ret = -EINVAL;
+ goto errout;
+ }
+ }
prog->exts_integrated = have_exts;
+ prog->gen_flags = gen_flags;
ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) :
cls_bpf_prog_from_efd(tb, prog, tp);
@@ -412,10 +514,17 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
goto errout;
}
- ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE], ovr);
+ ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE],
+ ovr);
if (ret < 0)
goto errout;
+ ret = cls_bpf_offload(tp, prog, oldprog);
+ if (ret) {
+ cls_bpf_delete_prog(tp, prog);
+ return ret;
+ }
+
if (oldprog) {
list_replace_rcu(&oldprog->link, &prog->link);
tcf_unbind_filter(tp, &oldprog->res);
@@ -477,6 +586,8 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
tm->tcm_handle = prog->handle;
+ cls_bpf_offload_update_stats(tp, prog);
+
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
@@ -499,6 +610,9 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT;
if (bpf_flags && nla_put_u32(skb, TCA_BPF_FLAGS, bpf_flags))
goto nla_put_failure;
+ if (prog->gen_flags &&
+ nla_put_u32(skb, TCA_BPF_FLAGS_GEN, prog->gen_flags))
+ goto nla_put_failure;
nla_nest_end(skb, nest);
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 2c1ae549edbf..e39672394c7b 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -29,7 +29,7 @@
#include <net/route.h>
#include <net/flow_dissector.h>
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack.h>
#endif
@@ -87,12 +87,14 @@ static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow)
return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
}
-static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_proto(const struct sk_buff *skb,
+ const struct flow_keys *flow)
{
return flow->basic.ip_proto;
}
-static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_proto_src(const struct sk_buff *skb,
+ const struct flow_keys *flow)
{
if (flow->ports.ports)
return ntohs(flow->ports.src);
@@ -100,7 +102,8 @@ static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys
return addr_fold(skb->sk);
}
-static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_proto_dst(const struct sk_buff *skb,
+ const struct flow_keys *flow)
{
if (flow->ports.ports)
return ntohs(flow->ports.dst);
@@ -125,14 +128,14 @@ static u32 flow_get_mark(const struct sk_buff *skb)
static u32 flow_get_nfct(const struct sk_buff *skb)
{
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
return addr_fold(skb->nfct);
#else
return 0;
#endif
}
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#define CTTUPLE(skb, member) \
({ \
enum ip_conntrack_info ctinfo; \
@@ -149,7 +152,8 @@ static u32 flow_get_nfct(const struct sk_buff *skb)
})
#endif
-static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_nfct_src(const struct sk_buff *skb,
+ const struct flow_keys *flow)
{
switch (tc_skb_protocol(skb)) {
case htons(ETH_P_IP):
@@ -161,7 +165,8 @@ fallback:
return flow_get_src(skb, flow);
}
-static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_nfct_dst(const struct sk_buff *skb,
+ const struct flow_keys *flow)
{
switch (tc_skb_protocol(skb)) {
case htons(ETH_P_IP):
@@ -173,14 +178,16 @@ fallback:
return flow_get_dst(skb, flow);
}
-static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_nfct_proto_src(const struct sk_buff *skb,
+ const struct flow_keys *flow)
{
return ntohs(CTTUPLE(skb, src.u.all));
fallback:
return flow_get_proto_src(skb, flow);
}
-static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
+static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb,
+ const struct flow_keys *flow)
{
return ntohs(CTTUPLE(skb, dst.u.all));
fallback:
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index cf9ad5b50889..2af09c872a1a 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -23,18 +23,26 @@
#include <net/ip.h>
#include <net/flow_dissector.h>
+#include <net/dst.h>
+#include <net/dst_metadata.h>
+
struct fl_flow_key {
int indev_ifindex;
struct flow_dissector_key_control control;
+ struct flow_dissector_key_control enc_control;
struct flow_dissector_key_basic basic;
struct flow_dissector_key_eth_addrs eth;
struct flow_dissector_key_vlan vlan;
- struct flow_dissector_key_addrs ipaddrs;
union {
struct flow_dissector_key_ipv4_addrs ipv4;
struct flow_dissector_key_ipv6_addrs ipv6;
};
struct flow_dissector_key_ports tp;
+ struct flow_dissector_key_keyid enc_key_id;
+ union {
+ struct flow_dissector_key_ipv4_addrs enc_ipv4;
+ struct flow_dissector_key_ipv6_addrs enc_ipv6;
+ };
} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
struct fl_flow_mask_range {
@@ -124,11 +132,31 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct cls_fl_filter *f;
struct fl_flow_key skb_key;
struct fl_flow_key skb_mkey;
+ struct ip_tunnel_info *info;
if (!atomic_read(&head->ht.nelems))
return -1;
fl_clear_masked_range(&skb_key, &head->mask);
+
+ info = skb_tunnel_info(skb);
+ if (info) {
+ struct ip_tunnel_key *key = &info->key;
+
+ switch (ip_tunnel_info_af(info)) {
+ case AF_INET:
+ skb_key.enc_ipv4.src = key->u.ipv4.src;
+ skb_key.enc_ipv4.dst = key->u.ipv4.dst;
+ break;
+ case AF_INET6:
+ skb_key.enc_ipv6.src = key->u.ipv6.src;
+ skb_key.enc_ipv6.dst = key->u.ipv6.dst;
+ break;
+ }
+
+ skb_key.enc_key_id.keyid = tunnel_id_to_key32(key->tun_id);
+ }
+
skb_key.indev_ifindex = skb->skb_iif;
/* skb_flow_dissect() does not set n_proto in case an unknown protocol,
* so do it rather here.
@@ -213,7 +241,8 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
tc.type = TC_SETUP_CLSFLOWER;
tc.cls_flower = &offload;
- err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+ err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
+ &tc);
if (tc_skip_sw(flags))
return err;
@@ -297,7 +326,19 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
[TCA_FLOWER_KEY_VLAN_ID] = { .type = NLA_U16 },
[TCA_FLOWER_KEY_VLAN_PRIO] = { .type = NLA_U8 },
[TCA_FLOWER_KEY_VLAN_ETH_TYPE] = { .type = NLA_U16 },
-
+ [TCA_FLOWER_KEY_ENC_KEY_ID] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ENC_IPV4_DST] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ENC_IPV4_DST_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) },
+ [TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) },
+ [TCA_FLOWER_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) },
+ [TCA_FLOWER_KEY_ENC_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) },
+ [TCA_FLOWER_KEY_TCP_SRC_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_TCP_DST_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_UDP_SRC_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_UDP_DST_MASK] = { .type = NLA_U16 },
};
static void fl_set_key_val(struct nlattr **tb,
@@ -395,20 +436,54 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
if (key->basic.ip_proto == IPPROTO_TCP) {
fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC,
- &mask->tp.src, TCA_FLOWER_UNSPEC,
+ &mask->tp.src, TCA_FLOWER_KEY_TCP_SRC_MASK,
sizeof(key->tp.src));
fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST,
- &mask->tp.dst, TCA_FLOWER_UNSPEC,
+ &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK,
sizeof(key->tp.dst));
} else if (key->basic.ip_proto == IPPROTO_UDP) {
fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC,
- &mask->tp.src, TCA_FLOWER_UNSPEC,
+ &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK,
sizeof(key->tp.src));
fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST,
- &mask->tp.dst, TCA_FLOWER_UNSPEC,
+ &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK,
sizeof(key->tp.dst));
}
+ if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
+ tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) {
+ key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ fl_set_key_val(tb, &key->enc_ipv4.src,
+ TCA_FLOWER_KEY_ENC_IPV4_SRC,
+ &mask->enc_ipv4.src,
+ TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
+ sizeof(key->enc_ipv4.src));
+ fl_set_key_val(tb, &key->enc_ipv4.dst,
+ TCA_FLOWER_KEY_ENC_IPV4_DST,
+ &mask->enc_ipv4.dst,
+ TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
+ sizeof(key->enc_ipv4.dst));
+ }
+
+ if (tb[TCA_FLOWER_KEY_ENC_IPV6_SRC] ||
+ tb[TCA_FLOWER_KEY_ENC_IPV6_DST]) {
+ key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ fl_set_key_val(tb, &key->enc_ipv6.src,
+ TCA_FLOWER_KEY_ENC_IPV6_SRC,
+ &mask->enc_ipv6.src,
+ TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
+ sizeof(key->enc_ipv6.src));
+ fl_set_key_val(tb, &key->enc_ipv6.dst,
+ TCA_FLOWER_KEY_ENC_IPV6_DST,
+ &mask->enc_ipv6.dst,
+ TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
+ sizeof(key->enc_ipv6.dst));
+ }
+
+ fl_set_key_val(tb, &key->enc_key_id.keyid, TCA_FLOWER_KEY_ENC_KEY_ID,
+ &mask->enc_key_id.keyid, TCA_FLOWER_KEY_ENC_KEY_ID,
+ sizeof(key->enc_key_id.keyid));
+
return 0;
}
@@ -806,21 +881,48 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
if (key->basic.ip_proto == IPPROTO_TCP &&
(fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC,
- &mask->tp.src, TCA_FLOWER_UNSPEC,
+ &mask->tp.src, TCA_FLOWER_KEY_TCP_SRC_MASK,
sizeof(key->tp.src)) ||
fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST,
- &mask->tp.dst, TCA_FLOWER_UNSPEC,
+ &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK,
sizeof(key->tp.dst))))
goto nla_put_failure;
else if (key->basic.ip_proto == IPPROTO_UDP &&
(fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC,
- &mask->tp.src, TCA_FLOWER_UNSPEC,
+ &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK,
sizeof(key->tp.src)) ||
fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST,
- &mask->tp.dst, TCA_FLOWER_UNSPEC,
+ &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK,
sizeof(key->tp.dst))))
goto nla_put_failure;
+ if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
+ (fl_dump_key_val(skb, &key->enc_ipv4.src,
+ TCA_FLOWER_KEY_ENC_IPV4_SRC, &mask->enc_ipv4.src,
+ TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
+ sizeof(key->enc_ipv4.src)) ||
+ fl_dump_key_val(skb, &key->enc_ipv4.dst,
+ TCA_FLOWER_KEY_ENC_IPV4_DST, &mask->enc_ipv4.dst,
+ TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
+ sizeof(key->enc_ipv4.dst))))
+ goto nla_put_failure;
+ else if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS &&
+ (fl_dump_key_val(skb, &key->enc_ipv6.src,
+ TCA_FLOWER_KEY_ENC_IPV6_SRC, &mask->enc_ipv6.src,
+ TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
+ sizeof(key->enc_ipv6.src)) ||
+ fl_dump_key_val(skb, &key->enc_ipv6.dst,
+ TCA_FLOWER_KEY_ENC_IPV6_DST,
+ &mask->enc_ipv6.dst,
+ TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
+ sizeof(key->enc_ipv6.dst))))
+ goto nla_put_failure;
+
+ if (fl_dump_key_val(skb, &key->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID,
+ &mask->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID,
+ sizeof(key->enc_key_id)))
+ goto nla_put_failure;
+
nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags);
if (tcf_exts_dump(skb, &f->exts))
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index cc0bda945800..9dc63d54e167 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -57,7 +57,7 @@ static u32 fw_hash(u32 handle)
}
static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
- struct tcf_result *res)
+ struct tcf_result *res)
{
struct fw_head *head = rcu_dereference_bh(tp->root);
struct fw_filter *f;
@@ -188,7 +188,8 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = {
static int
fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f,
- struct nlattr **tb, struct nlattr **tca, unsigned long base, bool ovr)
+ struct nlattr **tb, struct nlattr **tca, unsigned long base,
+ bool ovr)
{
struct fw_head *head = rtnl_dereference(tp->root);
struct tcf_exts e;
@@ -237,9 +238,8 @@ errout:
static int fw_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
- u32 handle,
- struct nlattr **tca,
- unsigned long *arg, bool ovr)
+ u32 handle, struct nlattr **tca, unsigned long *arg,
+ bool ovr)
{
struct fw_head *head = rtnl_dereference(tp->root);
struct fw_filter *f = (struct fw_filter *) *arg;
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index c91e65d81a48..455fc8f83d0a 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -268,8 +268,7 @@ static int route4_init(struct tcf_proto *tp)
return 0;
}
-static void
-route4_delete_filter(struct rcu_head *head)
+static void route4_delete_filter(struct rcu_head *head)
{
struct route4_filter *f = container_of(head, struct route4_filter, rcu);
@@ -474,10 +473,8 @@ errout:
}
static int route4_change(struct net *net, struct sk_buff *in_skb,
- struct tcf_proto *tp, unsigned long base,
- u32 handle,
- struct nlattr **tca,
- unsigned long *arg, bool ovr)
+ struct tcf_proto *tp, unsigned long base, u32 handle,
+ struct nlattr **tca, unsigned long *arg, bool ovr)
{
struct route4_head *head = rtnl_dereference(tp->root);
struct route4_filter __rcu **fp;
@@ -562,7 +559,8 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
return 0;
errout:
- tcf_exts_destroy(&f->exts);
+ if (f)
+ tcf_exts_destroy(&f->exts);
kfree(f);
return err;
}
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index d9500709831f..96144bdf30db 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -50,14 +50,13 @@ struct tcindex_data {
struct rcu_head rcu;
};
-static inline int
-tcindex_filter_is_set(struct tcindex_filter_result *r)
+static inline int tcindex_filter_is_set(struct tcindex_filter_result *r)
{
return tcf_exts_is_predicative(&r->exts) || r->res.classid;
}
-static struct tcindex_filter_result *
-tcindex_lookup(struct tcindex_data *p, u16 key)
+static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p,
+ u16 key)
{
if (p->perfect) {
struct tcindex_filter_result *f = p->perfect + key;
@@ -144,7 +143,8 @@ static void tcindex_destroy_rexts(struct rcu_head *head)
static void tcindex_destroy_fexts(struct rcu_head *head)
{
- struct tcindex_filter *f = container_of(head, struct tcindex_filter, rcu);
+ struct tcindex_filter *f = container_of(head, struct tcindex_filter,
+ rcu);
tcf_exts_destroy(&f->result.exts);
kfree(f);
@@ -550,7 +550,7 @@ static bool tcindex_destroy(struct tcf_proto *tp, bool force)
static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t)
{
struct tcindex_data *p = rtnl_dereference(tp->root);
struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index a29263a9d8c1..ae83c3aec308 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -104,7 +104,8 @@ static inline unsigned int u32_hash_fold(__be32 key,
return h;
}
-static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res)
+static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
{
struct {
struct tc_u_knode *knode;
@@ -256,8 +257,7 @@ deadloop:
return -1;
}
-static struct tc_u_hnode *
-u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
+static struct tc_u_hnode *u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
{
struct tc_u_hnode *ht;
@@ -270,8 +270,7 @@ u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
return ht;
}
-static struct tc_u_knode *
-u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
+static struct tc_u_knode *u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
{
unsigned int sel;
struct tc_u_knode *n = NULL;
@@ -360,8 +359,7 @@ static int u32_init(struct tcf_proto *tp)
return 0;
}
-static int u32_destroy_key(struct tcf_proto *tp,
- struct tc_u_knode *n,
+static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n,
bool free_pf)
{
tcf_exts_destroy(&n->exts);
@@ -448,9 +446,8 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
}
}
-static int u32_replace_hw_hnode(struct tcf_proto *tp,
- struct tc_u_hnode *h,
- u32 flags)
+static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
+ u32 flags)
{
struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_u32_offload u32_offload = {0};
@@ -496,9 +493,8 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
}
}
-static int u32_replace_hw_knode(struct tcf_proto *tp,
- struct tc_u_knode *n,
- u32 flags)
+static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
+ u32 flags)
{
struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_u32_offload u32_offload = {0};
@@ -763,8 +759,7 @@ errout:
return err;
}
-static void u32_replace_knode(struct tcf_proto *tp,
- struct tc_u_common *tp_c,
+static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c,
struct tc_u_knode *n)
{
struct tc_u_knode __rcu **ins;
@@ -845,8 +840,7 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
static int u32_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base, u32 handle,
- struct nlattr **tca,
- unsigned long *arg, bool ovr)
+ struct nlattr **tca, unsigned long *arg, bool ovr)
{
struct tc_u_common *tp_c = tp->data;
struct tc_u_hnode *ht;
@@ -1088,7 +1082,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
}
static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t)
{
struct tc_u_knode *n = (struct tc_u_knode *)fh;
struct tc_u_hnode *ht_up, *ht_down;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index d677b3484d81..206dc24add3a 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -389,7 +389,8 @@ static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
static struct qdisc_rate_table *qdisc_rtab_list;
-struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
+struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
+ struct nlattr *tab)
{
struct qdisc_rate_table *rtab;
@@ -541,7 +542,8 @@ nla_put_failure:
return -1;
}
-void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
+void __qdisc_calculate_pkt_len(struct sk_buff *skb,
+ const struct qdisc_size_table *stab)
{
int pkt_len, slot;
@@ -888,10 +890,10 @@ static struct lock_class_key qdisc_rx_lock;
Parameters are passed via opt.
*/
-static struct Qdisc *
-qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
- struct Qdisc *p, u32 parent, u32 handle,
- struct nlattr **tca, int *errp)
+static struct Qdisc *qdisc_create(struct net_device *dev,
+ struct netdev_queue *dev_queue,
+ struct Qdisc *p, u32 parent, u32 handle,
+ struct nlattr **tca, int *errp)
{
int err;
struct nlattr *kind = tca[TCA_KIND];
@@ -1073,7 +1075,8 @@ struct check_loop_arg {
int depth;
};
-static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
+static int check_loop_fn(struct Qdisc *q, unsigned long cl,
+ struct qdisc_walker *w);
static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
{
@@ -1450,7 +1453,8 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
} else {
if (!tc_qdisc_dump_ignore(q) &&
tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWQDISC) <= 0)
goto done;
q_idx++;
}
@@ -1471,7 +1475,8 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
}
if (!tc_qdisc_dump_ignore(q) &&
tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWQDISC) <= 0)
goto done;
q_idx++;
}
@@ -1505,7 +1510,8 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
s_q_idx = 0;
q_idx = 0;
- if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx, true) < 0)
+ if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
+ true) < 0)
goto done;
dev_queue = dev_ingress_queue(dev);
@@ -1640,7 +1646,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
if (cops->delete)
err = cops->delete(q, cl);
if (err == 0)
- tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
+ tclass_notify(net, skb, n, q, cl,
+ RTM_DELTCLASS);
goto out;
case RTM_GETTCLASS:
err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
@@ -1738,12 +1745,14 @@ struct qdisc_dump_args {
struct netlink_callback *cb;
};
-static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
+static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
+ struct qdisc_walker *arg)
{
struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
- a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
+ a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWTCLASS);
}
static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
@@ -1976,10 +1985,12 @@ static int __init pktsched_init(void)
rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
- rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
+ NULL);
rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
- rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
+ NULL);
return 0;
}
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index 4002df3c7d9f..5bfa79ee657c 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -69,7 +69,7 @@ struct codel_sched_data {
static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx)
{
struct Qdisc *sch = ctx;
- struct sk_buff *skb = __skb_dequeue(&sch->q);
+ struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
if (skb)
sch->qstats.backlog -= qdisc_pkt_len(skb);
@@ -172,7 +172,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt)
qlen = sch->q.qlen;
while (sch->q.qlen > sch->limit) {
- struct sk_buff *skb = __skb_dequeue(&sch->q);
+ struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
dropped += qdisc_pkt_len(skb);
qdisc_qstats_backlog_dec(sch, skb);
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index baeed6a78d28..1e37247656f8 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -31,7 +31,7 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
- if (likely(skb_queue_len(&sch->q) < sch->limit))
+ if (likely(sch->q.qlen < sch->limit))
return qdisc_enqueue_tail(skb, sch);
return qdisc_drop(skb, sch, to_free);
@@ -42,7 +42,7 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch,
{
unsigned int prev_backlog;
- if (likely(skb_queue_len(&sch->q) < sch->limit))
+ if (likely(sch->q.qlen < sch->limit))
return qdisc_enqueue_tail(skb, sch);
prev_backlog = sch->qstats.backlog;
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index e5458b99e09c..18e752439f6f 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -86,6 +86,7 @@ struct fq_sched_data {
struct rb_root delayed; /* for rate limited flows */
u64 time_next_delayed_flow;
+ unsigned long unthrottle_latency_ns;
struct fq_flow internal; /* for non classified or high prio packets */
u32 quantum;
@@ -94,6 +95,7 @@ struct fq_sched_data {
u32 flow_max_rate; /* optional max rate per flow */
u32 flow_plimit; /* max packets per flow */
u32 orphan_mask; /* mask for orphaned skb */
+ u32 low_rate_threshold;
struct rb_root *fq_root;
u8 rate_enable;
u8 fq_trees_log;
@@ -407,11 +409,19 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
static void fq_check_throttled(struct fq_sched_data *q, u64 now)
{
+ unsigned long sample;
struct rb_node *p;
if (q->time_next_delayed_flow > now)
return;
+ /* Update unthrottle latency EWMA.
+ * This is cheap and can help diagnosing timer/latency problems.
+ */
+ sample = (unsigned long)(now - q->time_next_delayed_flow);
+ q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
+ q->unthrottle_latency_ns += sample >> 3;
+
q->time_next_delayed_flow = ~0ULL;
while ((p = rb_first(&q->delayed)) != NULL) {
struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
@@ -433,7 +443,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
struct fq_flow_head *head;
struct sk_buff *skb;
struct fq_flow *f;
- u32 rate;
+ u32 rate, plen;
skb = fq_dequeue_head(sch, &q->internal);
if (skb)
@@ -482,7 +492,7 @@ begin:
prefetch(&skb->end);
f->credit -= qdisc_pkt_len(skb);
- if (f->credit > 0 || !q->rate_enable)
+ if (!q->rate_enable)
goto out;
/* Do not pace locally generated ack packets */
@@ -493,8 +503,15 @@ begin:
if (skb->sk)
rate = min(skb->sk->sk_pacing_rate, rate);
+ if (rate <= q->low_rate_threshold) {
+ f->credit = 0;
+ plen = qdisc_pkt_len(skb);
+ } else {
+ plen = max(qdisc_pkt_len(skb), q->quantum);
+ if (f->credit > 0)
+ goto out;
+ }
if (rate != ~0U) {
- u32 plen = max(qdisc_pkt_len(skb), q->quantum);
u64 len = (u64)plen * NSEC_PER_SEC;
if (likely(rate))
@@ -507,7 +524,12 @@ begin:
len = NSEC_PER_SEC;
q->stat_pkts_too_long++;
}
-
+ /* Account for schedule/timers drifts.
+ * f->time_next_packet was set when prior packet was sent,
+ * and current time (@now) can be too late by tens of us.
+ */
+ if (f->time_next_packet)
+ len -= min(len/2, now - f->time_next_packet);
f->time_next_packet = now + len;
}
out:
@@ -662,6 +684,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
[TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
[TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
[TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
+ [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
};
static int fq_change(struct Qdisc *sch, struct nlattr *opt)
@@ -716,6 +739,10 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
if (tb[TCA_FQ_FLOW_MAX_RATE])
q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
+ if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
+ q->low_rate_threshold =
+ nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
+
if (tb[TCA_FQ_RATE_ENABLE]) {
u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
@@ -774,6 +801,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
q->flow_refill_delay = msecs_to_jiffies(40);
q->flow_max_rate = ~0U;
+ q->time_next_delayed_flow = ~0ULL;
q->rate_enable = 1;
q->new_flows.first = NULL;
q->old_flows.first = NULL;
@@ -781,6 +809,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
q->fq_root = NULL;
q->fq_trees_log = ilog2(1024);
q->orphan_mask = 1024 - 1;
+ q->low_rate_threshold = 550000 / 8;
qdisc_watchdog_init(&q->watchdog, sch);
if (opt)
@@ -811,6 +840,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
jiffies_to_usecs(q->flow_refill_delay)) ||
nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
+ nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
+ q->low_rate_threshold) ||
nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
goto nla_put_failure;
@@ -823,20 +854,24 @@ nla_put_failure:
static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct fq_sched_data *q = qdisc_priv(sch);
- u64 now = ktime_get_ns();
- struct tc_fq_qd_stats st = {
- .gc_flows = q->stat_gc_flows,
- .highprio_packets = q->stat_internal_packets,
- .tcp_retrans = q->stat_tcp_retrans,
- .throttled = q->stat_throttled,
- .flows_plimit = q->stat_flows_plimit,
- .pkts_too_long = q->stat_pkts_too_long,
- .allocation_errors = q->stat_allocation_errors,
- .flows = q->flows,
- .inactive_flows = q->inactive_flows,
- .throttled_flows = q->throttled_flows,
- .time_next_delayed_flow = q->time_next_delayed_flow - now,
- };
+ struct tc_fq_qd_stats st;
+
+ sch_tree_lock(sch);
+
+ st.gc_flows = q->stat_gc_flows;
+ st.highprio_packets = q->stat_internal_packets;
+ st.tcp_retrans = q->stat_tcp_retrans;
+ st.throttled = q->stat_throttled;
+ st.flows_plimit = q->stat_flows_plimit;
+ st.pkts_too_long = q->stat_pkts_too_long;
+ st.allocation_errors = q->stat_allocation_errors;
+ st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns();
+ st.flows = q->flows;
+ st.inactive_flows = q->inactive_flows;
+ st.throttled_flows = q->throttled_flows;
+ st.unthrottle_latency_ns = min_t(unsigned long,
+ q->unthrottle_latency_ns, ~0U);
+ sch_tree_unlock(sch);
return gnet_stats_copy_app(d, &st, sizeof(st));
}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 0d21b567ff27..6cfb6e9038c2 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -466,7 +466,7 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = {
*/
struct pfifo_fast_priv {
u32 bitmap;
- struct sk_buff_head q[PFIFO_FAST_BANDS];
+ struct qdisc_skb_head q[PFIFO_FAST_BANDS];
};
/*
@@ -477,7 +477,7 @@ struct pfifo_fast_priv {
*/
static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};
-static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
+static inline struct qdisc_skb_head *band2list(struct pfifo_fast_priv *priv,
int band)
{
return priv->q + band;
@@ -486,10 +486,10 @@ static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
struct sk_buff **to_free)
{
- if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
+ if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) {
int band = prio2band[skb->priority & TC_PRIO_MAX];
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
- struct sk_buff_head *list = band2list(priv, band);
+ struct qdisc_skb_head *list = band2list(priv, band);
priv->bitmap |= (1 << band);
qdisc->q.qlen++;
@@ -505,11 +505,16 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
int band = bitmap2band[priv->bitmap];
if (likely(band >= 0)) {
- struct sk_buff_head *list = band2list(priv, band);
- struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
+ struct qdisc_skb_head *qh = band2list(priv, band);
+ struct sk_buff *skb = __qdisc_dequeue_head(qh);
+
+ if (likely(skb != NULL)) {
+ qdisc_qstats_backlog_dec(qdisc, skb);
+ qdisc_bstats_update(qdisc, skb);
+ }
qdisc->q.qlen--;
- if (skb_queue_empty(list))
+ if (qh->qlen == 0)
priv->bitmap &= ~(1 << band);
return skb;
@@ -524,9 +529,9 @@ static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
int band = bitmap2band[priv->bitmap];
if (band >= 0) {
- struct sk_buff_head *list = band2list(priv, band);
+ struct qdisc_skb_head *qh = band2list(priv, band);
- return skb_peek(list);
+ return qh->head;
}
return NULL;
@@ -564,7 +569,7 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
- __skb_queue_head_init(band2list(priv, prio));
+ qdisc_skb_head_init(band2list(priv, prio));
/* Can by-pass the queue discipline */
qdisc->flags |= TCQ_F_CAN_BYPASS;
@@ -612,7 +617,8 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
sch->padded = (char *) sch - (char *) p;
}
- skb_queue_head_init(&sch->q);
+ qdisc_skb_head_init(&sch->q);
+ spin_lock_init(&sch->q.lock);
spin_lock_init(&sch->busylock);
lockdep_set_class(&sch->busylock,
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 53dbfa187870..c798d0de8a9d 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -162,7 +162,7 @@ struct htb_sched {
struct work_struct work;
/* non shaped skbs; let them go directly thru */
- struct sk_buff_head direct_queue;
+ struct qdisc_skb_head direct_queue;
long direct_pkts;
struct qdisc_watchdog watchdog;
@@ -570,6 +570,22 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
list_del_init(&cl->un.leaf.drop_list);
}
+static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
+ struct qdisc_skb_head *qh)
+{
+ struct sk_buff *last = qh->tail;
+
+ if (last) {
+ skb->next = NULL;
+ last->next = skb;
+ qh->tail = skb;
+ } else {
+ qh->tail = skb;
+ qh->head = skb;
+ }
+ qh->qlen++;
+}
+
static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
@@ -580,7 +596,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (cl == HTB_DIRECT) {
/* enqueue to helper queue */
if (q->direct_queue.qlen < q->direct_qlen) {
- __skb_queue_tail(&q->direct_queue, skb);
+ htb_enqueue_tail(skb, sch, &q->direct_queue);
q->direct_pkts++;
} else {
return qdisc_drop(skb, sch, to_free);
@@ -888,7 +904,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
unsigned long start_at;
/* try to dequeue direct packets as high prio (!) to minimize cpu work */
- skb = __skb_dequeue(&q->direct_queue);
+ skb = __qdisc_dequeue_head(&q->direct_queue);
if (skb != NULL) {
ok:
qdisc_bstats_update(sch, skb);
@@ -1019,7 +1035,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)
qdisc_watchdog_init(&q->watchdog, sch);
INIT_WORK(&q->work, htb_work_func);
- __skb_queue_head_init(&q->direct_queue);
+ qdisc_skb_head_init(&q->direct_queue);
if (tb[TCA_HTB_DIRECT_QLEN])
q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index aaaf02175338..9f7b380cf0a3 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -413,6 +413,16 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
return segs;
}
+static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb)
+{
+ skb->next = qh->head;
+
+ if (!qh->head)
+ qh->tail = skb;
+ qh->head = skb;
+ qh->qlen++;
+}
+
/*
* Insert one skb into qdisc.
* Note: parent depends on return value to account for queue length.
@@ -502,7 +512,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1<<(prandom_u32() % 8);
}
- if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
+ if (unlikely(sch->q.qlen >= sch->limit))
return qdisc_drop(skb, sch, to_free);
qdisc_qstats_backlog_inc(sch, skb);
@@ -522,8 +532,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (q->rate) {
struct sk_buff *last;
- if (!skb_queue_empty(&sch->q))
- last = skb_peek_tail(&sch->q);
+ if (sch->q.qlen)
+ last = sch->q.tail;
else
last = netem_rb_to_skb(rb_last(&q->t_root));
if (last) {
@@ -552,7 +562,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
cb->time_to_send = psched_get_time();
q->counter = 0;
- __skb_queue_head(&sch->q, skb);
+ netem_enqueue_skb_head(&sch->q, skb);
sch->qstats.requeues++;
}
@@ -587,7 +597,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
struct rb_node *p;
tfifo_dequeue:
- skb = __skb_dequeue(&sch->q);
+ skb = __qdisc_dequeue_head(&sch->q);
if (skb) {
qdisc_qstats_backlog_dec(sch, skb);
deliver:
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index a570b0bb254c..5c3a99d6aa82 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -231,7 +231,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt)
/* Drop excess packets if new limit is lower */
qlen = sch->q.qlen;
while (sch->q.qlen > sch->limit) {
- struct sk_buff *skb = __skb_dequeue(&sch->q);
+ struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
dropped += qdisc_pkt_len(skb);
qdisc_qstats_backlog_dec(sch, skb);
@@ -511,7 +511,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
{
struct sk_buff *skb;
- skb = __qdisc_dequeue_head(sch, &sch->q);
+ skb = qdisc_dequeue_head(sch);
if (!skb)
return NULL;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 1c23060c41a6..f10d3397f917 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1408,7 +1408,7 @@ void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc)
transports) {
if (t->pmtu_pending && t->dst) {
sctp_transport_update_pmtu(sk, t,
- WORD_TRUNC(dst_mtu(t->dst)));
+ SCTP_TRUNC4(dst_mtu(t->dst)));
t->pmtu_pending = 0;
}
if (!pmtu || (t->pathmtu < pmtu))
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 912eb1685a5d..f99d4855d3de 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -48,7 +48,7 @@ static struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = {
/* id 2 is reserved as well */
.hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_2,
},
-#if defined (CONFIG_CRYPTO_SHA256) || defined (CONFIG_CRYPTO_SHA256_MODULE)
+#if IS_ENABLED(CONFIG_CRYPTO_SHA256)
{
.hmac_id = SCTP_AUTH_HMAC_ID_SHA256,
.hmac_name = "hmac(sha256)",
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index a55e54738b81..8afe2e90d003 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -70,6 +70,19 @@ static struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp)
return msg;
}
+void sctp_datamsg_free(struct sctp_datamsg *msg)
+{
+ struct sctp_chunk *chunk;
+
+ /* This doesn't have to be a _safe vairant because
+ * sctp_chunk_free() only drops the refs.
+ */
+ list_for_each_entry(chunk, &msg->chunks, frag_list)
+ sctp_chunk_free(chunk);
+
+ sctp_datamsg_put(msg);
+}
+
/* Final destructruction of datamsg memory. */
static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
{
@@ -182,9 +195,10 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
/* This is the biggest possible DATA chunk that can fit into
* the packet
*/
- max_data = (asoc->pathmtu -
- sctp_sk(asoc->base.sk)->pf->af->net_header_len -
- sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk)) & ~3;
+ max_data = asoc->pathmtu -
+ sctp_sk(asoc->base.sk)->pf->af->net_header_len -
+ sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk);
+ max_data = SCTP_TRUNC4(max_data);
max = asoc->frag_point;
/* If the the peer requested that we authenticate DATA chunks
@@ -195,8 +209,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc);
if (hmac_desc)
- max_data -= WORD_ROUND(sizeof(sctp_auth_chunk_t) +
- hmac_desc->hmac_len);
+ max_data -= SCTP_PAD4(sizeof(sctp_auth_chunk_t) +
+ hmac_desc->hmac_len);
}
/* Now, check if we need to reduce our max */
@@ -216,7 +230,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
asoc->outqueue.out_qlen == 0 &&
list_empty(&asoc->outqueue.retransmit) &&
msg_len > max)
- max_data -= WORD_ROUND(sizeof(sctp_sack_chunk_t));
+ max_data -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
/* Encourage Cookie-ECHO bundling. */
if (asoc->state < SCTP_STATE_COOKIE_ECHOED)
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 69444d32ecda..a2ea1d1cc06a 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -605,7 +605,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
/* PMTU discovery (RFC1191) */
if (ICMP_FRAG_NEEDED == code) {
sctp_icmp_frag_needed(sk, asoc, transport,
- WORD_TRUNC(info));
+ SCTP_TRUNC4(info));
goto out_unlock;
} else {
if (ICMP_PROT_UNREACH == code) {
@@ -673,7 +673,7 @@ static int sctp_rcv_ootb(struct sk_buff *skb)
if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t))
break;
- ch_end = offset + WORD_ROUND(ntohs(ch->length));
+ ch_end = offset + SCTP_PAD4(ntohs(ch->length));
if (ch_end > skb->len)
break;
@@ -796,27 +796,34 @@ struct sctp_hash_cmp_arg {
static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
const void *ptr)
{
+ struct sctp_transport *t = (struct sctp_transport *)ptr;
const struct sctp_hash_cmp_arg *x = arg->key;
- const struct sctp_transport *t = ptr;
- struct sctp_association *asoc = t->asoc;
- const struct net *net = x->net;
+ struct sctp_association *asoc;
+ int err = 1;
if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr))
- return 1;
- if (!net_eq(sock_net(asoc->base.sk), net))
- return 1;
+ return err;
+ if (!sctp_transport_hold(t))
+ return err;
+
+ asoc = t->asoc;
+ if (!net_eq(sock_net(asoc->base.sk), x->net))
+ goto out;
if (x->ep) {
if (x->ep != asoc->ep)
- return 1;
+ goto out;
} else {
if (x->laddr->v4.sin_port != htons(asoc->base.bind_addr.port))
- return 1;
+ goto out;
if (!sctp_bind_addr_match(&asoc->base.bind_addr,
x->laddr, sctp_sk(asoc->base.sk)))
- return 1;
+ goto out;
}
- return 0;
+ err = 0;
+out:
+ sctp_transport_put(t);
+ return err;
}
static inline u32 sctp_hash_obj(const void *data, u32 len, u32 seed)
@@ -1121,7 +1128,7 @@ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net,
if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t))
break;
- ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
+ ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length));
if (ch_end > skb_tail_pointer(skb))
break;
@@ -1190,7 +1197,7 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net,
* that the chunk length doesn't cause overflow. Otherwise, we'll
* walk off the end.
*/
- if (WORD_ROUND(ntohs(ch->length)) > skb->len)
+ if (SCTP_PAD4(ntohs(ch->length)) > skb->len)
return NULL;
/* If this is INIT/INIT-ACK look inside the chunk too. */
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index 6437aa97cfd7..f731de3e8428 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -213,7 +213,7 @@ new_skb:
}
chunk->chunk_hdr = ch;
- chunk->chunk_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
+ chunk->chunk_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length));
skb_pull(chunk->skb, sizeof(sctp_chunkhdr_t));
chunk->subh.v = NULL; /* Subheader is no longer valid. */
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 1f1682b9a6a8..2a5c1896d18f 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -180,7 +180,6 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
int one_packet, gfp_t gfp)
{
sctp_xmit_t retval;
- int error = 0;
pr_debug("%s: packet:%p size:%Zu chunk:%p size:%d\n", __func__,
packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1);
@@ -188,6 +187,8 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) {
case SCTP_XMIT_PMTU_FULL:
if (!packet->has_cookie_echo) {
+ int error = 0;
+
error = sctp_packet_transmit(packet, gfp);
if (error < 0)
chunk->skb->sk->sk_err = -error;
@@ -296,7 +297,7 @@ static sctp_xmit_t __sctp_packet_append_chunk(struct sctp_packet *packet,
struct sctp_chunk *chunk)
{
sctp_xmit_t retval = SCTP_XMIT_OK;
- __u16 chunk_len = WORD_ROUND(ntohs(chunk->chunk_hdr->length));
+ __u16 chunk_len = SCTP_PAD4(ntohs(chunk->chunk_hdr->length));
/* Check to see if this chunk will fit into the packet */
retval = sctp_packet_will_fit(packet, chunk, chunk_len);
@@ -441,14 +442,14 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
* time. Application may notice this error.
*/
pr_err_once("Trying to GSO but underlying device doesn't support it.");
- goto nomem;
+ goto err;
}
} else {
pkt_size = packet->size;
}
head = alloc_skb(pkt_size + MAX_HEADER, gfp);
if (!head)
- goto nomem;
+ goto err;
if (gso) {
NAPI_GRO_CB(head)->last = head;
skb_shinfo(head)->gso_type = sk->sk_gso_type;
@@ -469,8 +470,12 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
}
}
dst = dst_clone(tp->dst);
- if (!dst)
- goto no_route;
+ if (!dst) {
+ if (asoc)
+ IP_INC_STATS(sock_net(asoc->base.sk),
+ IPSTATS_MIB_OUTNOROUTES);
+ goto nodst;
+ }
skb_dst_set(head, dst);
/* Build the SCTP header. */
@@ -503,7 +508,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
if (gso) {
pkt_size = packet->overhead;
list_for_each_entry(chunk, &packet->chunk_list, list) {
- int padded = WORD_ROUND(chunk->skb->len);
+ int padded = SCTP_PAD4(chunk->skb->len);
if (pkt_size + padded > tp->pathmtu)
break;
@@ -533,7 +538,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
* included in the chunk length field. The sender should
* never pad with more than 3 bytes.
*
- * [This whole comment explains WORD_ROUND() below.]
+ * [This whole comment explains SCTP_PAD4() below.]
*/
pkt_size -= packet->overhead;
@@ -555,7 +560,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
has_data = 1;
}
- padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len;
+ padding = SCTP_PAD4(chunk->skb->len) - chunk->skb->len;
if (padding)
memset(skb_put(chunk->skb, padding), 0, padding);
@@ -582,7 +587,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
* acknowledged or have failed.
* Re-queue auth chunks if needed.
*/
- pkt_size -= WORD_ROUND(chunk->skb->len);
+ pkt_size -= SCTP_PAD4(chunk->skb->len);
if (!sctp_chunk_is_data(chunk) && chunk != packet->auth)
sctp_chunk_free(chunk);
@@ -621,8 +626,10 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
if (!gso)
break;
- if (skb_gro_receive(&head, nskb))
+ if (skb_gro_receive(&head, nskb)) {
+ kfree_skb(nskb);
goto nomem;
+ }
nskb = NULL;
if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >=
sk->sk_gso_max_segs))
@@ -716,18 +723,13 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
}
head->ignore_df = packet->ipfragok;
tp->af_specific->sctp_xmit(head, tp);
+ goto out;
-out:
- sctp_packet_reset(packet);
- return err;
-no_route:
- kfree_skb(head);
- if (nskb != head)
- kfree_skb(nskb);
-
- if (asoc)
- IP_INC_STATS(sock_net(asoc->base.sk), IPSTATS_MIB_OUTNOROUTES);
+nomem:
+ if (packet->auth && list_empty(&packet->auth->list))
+ sctp_chunk_free(packet->auth);
+nodst:
/* FIXME: Returning the 'err' will effect all the associations
* associated with a socket, although only one of the paths of the
* association is unreachable.
@@ -736,22 +738,18 @@ no_route:
* required.
*/
/* err = -EHOSTUNREACH; */
-err:
- /* Control chunks are unreliable so just drop them. DATA chunks
- * will get resent or dropped later.
- */
+ kfree_skb(head);
+err:
list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
list_del_init(&chunk->list);
if (!sctp_chunk_is_data(chunk))
sctp_chunk_free(chunk);
}
- goto out;
-nomem:
- if (packet->auth && list_empty(&packet->auth->list))
- sctp_chunk_free(packet->auth);
- err = -ENOMEM;
- goto err;
+
+out:
+ sctp_packet_reset(packet);
+ return err;
}
/********************************************************************
@@ -878,7 +876,7 @@ static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet,
struct sctp_chunk *chunk,
u16 chunk_len)
{
- size_t psize, pmtu;
+ size_t psize, pmtu, maxsize;
sctp_xmit_t retval = SCTP_XMIT_OK;
psize = packet->size;
@@ -906,6 +904,17 @@ static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet,
goto out;
}
+ /* Similarly, if this chunk was built before a PMTU
+ * reduction, we have to fragment it at IP level now. So
+ * if the packet already contains something, we need to
+ * flush.
+ */
+ maxsize = pmtu - packet->overhead;
+ if (packet->auth)
+ maxsize -= SCTP_PAD4(packet->auth->skb->len);
+ if (chunk_len > maxsize)
+ retval = SCTP_XMIT_PMTU_FULL;
+
/* It is also okay to fragment if the chunk we are
* adding is a control chunk, but only if current packet
* is not a GSO one otherwise it causes fragmentation of
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 72e54a416af6..3ec6da8bbb53 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -68,7 +68,7 @@ static void sctp_mark_missing(struct sctp_outq *q,
static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
-static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp);
+static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp);
/* Add data to the front of the queue. */
static inline void sctp_outq_head_data(struct sctp_outq *q,
@@ -285,10 +285,9 @@ void sctp_outq_free(struct sctp_outq *q)
}
/* Put a new chunk in an sctp_outq. */
-int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
+void sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
{
struct net *net = sock_net(q->asoc->base.sk);
- int error = 0;
pr_debug("%s: outq:%p, chunk:%p[%s]\n", __func__, q, chunk,
chunk && chunk->chunk_hdr ?
@@ -299,54 +298,26 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
* immediately.
*/
if (sctp_chunk_is_data(chunk)) {
- /* Is it OK to queue data chunks? */
- /* From 9. Termination of Association
- *
- * When either endpoint performs a shutdown, the
- * association on each peer will stop accepting new
- * data from its user and only deliver data in queue
- * at the time of sending or receiving the SHUTDOWN
- * chunk.
- */
- switch (q->asoc->state) {
- case SCTP_STATE_CLOSED:
- case SCTP_STATE_SHUTDOWN_PENDING:
- case SCTP_STATE_SHUTDOWN_SENT:
- case SCTP_STATE_SHUTDOWN_RECEIVED:
- case SCTP_STATE_SHUTDOWN_ACK_SENT:
- /* Cannot send after transport endpoint shutdown */
- error = -ESHUTDOWN;
- break;
-
- default:
- pr_debug("%s: outqueueing: outq:%p, chunk:%p[%s])\n",
- __func__, q, chunk, chunk && chunk->chunk_hdr ?
- sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
- "illegal chunk");
-
- sctp_chunk_hold(chunk);
- sctp_outq_tail_data(q, chunk);
- if (chunk->asoc->prsctp_enable &&
- SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
- chunk->asoc->sent_cnt_removable++;
- if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
- SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS);
- else
- SCTP_INC_STATS(net, SCTP_MIB_OUTORDERCHUNKS);
- break;
- }
+ pr_debug("%s: outqueueing: outq:%p, chunk:%p[%s])\n",
+ __func__, q, chunk, chunk && chunk->chunk_hdr ?
+ sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
+ "illegal chunk");
+
+ sctp_outq_tail_data(q, chunk);
+ if (chunk->asoc->prsctp_enable &&
+ SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
+ chunk->asoc->sent_cnt_removable++;
+ if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
+ SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS);
+ else
+ SCTP_INC_STATS(net, SCTP_MIB_OUTORDERCHUNKS);
} else {
list_add_tail(&chunk->list, &q->control_chunk_list);
SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
}
- if (error < 0)
- return error;
-
if (!q->cork)
- error = sctp_outq_flush(q, 0, gfp);
-
- return error;
+ sctp_outq_flush(q, 0, gfp);
}
/* Insert a chunk into the sorted list based on the TSNs. The retransmit list
@@ -559,7 +530,6 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
sctp_retransmit_reason_t reason)
{
struct net *net = sock_net(q->asoc->base.sk);
- int error = 0;
switch (reason) {
case SCTP_RTXR_T3_RTX:
@@ -603,10 +573,7 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
* will be flushed at the end.
*/
if (reason != SCTP_RTXR_FAST_RTX)
- error = sctp_outq_flush(q, /* rtx_timeout */ 1, GFP_ATOMIC);
-
- if (error)
- q->asoc->base.sk->sk_err = -error;
+ sctp_outq_flush(q, /* rtx_timeout */ 1, GFP_ATOMIC);
}
/*
@@ -778,12 +745,12 @@ redo:
}
/* Cork the outqueue so queued chunks are really queued. */
-int sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
+void sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
{
if (q->cork)
q->cork = 0;
- return sctp_outq_flush(q, 0, gfp);
+ sctp_outq_flush(q, 0, gfp);
}
@@ -796,7 +763,7 @@ int sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
* locking concerns must be made. Today we use the sock lock to protect
* this function.
*/
-static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
+static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
{
struct sctp_packet *packet;
struct sctp_packet singleton;
@@ -919,8 +886,10 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
sctp_packet_config(&singleton, vtag, 0);
sctp_packet_append_chunk(&singleton, chunk);
error = sctp_packet_transmit(&singleton, gfp);
- if (error < 0)
- return error;
+ if (error < 0) {
+ asoc->base.sk->sk_err = -error;
+ return;
+ }
break;
case SCTP_CID_ABORT:
@@ -1018,6 +987,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
retran:
error = sctp_outq_flush_rtx(q, packet,
rtx_timeout, &start_timer);
+ if (error < 0)
+ asoc->base.sk->sk_err = -error;
if (start_timer) {
sctp_transport_reset_t3_rtx(transport);
@@ -1192,14 +1163,15 @@ sctp_flush_out:
struct sctp_transport,
send_ready);
packet = &t->packet;
- if (!sctp_packet_empty(packet))
+ if (!sctp_packet_empty(packet)) {
error = sctp_packet_transmit(packet, gfp);
+ if (error < 0)
+ asoc->base.sk->sk_err = -error;
+ }
/* Clear the burst limited state, if any */
sctp_transport_burst_reset(t);
}
-
- return error;
}
/* Update unack_data based on the incoming SACK chunk */
@@ -1747,7 +1719,7 @@ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn)
{
int i;
sctp_sack_variable_t *frags;
- __u16 gap;
+ __u16 tsn_offset, blocks;
__u32 ctsn = ntohl(sack->cum_tsn_ack);
if (TSN_lte(tsn, ctsn))
@@ -1766,10 +1738,11 @@ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn)
*/
frags = sack->variable;
- gap = tsn - ctsn;
- for (i = 0; i < ntohs(sack->num_gap_ack_blocks); ++i) {
- if (TSN_lte(ntohs(frags[i].gab.start), gap) &&
- TSN_lte(gap, ntohs(frags[i].gab.end)))
+ blocks = ntohs(sack->num_gap_ack_blocks);
+ tsn_offset = tsn - ctsn;
+ for (i = 0; i < blocks; ++i) {
+ if (tsn_offset >= ntohs(frags[i].gab.start) &&
+ tsn_offset <= ntohs(frags[i].gab.end))
goto pass;
}
diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
index f3508aa75815..807158e32f5f 100644
--- a/net/sctp/sctp_diag.c
+++ b/net/sctp/sctp_diag.c
@@ -106,7 +106,8 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc,
const struct inet_diag_req_v2 *req,
struct user_namespace *user_ns,
int portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ const struct nlmsghdr *unlh,
+ bool net_admin)
{
struct sctp_endpoint *ep = sctp_sk(sk)->ep;
struct list_head *addr_list;
@@ -133,7 +134,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc,
r->idiag_retrans = 0;
}
- if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns))
+ if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin))
goto errout;
if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) {
@@ -203,6 +204,7 @@ struct sctp_comm_param {
struct netlink_callback *cb;
const struct inet_diag_req_v2 *r;
const struct nlmsghdr *nlh;
+ bool net_admin;
};
static size_t inet_assoc_attr_size(struct sctp_association *asoc)
@@ -219,6 +221,7 @@ static size_t inet_assoc_attr_size(struct sctp_association *asoc)
+ nla_total_size(1) /* INET_DIAG_SHUTDOWN */
+ nla_total_size(1) /* INET_DIAG_TOS */
+ nla_total_size(1) /* INET_DIAG_TCLASS */
+ + nla_total_size(4) /* INET_DIAG_MARK */
+ nla_total_size(addrlen * asoc->peer.transport_count)
+ nla_total_size(addrlen * addrcnt)
+ nla_total_size(sizeof(struct inet_diag_meminfo))
@@ -256,7 +259,8 @@ static int sctp_tsp_dump_one(struct sctp_transport *tsp, void *p)
err = inet_sctp_diag_fill(sk, assoc, rep, req,
sk_user_ns(NETLINK_CB(in_skb).sk),
NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0, nlh);
+ nlh->nlmsg_seq, 0, nlh,
+ commp->net_admin);
release_sock(sk);
if (err < 0) {
WARN_ON(err == -EMSGSIZE);
@@ -310,7 +314,8 @@ static int sctp_tsp_dump(struct sctp_transport *tsp, void *p)
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
- NLM_F_MULTI, cb->nlh) < 0) {
+ NLM_F_MULTI, cb->nlh,
+ commp->net_admin) < 0) {
cb->args[3] = 1;
err = 2;
goto release;
@@ -320,7 +325,8 @@ static int sctp_tsp_dump(struct sctp_transport *tsp, void *p)
if (inet_sctp_diag_fill(sk, assoc, skb, r,
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, 0, cb->nlh) < 0) {
+ cb->nlh->nlmsg_seq, 0, cb->nlh,
+ commp->net_admin) < 0) {
err = 2;
goto release;
}
@@ -375,7 +381,7 @@ static int sctp_ep_dump(struct sctp_endpoint *ep, void *p)
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
- cb->nlh) < 0) {
+ cb->nlh, commp->net_admin) < 0) {
err = 2;
goto out;
}
@@ -412,6 +418,7 @@ static int sctp_diag_dump_one(struct sk_buff *in_skb,
.skb = in_skb,
.r = req,
.nlh = nlh,
+ .net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN),
};
if (req->sdiag_family == AF_INET) {
@@ -447,6 +454,7 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
.skb = skb,
.cb = cb,
.r = r,
+ .net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN),
};
/* eps hashtable dumps
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 8c77b87a8565..79dd66079dd7 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -253,7 +253,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
num_types = sp->pf->supported_addrs(sp, types);
chunksize = sizeof(init) + addrs_len;
- chunksize += WORD_ROUND(SCTP_SAT_LEN(num_types));
+ chunksize += SCTP_PAD4(SCTP_SAT_LEN(num_types));
chunksize += sizeof(ecap_param);
if (asoc->prsctp_enable)
@@ -283,14 +283,14 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
/* Add HMACS parameter length if any were defined */
auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs;
if (auth_hmacs->length)
- chunksize += WORD_ROUND(ntohs(auth_hmacs->length));
+ chunksize += SCTP_PAD4(ntohs(auth_hmacs->length));
else
auth_hmacs = NULL;
/* Add CHUNKS parameter length */
auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks;
if (auth_chunks->length)
- chunksize += WORD_ROUND(ntohs(auth_chunks->length));
+ chunksize += SCTP_PAD4(ntohs(auth_chunks->length));
else
auth_chunks = NULL;
@@ -300,8 +300,8 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
/* If we have any extensions to report, account for that */
if (num_ext)
- chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) +
- num_ext);
+ chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) +
+ num_ext);
/* RFC 2960 3.3.2 Initiation (INIT) (1)
*
@@ -443,13 +443,13 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs;
if (auth_hmacs->length)
- chunksize += WORD_ROUND(ntohs(auth_hmacs->length));
+ chunksize += SCTP_PAD4(ntohs(auth_hmacs->length));
else
auth_hmacs = NULL;
auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks;
if (auth_chunks->length)
- chunksize += WORD_ROUND(ntohs(auth_chunks->length));
+ chunksize += SCTP_PAD4(ntohs(auth_chunks->length));
else
auth_chunks = NULL;
@@ -458,8 +458,8 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
}
if (num_ext)
- chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) +
- num_ext);
+ chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) +
+ num_ext);
/* Now allocate and fill out the chunk. */
retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp);
@@ -1390,7 +1390,7 @@ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
struct sock *sk;
/* No need to allocate LL here, as this is only a chunk. */
- skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen), gfp);
+ skb = alloc_skb(SCTP_PAD4(sizeof(sctp_chunkhdr_t) + paylen), gfp);
if (!skb)
goto nodata;
@@ -1482,7 +1482,7 @@ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data)
void *target;
void *padding;
int chunklen = ntohs(chunk->chunk_hdr->length);
- int padlen = WORD_ROUND(chunklen) - chunklen;
+ int padlen = SCTP_PAD4(chunklen) - chunklen;
padding = skb_put(chunk->skb, padlen);
target = skb_put(chunk->skb, len);
@@ -1900,7 +1900,7 @@ static int sctp_process_missing_param(const struct sctp_association *asoc,
struct __sctp_missing report;
__u16 len;
- len = WORD_ROUND(sizeof(report));
+ len = SCTP_PAD4(sizeof(report));
/* Make an ERROR chunk, preparing enough room for
* returning multiple unknown parameters.
@@ -2098,9 +2098,9 @@ static sctp_ierror_t sctp_process_unk_param(const struct sctp_association *asoc,
if (*errp) {
if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM,
- WORD_ROUND(ntohs(param.p->length))))
+ SCTP_PAD4(ntohs(param.p->length))))
sctp_addto_chunk_fixed(*errp,
- WORD_ROUND(ntohs(param.p->length)),
+ SCTP_PAD4(ntohs(param.p->length)),
param.v);
} else {
/* If there is no memory for generating the ERROR
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 12d45193357c..c345bf153bed 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1020,19 +1020,13 @@ static void sctp_cmd_t1_timer_update(struct sctp_association *asoc,
* This way the whole message is queued up and bundling if
* encouraged for small fragments.
*/
-static int sctp_cmd_send_msg(struct sctp_association *asoc,
- struct sctp_datamsg *msg, gfp_t gfp)
+static void sctp_cmd_send_msg(struct sctp_association *asoc,
+ struct sctp_datamsg *msg, gfp_t gfp)
{
struct sctp_chunk *chunk;
- int error = 0;
-
- list_for_each_entry(chunk, &msg->chunks, frag_list) {
- error = sctp_outq_tail(&asoc->outqueue, chunk, gfp);
- if (error)
- break;
- }
- return error;
+ list_for_each_entry(chunk, &msg->chunks, frag_list)
+ sctp_outq_tail(&asoc->outqueue, chunk, gfp);
}
@@ -1427,8 +1421,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
local_cork = 1;
}
/* Send a chunk to our peer. */
- error = sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk,
- gfp);
+ sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk, gfp);
break;
case SCTP_CMD_SEND_PKT:
@@ -1682,7 +1675,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
case SCTP_CMD_FORCE_PRIM_RETRAN:
t = asoc->peer.retran_path;
asoc->peer.retran_path = asoc->peer.primary_path;
- error = sctp_outq_uncork(&asoc->outqueue, gfp);
+ sctp_outq_uncork(&asoc->outqueue, gfp);
local_cork = 0;
asoc->peer.retran_path = t;
break;
@@ -1709,7 +1702,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
sctp_outq_cork(&asoc->outqueue);
local_cork = 1;
}
- error = sctp_cmd_send_msg(asoc, cmd->obj.msg, gfp);
+ sctp_cmd_send_msg(asoc, cmd->obj.msg, gfp);
break;
case SCTP_CMD_SEND_NEXT_ASCONF:
sctp_cmd_send_asconf(asoc);
@@ -1739,9 +1732,9 @@ out:
*/
if (asoc && SCTP_EVENT_T_CHUNK == event_type && chunk) {
if (chunk->end_of_packet || chunk->singleton)
- error = sctp_outq_uncork(&asoc->outqueue, gfp);
+ sctp_outq_uncork(&asoc->outqueue, gfp);
} else if (local_cork)
- error = sctp_outq_uncork(&asoc->outqueue, gfp);
+ sctp_outq_uncork(&asoc->outqueue, gfp);
if (sp->data_ready_signalled)
sp->data_ready_signalled = 0;
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index d88bb2b0b699..026e3bca4a94 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -3454,7 +3454,7 @@ sctp_disposition_t sctp_sf_ootb(struct net *net,
}
/* Report violation if chunk len overflows */
- ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
+ ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length));
if (ch_end > skb_tail_pointer(skb))
return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
commands);
@@ -4185,7 +4185,7 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net,
hdr = unk_chunk->chunk_hdr;
err_chunk = sctp_make_op_error(asoc, unk_chunk,
SCTP_ERROR_UNKNOWN_CHUNK, hdr,
- WORD_ROUND(ntohs(hdr->length)),
+ SCTP_PAD4(ntohs(hdr->length)),
0);
if (err_chunk) {
sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
@@ -4203,7 +4203,7 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net,
hdr = unk_chunk->chunk_hdr;
err_chunk = sctp_make_op_error(asoc, unk_chunk,
SCTP_ERROR_UNKNOWN_CHUNK, hdr,
- WORD_ROUND(ntohs(hdr->length)),
+ SCTP_PAD4(ntohs(hdr->length)),
0);
if (err_chunk) {
sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 9fc417a8b476..6cdc61c21438 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1958,6 +1958,8 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
/* Now send the (possibly) fragmented message. */
list_for_each_entry(chunk, &datamsg->chunks, frag_list) {
+ sctp_chunk_hold(chunk);
+
/* Do accounting for the write space. */
sctp_set_owner_w(chunk);
@@ -1970,13 +1972,15 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
* breaks.
*/
err = sctp_primitive_SEND(net, asoc, datamsg);
- sctp_datamsg_put(datamsg);
/* Did the lower layer accept the chunk? */
- if (err)
+ if (err) {
+ sctp_datamsg_free(datamsg);
goto out_free;
+ }
pr_debug("%s: we sent primitively\n", __func__);
+ sctp_datamsg_put(datamsg);
err = msg_len;
if (unlikely(wait_connect)) {
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 81b86678be4d..ce54dce13ddb 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -233,7 +233,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
}
if (transport->dst) {
- transport->pathmtu = WORD_TRUNC(dst_mtu(transport->dst));
+ transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst));
} else
transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
}
@@ -287,7 +287,7 @@ void sctp_transport_route(struct sctp_transport *transport,
return;
}
if (transport->dst) {
- transport->pathmtu = WORD_TRUNC(dst_mtu(transport->dst));
+ transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst));
/* Initialize sk->sk_rcv_saddr, if the transport is the
* association's active path for getsockname().
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index d85b803da11d..bea00058ce35 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -383,7 +383,7 @@ sctp_ulpevent_make_remote_error(const struct sctp_association *asoc,
ch = (sctp_errhdr_t *)(chunk->skb->data);
cause = ch->cause;
- elen = WORD_ROUND(ntohs(ch->length)) - sizeof(sctp_errhdr_t);
+ elen = SCTP_PAD4(ntohs(ch->length)) - sizeof(sctp_errhdr_t);
/* Pull off the ERROR header. */
skb_pull(chunk->skb, sizeof(sctp_errhdr_t));
@@ -688,7 +688,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
* MUST ignore the padding bytes.
*/
len = ntohs(chunk->chunk_hdr->length);
- padding = WORD_ROUND(len) - len;
+ padding = SCTP_PAD4(len) - len;
/* Fixup cloned skb with just this chunks data. */
skb_trim(skb, chunk->chunk_end - padding - skb->data);
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 877e55066f89..84d0fdaf7de9 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -140,11 +140,8 @@ int sctp_clear_pd(struct sock *sk, struct sctp_association *asoc)
* we can go ahead and clear out the lobby in one shot
*/
if (!skb_queue_empty(&sp->pd_lobby)) {
- struct list_head *list;
skb_queue_splice_tail_init(&sp->pd_lobby,
&sk->sk_receive_queue);
- list = (struct list_head *)&sctp_sk(sk)->pd_lobby;
- INIT_LIST_HEAD(list);
return 1;
}
} else {
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 1d281816f2bf..d8582028b346 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -569,9 +569,10 @@ gss_svc_searchbyctx(struct cache_detail *cd, struct xdr_netobj *handle)
struct rsc *found;
memset(&rsci, 0, sizeof(rsci));
- rsci.handle.data = handle->data;
- rsci.handle.len = handle->len;
+ if (dup_to_netobj(&rsci.handle, handle->data, handle->len))
+ return NULL;
found = rsc_lookup(cd, &rsci);
+ rsc_free(&rsci);
if (!found)
return NULL;
if (cache_check(cd, &found->h, NULL))
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 7f79fb7dc6a0..66f23b376fa0 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -453,7 +453,7 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
struct rpc_xprt_switch *xps;
if (args->bc_xprt && args->bc_xprt->xpt_bc_xps) {
- WARN_ON(args->protocol != XPRT_TRANSPORT_BC_TCP);
+ WARN_ON_ONCE(!(args->protocol & XPRT_TRANSPORT_BC));
xps = args->bc_xprt->xpt_bc_xps;
xprt_switch_get(xps);
} else {
@@ -520,7 +520,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
char servername[48];
if (args->bc_xprt) {
- WARN_ON(args->protocol != XPRT_TRANSPORT_BC_TCP);
+ WARN_ON_ONCE(!(args->protocol & XPRT_TRANSPORT_BC));
xprt = args->bc_xprt->xpt_bc_xprt;
if (xprt) {
xprt_get(xprt);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 536d0be3f61b..799cce6cbe45 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -51,6 +51,7 @@
#include <linux/slab.h>
#include <linux/prefetch.h>
#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/svc_rdma.h>
#include <asm/bitops.h>
#include <linux/module.h> /* try_module_get()/module_put() */
@@ -923,7 +924,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
}
INIT_LIST_HEAD(&buf->rb_recv_bufs);
- for (i = 0; i < buf->rb_max_requests; i++) {
+ for (i = 0; i < buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS; i++) {
struct rpcrdma_rep *rep;
rep = rpcrdma_create_rep(r_xprt);
@@ -1018,6 +1019,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
rep = rpcrdma_buffer_get_rep_locked(buf);
rpcrdma_destroy_rep(ia, rep);
}
+ buf->rb_send_count = 0;
spin_lock(&buf->rb_reqslock);
while (!list_empty(&buf->rb_allreqs)) {
@@ -1032,6 +1034,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
spin_lock(&buf->rb_reqslock);
}
spin_unlock(&buf->rb_reqslock);
+ buf->rb_recv_count = 0;
rpcrdma_destroy_mrs(buf);
}
@@ -1074,8 +1077,27 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
spin_unlock(&buf->rb_mwlock);
}
+static struct rpcrdma_rep *
+rpcrdma_buffer_get_rep(struct rpcrdma_buffer *buffers)
+{
+ /* If an RPC previously completed without a reply (say, a
+ * credential problem or a soft timeout occurs) then hold off
+ * on supplying more Receive buffers until the number of new
+ * pending RPCs catches up to the number of posted Receives.
+ */
+ if (unlikely(buffers->rb_send_count < buffers->rb_recv_count))
+ return NULL;
+
+ if (unlikely(list_empty(&buffers->rb_recv_bufs)))
+ return NULL;
+ buffers->rb_recv_count++;
+ return rpcrdma_buffer_get_rep_locked(buffers);
+}
+
/*
* Get a set of request/reply buffers.
+ *
+ * Reply buffer (if available) is attached to send buffer upon return.
*/
struct rpcrdma_req *
rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
@@ -1085,21 +1107,15 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
spin_lock(&buffers->rb_lock);
if (list_empty(&buffers->rb_send_bufs))
goto out_reqbuf;
+ buffers->rb_send_count++;
req = rpcrdma_buffer_get_req_locked(buffers);
- if (list_empty(&buffers->rb_recv_bufs))
- goto out_repbuf;
- req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers);
+ req->rl_reply = rpcrdma_buffer_get_rep(buffers);
spin_unlock(&buffers->rb_lock);
return req;
out_reqbuf:
spin_unlock(&buffers->rb_lock);
- pr_warn("rpcrdma: out of request buffers (%p)\n", buffers);
- return NULL;
-out_repbuf:
- list_add(&req->rl_free, &buffers->rb_send_bufs);
- spin_unlock(&buffers->rb_lock);
- pr_warn("rpcrdma: out of reply buffers (%p)\n", buffers);
+ pr_warn("RPC: %s: out of request buffers\n", __func__);
return NULL;
}
@@ -1117,9 +1133,12 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
req->rl_reply = NULL;
spin_lock(&buffers->rb_lock);
+ buffers->rb_send_count--;
list_add_tail(&req->rl_free, &buffers->rb_send_bufs);
- if (rep)
+ if (rep) {
+ buffers->rb_recv_count--;
list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
+ }
spin_unlock(&buffers->rb_lock);
}
@@ -1133,8 +1152,7 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
struct rpcrdma_buffer *buffers = req->rl_buffer;
spin_lock(&buffers->rb_lock);
- if (!list_empty(&buffers->rb_recv_bufs))
- req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers);
+ req->rl_reply = rpcrdma_buffer_get_rep(buffers);
spin_unlock(&buffers->rb_lock);
}
@@ -1148,6 +1166,7 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
spin_lock(&buffers->rb_lock);
+ buffers->rb_recv_count--;
list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
spin_unlock(&buffers->rb_lock);
}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 670fad57153a..a71b0f5897d8 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -321,6 +321,7 @@ struct rpcrdma_buffer {
char *rb_pool;
spinlock_t rb_lock; /* protect buf lists */
+ int rb_send_count, rb_recv_count;
struct list_head rb_send_bufs;
struct list_head rb_recv_bufs;
u32 rb_max_requests;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 8ede3bc52481..bf168838a029 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1074,7 +1074,7 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
skb = skb_recv_datagram(sk, 0, 1, &err);
if (skb != NULL) {
xs_udp_data_read_skb(&transport->xprt, sk, skb);
- skb_free_datagram(sk, skb);
+ skb_free_datagram_locked(sk, skb);
continue;
}
if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 6b626a64b517..a04fe9be1c60 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -62,6 +62,8 @@ static void publ_to_item(struct distr_item *i, struct publication *p)
/**
* named_prepare_buf - allocate & initialize a publication message
+ *
+ * The buffer returned is of size INT_H_SIZE + payload size
*/
static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size,
u32 dest)
@@ -141,9 +143,9 @@ static void named_distribute(struct net *net, struct sk_buff_head *list,
struct publication *publ;
struct sk_buff *skb = NULL;
struct distr_item *item = NULL;
- uint msg_dsz = (tipc_node_get_mtu(net, dnode, 0) / ITEM_SIZE) *
- ITEM_SIZE;
- uint msg_rem = msg_dsz;
+ u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0) - INT_H_SIZE) /
+ ITEM_SIZE) * ITEM_SIZE;
+ u32 msg_rem = msg_dsz;
list_for_each_entry(publ, pls, local_list) {
/* Prepare next buffer: */
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index dd274687a53d..d80cd3f7503f 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -665,7 +665,8 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
if (!opts[TIPC_NLA_UDP_LOCAL] || !opts[TIPC_NLA_UDP_REMOTE]) {
pr_err("Invalid UDP bearer configuration");
- return -EINVAL;
+ err = -EINVAL;
+ goto err;
}
err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_LOCAL], &local,
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index f1dffe84f0d5..8309687a56b0 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -661,11 +661,11 @@ static int unix_set_peek_off(struct sock *sk, int val)
{
struct unix_sock *u = unix_sk(sk);
- if (mutex_lock_interruptible(&u->readlock))
+ if (mutex_lock_interruptible(&u->iolock))
return -EINTR;
sk->sk_peek_off = val;
- mutex_unlock(&u->readlock);
+ mutex_unlock(&u->iolock);
return 0;
}
@@ -779,7 +779,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
spin_lock_init(&u->lock);
atomic_long_set(&u->inflight, 0);
INIT_LIST_HEAD(&u->link);
- mutex_init(&u->readlock); /* single task reading lock */
+ mutex_init(&u->iolock); /* single task reading lock */
+ mutex_init(&u->bindlock); /* single task binding lock */
init_waitqueue_head(&u->peer_wait);
init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
unix_insert_socket(unix_sockets_unbound(sk), sk);
@@ -848,7 +849,7 @@ static int unix_autobind(struct socket *sock)
int err;
unsigned int retries = 0;
- err = mutex_lock_interruptible(&u->readlock);
+ err = mutex_lock_interruptible(&u->bindlock);
if (err)
return err;
@@ -895,7 +896,7 @@ retry:
spin_unlock(&unix_table_lock);
err = 0;
-out: mutex_unlock(&u->readlock);
+out: mutex_unlock(&u->bindlock);
return err;
}
@@ -954,20 +955,32 @@ fail:
return NULL;
}
-static int unix_mknod(struct dentry *dentry, const struct path *path, umode_t mode,
- struct path *res)
+static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
{
- int err;
+ struct dentry *dentry;
+ struct path path;
+ int err = 0;
+ /*
+ * Get the parent directory, calculate the hash for last
+ * component.
+ */
+ dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
+ err = PTR_ERR(dentry);
+ if (IS_ERR(dentry))
+ return err;
- err = security_path_mknod(path, dentry, mode, 0);
+ /*
+ * All right, let's create it.
+ */
+ err = security_path_mknod(&path, dentry, mode, 0);
if (!err) {
- err = vfs_mknod(d_inode(path->dentry), dentry, mode, 0);
+ err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
if (!err) {
- res->mnt = mntget(path->mnt);
+ res->mnt = mntget(path.mnt);
res->dentry = dget(dentry);
}
}
-
+ done_path_create(&path, dentry);
return err;
}
@@ -978,12 +991,10 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
struct unix_sock *u = unix_sk(sk);
struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
char *sun_path = sunaddr->sun_path;
- int err, name_err;
+ int err;
unsigned int hash;
struct unix_address *addr;
struct hlist_head *list;
- struct path path;
- struct dentry *dentry;
err = -EINVAL;
if (sunaddr->sun_family != AF_UNIX)
@@ -999,34 +1010,14 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto out;
addr_len = err;
- name_err = 0;
- dentry = NULL;
- if (sun_path[0]) {
- /* Get the parent directory, calculate the hash for last
- * component.
- */
- dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
-
- if (IS_ERR(dentry)) {
- /* delay report until after 'already bound' check */
- name_err = PTR_ERR(dentry);
- dentry = NULL;
- }
- }
-
- err = mutex_lock_interruptible(&u->readlock);
+ err = mutex_lock_interruptible(&u->bindlock);
if (err)
- goto out_path;
+ goto out;
err = -EINVAL;
if (u->addr)
goto out_up;
- if (name_err) {
- err = name_err == -EEXIST ? -EADDRINUSE : name_err;
- goto out_up;
- }
-
err = -ENOMEM;
addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
if (!addr)
@@ -1037,11 +1028,11 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
addr->hash = hash ^ sk->sk_type;
atomic_set(&addr->refcnt, 1);
- if (dentry) {
- struct path u_path;
+ if (sun_path[0]) {
+ struct path path;
umode_t mode = S_IFSOCK |
(SOCK_INODE(sock)->i_mode & ~current_umask());
- err = unix_mknod(dentry, &path, mode, &u_path);
+ err = unix_mknod(sun_path, mode, &path);
if (err) {
if (err == -EEXIST)
err = -EADDRINUSE;
@@ -1049,9 +1040,9 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto out_up;
}
addr->hash = UNIX_HASH_SIZE;
- hash = d_real_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1);
+ hash = d_real_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
spin_lock(&unix_table_lock);
- u->path = u_path;
+ u->path = path;
list = &unix_socket_table[hash];
} else {
spin_lock(&unix_table_lock);
@@ -1073,11 +1064,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
out_unlock:
spin_unlock(&unix_table_lock);
out_up:
- mutex_unlock(&u->readlock);
-out_path:
- if (dentry)
- done_path_create(&path, dentry);
-
+ mutex_unlock(&u->bindlock);
out:
return err;
}
@@ -1969,17 +1956,17 @@ static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
if (false) {
alloc_skb:
unix_state_unlock(other);
- mutex_unlock(&unix_sk(other)->readlock);
+ mutex_unlock(&unix_sk(other)->iolock);
newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
&err, 0);
if (!newskb)
goto err;
}
- /* we must acquire readlock as we modify already present
+ /* we must acquire iolock as we modify already present
* skbs in the sk_receive_queue and mess with skb->len
*/
- err = mutex_lock_interruptible(&unix_sk(other)->readlock);
+ err = mutex_lock_interruptible(&unix_sk(other)->iolock);
if (err) {
err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
goto err;
@@ -2046,7 +2033,7 @@ alloc_skb:
}
unix_state_unlock(other);
- mutex_unlock(&unix_sk(other)->readlock);
+ mutex_unlock(&unix_sk(other)->iolock);
other->sk_data_ready(other);
scm_destroy(&scm);
@@ -2055,7 +2042,7 @@ alloc_skb:
err_state_unlock:
unix_state_unlock(other);
err_unlock:
- mutex_unlock(&unix_sk(other)->readlock);
+ mutex_unlock(&unix_sk(other)->iolock);
err:
kfree_skb(newskb);
if (send_sigpipe && !(flags & MSG_NOSIGNAL))
@@ -2123,7 +2110,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
do {
- mutex_lock(&u->readlock);
+ mutex_lock(&u->iolock);
skip = sk_peek_offset(sk, flags);
skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err,
@@ -2131,14 +2118,14 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
if (skb)
break;
- mutex_unlock(&u->readlock);
+ mutex_unlock(&u->iolock);
if (err != -EAGAIN)
break;
} while (timeo &&
!__skb_wait_for_more_packets(sk, &err, &timeo, last));
- if (!skb) { /* implies readlock unlocked */
+ if (!skb) { /* implies iolock unlocked */
unix_state_lock(sk);
/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
@@ -2203,7 +2190,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
out_free:
skb_free_datagram(sk, skb);
- mutex_unlock(&u->readlock);
+ mutex_unlock(&u->iolock);
out:
return err;
}
@@ -2298,7 +2285,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state)
/* Lock the socket to prevent queue disordering
* while sleeps in memcpy_tomsg
*/
- mutex_lock(&u->readlock);
+ mutex_lock(&u->iolock);
if (flags & MSG_PEEK)
skip = sk_peek_offset(sk, flags);
@@ -2340,7 +2327,7 @@ again:
break;
}
- mutex_unlock(&u->readlock);
+ mutex_unlock(&u->iolock);
timeo = unix_stream_data_wait(sk, timeo, last,
last_len);
@@ -2351,7 +2338,7 @@ again:
goto out;
}
- mutex_lock(&u->readlock);
+ mutex_lock(&u->iolock);
goto redo;
unlock:
unix_state_unlock(sk);
@@ -2454,7 +2441,7 @@ unlock:
}
} while (size);
- mutex_unlock(&u->readlock);
+ mutex_unlock(&u->iolock);
if (state->msg)
scm_recv(sock, state->msg, &scm, flags);
else
@@ -2495,9 +2482,9 @@ static ssize_t skb_unix_socket_splice(struct sock *sk,
int ret;
struct unix_sock *u = unix_sk(sk);
- mutex_unlock(&u->readlock);
+ mutex_unlock(&u->iolock);
ret = splice_to_pipe(pipe, spd);
- mutex_lock(&u->readlock);
+ mutex_lock(&u->iolock);
return ret;
}
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 2029b49a1df3..4911cd997b9a 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -1252,7 +1252,7 @@ static int __init cfg80211_init(void)
if (err)
goto out_fail_reg;
- cfg80211_wq = create_singlethread_workqueue("cfg80211");
+ cfg80211_wq = alloc_ordered_workqueue("cfg80211", WQ_MEM_RECLAIM);
if (!cfg80211_wq) {
err = -ENOMEM;
goto out_fail_wq;
diff --git a/net/wireless/core.h b/net/wireless/core.h
index eee91443924d..5555e3c13ae9 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -249,9 +249,9 @@ struct cfg80211_event {
};
struct cfg80211_cached_keys {
- struct key_params params[6];
- u8 data[6][WLAN_MAX_KEY_LEN];
- int def, defmgmt;
+ struct key_params params[4];
+ u8 data[4][WLAN_KEY_LEN_WEP104];
+ int def;
};
enum cfg80211_chan_mode {
diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c
index 4a4dda53bdf1..eafdfa5798ae 100644
--- a/net/wireless/ibss.c
+++ b/net/wireless/ibss.c
@@ -114,6 +114,9 @@ static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
}
}
+ if (WARN_ON(connkeys && connkeys->def < 0))
+ return -EINVAL;
+
if (WARN_ON(wdev->connect_keys))
kzfree(wdev->connect_keys);
wdev->connect_keys = connkeys;
@@ -284,18 +287,16 @@ int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev,
if (!netif_running(wdev->netdev))
return 0;
- if (wdev->wext.keys) {
+ if (wdev->wext.keys)
wdev->wext.keys->def = wdev->wext.default_key;
- wdev->wext.keys->defmgmt = wdev->wext.default_mgmt_key;
- }
wdev->wext.ibss.privacy = wdev->wext.default_key != -1;
- if (wdev->wext.keys) {
+ if (wdev->wext.keys && wdev->wext.keys->def != -1) {
ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL);
if (!ck)
return -ENOMEM;
- for (i = 0; i < 6; i++)
+ for (i = 0; i < 4; i++)
ck->params[i].key = ck->data[i];
}
err = __cfg80211_join_ibss(rdev, wdev->netdev,
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index c284d883c349..d6abb0704db5 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -222,7 +222,7 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
ASSERT_WDEV_LOCK(wdev);
if (auth_type == NL80211_AUTHTYPE_SHARED_KEY)
- if (!key || !key_len || key_idx < 0 || key_idx > 4)
+ if (!key || !key_len || key_idx < 0 || key_idx > 3)
return -EINVAL;
if (wdev->current_bss &&
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 499785778983..fd111e2b559d 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -848,13 +848,21 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
struct nlattr *key;
struct cfg80211_cached_keys *result;
int rem, err, def = 0;
+ bool have_key = false;
+
+ nla_for_each_nested(key, keys, rem) {
+ have_key = true;
+ break;
+ }
+
+ if (!have_key)
+ return NULL;
result = kzalloc(sizeof(*result), GFP_KERNEL);
if (!result)
return ERR_PTR(-ENOMEM);
result->def = -1;
- result->defmgmt = -1;
nla_for_each_nested(key, keys, rem) {
memset(&parse, 0, sizeof(parse));
@@ -866,7 +874,7 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
err = -EINVAL;
if (!parse.p.key)
goto error;
- if (parse.idx < 0 || parse.idx > 4)
+ if (parse.idx < 0 || parse.idx > 3)
goto error;
if (parse.def) {
if (def)
@@ -881,16 +889,24 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
parse.idx, false, NULL);
if (err)
goto error;
+ if (parse.p.cipher != WLAN_CIPHER_SUITE_WEP40 &&
+ parse.p.cipher != WLAN_CIPHER_SUITE_WEP104) {
+ err = -EINVAL;
+ goto error;
+ }
result->params[parse.idx].cipher = parse.p.cipher;
result->params[parse.idx].key_len = parse.p.key_len;
result->params[parse.idx].key = result->data[parse.idx];
memcpy(result->data[parse.idx], parse.p.key, parse.p.key_len);
- if (parse.p.cipher == WLAN_CIPHER_SUITE_WEP40 ||
- parse.p.cipher == WLAN_CIPHER_SUITE_WEP104) {
- if (no_ht)
- *no_ht = true;
- }
+ /* must be WEP key if we got here */
+ if (no_ht)
+ *no_ht = true;
+ }
+
+ if (result->def < 0) {
+ err = -EINVAL;
+ goto error;
}
return result;
@@ -2525,10 +2541,35 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
int if_idx = 0;
int wp_start = cb->args[0];
int if_start = cb->args[1];
+ int filter_wiphy = -1;
struct cfg80211_registered_device *rdev;
struct wireless_dev *wdev;
rtnl_lock();
+ if (!cb->args[2]) {
+ struct nl80211_dump_wiphy_state state = {
+ .filter_wiphy = -1,
+ };
+ int ret;
+
+ ret = nl80211_dump_wiphy_parse(skb, cb, &state);
+ if (ret)
+ return ret;
+
+ filter_wiphy = state.filter_wiphy;
+
+ /*
+ * if filtering, set cb->args[2] to +1 since 0 is the default
+ * value needed to determine that parsing is necessary.
+ */
+ if (filter_wiphy >= 0)
+ cb->args[2] = filter_wiphy + 1;
+ else
+ cb->args[2] = -1;
+ } else if (cb->args[2] > 0) {
+ filter_wiphy = cb->args[2] - 1;
+ }
+
list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
if (!net_eq(wiphy_net(&rdev->wiphy), sock_net(skb->sk)))
continue;
@@ -2536,6 +2577,10 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
wp_idx++;
continue;
}
+
+ if (filter_wiphy >= 0 && filter_wiphy != rdev->wiphy_idx)
+ continue;
+
if_idx = 0;
list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
@@ -6969,7 +7014,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
params.n_counter_offsets_presp = len / sizeof(u16);
if (rdev->wiphy.max_num_csa_counters &&
- (params.n_counter_offsets_beacon >
+ (params.n_counter_offsets_presp >
rdev->wiphy.max_num_csa_counters))
return -EINVAL;
@@ -7359,7 +7404,7 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
(key.p.cipher != WLAN_CIPHER_SUITE_WEP104 ||
key.p.key_len != WLAN_KEY_LEN_WEP104))
return -EINVAL;
- if (key.idx > 4)
+ if (key.idx > 3)
return -EINVAL;
} else {
key.p.key_len = 0;
@@ -7977,6 +8022,8 @@ __cfg80211_alloc_vendor_skb(struct cfg80211_registered_device *rdev,
}
data = nla_nest_start(skb, attr);
+ if (!data)
+ goto nla_put_failure;
((void **)skb->cb)[0] = rdev;
((void **)skb->cb)[1] = hdr;
@@ -9406,18 +9453,27 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg,
if (!freqs)
return -ENOBUFS;
- for (i = 0; i < req->n_channels; i++)
- nla_put_u32(msg, i, req->channels[i]->center_freq);
+ for (i = 0; i < req->n_channels; i++) {
+ if (nla_put_u32(msg, i, req->channels[i]->center_freq))
+ return -ENOBUFS;
+ }
nla_nest_end(msg, freqs);
if (req->n_match_sets) {
matches = nla_nest_start(msg, NL80211_ATTR_SCHED_SCAN_MATCH);
+ if (!matches)
+ return -ENOBUFS;
+
for (i = 0; i < req->n_match_sets; i++) {
match = nla_nest_start(msg, i);
- nla_put(msg, NL80211_SCHED_SCAN_MATCH_ATTR_SSID,
- req->match_sets[i].ssid.ssid_len,
- req->match_sets[i].ssid.ssid);
+ if (!match)
+ return -ENOBUFS;
+
+ if (nla_put(msg, NL80211_SCHED_SCAN_MATCH_ATTR_SSID,
+ req->match_sets[i].ssid.ssid_len,
+ req->match_sets[i].ssid.ssid))
+ return -ENOBUFS;
nla_nest_end(msg, match);
}
nla_nest_end(msg, matches);
@@ -9429,6 +9485,9 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg,
for (i = 0; i < req->n_scan_plans; i++) {
scan_plan = nla_nest_start(msg, i + 1);
+ if (!scan_plan)
+ return -ENOBUFS;
+
if (!scan_plan ||
nla_put_u32(msg, NL80211_SCHED_SCAN_PLAN_INTERVAL,
req->scan_plans[i].interval) ||
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 0358e12be54b..b5bd58d0f731 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -352,52 +352,48 @@ void cfg80211_bss_expire(struct cfg80211_registered_device *rdev)
__cfg80211_bss_expire(rdev, jiffies - IEEE80211_SCAN_RESULT_EXPIRE);
}
-const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len)
+const u8 *cfg80211_find_ie_match(u8 eid, const u8 *ies, int len,
+ const u8 *match, int match_len,
+ int match_offset)
{
- while (len > 2 && ies[0] != eid) {
+ /* match_offset can't be smaller than 2, unless match_len is
+ * zero, in which case match_offset must be zero as well.
+ */
+ if (WARN_ON((match_len && match_offset < 2) ||
+ (!match_len && match_offset)))
+ return NULL;
+
+ while (len >= 2 && len >= ies[1] + 2) {
+ if ((ies[0] == eid) &&
+ (ies[1] + 2 >= match_offset + match_len) &&
+ !memcmp(ies + match_offset, match, match_len))
+ return ies;
+
len -= ies[1] + 2;
ies += ies[1] + 2;
}
- if (len < 2)
- return NULL;
- if (len < 2 + ies[1])
- return NULL;
- return ies;
+
+ return NULL;
}
-EXPORT_SYMBOL(cfg80211_find_ie);
+EXPORT_SYMBOL(cfg80211_find_ie_match);
const u8 *cfg80211_find_vendor_ie(unsigned int oui, int oui_type,
const u8 *ies, int len)
{
- struct ieee80211_vendor_ie *ie;
- const u8 *pos = ies, *end = ies + len;
- int ie_oui;
+ const u8 *ie;
+ u8 match[] = { oui >> 16, oui >> 8, oui, oui_type };
+ int match_len = (oui_type < 0) ? 3 : sizeof(match);
if (WARN_ON(oui_type > 0xff))
return NULL;
- while (pos < end) {
- pos = cfg80211_find_ie(WLAN_EID_VENDOR_SPECIFIC, pos,
- end - pos);
- if (!pos)
- return NULL;
-
- ie = (struct ieee80211_vendor_ie *)pos;
-
- /* make sure we can access ie->len */
- BUILD_BUG_ON(offsetof(struct ieee80211_vendor_ie, len) != 1);
+ ie = cfg80211_find_ie_match(WLAN_EID_VENDOR_SPECIFIC, ies, len,
+ match, match_len, 2);
- if (ie->len < sizeof(*ie))
- goto cont;
+ if (ie && (ie[1] < 4))
+ return NULL;
- ie_oui = ie->oui[0] << 16 | ie->oui[1] << 8 | ie->oui[2];
- if (ie_oui == oui &&
- (oui_type < 0 || ie->oui_type == oui_type))
- return pos;
-cont:
- pos += 2 + ie->len;
- }
- return NULL;
+ return ie;
}
EXPORT_SYMBOL(cfg80211_find_vendor_ie);
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index add6824c44fd..c08a3b57dca1 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -1043,6 +1043,9 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev,
connect->crypto.ciphers_pairwise[0] = cipher;
}
}
+ } else {
+ if (WARN_ON(connkeys))
+ return -EINVAL;
}
wdev->connect_keys = connkeys;
diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
index e46469bc130f..0082f4b01795 100644
--- a/net/wireless/sysfs.c
+++ b/net/wireless/sysfs.c
@@ -57,7 +57,7 @@ static ssize_t addresses_show(struct device *dev,
return sprintf(buf, "%pM\n", wiphy->perm_addr);
for (i = 0; i < wiphy->n_addresses; i++)
- buf += sprintf(buf, "%pM\n", &wiphy->addresses[i].addr);
+ buf += sprintf(buf, "%pM\n", wiphy->addresses[i].addr);
return buf - start;
}
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 0675f513e7b9..9e6e2aaa7766 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -218,7 +218,7 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
struct key_params *params, int key_idx,
bool pairwise, const u8 *mac_addr)
{
- if (key_idx > 5)
+ if (key_idx < 0 || key_idx > 5)
return -EINVAL;
if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
@@ -249,7 +249,13 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
/* Disallow BIP (group-only) cipher as pairwise cipher */
if (pairwise)
return -EINVAL;
+ if (key_idx < 4)
+ return -EINVAL;
break;
+ case WLAN_CIPHER_SUITE_WEP40:
+ case WLAN_CIPHER_SUITE_WEP104:
+ if (key_idx > 3)
+ return -EINVAL;
default:
break;
}
@@ -906,7 +912,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev)
if (!wdev->connect_keys)
return;
- for (i = 0; i < 6; i++) {
+ for (i = 0; i < 4; i++) {
if (!wdev->connect_keys->params[i].cipher)
continue;
if (rdev_add_key(rdev, dev, i, false, NULL,
@@ -919,9 +925,6 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev)
netdev_err(dev, "failed to set defkey %d\n", i);
continue;
}
- if (wdev->connect_keys->defmgmt == i)
- if (rdev_set_default_mgmt_key(rdev, dev, i))
- netdev_err(dev, "failed to set mgtdef %d\n", i);
}
kzfree(wdev->connect_keys);
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 9f27221c8913..7b97d43b27e1 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -408,10 +408,10 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
if (!wdev->wext.keys) {
wdev->wext.keys = kzalloc(sizeof(*wdev->wext.keys),
- GFP_KERNEL);
+ GFP_KERNEL);
if (!wdev->wext.keys)
return -ENOMEM;
- for (i = 0; i < 6; i++)
+ for (i = 0; i < 4; i++)
wdev->wext.keys->params[i].key =
wdev->wext.keys->data[i];
}
@@ -460,7 +460,7 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
if (err == -ENOENT)
err = 0;
if (!err) {
- if (!addr) {
+ if (!addr && idx < 4) {
memset(wdev->wext.keys->data[idx], 0,
sizeof(wdev->wext.keys->data[idx]));
wdev->wext.keys->params[idx].key_len = 0;
@@ -487,6 +487,9 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
err = 0;
if (wdev->current_bss)
err = rdev_add_key(rdev, dev, idx, pairwise, addr, params);
+ else if (params->cipher != WLAN_CIPHER_SUITE_WEP40 &&
+ params->cipher != WLAN_CIPHER_SUITE_WEP104)
+ return -EINVAL;
if (err)
return err;
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index dbb2738e356a..6250b1cfcde5 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -958,29 +958,8 @@ static int wireless_process_ioctl(struct net *net, struct ifreq *ifr,
return private(dev, iwr, cmd, info, handler);
}
/* Old driver API : call driver ioctl handler */
- if (dev->netdev_ops->ndo_do_ioctl) {
-#ifdef CONFIG_COMPAT
- if (info->flags & IW_REQUEST_FLAG_COMPAT) {
- int ret = 0;
- struct iwreq iwr_lcl;
- struct compat_iw_point *iwp_compat = (void *) &iwr->u.data;
-
- memcpy(&iwr_lcl, iwr, sizeof(struct iwreq));
- iwr_lcl.u.data.pointer = compat_ptr(iwp_compat->pointer);
- iwr_lcl.u.data.length = iwp_compat->length;
- iwr_lcl.u.data.flags = iwp_compat->flags;
-
- ret = dev->netdev_ops->ndo_do_ioctl(dev, (void *) &iwr_lcl, cmd);
-
- iwp_compat->pointer = ptr_to_compat(iwr_lcl.u.data.pointer);
- iwp_compat->length = iwr_lcl.u.data.length;
- iwp_compat->flags = iwr_lcl.u.data.flags;
-
- return ret;
- } else
-#endif
- return dev->netdev_ops->ndo_do_ioctl(dev, ifr, cmd);
- }
+ if (dev->netdev_ops->ndo_do_ioctl)
+ return dev->netdev_ops->ndo_do_ioctl(dev, ifr, cmd);
return -EOPNOTSUPP;
}
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
index a4e8af3321d2..88f1f6931ab8 100644
--- a/net/wireless/wext-sme.c
+++ b/net/wireless/wext-sme.c
@@ -35,7 +35,6 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
if (wdev->wext.keys) {
wdev->wext.keys->def = wdev->wext.default_key;
- wdev->wext.keys->defmgmt = wdev->wext.default_mgmt_key;
if (wdev->wext.default_key != -1)
wdev->wext.connect.privacy = true;
}
@@ -43,11 +42,11 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
if (!wdev->wext.connect.ssid_len)
return 0;
- if (wdev->wext.keys) {
+ if (wdev->wext.keys && wdev->wext.keys->def != -1) {
ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL);
if (!ck)
return -ENOMEM;
- for (i = 0; i < 6; i++)
+ for (i = 0; i < 4; i++)
ck->params[i].key = ck->data[i];
}
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index a750f330b8dd..f83b74d3e2ac 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -1500,12 +1500,8 @@ out_fac_release:
goto out_dtefac_release;
if (dtefacs.calling_len > X25_MAX_AE_LEN)
goto out_dtefac_release;
- if (dtefacs.calling_ae == NULL)
- goto out_dtefac_release;
if (dtefacs.called_len > X25_MAX_AE_LEN)
goto out_dtefac_release;
- if (dtefacs.called_ae == NULL)
- goto out_dtefac_release;
x25->dte_facilities = dtefacs;
rc = 0;
out_dtefac_release:
diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c
index 250e567ba3d6..44ac85fe2bc9 100644
--- a/net/xfrm/xfrm_algo.c
+++ b/net/xfrm/xfrm_algo.c
@@ -17,7 +17,7 @@
#include <linux/crypto.h>
#include <linux/scatterlist.h>
#include <net/xfrm.h>
-#if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE)
+#if IS_ENABLED(CONFIG_INET_ESP) || IS_ENABLED(CONFIG_INET6_ESP)
#include <net/esp.h>
#endif
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 1c4ad477ce93..6e3f0254d8a1 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -207,15 +207,15 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
family = XFRM_SPI_SKB_CB(skb)->family;
/* if tunnel is present override skb->mark value with tunnel i_key */
- if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4) {
- switch (family) {
- case AF_INET:
+ switch (family) {
+ case AF_INET:
+ if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4)
mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4->parms.i_key);
- break;
- case AF_INET6:
+ break;
+ case AF_INET6:
+ if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6)
mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6->parms.i_key);
- break;
- }
+ break;
}
/* Allocate new secpath or COW existing one. */
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index b5e665b3cfb0..fd6986634e6f 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -49,6 +49,7 @@ static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO]
__read_mostly;
static struct kmem_cache *xfrm_dst_cache __read_mostly;
+static __read_mostly seqcount_t xfrm_policy_hash_generation;
static void xfrm_init_pmtu(struct dst_entry *dst);
static int stale_bundle(struct dst_entry *dst);
@@ -59,6 +60,11 @@ static void __xfrm_policy_link(struct xfrm_policy *pol, int dir);
static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
int dir);
+static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy)
+{
+ return atomic_inc_not_zero(&policy->refcnt);
+}
+
static inline bool
__xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
{
@@ -385,9 +391,11 @@ static struct hlist_head *policy_hash_bysel(struct net *net,
__get_hash_thresh(net, family, dir, &dbits, &sbits);
hash = __sel_hash(sel, family, hmask, dbits, sbits);
- return (hash == hmask + 1 ?
- &net->xfrm.policy_inexact[dir] :
- net->xfrm.policy_bydst[dir].table + hash);
+ if (hash == hmask + 1)
+ return &net->xfrm.policy_inexact[dir];
+
+ return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
+ lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
}
static struct hlist_head *policy_hash_direct(struct net *net,
@@ -403,7 +411,8 @@ static struct hlist_head *policy_hash_direct(struct net *net,
__get_hash_thresh(net, family, dir, &dbits, &sbits);
hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits);
- return net->xfrm.policy_bydst[dir].table + hash;
+ return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
+ lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
}
static void xfrm_dst_hash_transfer(struct net *net,
@@ -426,14 +435,14 @@ redo:
h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr,
pol->family, nhashmask, dbits, sbits);
if (!entry0) {
- hlist_del(&pol->bydst);
- hlist_add_head(&pol->bydst, ndsttable+h);
+ hlist_del_rcu(&pol->bydst);
+ hlist_add_head_rcu(&pol->bydst, ndsttable + h);
h0 = h;
} else {
if (h != h0)
continue;
- hlist_del(&pol->bydst);
- hlist_add_behind(&pol->bydst, entry0);
+ hlist_del_rcu(&pol->bydst);
+ hlist_add_behind_rcu(&pol->bydst, entry0);
}
entry0 = &pol->bydst;
}
@@ -468,22 +477,32 @@ static void xfrm_bydst_resize(struct net *net, int dir)
unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
unsigned int nhashmask = xfrm_new_hash_mask(hmask);
unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
- struct hlist_head *odst = net->xfrm.policy_bydst[dir].table;
struct hlist_head *ndst = xfrm_hash_alloc(nsize);
+ struct hlist_head *odst;
int i;
if (!ndst)
return;
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+ write_seqcount_begin(&xfrm_policy_hash_generation);
+
+ odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
+ lockdep_is_held(&net->xfrm.xfrm_policy_lock));
+
+ odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
+ lockdep_is_held(&net->xfrm.xfrm_policy_lock));
for (i = hmask; i >= 0; i--)
xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir);
- net->xfrm.policy_bydst[dir].table = ndst;
+ rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst);
net->xfrm.policy_bydst[dir].hmask = nhashmask;
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ write_seqcount_end(&xfrm_policy_hash_generation);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+
+ synchronize_rcu();
xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head));
}
@@ -500,7 +519,7 @@ static void xfrm_byidx_resize(struct net *net, int total)
if (!nidx)
return;
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
for (i = hmask; i >= 0; i--)
xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask);
@@ -508,7 +527,7 @@ static void xfrm_byidx_resize(struct net *net, int total)
net->xfrm.policy_byidx = nidx;
net->xfrm.policy_idx_hmask = nhashmask;
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head));
}
@@ -541,7 +560,6 @@ static inline int xfrm_byidx_should_resize(struct net *net, int total)
void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si)
{
- read_lock_bh(&net->xfrm.xfrm_policy_lock);
si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN];
si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT];
si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD];
@@ -550,7 +568,6 @@ void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si)
si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX];
si->spdhcnt = net->xfrm.policy_idx_hmask;
si->spdhmcnt = xfrm_policy_hashmax;
- read_unlock_bh(&net->xfrm.xfrm_policy_lock);
}
EXPORT_SYMBOL(xfrm_spd_getinfo);
@@ -600,7 +617,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
rbits6 = net->xfrm.policy_hthresh.rbits6;
} while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq));
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
/* reset the bydst and inexact table in all directions */
for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
@@ -626,6 +643,10 @@ static void xfrm_hash_rebuild(struct work_struct *work)
/* re-insert all policies by order of creation */
list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
+ if (xfrm_policy_id2dir(policy->index) >= XFRM_POLICY_MAX) {
+ /* skip socket policies */
+ continue;
+ }
newpos = NULL;
chain = policy_hash_bysel(net, &policy->selector,
policy->family,
@@ -642,7 +663,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
hlist_add_head(&policy->bydst, chain);
}
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
mutex_unlock(&hash_resize_mutex);
}
@@ -753,7 +774,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
struct hlist_head *chain;
struct hlist_node *newpos;
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
delpol = NULL;
newpos = NULL;
@@ -764,7 +785,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
xfrm_sec_ctx_match(pol->security, policy->security) &&
!WARN_ON(delpol)) {
if (excl) {
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
return -EEXIST;
}
delpol = pol;
@@ -800,7 +821,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
policy->curlft.use_time = 0;
if (!mod_timer(&policy->timer, jiffies + HZ))
xfrm_pol_hold(policy);
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
if (delpol)
xfrm_policy_kill(delpol);
@@ -820,7 +841,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
struct hlist_head *chain;
*err = 0;
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
chain = policy_hash_bysel(net, sel, sel->family, dir);
ret = NULL;
hlist_for_each_entry(pol, chain, bydst) {
@@ -833,7 +854,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
*err = security_xfrm_policy_delete(
pol->security);
if (*err) {
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
return pol;
}
__xfrm_policy_unlink(pol, dir);
@@ -842,7 +863,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
break;
}
}
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
if (ret && delete)
xfrm_policy_kill(ret);
@@ -861,7 +882,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
return NULL;
*err = 0;
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
chain = net->xfrm.policy_byidx + idx_hash(net, id);
ret = NULL;
hlist_for_each_entry(pol, chain, byidx) {
@@ -872,7 +893,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
*err = security_xfrm_policy_delete(
pol->security);
if (*err) {
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
return pol;
}
__xfrm_policy_unlink(pol, dir);
@@ -881,7 +902,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
break;
}
}
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
if (ret && delete)
xfrm_policy_kill(ret);
@@ -939,7 +960,7 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
{
int dir, err = 0, cnt = 0;
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
err = xfrm_policy_flush_secctx_check(net, type, task_valid);
if (err)
@@ -955,14 +976,14 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
if (pol->type != type)
continue;
__xfrm_policy_unlink(pol, dir);
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
cnt++;
xfrm_audit_policy_delete(pol, 1, task_valid);
xfrm_policy_kill(pol);
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
goto again1;
}
@@ -974,13 +995,13 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
if (pol->type != type)
continue;
__xfrm_policy_unlink(pol, dir);
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
cnt++;
xfrm_audit_policy_delete(pol, 1, task_valid);
xfrm_policy_kill(pol);
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
goto again2;
}
}
@@ -989,7 +1010,7 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
if (!cnt)
err = -ESRCH;
out:
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
return err;
}
EXPORT_SYMBOL(xfrm_policy_flush);
@@ -1009,7 +1030,7 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
if (list_empty(&walk->walk.all) && walk->seq != 0)
return 0;
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
if (list_empty(&walk->walk.all))
x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all);
else
@@ -1037,7 +1058,7 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
}
list_del_init(&walk->walk.all);
out:
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
return error;
}
EXPORT_SYMBOL(xfrm_policy_walk);
@@ -1056,9 +1077,9 @@ void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net)
if (list_empty(&walk->walk.all))
return;
- write_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */
list_del(&walk->walk.all);
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
}
EXPORT_SYMBOL(xfrm_policy_walk_done);
@@ -1096,17 +1117,24 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
struct xfrm_policy *pol, *ret;
const xfrm_address_t *daddr, *saddr;
struct hlist_head *chain;
- u32 priority = ~0U;
+ unsigned int sequence;
+ u32 priority;
daddr = xfrm_flowi_daddr(fl, family);
saddr = xfrm_flowi_saddr(fl, family);
if (unlikely(!daddr || !saddr))
return NULL;
- read_lock_bh(&net->xfrm.xfrm_policy_lock);
- chain = policy_hash_direct(net, daddr, saddr, family, dir);
+ rcu_read_lock();
+ retry:
+ do {
+ sequence = read_seqcount_begin(&xfrm_policy_hash_generation);
+ chain = policy_hash_direct(net, daddr, saddr, family, dir);
+ } while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence));
+
+ priority = ~0U;
ret = NULL;
- hlist_for_each_entry(pol, chain, bydst) {
+ hlist_for_each_entry_rcu(pol, chain, bydst) {
err = xfrm_policy_match(pol, fl, type, family, dir);
if (err) {
if (err == -ESRCH)
@@ -1122,7 +1150,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
}
}
chain = &net->xfrm.policy_inexact[dir];
- hlist_for_each_entry(pol, chain, bydst) {
+ hlist_for_each_entry_rcu(pol, chain, bydst) {
if ((pol->priority >= priority) && ret)
break;
@@ -1140,9 +1168,13 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
}
}
- xfrm_pol_hold(ret);
+ if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence))
+ goto retry;
+
+ if (ret && !xfrm_pol_hold_rcu(ret))
+ goto retry;
fail:
- read_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ rcu_read_unlock();
return ret;
}
@@ -1219,10 +1251,9 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
const struct flowi *fl)
{
struct xfrm_policy *pol;
- struct net *net = sock_net(sk);
rcu_read_lock();
- read_lock_bh(&net->xfrm.xfrm_policy_lock);
+ again:
pol = rcu_dereference(sk->sk_policy[dir]);
if (pol != NULL) {
bool match = xfrm_selector_match(&pol->selector, fl,
@@ -1237,8 +1268,8 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
err = security_xfrm_policy_lookup(pol->security,
fl->flowi_secid,
policy_to_flow_dir(dir));
- if (!err)
- xfrm_pol_hold(pol);
+ if (!err && !xfrm_pol_hold_rcu(pol))
+ goto again;
else if (err == -ESRCH)
pol = NULL;
else
@@ -1247,7 +1278,6 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
pol = NULL;
}
out:
- read_unlock_bh(&net->xfrm.xfrm_policy_lock);
rcu_read_unlock();
return pol;
}
@@ -1271,7 +1301,7 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
/* Socket policies are not hashed. */
if (!hlist_unhashed(&pol->bydst)) {
- hlist_del(&pol->bydst);
+ hlist_del_rcu(&pol->bydst);
hlist_del(&pol->byidx);
}
@@ -1295,9 +1325,9 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
{
struct net *net = xp_net(pol);
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
pol = __xfrm_policy_unlink(pol, dir);
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
if (pol) {
xfrm_policy_kill(pol);
return 0;
@@ -1316,7 +1346,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
return -EINVAL;
#endif
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
old_pol = rcu_dereference_protected(sk->sk_policy[dir],
lockdep_is_held(&net->xfrm.xfrm_policy_lock));
if (pol) {
@@ -1334,7 +1364,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
*/
xfrm_sk_policy_unlink(old_pol, dir);
}
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
if (old_pol) {
xfrm_policy_kill(old_pol);
@@ -1364,9 +1394,9 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
newp->type = old->type;
memcpy(newp->xfrm_vec, old->xfrm_vec,
newp->xfrm_nr*sizeof(struct xfrm_tmpl));
- write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
xfrm_sk_policy_link(newp, dir);
- write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
xfrm_pol_put(newp);
}
return newp;
@@ -3048,7 +3078,7 @@ static int __net_init xfrm_net_init(struct net *net)
/* Initialize the per-net locks here */
spin_lock_init(&net->xfrm.xfrm_state_lock);
- rwlock_init(&net->xfrm.xfrm_policy_lock);
+ spin_lock_init(&net->xfrm.xfrm_policy_lock);
mutex_init(&net->xfrm.xfrm_cfg_mutex);
return 0;
@@ -3082,6 +3112,7 @@ static struct pernet_operations __net_initdata xfrm_net_ops = {
void __init xfrm_init(void)
{
register_pernet_subsys(&xfrm_net_ops);
+ seqcount_init(&xfrm_policy_hash_generation);
xfrm_input_init();
}
@@ -3179,7 +3210,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *
struct hlist_head *chain;
u32 priority = ~0U;
- read_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME*/
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir);
hlist_for_each_entry(pol, chain, bydst) {
if (xfrm_migrate_selector_match(sel, &pol->selector) &&
@@ -3203,7 +3234,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *
xfrm_pol_hold(ret);
- read_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
return ret;
}
diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
index 4fd725a0c500..cdc2e2e71bff 100644
--- a/net/xfrm/xfrm_replay.c
+++ b/net/xfrm/xfrm_replay.c
@@ -558,7 +558,7 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq)
x->repl->notify(x, XFRM_REPLAY_UPDATE);
}
-static struct xfrm_replay xfrm_replay_legacy = {
+static const struct xfrm_replay xfrm_replay_legacy = {
.advance = xfrm_replay_advance,
.check = xfrm_replay_check,
.recheck = xfrm_replay_check,
@@ -566,7 +566,7 @@ static struct xfrm_replay xfrm_replay_legacy = {
.overflow = xfrm_replay_overflow,
};
-static struct xfrm_replay xfrm_replay_bmp = {
+static const struct xfrm_replay xfrm_replay_bmp = {
.advance = xfrm_replay_advance_bmp,
.check = xfrm_replay_check_bmp,
.recheck = xfrm_replay_check_bmp,
@@ -574,7 +574,7 @@ static struct xfrm_replay xfrm_replay_bmp = {
.overflow = xfrm_replay_overflow_bmp,
};
-static struct xfrm_replay xfrm_replay_esn = {
+static const struct xfrm_replay xfrm_replay_esn = {
.advance = xfrm_replay_advance_esn,
.check = xfrm_replay_check_esn,
.recheck = xfrm_replay_recheck_esn,
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 9895a8c56d8c..419bf5d463bd 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -28,6 +28,11 @@
#include "xfrm_hash.h"
+#define xfrm_state_deref_prot(table, net) \
+ rcu_dereference_protected((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock))
+
+static void xfrm_state_gc_task(struct work_struct *work);
+
/* Each xfrm_state may be linked to two tables:
1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl)
@@ -36,6 +41,15 @@
*/
static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
+static __read_mostly seqcount_t xfrm_state_hash_generation = SEQCNT_ZERO(xfrm_state_hash_generation);
+
+static DECLARE_WORK(xfrm_state_gc_work, xfrm_state_gc_task);
+static HLIST_HEAD(xfrm_state_gc_list);
+
+static inline bool xfrm_state_hold_rcu(struct xfrm_state __rcu *x)
+{
+ return atomic_inc_not_zero(&x->refcnt);
+}
static inline unsigned int xfrm_dst_hash(struct net *net,
const xfrm_address_t *daddr,
@@ -76,18 +90,18 @@ static void xfrm_hash_transfer(struct hlist_head *list,
h = __xfrm_dst_hash(&x->id.daddr, &x->props.saddr,
x->props.reqid, x->props.family,
nhashmask);
- hlist_add_head(&x->bydst, ndsttable+h);
+ hlist_add_head_rcu(&x->bydst, ndsttable + h);
h = __xfrm_src_hash(&x->id.daddr, &x->props.saddr,
x->props.family,
nhashmask);
- hlist_add_head(&x->bysrc, nsrctable+h);
+ hlist_add_head_rcu(&x->bysrc, nsrctable + h);
if (x->id.spi) {
h = __xfrm_spi_hash(&x->id.daddr, x->id.spi,
x->id.proto, x->props.family,
nhashmask);
- hlist_add_head(&x->byspi, nspitable+h);
+ hlist_add_head_rcu(&x->byspi, nspitable + h);
}
}
}
@@ -122,25 +136,29 @@ static void xfrm_hash_resize(struct work_struct *work)
}
spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ write_seqcount_begin(&xfrm_state_hash_generation);
nhashmask = (nsize / sizeof(struct hlist_head)) - 1U;
+ odst = xfrm_state_deref_prot(net->xfrm.state_bydst, net);
for (i = net->xfrm.state_hmask; i >= 0; i--)
- xfrm_hash_transfer(net->xfrm.state_bydst+i, ndst, nsrc, nspi,
- nhashmask);
+ xfrm_hash_transfer(odst + i, ndst, nsrc, nspi, nhashmask);
- odst = net->xfrm.state_bydst;
- osrc = net->xfrm.state_bysrc;
- ospi = net->xfrm.state_byspi;
+ osrc = xfrm_state_deref_prot(net->xfrm.state_bysrc, net);
+ ospi = xfrm_state_deref_prot(net->xfrm.state_byspi, net);
ohashmask = net->xfrm.state_hmask;
- net->xfrm.state_bydst = ndst;
- net->xfrm.state_bysrc = nsrc;
- net->xfrm.state_byspi = nspi;
+ rcu_assign_pointer(net->xfrm.state_bydst, ndst);
+ rcu_assign_pointer(net->xfrm.state_bysrc, nsrc);
+ rcu_assign_pointer(net->xfrm.state_byspi, nspi);
net->xfrm.state_hmask = nhashmask;
+ write_seqcount_end(&xfrm_state_hash_generation);
spin_unlock_bh(&net->xfrm.xfrm_state_lock);
osize = (ohashmask + 1) * sizeof(struct hlist_head);
+
+ synchronize_rcu();
+
xfrm_hash_free(odst, osize);
xfrm_hash_free(osrc, osize);
xfrm_hash_free(ospi, osize);
@@ -332,6 +350,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
{
tasklet_hrtimer_cancel(&x->mtimer);
del_timer_sync(&x->rtimer);
+ kfree(x->aead);
kfree(x->aalg);
kfree(x->ealg);
kfree(x->calg);
@@ -355,15 +374,16 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
static void xfrm_state_gc_task(struct work_struct *work)
{
- struct net *net = container_of(work, struct net, xfrm.state_gc_work);
struct xfrm_state *x;
struct hlist_node *tmp;
struct hlist_head gc_list;
spin_lock_bh(&xfrm_state_gc_lock);
- hlist_move_list(&net->xfrm.state_gc_list, &gc_list);
+ hlist_move_list(&xfrm_state_gc_list, &gc_list);
spin_unlock_bh(&xfrm_state_gc_lock);
+ synchronize_rcu();
+
hlist_for_each_entry_safe(x, tmp, &gc_list, gclist)
xfrm_state_gc_destroy(x);
}
@@ -500,14 +520,12 @@ EXPORT_SYMBOL(xfrm_state_alloc);
void __xfrm_state_destroy(struct xfrm_state *x)
{
- struct net *net = xs_net(x);
-
WARN_ON(x->km.state != XFRM_STATE_DEAD);
spin_lock_bh(&xfrm_state_gc_lock);
- hlist_add_head(&x->gclist, &net->xfrm.state_gc_list);
+ hlist_add_head(&x->gclist, &xfrm_state_gc_list);
spin_unlock_bh(&xfrm_state_gc_lock);
- schedule_work(&net->xfrm.state_gc_work);
+ schedule_work(&xfrm_state_gc_work);
}
EXPORT_SYMBOL(__xfrm_state_destroy);
@@ -520,10 +538,10 @@ int __xfrm_state_delete(struct xfrm_state *x)
x->km.state = XFRM_STATE_DEAD;
spin_lock(&net->xfrm.xfrm_state_lock);
list_del(&x->km.all);
- hlist_del(&x->bydst);
- hlist_del(&x->bysrc);
+ hlist_del_rcu(&x->bydst);
+ hlist_del_rcu(&x->bysrc);
if (x->id.spi)
- hlist_del(&x->byspi);
+ hlist_del_rcu(&x->byspi);
net->xfrm.state_num--;
spin_unlock(&net->xfrm.xfrm_state_lock);
@@ -659,7 +677,7 @@ static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark,
unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family);
struct xfrm_state *x;
- hlist_for_each_entry(x, net->xfrm.state_byspi+h, byspi) {
+ hlist_for_each_entry_rcu(x, net->xfrm.state_byspi + h, byspi) {
if (x->props.family != family ||
x->id.spi != spi ||
x->id.proto != proto ||
@@ -668,7 +686,8 @@ static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark,
if ((mark & x->mark.m) != x->mark.v)
continue;
- xfrm_state_hold(x);
+ if (!xfrm_state_hold_rcu(x))
+ continue;
return x;
}
@@ -683,7 +702,7 @@ static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark,
unsigned int h = xfrm_src_hash(net, daddr, saddr, family);
struct xfrm_state *x;
- hlist_for_each_entry(x, net->xfrm.state_bysrc+h, bysrc) {
+ hlist_for_each_entry_rcu(x, net->xfrm.state_bysrc + h, bysrc) {
if (x->props.family != family ||
x->id.proto != proto ||
!xfrm_addr_equal(&x->id.daddr, daddr, family) ||
@@ -692,7 +711,8 @@ static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark,
if ((mark & x->mark.m) != x->mark.v)
continue;
- xfrm_state_hold(x);
+ if (!xfrm_state_hold_rcu(x))
+ continue;
return x;
}
@@ -775,13 +795,16 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
struct xfrm_state *best = NULL;
u32 mark = pol->mark.v & pol->mark.m;
unsigned short encap_family = tmpl->encap_family;
+ unsigned int sequence;
struct km_event c;
to_put = NULL;
- spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ sequence = read_seqcount_begin(&xfrm_state_hash_generation);
+
+ rcu_read_lock();
h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family);
- hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) {
+ hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) {
if (x->props.family == encap_family &&
x->props.reqid == tmpl->reqid &&
(mark & x->mark.m) == x->mark.v &&
@@ -797,7 +820,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
goto found;
h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family);
- hlist_for_each_entry(x, net->xfrm.state_bydst+h_wildcard, bydst) {
+ hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h_wildcard, bydst) {
if (x->props.family == encap_family &&
x->props.reqid == tmpl->reqid &&
(mark & x->mark.m) == x->mark.v &&
@@ -850,19 +873,21 @@ found:
}
if (km_query(x, tmpl, pol) == 0) {
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
x->km.state = XFRM_STATE_ACQ;
list_add(&x->km.all, &net->xfrm.state_all);
- hlist_add_head(&x->bydst, net->xfrm.state_bydst+h);
+ hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
h = xfrm_src_hash(net, daddr, saddr, encap_family);
- hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h);
+ hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h);
if (x->id.spi) {
h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family);
- hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
+ hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
}
x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
net->xfrm.state_num++;
xfrm_hash_grow_check(net, x->bydst.next != NULL);
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
} else {
x->km.state = XFRM_STATE_DEAD;
to_put = x;
@@ -871,13 +896,26 @@ found:
}
}
out:
- if (x)
- xfrm_state_hold(x);
- else
+ if (x) {
+ if (!xfrm_state_hold_rcu(x)) {
+ *err = -EAGAIN;
+ x = NULL;
+ }
+ } else {
*err = acquire_in_progress ? -EAGAIN : error;
- spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+ }
+ rcu_read_unlock();
if (to_put)
xfrm_state_put(to_put);
+
+ if (read_seqcount_retry(&xfrm_state_hash_generation, sequence)) {
+ *err = -EAGAIN;
+ if (x) {
+ xfrm_state_put(x);
+ x = NULL;
+ }
+ }
+
return x;
}
@@ -945,16 +983,16 @@ static void __xfrm_state_insert(struct xfrm_state *x)
h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr,
x->props.reqid, x->props.family);
- hlist_add_head(&x->bydst, net->xfrm.state_bydst+h);
+ hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
h = xfrm_src_hash(net, &x->id.daddr, &x->props.saddr, x->props.family);
- hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h);
+ hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h);
if (x->id.spi) {
h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto,
x->props.family);
- hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
+ hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
}
tasklet_hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL);
@@ -1063,9 +1101,9 @@ static struct xfrm_state *__find_acq_core(struct net *net,
xfrm_state_hold(x);
tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
list_add(&x->km.all, &net->xfrm.state_all);
- hlist_add_head(&x->bydst, net->xfrm.state_bydst+h);
+ hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
h = xfrm_src_hash(net, daddr, saddr, family);
- hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h);
+ hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h);
net->xfrm.state_num++;
@@ -1394,9 +1432,9 @@ xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32
{
struct xfrm_state *x;
- spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ rcu_read_lock();
x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family);
- spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+ rcu_read_unlock();
return x;
}
EXPORT_SYMBOL(xfrm_state_lookup);
@@ -1581,7 +1619,7 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high)
if (x->id.spi) {
spin_lock_bh(&net->xfrm.xfrm_state_lock);
h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family);
- hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
+ hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
spin_unlock_bh(&net->xfrm.xfrm_state_lock);
err = 0;
@@ -2099,8 +2137,6 @@ int __net_init xfrm_state_init(struct net *net)
net->xfrm.state_num = 0;
INIT_WORK(&net->xfrm.state_hash_work, xfrm_hash_resize);
- INIT_HLIST_HEAD(&net->xfrm.state_gc_list);
- INIT_WORK(&net->xfrm.state_gc_work, xfrm_state_gc_task);
spin_lock_init(&net->xfrm.xfrm_state_lock);
return 0;
@@ -2118,7 +2154,7 @@ void xfrm_state_fini(struct net *net)
flush_work(&net->xfrm.state_hash_work);
xfrm_state_flush(net, IPSEC_PROTO_ANY, false);
- flush_work(&net->xfrm.state_gc_work);
+ flush_work(&xfrm_state_gc_work);
WARN_ON(!list_empty(&net->xfrm.state_all));
diff --git a/net/xfrm/xfrm_sysctl.c b/net/xfrm/xfrm_sysctl.c
index 05a6e3d9c258..35a7e794ad04 100644
--- a/net/xfrm/xfrm_sysctl.c
+++ b/net/xfrm/xfrm_sysctl.c
@@ -17,13 +17,13 @@ static struct ctl_table xfrm_table[] = {
.procname = "xfrm_aevent_etime",
.maxlen = sizeof(u32),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_douintvec
},
{
.procname = "xfrm_aevent_rseqth",
.maxlen = sizeof(u32),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_douintvec
},
{
.procname = "xfrm_larval_drop",
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index d516845e16e3..08892091cfe3 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -581,9 +581,12 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
if (err)
goto error;
- if (attrs[XFRMA_SEC_CTX] &&
- security_xfrm_state_alloc(x, nla_data(attrs[XFRMA_SEC_CTX])))
- goto error;
+ if (attrs[XFRMA_SEC_CTX]) {
+ err = security_xfrm_state_alloc(x,
+ nla_data(attrs[XFRMA_SEC_CTX]));
+ if (err)
+ goto error;
+ }
if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn,
attrs[XFRMA_REPLAY_ESN_VAL])))
@@ -896,7 +899,8 @@ static int xfrm_dump_sa_done(struct netlink_callback *cb)
struct sock *sk = cb->skb->sk;
struct net *net = sock_net(sk);
- xfrm_state_walk_done(walk, net);
+ if (cb->args[0])
+ xfrm_state_walk_done(walk, net);
return 0;
}
@@ -921,8 +925,6 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb)
u8 proto = 0;
int err;
- cb->args[0] = 1;
-
err = nlmsg_parse(cb->nlh, 0, attrs, XFRMA_MAX,
xfrma_policy);
if (err < 0)
@@ -939,6 +941,7 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb)
proto = nla_get_u8(attrs[XFRMA_PROTO]);
xfrm_state_walk_init(walk, proto, filter);
+ cb->args[0] = 1;
}
(void) xfrm_state_walk(net, walk, dump_one_state, &info);
@@ -2051,9 +2054,6 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
if (up->hard) {
xfrm_policy_delete(xp, p->dir);
xfrm_audit_policy_delete(xp, 1, true);
- } else {
- // reset the timers here?
- WARN(1, "Don't know what to do with soft policy expire\n");
}
km_policy_expired(xp, p->dir, up->hard, nlh->nlmsg_pid);
@@ -2117,7 +2117,7 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh,
err = verify_newpolicy_info(&ua->policy);
if (err)
- goto bad_policy;
+ goto free_state;
/* build an XP */
xp = xfrm_policy_construct(net, &ua->policy, attrs, &err);
@@ -2149,8 +2149,6 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh,
return 0;
-bad_policy:
- WARN(1, "BAD policy passed\n");
free_state:
kfree(x);
nomem: