diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/af_inet.c | 1 | ||||
-rw-r--r-- | net/ipv4/arp.c | 9 | ||||
-rw-r--r-- | net/ipv4/devinet.c | 2 | ||||
-rw-r--r-- | net/ipv4/esp4.c | 3 | ||||
-rw-r--r-- | net/ipv4/fou_bpf.c | 2 | ||||
-rw-r--r-- | net/ipv4/gre_demux.c | 2 | ||||
-rw-r--r-- | net/ipv4/igmp.c | 3 | ||||
-rw-r--r-- | net/ipv4/inet_fragment.c | 4 | ||||
-rw-r--r-- | net/ipv4/inet_timewait_sock.c | 16 | ||||
-rw-r--r-- | net/ipv4/ip_fragment.c | 2 | ||||
-rw-r--r-- | net/ipv4/ip_gre.c | 144 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel.c | 111 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel_core.c | 82 | ||||
-rw-r--r-- | net/ipv4/ip_vti.c | 41 | ||||
-rw-r--r-- | net/ipv4/ipip.c | 33 | ||||
-rw-r--r-- | net/ipv4/ipmr.c | 2 | ||||
-rw-r--r-- | net/ipv4/route.c | 16 | ||||
-rw-r--r-- | net/ipv4/syncookies.c | 3 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 39 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 71 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 36 | ||||
-rw-r--r-- | net/ipv4/tcp_metrics.c | 7 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 11 | ||||
-rw-r--r-- | net/ipv4/tcp_offload.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 108 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 4 | ||||
-rw-r--r-- | net/ipv4/udp.c | 55 | ||||
-rw-r--r-- | net/ipv4/udp_tunnel_core.c | 5 | ||||
-rw-r--r-- | net/ipv4/xfrm4_policy.c | 2 |
30 files changed, 506 insertions, 314 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 55bd72997b31..a7cfeda28bb2 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1072,6 +1072,7 @@ const struct proto_ops inet_stream_ops = { #endif .splice_eof = inet_splice_eof, .splice_read = tcp_splice_read, + .set_peek_off = sk_set_peek_off, .read_sock = tcp_read_sock, .read_skb = tcp_read_skb, .sendmsg_locked = tcp_sendmsg_locked, diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 0d0d725b46ad..ab82ca104496 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -456,7 +456,8 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) /*unsigned long now; */ struct net *net = dev_net(dev); - rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev)); + rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev), + RT_SCOPE_UNIVERSE); if (IS_ERR(rt)) return 1; if (rt->dst.dev != dev) { @@ -1056,7 +1057,8 @@ static int arp_req_set(struct net *net, struct arpreq *r, if (r->arp_flags & ATF_PERM) r->arp_flags |= ATF_COM; if (!dev) { - struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0); + struct rtable *rt = ip_route_output(net, ip, 0, 0, 0, + RT_SCOPE_LINK); if (IS_ERR(rt)) return PTR_ERR(rt); @@ -1188,7 +1190,8 @@ static int arp_req_delete(struct net *net, struct arpreq *r, ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; if (!dev) { - struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0); + struct rtable *rt = ip_route_output(net, ip, 0, 0, 0, + RT_SCOPE_LINK); if (IS_ERR(rt)) return PTR_ERR(rt); dev = rt->dst.dev; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 7a437f0d4190..7592f242336b 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2749,7 +2749,7 @@ err_alloc_all: static __net_exit void devinet_exit_net(struct net *net) { #ifdef CONFIG_SYSCTL - struct ctl_table *tbl; + const struct ctl_table *tbl; tbl = net->ipv4.forw_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.forw_hdr); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index d33d12421814..dff04580318f 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -20,6 +20,7 @@ #include <net/udp.h> #include <net/tcp.h> #include <net/espintcp.h> +#include <linux/skbuff_ref.h> #include <linux/highmem.h> @@ -114,7 +115,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) */ if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) - skb_page_unref(skb, sg_page(sg), false); + skb_page_unref(sg_page(sg), skb->pp_recycle); } #ifdef CONFIG_INET_ESPINTCP diff --git a/net/ipv4/fou_bpf.c b/net/ipv4/fou_bpf.c index 06e5572f296f..54984f3170a8 100644 --- a/net/ipv4/fou_bpf.c +++ b/net/ipv4/fou_bpf.c @@ -64,7 +64,7 @@ __bpf_kfunc int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx, info->encap.type = TUNNEL_ENCAP_NONE; } - if (info->key.tun_flags & TUNNEL_CSUM) + if (test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags)) info->encap.flags |= TUNNEL_ENCAP_FLAG_CSUM; info->encap.sport = encap->sport; diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 3757fd93523f..6701a98d9a9f 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -73,7 +73,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; - tpi->flags = gre_flags_to_tnl_flags(greh->flags); + gre_flags_to_tnl_flags(tpi->flags, greh->flags); hdr_len = gre_calc_hlen(tpi->flags); if (!pskb_may_pull(skb, nhs + hdr_len)) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 717e97a389a8..9bf09de6a2e7 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1842,7 +1842,8 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) if (!dev) { struct rtable *rt = ip_route_output(net, imr->imr_multiaddr.s_addr, - 0, 0, 0); + 0, 0, 0, + RT_SCOPE_UNIVERSE); if (!IS_ERR(rt)) { dev = rt->dst.dev; ip_rt_put(rt); diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index c88c9034d630..faaec92a46ac 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -175,7 +175,7 @@ static void fqdir_free_fn(struct work_struct *work) } } -static DECLARE_WORK(fqdir_free_work, fqdir_free_fn); +static DECLARE_DELAYED_WORK(fqdir_free_work, fqdir_free_fn); static void fqdir_work_fn(struct work_struct *work) { @@ -184,7 +184,7 @@ static void fqdir_work_fn(struct work_struct *work) rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); if (llist_add(&fqdir->free_list, &fqdir_free_list)) - queue_work(system_wq, &fqdir_free_work); + queue_delayed_work(system_wq, &fqdir_free_work, HZ); } int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index e8de45d34d56..e28075f0006e 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -264,14 +264,18 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) EXPORT_SYMBOL_GPL(__inet_twsk_schedule); /* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */ -void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family) +void inet_twsk_purge(struct inet_hashinfo *hashinfo) { + struct inet_ehash_bucket *head = &hashinfo->ehash[0]; + unsigned int ehash_mask = hashinfo->ehash_mask; struct hlist_nulls_node *node; unsigned int slot; struct sock *sk; - for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { - struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; + for (slot = 0; slot <= ehash_mask; slot++, head++) { + if (hlist_nulls_empty(&head->chain)) + continue; + restart_rcu: cond_resched(); rcu_read_lock(); @@ -283,15 +287,13 @@ restart: TCPF_NEW_SYN_RECV)) continue; - if (sk->sk_family != family || - refcount_read(&sock_net(sk)->ns.count)) + if (refcount_read(&sock_net(sk)->ns.count)) continue; if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) continue; - if (unlikely(sk->sk_family != family || - refcount_read(&sock_net(sk)->ns.count))) { + if (refcount_read(&sock_net(sk)->ns.count)) { sock_gen_put(sk); goto restart; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index fb947d1613fe..534b98a0744a 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -632,7 +632,7 @@ err_alloc: static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->ipv4.frags_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.frags_hdr); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 57ddcd8c62f6..c3af965dc407 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -265,6 +265,7 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, struct net *net = dev_net(skb->dev); struct metadata_dst *tun_dst = NULL; struct erspan_base_hdr *ershdr; + IP_TUNNEL_DECLARE_FLAGS(flags); struct ip_tunnel_net *itn; struct ip_tunnel *tunnel; const struct iphdr *iph; @@ -272,12 +273,14 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, int ver; int len; + ip_tunnel_flags_copy(flags, tpi->flags); + itn = net_generic(net, erspan_net_id); iph = ip_hdr(skb); if (is_erspan_type1(gre_hdr_len)) { ver = 0; - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, - tpi->flags | TUNNEL_NO_KEY, + __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, iph->daddr, 0); } else { if (unlikely(!pskb_may_pull(skb, @@ -287,8 +290,8 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); ver = ershdr->ver; iph = ip_hdr(skb); - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, - tpi->flags | TUNNEL_KEY, + __set_bit(IP_TUNNEL_KEY_BIT, flags); + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, iph->daddr, tpi->key); } @@ -312,10 +315,9 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, struct ip_tunnel_info *info; unsigned char *gh; __be64 tun_id; - __be16 flags; - tpi->flags |= TUNNEL_KEY; - flags = tpi->flags; + __set_bit(IP_TUNNEL_KEY_BIT, tpi->flags); + ip_tunnel_flags_copy(flags, tpi->flags); tun_id = key32_to_tunnel_id(tpi->key); tun_dst = ip_tun_rx_dst(skb, flags, @@ -338,7 +340,8 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, ERSPAN_V2_MDSIZE); info = &tun_dst->u.tun_info; - info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, + info->key.tun_flags); info->options_len = sizeof(*md); } @@ -381,10 +384,13 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, tnl_params = &tunnel->parms.iph; if (tunnel->collect_md || tnl_params->daddr == 0) { - __be16 flags; + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; __be64 tun_id; - flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY); + __set_bit(IP_TUNNEL_CSUM_BIT, flags); + __set_bit(IP_TUNNEL_KEY_BIT, flags); + ip_tunnel_flags_and(flags, tpi->flags, flags); + tun_id = key32_to_tunnel_id(tpi->key); tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0); if (!tun_dst) @@ -464,12 +470,15 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, __be16 proto) { struct ip_tunnel *tunnel = netdev_priv(dev); - __be16 flags = tunnel->parms.o_flags; + IP_TUNNEL_DECLARE_FLAGS(flags); + + ip_tunnel_flags_copy(flags, tunnel->parms.o_flags); /* Push GRE header. */ gre_build_header(skb, tunnel->tun_hlen, flags, proto, tunnel->parms.o_key, - (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); + test_bit(IP_TUNNEL_SEQ_BIT, flags) ? + htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } @@ -483,10 +492,10 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, __be16 proto) { struct ip_tunnel *tunnel = netdev_priv(dev); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; int tunnel_hlen; - __be16 flags; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || @@ -500,14 +509,19 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, goto err_free_skb; /* Push Tunnel header. */ - if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM))) + if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + tunnel->parms.o_flags))) goto err_free_skb; - flags = tun_info->key.tun_flags & - (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); + __set_bit(IP_TUNNEL_CSUM_BIT, flags); + __set_bit(IP_TUNNEL_KEY_BIT, flags); + __set_bit(IP_TUNNEL_SEQ_BIT, flags); + ip_tunnel_flags_and(flags, tun_info->key.tun_flags, flags); + gre_build_header(skb, tunnel_hlen, flags, proto, tunnel_id_to_key32(tun_info->key.tun_id), - (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); + test_bit(IP_TUNNEL_SEQ_BIT, flags) ? + htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); @@ -521,6 +535,7 @@ err_free_skb: static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; struct erspan_metadata *md; @@ -536,7 +551,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) goto err_free_skb; key = &tun_info->key; - if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) + if (!test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags)) goto err_free_skb; if (tun_info->options_len < sizeof(*md)) goto err_free_skb; @@ -589,8 +604,9 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) goto err_free_skb; } - gre_build_header(skb, 8, TUNNEL_SEQ, - proto, 0, htonl(atomic_fetch_inc(&tunnel->o_seqno))); + __set_bit(IP_TUNNEL_SEQ_BIT, flags); + gre_build_header(skb, 8, flags, proto, 0, + htonl(atomic_fetch_inc(&tunnel->o_seqno))); ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); @@ -664,7 +680,8 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, tnl_params = &tunnel->parms.iph; } - if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM))) + if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + tunnel->parms.o_flags))) goto free_skb; __gre_xmit(skb, dev, tnl_params, skb->protocol); @@ -706,7 +723,7 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, /* Push ERSPAN header */ if (tunnel->erspan_ver == 0) { proto = htons(ETH_P_ERSPAN); - tunnel->parms.o_flags &= ~TUNNEL_SEQ; + __clear_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags); } else if (tunnel->erspan_ver == 1) { erspan_build_header(skb, ntohl(tunnel->parms.o_key), tunnel->index, @@ -721,7 +738,7 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, goto free_skb; } - tunnel->parms.o_flags &= ~TUNNEL_KEY; + __clear_bit(IP_TUNNEL_KEY_BIT, tunnel->parms.o_flags); __gre_xmit(skb, dev, &tunnel->parms.iph, proto); return NETDEV_TX_OK; @@ -744,7 +761,8 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, return NETDEV_TX_OK; } - if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM))) + if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + tunnel->parms.o_flags))) goto free_skb; if (skb_cow_head(skb, dev->needed_headroom)) @@ -762,7 +780,6 @@ free_skb: static void ipgre_link_update(struct net_device *dev, bool set_mtu) { struct ip_tunnel *tunnel = netdev_priv(dev); - __be16 flags; int len; len = tunnel->tun_hlen; @@ -778,10 +795,9 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu) if (set_mtu) dev->mtu = max_t(int, dev->mtu - len, 68); - flags = tunnel->parms.o_flags; - - if (flags & TUNNEL_SEQ || - (flags & TUNNEL_CSUM && tunnel->encap.type != TUNNEL_ENCAP_NONE)) { + if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags) || + (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags) && + tunnel->encap.type != TUNNEL_ENCAP_NONE)) { dev->features &= ~NETIF_F_GSO_SOFTWARE; dev->hw_features &= ~NETIF_F_GSO_SOFTWARE; } else { @@ -790,20 +806,29 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu) } } -static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, +static int ipgre_tunnel_ctl(struct net_device *dev, + struct ip_tunnel_parm_kern *p, int cmd) { + __be16 i_flags, o_flags; int err; + if (!ip_tunnel_flags_is_be16_compat(p->i_flags) || + !ip_tunnel_flags_is_be16_compat(p->o_flags)) + return -EOVERFLOW; + + i_flags = ip_tunnel_flags_to_be16(p->i_flags); + o_flags = ip_tunnel_flags_to_be16(p->o_flags); + if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p->iph.version != 4 || p->iph.protocol != IPPROTO_GRE || p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)) || - ((p->i_flags | p->o_flags) & (GRE_VERSION | GRE_ROUTING))) + ((i_flags | o_flags) & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; } - p->i_flags = gre_flags_to_tnl_flags(p->i_flags); - p->o_flags = gre_flags_to_tnl_flags(p->o_flags); + gre_flags_to_tnl_flags(p->i_flags, i_flags); + gre_flags_to_tnl_flags(p->o_flags, o_flags); err = ip_tunnel_ctl(dev, p, cmd); if (err) @@ -812,15 +837,18 @@ static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, if (cmd == SIOCCHGTUNNEL) { struct ip_tunnel *t = netdev_priv(dev); - t->parms.i_flags = p->i_flags; - t->parms.o_flags = p->o_flags; + ip_tunnel_flags_copy(t->parms.i_flags, p->i_flags); + ip_tunnel_flags_copy(t->parms.o_flags, p->o_flags); if (strcmp(dev->rtnl_link_ops->kind, "erspan")) ipgre_link_update(dev, true); } - p->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags); - p->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags); + i_flags = gre_tnl_flags_to_gre_flags(p->i_flags); + ip_tunnel_flags_from_be16(p->i_flags, i_flags); + o_flags = gre_tnl_flags_to_gre_flags(p->o_flags); + ip_tunnel_flags_from_be16(p->o_flags, o_flags); + return 0; } @@ -960,7 +988,6 @@ static void ipgre_tunnel_setup(struct net_device *dev) static void __gre_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel; - __be16 flags; tunnel = netdev_priv(dev); tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); @@ -972,14 +999,13 @@ static void __gre_tunnel_init(struct net_device *dev) dev->features |= GRE_FEATURES | NETIF_F_LLTX; dev->hw_features |= GRE_FEATURES; - flags = tunnel->parms.o_flags; - /* TCP offload with GRE SEQ is not supported, nor can we support 2 * levels of outer headers requiring an update. */ - if (flags & TUNNEL_SEQ) + if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags)) return; - if (flags & TUNNEL_CSUM && tunnel->encap.type != TUNNEL_ENCAP_NONE) + if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags) && + tunnel->encap.type != TUNNEL_ENCAP_NONE) return; dev->features |= NETIF_F_GSO_SOFTWARE; @@ -1136,7 +1162,7 @@ static int erspan_validate(struct nlattr *tb[], struct nlattr *data[], static int ipgre_netlink_parms(struct net_device *dev, struct nlattr *data[], struct nlattr *tb[], - struct ip_tunnel_parm *parms, + struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { struct ip_tunnel *t = netdev_priv(dev); @@ -1152,10 +1178,12 @@ static int ipgre_netlink_parms(struct net_device *dev, parms->link = nla_get_u32(data[IFLA_GRE_LINK]); if (data[IFLA_GRE_IFLAGS]) - parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS])); + gre_flags_to_tnl_flags(parms->i_flags, + nla_get_be16(data[IFLA_GRE_IFLAGS])); if (data[IFLA_GRE_OFLAGS]) - parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS])); + gre_flags_to_tnl_flags(parms->o_flags, + nla_get_be16(data[IFLA_GRE_OFLAGS])); if (data[IFLA_GRE_IKEY]) parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); @@ -1203,7 +1231,7 @@ static int ipgre_netlink_parms(struct net_device *dev, static int erspan_netlink_parms(struct net_device *dev, struct nlattr *data[], struct nlattr *tb[], - struct ip_tunnel_parm *parms, + struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { struct ip_tunnel *t = netdev_priv(dev); @@ -1362,7 +1390,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - struct ip_tunnel_parm p; + struct ip_tunnel_parm_kern p; __u32 fwmark = 0; int err; @@ -1380,7 +1408,7 @@ static int erspan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - struct ip_tunnel_parm p; + struct ip_tunnel_parm_kern p; __u32 fwmark = 0; int err; @@ -1399,8 +1427,8 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm_kern p; __u32 fwmark = t->fwmark; - struct ip_tunnel_parm p; int err; err = ipgre_newlink_encap_setup(dev, data); @@ -1415,8 +1443,8 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], if (err < 0) return err; - t->parms.i_flags = p.i_flags; - t->parms.o_flags = p.o_flags; + ip_tunnel_flags_copy(t->parms.i_flags, p.i_flags); + ip_tunnel_flags_copy(t->parms.o_flags, p.o_flags); ipgre_link_update(dev, !tb[IFLA_MTU]); @@ -1428,8 +1456,8 @@ static int erspan_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm_kern p; __u32 fwmark = t->fwmark; - struct ip_tunnel_parm p; int err; err = ipgre_newlink_encap_setup(dev, data); @@ -1444,8 +1472,8 @@ static int erspan_changelink(struct net_device *dev, struct nlattr *tb[], if (err < 0) return err; - t->parms.i_flags = p.i_flags; - t->parms.o_flags = p.o_flags; + ip_tunnel_flags_copy(t->parms.i_flags, p.i_flags); + ip_tunnel_flags_copy(t->parms.o_flags, p.o_flags); return 0; } @@ -1501,8 +1529,10 @@ static size_t ipgre_get_size(const struct net_device *dev) static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm *p = &t->parms; - __be16 o_flags = p->o_flags; + struct ip_tunnel_parm_kern *p = &t->parms; + IP_TUNNEL_DECLARE_FLAGS(o_flags); + + ip_tunnel_flags_copy(o_flags, p->o_flags); if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || nla_put_be16(skb, IFLA_GRE_IFLAGS, @@ -1550,7 +1580,7 @@ static int erspan_fill_info(struct sk_buff *skb, const struct net_device *dev) if (t->erspan_ver <= 2) { if (t->erspan_ver != 0 && !t->collect_md) - t->parms.o_flags |= TUNNEL_KEY; + __set_bit(IP_TUNNEL_KEY_BIT, t->parms.o_flags); if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) goto nla_put_failure; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 1b8d8ff9a237..177f40c3a8e8 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -56,17 +56,13 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) IP_TNL_HASH_BITS); } -static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, - __be16 flags, __be32 key) +static bool ip_tunnel_key_match(const struct ip_tunnel_parm_kern *p, + const unsigned long *flags, __be32 key) { - if (p->i_flags & TUNNEL_KEY) { - if (flags & TUNNEL_KEY) - return key == p->i_key; - else - /* key expected, none present */ - return false; - } else - return !(flags & TUNNEL_KEY); + if (!test_bit(IP_TUNNEL_KEY_BIT, flags)) + return !test_bit(IP_TUNNEL_KEY_BIT, p->i_flags); + + return test_bit(IP_TUNNEL_KEY_BIT, p->i_flags) && p->i_key == key; } /* Fallback tunnel: no source, no destination, no key, no options @@ -81,7 +77,7 @@ static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, Given src, dst and key, find appropriate for input tunnel. */ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, - int link, __be16 flags, + int link, const unsigned long *flags, __be32 remote, __be32 local, __be32 key) { @@ -143,7 +139,8 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, } hlist_for_each_entry_rcu(t, head, hash_node) { - if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) || + if ((!test_bit(IP_TUNNEL_NO_KEY_BIT, flags) && + t->parms.i_key != key) || t->parms.iph.saddr != 0 || t->parms.iph.daddr != 0 || !(t->dev->flags & IFF_UP)) @@ -171,7 +168,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, EXPORT_SYMBOL_GPL(ip_tunnel_lookup); static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm_kern *parms) { unsigned int h; __be32 remote; @@ -182,7 +179,8 @@ static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, else remote = 0; - if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI)) + if (!test_bit(IP_TUNNEL_KEY_BIT, parms->i_flags) && + test_bit(IP_TUNNEL_VTI_BIT, parms->i_flags)) i_key = 0; h = ip_tunnel_hash(i_key, remote); @@ -206,17 +204,19 @@ static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t) } static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, - struct ip_tunnel_parm *parms, + struct ip_tunnel_parm_kern *parms, int type) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; + IP_TUNNEL_DECLARE_FLAGS(flags); __be32 key = parms->i_key; - __be16 flags = parms->i_flags; int link = parms->link; struct ip_tunnel *t = NULL; struct hlist_head *head = ip_bucket(itn, parms); + ip_tunnel_flags_copy(flags, parms->i_flags); + hlist_for_each_entry_rcu(t, head, hash_node) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && @@ -230,7 +230,7 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, static struct net_device *__ip_tunnel_create(struct net *net, const struct rtnl_link_ops *ops, - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm_kern *parms) { int err; struct ip_tunnel *tunnel; @@ -326,7 +326,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) static struct ip_tunnel *ip_tunnel_create(struct net *net, struct ip_tunnel_net *itn, - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm_kern *parms) { struct ip_tunnel *nt; struct net_device *dev; @@ -386,15 +386,15 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, } #endif - if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) || - ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) { + if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) != + test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) { DEV_STATS_INC(tunnel->dev, rx_crc_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } - if (tunnel->parms.i_flags&TUNNEL_SEQ) { - if (!(tpi->flags&TUNNEL_SEQ) || + if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) { + if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) || (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { DEV_STATS_INC(tunnel->dev, rx_fifo_errors); DEV_STATS_INC(tunnel->dev, rx_errors); @@ -638,7 +638,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } - if (key->tun_flags & TUNNEL_DONT_FRAGMENT) + if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags)) df = htons(IP_DF); if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen, key->u.ipv4.dst, true)) { @@ -871,7 +871,7 @@ EXPORT_SYMBOL_GPL(ip_tunnel_xmit); static void ip_tunnel_update(struct ip_tunnel_net *itn, struct ip_tunnel *t, struct net_device *dev, - struct ip_tunnel_parm *p, + struct ip_tunnel_parm_kern *p, bool set_mtu, __u32 fwmark) { @@ -903,7 +903,8 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, netdev_state_change(dev); } -int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, + int cmd) { int err = 0; struct ip_tunnel *t = netdev_priv(dev); @@ -927,10 +928,10 @@ int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) goto done; if (p->iph.ttl) p->iph.frag_off |= htons(IP_DF); - if (!(p->i_flags & VTI_ISVTI)) { - if (!(p->i_flags & TUNNEL_KEY)) + if (!test_bit(IP_TUNNEL_VTI_BIT, p->i_flags)) { + if (!test_bit(IP_TUNNEL_KEY_BIT, p->i_flags)) p->i_key = 0; - if (!(p->o_flags & TUNNEL_KEY)) + if (!test_bit(IP_TUNNEL_KEY_BIT, p->o_flags)) p->o_key = 0; } @@ -1005,16 +1006,58 @@ done: } EXPORT_SYMBOL_GPL(ip_tunnel_ctl); +bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp, + const void __user *data) +{ + struct ip_tunnel_parm p; + + if (copy_from_user(&p, data, sizeof(p))) + return false; + + strscpy(kp->name, p.name); + kp->link = p.link; + ip_tunnel_flags_from_be16(kp->i_flags, p.i_flags); + ip_tunnel_flags_from_be16(kp->o_flags, p.o_flags); + kp->i_key = p.i_key; + kp->o_key = p.o_key; + memcpy(&kp->iph, &p.iph, min(sizeof(kp->iph), sizeof(p.iph))); + + return true; +} +EXPORT_SYMBOL_GPL(ip_tunnel_parm_from_user); + +bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp) +{ + struct ip_tunnel_parm p; + + if (!ip_tunnel_flags_is_be16_compat(kp->i_flags) || + !ip_tunnel_flags_is_be16_compat(kp->o_flags)) + return false; + + memset(&p, 0, sizeof(p)); + + strscpy(p.name, kp->name); + p.link = kp->link; + p.i_flags = ip_tunnel_flags_to_be16(kp->i_flags); + p.o_flags = ip_tunnel_flags_to_be16(kp->o_flags); + p.i_key = kp->i_key; + p.o_key = kp->o_key; + memcpy(&p.iph, &kp->iph, min(sizeof(p.iph), sizeof(kp->iph))); + + return !copy_to_user(data, &p, sizeof(p)); +} +EXPORT_SYMBOL_GPL(ip_tunnel_parm_to_user); + int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { - struct ip_tunnel_parm p; + struct ip_tunnel_parm_kern p; int err; - if (copy_from_user(&p, data, sizeof(p))) + if (!ip_tunnel_parm_from_user(&p, data)) return -EFAULT; err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd); - if (!err && copy_to_user(data, &p, sizeof(p))) + if (!err && !ip_tunnel_parm_to_user(data, &p)) return -EFAULT; return err; } @@ -1093,7 +1136,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname) { struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); - struct ip_tunnel_parm parms; + struct ip_tunnel_parm_kern parms; unsigned int i; itn->rtnl_link_ops = ops; @@ -1171,7 +1214,7 @@ void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p, __u32 fwmark) + struct ip_tunnel_parm_kern *p, __u32 fwmark) { struct ip_tunnel *nt; struct net *net = dev_net(dev); @@ -1225,7 +1268,7 @@ err_register_netdevice: EXPORT_SYMBOL_GPL(ip_tunnel_newlink); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p, __u32 fwmark) + struct ip_tunnel_parm_kern *p, __u32 fwmark) { struct ip_tunnel *t; struct ip_tunnel *tunnel = netdev_priv(dev); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 80ccd6661aa3..a3676155be78 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -125,6 +125,7 @@ EXPORT_SYMBOL_GPL(__iptunnel_pull_header); struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags) { + IP_TUNNEL_DECLARE_FLAGS(tun_flags) = { }; struct metadata_dst *res; struct ip_tunnel_info *dst, *src; @@ -144,10 +145,10 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, sizeof(struct in6_addr)); else dst->key.u.ipv4.dst = src->key.u.ipv4.src; - dst->key.tun_flags = src->key.tun_flags; + ip_tunnel_flags_copy(dst->key.tun_flags, src->key.tun_flags); dst->mode = src->mode | IP_TUNNEL_INFO_TX; ip_tunnel_info_opts_set(dst, ip_tunnel_info_opts(src), - src->options_len, 0); + src->options_len, tun_flags); return res; } @@ -497,7 +498,7 @@ static int ip_tun_parse_opts_geneve(struct nlattr *attr, opt->opt_class = nla_get_be16(attr); attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE]; opt->type = nla_get_u8(attr); - info->key.tun_flags |= TUNNEL_GENEVE_OPT; + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags); } return sizeof(struct geneve_opt) + data_len; @@ -525,7 +526,7 @@ static int ip_tun_parse_opts_vxlan(struct nlattr *attr, attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP]; md->gbp = nla_get_u32(attr); md->gbp &= VXLAN_GBP_MASK; - info->key.tun_flags |= TUNNEL_VXLAN_OPT; + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags); } return sizeof(struct vxlan_metadata); @@ -574,7 +575,7 @@ static int ip_tun_parse_opts_erspan(struct nlattr *attr, set_hwid(&md->u.md2, nla_get_u8(attr)); } - info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags); } return sizeof(struct erspan_metadata); @@ -585,7 +586,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, { int err, rem, opt_len, opts_len = 0; struct nlattr *nla; - __be16 type = 0; + u32 type = 0; if (!attr) return 0; @@ -598,7 +599,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) { switch (nla_type(nla)) { case LWTUNNEL_IP_OPTS_GENEVE: - if (type && type != TUNNEL_GENEVE_OPT) + if (type && type != IP_TUNNEL_GENEVE_OPT_BIT) return -EINVAL; opt_len = ip_tun_parse_opts_geneve(nla, info, opts_len, extack); @@ -607,7 +608,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, opts_len += opt_len; if (opts_len > IP_TUNNEL_OPTS_MAX) return -EINVAL; - type = TUNNEL_GENEVE_OPT; + type = IP_TUNNEL_GENEVE_OPT_BIT; break; case LWTUNNEL_IP_OPTS_VXLAN: if (type) @@ -617,7 +618,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, if (opt_len < 0) return opt_len; opts_len += opt_len; - type = TUNNEL_VXLAN_OPT; + type = IP_TUNNEL_VXLAN_OPT_BIT; break; case LWTUNNEL_IP_OPTS_ERSPAN: if (type) @@ -627,7 +628,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, if (opt_len < 0) return opt_len; opts_len += opt_len; - type = TUNNEL_ERSPAN_OPT; + type = IP_TUNNEL_ERSPAN_OPT_BIT; break; default: return -EINVAL; @@ -705,10 +706,16 @@ static int ip_tun_build_state(struct net *net, struct nlattr *attr, if (tb[LWTUNNEL_IP_TOS]) tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); - if (tb[LWTUNNEL_IP_FLAGS]) - tun_info->key.tun_flags |= - (nla_get_be16(tb[LWTUNNEL_IP_FLAGS]) & - ~TUNNEL_OPTIONS_PRESENT); + if (tb[LWTUNNEL_IP_FLAGS]) { + IP_TUNNEL_DECLARE_FLAGS(flags); + + ip_tunnel_flags_from_be16(flags, + nla_get_be16(tb[LWTUNNEL_IP_FLAGS])); + ip_tunnel_clear_options_present(flags); + + ip_tunnel_flags_or(tun_info->key.tun_flags, + tun_info->key.tun_flags, flags); + } tun_info->mode = IP_TUNNEL_INFO_TX; tun_info->options_len = opt_len; @@ -812,18 +819,18 @@ static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type, struct nlattr *nest; int err = 0; - if (!(tun_info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)) + if (!ip_tunnel_is_options_present(tun_info->key.tun_flags)) return 0; nest = nla_nest_start_noflag(skb, type); if (!nest) return -ENOMEM; - if (tun_info->key.tun_flags & TUNNEL_GENEVE_OPT) + if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_info->key.tun_flags)) err = ip_tun_fill_encap_opts_geneve(skb, tun_info); - else if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT) + else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, tun_info->key.tun_flags)) err = ip_tun_fill_encap_opts_vxlan(skb, tun_info); - else if (tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT) + else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags)) err = ip_tun_fill_encap_opts_erspan(skb, tun_info); if (err) { @@ -846,7 +853,8 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || - nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags) || + nla_put_be16(skb, LWTUNNEL_IP_FLAGS, + ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) || ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info)) return -ENOMEM; @@ -857,11 +865,11 @@ static int ip_tun_opts_nlsize(struct ip_tunnel_info *info) { int opt_len; - if (!(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)) + if (!ip_tunnel_is_options_present(info->key.tun_flags)) return 0; opt_len = nla_total_size(0); /* LWTUNNEL_IP_OPTS */ - if (info->key.tun_flags & TUNNEL_GENEVE_OPT) { + if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags)) { struct geneve_opt *opt; int offset = 0; @@ -874,10 +882,10 @@ static int ip_tun_opts_nlsize(struct ip_tunnel_info *info) /* OPT_GENEVE_DATA */ offset += sizeof(*opt) + opt->length * 4; } - } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) { + } else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) { opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_VXLAN */ + nla_total_size(4); /* OPT_VXLAN_GBP */ - } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) { + } else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags)) { struct erspan_metadata *md = ip_tunnel_info_opts(info); opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_ERSPAN */ @@ -984,10 +992,17 @@ static int ip6_tun_build_state(struct net *net, struct nlattr *attr, if (tb[LWTUNNEL_IP6_TC]) tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]); - if (tb[LWTUNNEL_IP6_FLAGS]) - tun_info->key.tun_flags |= - (nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]) & - ~TUNNEL_OPTIONS_PRESENT); + if (tb[LWTUNNEL_IP6_FLAGS]) { + IP_TUNNEL_DECLARE_FLAGS(flags); + __be16 data; + + data = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]); + ip_tunnel_flags_from_be16(flags, data); + ip_tunnel_clear_options_present(flags); + + ip_tunnel_flags_or(tun_info->key.tun_flags, + tun_info->key.tun_flags, flags); + } tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6; tun_info->options_len = opt_len; @@ -1008,7 +1023,8 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb, nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) || - nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags) || + nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, + ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) || ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info)) return -ENOMEM; @@ -1116,7 +1132,7 @@ bool ip_tunnel_netlink_encap_parms(struct nlattr *data[], EXPORT_SYMBOL_GPL(ip_tunnel_netlink_encap_parms); void ip_tunnel_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm_kern *parms) { if (data[IFLA_IPTUN_LINK]) parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); @@ -1139,8 +1155,12 @@ void ip_tunnel_netlink_parms(struct nlattr *data[], if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC])) parms->iph.frag_off = htons(IP_DF); - if (data[IFLA_IPTUN_FLAGS]) - parms->i_flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]); + if (data[IFLA_IPTUN_FLAGS]) { + __be16 flags; + + flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]); + ip_tunnel_flags_from_be16(parms->i_flags, flags); + } if (data[IFLA_IPTUN_PROTO]) parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index ee587adb169f..14536da9f5dc 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -51,8 +51,11 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, const struct iphdr *iph = ip_hdr(skb); struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); + + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, iph->daddr, 0); if (tunnel) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) @@ -167,7 +170,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct ip_tunnel_parm *parms = &tunnel->parms; + struct ip_tunnel_parm_kern *parms = &tunnel->parms; struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ int pkt_len = skb->len; @@ -322,8 +325,11 @@ static int vti4_err(struct sk_buff *skb, u32 info) const struct iphdr *iph = (const struct iphdr *)skb->data; int protocol = iph->protocol; struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; + + __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->daddr, iph->saddr, 0); if (!tunnel) return -1; @@ -373,8 +379,9 @@ static int vti4_err(struct sk_buff *skb, u32 info) } static int -vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) { + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; int err = 0; if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { @@ -383,20 +390,26 @@ vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) return -EINVAL; } - if (!(p->i_flags & GRE_KEY)) + if (!ip_tunnel_flags_is_be16_compat(p->i_flags) || + !ip_tunnel_flags_is_be16_compat(p->o_flags)) + return -EOVERFLOW; + + if (!(ip_tunnel_flags_to_be16(p->i_flags) & GRE_KEY)) p->i_key = 0; - if (!(p->o_flags & GRE_KEY)) + if (!(ip_tunnel_flags_to_be16(p->o_flags) & GRE_KEY)) p->o_key = 0; - p->i_flags = VTI_ISVTI; + __set_bit(IP_TUNNEL_VTI_BIT, flags); + ip_tunnel_flags_copy(p->i_flags, flags); err = ip_tunnel_ctl(dev, p, cmd); if (err) return err; if (cmd != SIOCDELTUNNEL) { - p->i_flags |= GRE_KEY; - p->o_flags |= GRE_KEY; + ip_tunnel_flags_from_be16(flags, GRE_KEY); + ip_tunnel_flags_or(p->i_flags, p->i_flags, flags); + ip_tunnel_flags_or(p->o_flags, p->o_flags, flags); } return 0; } @@ -531,7 +544,7 @@ static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], } static void vti_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms, + struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); @@ -541,7 +554,7 @@ static void vti_netlink_parms(struct nlattr *data[], if (!data) return; - parms->i_flags = VTI_ISVTI; + __set_bit(IP_TUNNEL_VTI_BIT, parms->i_flags); if (data[IFLA_VTI_LINK]) parms->link = nla_get_u32(data[IFLA_VTI_LINK]); @@ -566,7 +579,7 @@ static int vti_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - struct ip_tunnel_parm parms; + struct ip_tunnel_parm_kern parms; __u32 fwmark = 0; vti_netlink_parms(data, &parms, &fwmark); @@ -578,8 +591,8 @@ static int vti_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm_kern p; __u32 fwmark = t->fwmark; - struct ip_tunnel_parm p; vti_netlink_parms(data, &p, &fwmark); return ip_tunnel_changelink(dev, tb, &p, fwmark); @@ -606,7 +619,7 @@ static size_t vti_get_size(const struct net_device *dev) static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm *p = &t->parms; + struct ip_tunnel_parm_kern *p = &t->parms; if (nla_put_u32(skb, IFLA_VTI_LINK, p->link) || nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key) || diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index f2696eaadbe6..923a2ef68c2f 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -130,13 +130,16 @@ static int ipip_err(struct sk_buff *skb, u32 info) struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); const struct iphdr *iph = (const struct iphdr *)skb->data; + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; int err = 0; - t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, - iph->daddr, iph->saddr, 0); + __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); + + t = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->daddr, + iph->saddr, 0); if (!t) { err = -ENOENT; goto out; @@ -213,13 +216,16 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto) { struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct metadata_dst *tun_dst = NULL; struct ip_tunnel *tunnel; const struct iphdr *iph; + __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); + iph = ip_hdr(skb); - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, - iph->saddr, iph->daddr, 0); + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, + iph->daddr, 0); if (tunnel) { const struct tnl_ptk_info *tpi; @@ -238,7 +244,9 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto) if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; if (tunnel->collect_md) { - tun_dst = ip_tun_rx_dst(skb, 0, 0, 0); + ip_tunnel_flags_zero(flags); + + tun_dst = ip_tun_rx_dst(skb, flags, 0, 0); if (!tun_dst) return 0; ip_tunnel_md_udp_encap(skb, &tun_dst->u.tun_info); @@ -330,7 +338,7 @@ static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto) } static int -ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) { if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p->iph.version != 4 || @@ -340,7 +348,8 @@ ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) } p->i_key = p->o_key = 0; - p->i_flags = p->o_flags = 0; + ip_tunnel_flags_zero(p->i_flags); + ip_tunnel_flags_zero(p->o_flags); return ip_tunnel_ctl(dev, p, cmd); } @@ -405,8 +414,8 @@ static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], } static void ipip_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms, bool *collect_md, - __u32 *fwmark) + struct ip_tunnel_parm_kern *parms, + bool *collect_md, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); @@ -432,8 +441,8 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev, struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + struct ip_tunnel_parm_kern p; __u32 fwmark = 0; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { @@ -452,8 +461,8 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + struct ip_tunnel_parm_kern p; bool collect_md; __u32 fwmark = t->fwmark; @@ -510,7 +519,7 @@ static size_t ipip_get_size(const struct net_device *dev) static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct ip_tunnel_parm *parm = &tunnel->parms; + struct ip_tunnel_parm_kern *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index fd5c01c8489f..6c750bd13dd8 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -441,7 +441,7 @@ static bool ipmr_init_vif_indev(const struct net_device *dev) static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) { struct net_device *tunnel_dev, *new_dev; - struct ip_tunnel_parm p = { }; + struct ip_tunnel_parm_kern p = { }; int err; tunnel_dev = __dev_get_by_name(net, "tunl0"); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b814fdab19f7..f89ff2e5a05b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -106,9 +106,6 @@ #include "fib_lookup.h" -#define RT_FL_TOS(oldflp4) \ - ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) - #define RT_GC_TIMEOUT (300*HZ) #define DEFAULT_MIN_PMTU (512 + 20 + 20) @@ -498,15 +495,6 @@ void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) } EXPORT_SYMBOL(__ip_select_ident); -static void ip_rt_fix_tos(struct flowi4 *fl4) -{ - __u8 tos = RT_FL_TOS(fl4); - - fl4->flowi4_tos = tos & IPTOS_RT_MASK; - if (tos & RTO_ONLINK) - fl4->flowi4_scope = RT_SCOPE_LINK; -} - static void __build_flow_key(const struct net *net, struct flowi4 *fl4, const struct sock *sk, const struct iphdr *iph, int oif, __u8 tos, u8 prot, u32 mark, @@ -2639,7 +2627,7 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, struct rtable *rth; fl4->flowi4_iif = LOOPBACK_IFINDEX; - ip_rt_fix_tos(fl4); + fl4->flowi4_tos &= IPTOS_RT_MASK; rcu_read_lock(); rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); @@ -3591,7 +3579,7 @@ err_dup: static __net_exit void sysctl_route_net_exit(struct net *net) { - struct ctl_table *tbl; + const struct ctl_table *tbl; tbl = net->ipv4.route_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.route_hdr); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 500f665f98cb..b61d36810fe3 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -462,7 +462,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) } /* Try to redo what tcp_v4_send_synack did. */ - req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); + req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? : + dst_metric(&rt->dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ full_space = tcp_full_space(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 7e4f16a7dcc1..ce5d19978a26 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1554,7 +1554,7 @@ err_alloc: static __net_exit void ipv4_sysctl_exit_net(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; kfree(net->ipv4.sysctl_local_reserved_ports); table = net->ipv4.ipv4_hdr->ctl_table_arg; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e767721b3a58..f23b97777ea5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -290,6 +290,9 @@ enum { DEFINE_PER_CPU(unsigned int, tcp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); +DEFINE_PER_CPU(u32, tcp_tw_isn); +EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn); + long sysctl_tcp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_tcp_mem); @@ -1416,8 +1419,6 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) struct sk_buff *skb; int copied = 0, err = 0; - /* XXX -- need to support SO_PEEK_OFF */ - skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) @@ -1721,7 +1722,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val) space = tcp_space_from_win(sk, val); if (space > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, space); - tcp_sk(sk)->window_clamp = val; + WRITE_ONCE(tcp_sk(sk)->window_clamp, val); } return 0; } @@ -2328,6 +2329,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, int target; /* Read at least this many bytes */ long timeo; struct sk_buff *skb, *last; + u32 peek_offset = 0; u32 urg_hole = 0; err = -ENOTCONN; @@ -2361,7 +2363,8 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, seq = &tp->copied_seq; if (flags & MSG_PEEK) { - peek_seq = tp->copied_seq; + peek_offset = max(sk_peek_offset(sk, flags), 0); + peek_seq = tp->copied_seq + peek_offset; seq = &peek_seq; } @@ -2464,11 +2467,11 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, } if ((flags & MSG_PEEK) && - (peek_seq - copied - urg_hole != tp->copied_seq)) { + (peek_seq - peek_offset - copied - urg_hole != tp->copied_seq)) { net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", current->comm, task_pid_nr(current)); - peek_seq = tp->copied_seq; + peek_seq = tp->copied_seq + peek_offset; } continue; @@ -2509,7 +2512,10 @@ found_ok_skb: WRITE_ONCE(*seq, *seq + used); copied += used; len -= used; - + if (flags & MSG_PEEK) + sk_peek_offset_fwd(sk, used); + else + sk_peek_offset_bwd(sk, used); tcp_rcv_space_adjust(sk); skip_copy: @@ -3010,6 +3016,7 @@ int tcp_disconnect(struct sock *sk, int flags) __skb_queue_purge(&sk->sk_receive_queue); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); WRITE_ONCE(tp->urg_data, 0); + sk_set_peek_off(sk, -1); tcp_write_queue_purge(sk); tcp_fastopen_active_disable_ofo_check(sk); skb_rbtree_purge(&tp->out_of_order_queue); @@ -3379,7 +3386,7 @@ int tcp_set_window_clamp(struct sock *sk, int val) if (!val) { if (sk->sk_state != TCP_CLOSE) return -EINVAL; - tp->window_clamp = 0; + WRITE_ONCE(tp->window_clamp, 0); } else { u32 new_rcv_ssthresh, old_window_clamp = tp->window_clamp; u32 new_window_clamp = val < SOCK_MIN_RCVBUF / 2 ? @@ -3388,7 +3395,7 @@ int tcp_set_window_clamp(struct sock *sk, int val) if (new_window_clamp == old_window_clamp) return 0; - tp->window_clamp = new_window_clamp; + WRITE_ONCE(tp->window_clamp, new_window_clamp); if (new_window_clamp < old_window_clamp) { /* need to apply the reserved mem provisioning only * when shrinking the window clamp @@ -4057,7 +4064,7 @@ int do_tcp_getsockopt(struct sock *sk, int level, TCP_RTO_MAX / HZ); break; case TCP_WINDOW_CLAMP: - val = tp->window_clamp; + val = READ_ONCE(tp->window_clamp); break; case TCP_INFO: { struct tcp_info info; @@ -4648,16 +4655,16 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns); - CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_clock_cache); - CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 105); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 89); /* TXRX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_clock_cache); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_nxt); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_nxt); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_una); @@ -4670,7 +4677,11 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 76); + + /* 32bit arches with 8byte alignment on u64 fields might need padding + * before tcp_clock_cache. + */ + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 92 + 4); /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5d874817a78d..384fa5e2f065 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -563,19 +563,20 @@ static void tcp_init_buffer_space(struct sock *sk) maxwin = tcp_full_space(sk); if (tp->window_clamp >= maxwin) { - tp->window_clamp = maxwin; + WRITE_ONCE(tp->window_clamp, maxwin); if (tcp_app_win && maxwin > 4 * tp->advmss) - tp->window_clamp = max(maxwin - - (maxwin >> tcp_app_win), - 4 * tp->advmss); + WRITE_ONCE(tp->window_clamp, + max(maxwin - (maxwin >> tcp_app_win), + 4 * tp->advmss)); } /* Force reservation of one segment. */ if (tcp_app_win && tp->window_clamp > 2 * tp->advmss && tp->window_clamp + tp->advmss > maxwin) - tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); + WRITE_ONCE(tp->window_clamp, + max(2 * tp->advmss, maxwin - tp->advmss)); tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); tp->snd_cwnd_stamp = tcp_jiffies32; @@ -773,7 +774,8 @@ void tcp_rcv_space_adjust(struct sock *sk) WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); /* Make the window clamp follow along. */ - tp->window_clamp = tcp_win_from_space(sk, rcvbuf); + WRITE_ONCE(tp->window_clamp, + tcp_win_from_space(sk, rcvbuf)); } } tp->rcvq_space.space = copied; @@ -4803,10 +4805,8 @@ static bool tcp_try_coalesce(struct sock *sk, if (!mptcp_skb_can_collapse(to, from)) return false; -#ifdef CONFIG_TLS_DEVICE - if (from->decrypted != to->decrypted) + if (skb_cmp_decrypted(from, to)) return false; -#endif if (!skb_try_coalesce(to, from, fragstolen, &delta)) return false; @@ -5174,6 +5174,16 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { if (tcp_receive_window(tp) == 0) { + /* Some stacks are known to send bare FIN packets + * in a loop even if we send RWIN 0 in our ACK. + * Accepting this FIN does not hurt memory pressure + * because the FIN flag will simply be merged to the + * receive queue tail skb in most cases. + */ + if (!skb->len && + (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) + goto queue_and_out; + reason = SKB_DROP_REASON_TCP_ZEROWINDOW; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; @@ -5188,7 +5198,7 @@ queue_and_out: inet_csk_schedule_ack(sk); sk->sk_data_ready(sk); - if (skb_queue_len(&sk->sk_receive_queue)) { + if (skb_queue_len(&sk->sk_receive_queue) && skb->len) { reason = SKB_DROP_REASON_PROTO_MEM; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); goto drop; @@ -5375,9 +5385,7 @@ restart: break; memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); -#ifdef CONFIG_TLS_DEVICE - nskb->decrypted = skb->decrypted; -#endif + skb_copy_decrypted(nskb, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; if (list) __skb_queue_before(list, skb, nskb); @@ -5407,10 +5415,8 @@ restart: !mptcp_skb_can_collapse(nskb, skb) || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) goto end; -#ifdef CONFIG_TLS_DEVICE - if (skb->decrypted != nskb->decrypted) + if (skb_cmp_decrypted(skb, nskb)) goto end; -#endif } } } @@ -6426,7 +6432,8 @@ consume: if (!tp->rx_opt.wscale_ok) { tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; - tp->window_clamp = min(tp->window_clamp, 65535U); + WRITE_ONCE(tp->window_clamp, + min(tp->window_clamp, 65535U)); } if (tp->rx_opt.saw_tstamp) { @@ -6999,7 +7006,7 @@ EXPORT_SYMBOL(inet_reqsk_alloc); /* * Return true if a syncookie should be sent */ -static bool tcp_syn_flood_action(const struct sock *sk, const char *proto) +static bool tcp_syn_flood_action(struct sock *sk, const char *proto) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const char *msg = "Dropping request"; @@ -7100,7 +7107,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct sock *sk, struct sk_buff *skb) { struct tcp_fastopen_cookie foc = { .len = -1 }; - __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; struct tcp_options_received tmp_opt; struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); @@ -7110,21 +7116,28 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct dst_entry *dst; struct flowi fl; u8 syncookies; + u32 isn; #ifdef CONFIG_TCP_AO const struct tcp_ao_hdr *aoh; #endif - syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); + isn = __this_cpu_read(tcp_tw_isn); + if (isn) { + /* TW buckets are converted to open requests without + * limitations, they conserve resources and peer is + * evidently real one. + */ + __this_cpu_write(tcp_tw_isn, 0); + } else { + syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); - /* TW buckets are converted to open requests without - * limitations, they conserve resources and peer is - * evidently real one. - */ - if ((syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) { - want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name); - if (!want_cookie) - goto drop; + if (syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) { + want_cookie = tcp_syn_flood_action(sk, + rsk_ops->slab_name); + if (!want_cookie) + goto drop; + } } if (sk_acceptq_is_full(sk)) { @@ -7163,7 +7176,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, /* Note: tcp_v6_init_req() might override ir_iif for link locals */ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); - dst = af_ops->route_req(sk, skb, &fl, req); + dst = af_ops->route_req(sk, skb, &fl, req, isn); if (!dst) goto drop_and_free; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a22ee5838751..e06f0cd04f7e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -866,11 +866,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) * routing might fail in this case. No choice here, if we choose to force * input interface, we will misroute in case of asymmetric route. */ - if (sk) { + if (sk) arg.bound_dev_if = sk->sk_bound_dev_if; - if (sk_fullsock(sk)) - trace_tcp_send_reset(sk, skb); - } + + trace_tcp_send_reset(sk, skb); BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != offsetof(struct inet_timewait_sock, tw_bound_dev_if)); @@ -1667,7 +1666,8 @@ static void tcp_v4_init_req(struct request_sock *req, static struct dst_entry *tcp_v4_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req) + struct request_sock *req, + u32 tw_isn) { tcp_v4_init_req(req, sk, skb); @@ -1995,7 +1995,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) { - u32 limit, tail_gso_size, tail_gso_segs; + u32 tail_gso_size, tail_gso_segs; struct skb_shared_info *shinfo; const struct tcphdr *th; struct tcphdr *thtail; @@ -2004,6 +2004,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, bool fragstolen; u32 gso_segs; u32 gso_size; + u64 limit; int delta; /* In case all data was pulled from skb frags (in __pskb_pull_tail()), @@ -2045,10 +2046,8 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || ((TCP_SKB_CB(tail)->tcp_flags ^ TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || -#ifdef CONFIG_TLS_DEVICE - tail->decrypted != skb->decrypted || -#endif !mptcp_skb_can_collapse(tail, skb) || + skb_cmp_decrypted(tail, skb) || thtail->doff != th->doff || memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) goto no_coalesce; @@ -2101,7 +2100,13 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, __skb_push(skb, hdrlen); no_coalesce: - limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1); + /* sk->sk_backlog.len is reset only at the end of __release_sock(). + * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach + * sk_rcvbuf in normal conditions. + */ + limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; + + limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; /* Only socket owner can try to collapse/prune rx queues * to reduce memory overhead, so add a little headroom here. @@ -2109,6 +2114,8 @@ no_coalesce: */ limit += 64 * 1024; + limit = min_t(u64, limit, UINT_MAX); + if (unlikely(sk_add_backlog(sk, skb, limit))) { bh_unlock_sock(sk); *reason = SKB_DROP_REASON_SOCKET_BACKLOG; @@ -2148,7 +2155,6 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); - TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = @@ -2170,6 +2176,7 @@ int tcp_v4_rcv(struct sk_buff *skb) bool refcounted; struct sock *sk; int ret; + u32 isn; drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (skb->pkt_type != PACKET_HOST) @@ -2207,7 +2214,6 @@ lookup: if (!sk) goto no_tcp_socket; -process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; @@ -2287,6 +2293,7 @@ process: } } +process: if (static_branch_unlikely(&ip4_min_ttl)) { /* min_ttl can be changed concurrently from do_ip_setsockopt() */ if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { @@ -2385,7 +2392,7 @@ do_time_wait: inet_twsk_put(inet_twsk(sk)); goto csum_error; } - switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { + switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, @@ -2399,6 +2406,7 @@ do_time_wait: sk = sk2; tcp_v4_restore_cb(skb); refcounted = false; + __this_cpu_write(tcp_tw_isn, isn); goto process; } } @@ -3501,7 +3509,7 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { struct net *net; - tcp_twsk_purge(net_exit_list, AF_INET); + tcp_twsk_purge(net_exit_list); list_for_each_entry(net, net_exit_list, exit_list) { inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index c2a925538542..e93df98de3f4 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -766,6 +766,7 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb, unsigned int max_rows = 1U << tcp_metrics_hash_log; unsigned int row, s_row = cb->args[0]; int s_col = cb->args[1], col = s_col; + int res = 0; for (row = s_row; row < max_rows; row++, s_col = 0) { struct tcp_metrics_block *tm; @@ -778,7 +779,8 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb, continue; if (col < s_col) continue; - if (tcp_metrics_dump_info(skb, cb, tm) < 0) { + res = tcp_metrics_dump_info(skb, cb, tm); + if (res < 0) { rcu_read_unlock(); goto done; } @@ -789,7 +791,7 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb, done: cb->args[0] = row; cb->args[1] = col; - return skb->len; + return res; } static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, @@ -986,6 +988,7 @@ static struct genl_family tcp_metrics_nl_family __ro_after_init = { .maxattr = TCP_METRICS_ATTR_MAX, .policy = tcp_metrics_nl_policy, .netnsok = true, + .parallel_ops = true, .module = THIS_MODULE, .small_ops = tcp_metrics_nl_ops, .n_small_ops = ARRAY_SIZE(tcp_metrics_nl_ops), diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f0761f060a83..146c061145b4 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -95,7 +95,7 @@ static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq) */ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - const struct tcphdr *th) + const struct tcphdr *th, u32 *tw_isn) { struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); @@ -228,7 +228,7 @@ kill: u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; - TCP_SKB_CB(skb)->tcp_tw_isn = isn; + *tw_isn = isn; return TCP_TW_SYN; } @@ -388,7 +388,7 @@ void tcp_twsk_destructor(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_twsk_destructor); -void tcp_twsk_purge(struct list_head *net_exit_list, int family) +void tcp_twsk_purge(struct list_head *net_exit_list) { bool purged_once = false; struct net *net; @@ -396,14 +396,13 @@ void tcp_twsk_purge(struct list_head *net_exit_list, int family) list_for_each_entry(net, net_exit_list, exit_list) { if (net->ipv4.tcp_death_row.hashinfo->pernet) { /* Even if tw_refcount == 1, we must clean up kernel reqsk */ - inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo, family); + inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo); } else if (!purged_once) { - inet_twsk_purge(&tcp_hashinfo, family); + inet_twsk_purge(&tcp_hashinfo); purged_once = true; } } } -EXPORT_SYMBOL_GPL(tcp_twsk_purge); /* Warning : This function is called without sk_listener being locked. * Be sure to read socket fields once, as their value could change under us. diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index ebe4722bb020..fab0973f995b 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -265,9 +265,7 @@ found: flush |= (len - 1) >= mss; flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); -#ifdef CONFIG_TLS_DEVICE - flush |= p->decrypted ^ skb->decrypted; -#endif + flush |= skb_cmp_decrypted(p, skb); if (flush || skb_gro_receive(p, skb)) { mss = 1; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e3167ad96567..ce59e4499b66 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -44,6 +44,7 @@ #include <linux/gfp.h> #include <linux/module.h> #include <linux/static_key.h> +#include <linux/skbuff_ref.h> #include <trace/events/tcp.h> @@ -203,16 +204,17 @@ static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt) * This MUST be enforced by all callers. */ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, - __u32 *rcv_wnd, __u32 *window_clamp, + __u32 *rcv_wnd, __u32 *__window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd) { unsigned int space = (__space < 0 ? 0 : __space); + u32 window_clamp = READ_ONCE(*__window_clamp); /* If no clamp set the clamp to the max possible scaled window */ - if (*window_clamp == 0) - (*window_clamp) = (U16_MAX << TCP_MAX_WSCALE); - space = min(*window_clamp, space); + if (window_clamp == 0) + window_clamp = (U16_MAX << TCP_MAX_WSCALE); + space = min(window_clamp, space); /* Quantize space offering to a multiple of mss if possible. */ if (space > mss) @@ -239,12 +241,13 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, /* Set window scaling on max possible window */ space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); space = max_t(u32, space, READ_ONCE(sysctl_rmem_max)); - space = min_t(u32, space, *window_clamp); + space = min_t(u32, space, window_clamp); *rcv_wscale = clamp_t(int, ilog2(space) - 15, 0, TCP_MAX_WSCALE); } /* Set the clamp no higher than max representable value */ - (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp); + WRITE_ONCE(*__window_clamp, + min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp)); } EXPORT_SYMBOL(tcp_select_initial_window); @@ -1499,18 +1502,22 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) } /* Initialize TSO segments for a packet. */ -static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) +static int tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { + int tso_segs; + if (skb->len <= mss_now) { /* Avoid the costly divide in the normal * non-TSO case. */ - tcp_skb_pcount_set(skb, 1); TCP_SKB_CB(skb)->tcp_gso_size = 0; - } else { - tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); - TCP_SKB_CB(skb)->tcp_gso_size = mss_now; + tcp_skb_pcount_set(skb, 1); + return 1; } + TCP_SKB_CB(skb)->tcp_gso_size = mss_now; + tso_segs = DIV_ROUND_UP(skb->len, mss_now); + tcp_skb_pcount_set(skb, tso_segs); + return tso_segs; } /* Pcount in the middle of the write queue got changed, we need to do various @@ -2070,16 +2077,10 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, /* Can at least one segment of SKB be sent right now, according to the * congestion window rules? If so, return how many segments are allowed. */ -static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, - const struct sk_buff *skb) +static u32 tcp_cwnd_test(const struct tcp_sock *tp) { u32 in_flight, cwnd, halfcwnd; - /* Don't be strict about the congestion window for the final FIN. */ - if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && - tcp_skb_pcount(skb) == 1) - return 1; - in_flight = tcp_packets_in_flight(tp); cwnd = tcp_snd_cwnd(tp); if (in_flight >= cwnd) @@ -2100,10 +2101,9 @@ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); - if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { - tcp_set_skb_tso_segs(skb, mss_now); - tso_segs = tcp_skb_pcount(skb); - } + if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) + return tcp_set_skb_tso_segs(skb, mss_now); + return tso_segs; } @@ -2683,6 +2683,36 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) tcp_chrono_set(tp, TCP_CHRONO_BUSY); } +/* First skb in the write queue is smaller than ideal packet size. + * Check if we can move payload from the second skb in the queue. + */ +static void tcp_grow_skb(struct sock *sk, struct sk_buff *skb, int amount) +{ + struct sk_buff *next_skb = skb->next; + unsigned int nlen; + + if (tcp_skb_is_last(sk, skb)) + return; + + if (!tcp_skb_can_collapse(skb, next_skb)) + return; + + nlen = min_t(u32, amount, next_skb->len); + if (!nlen || !skb_shift(skb, next_skb, nlen)) + return; + + TCP_SKB_CB(skb)->end_seq += nlen; + TCP_SKB_CB(next_skb)->seq += nlen; + + if (!next_skb->len) { + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; + TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor; + TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags; + tcp_unlink_write_queue(next_skb, sk); + tcp_wmem_free_skb(sk, next_skb); + } +} + /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -2703,10 +2733,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int tso_segs, sent_pkts; - int cwnd_quota; + u32 cwnd_quota, max_segs; int result; bool is_cwnd_limited = false, is_rwnd_limited = false; - u32 max_segs; sent_pkts = 0; @@ -2724,6 +2753,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, max_segs = tcp_tso_segs(sk, mss_now); while ((skb = tcp_send_head(sk))) { unsigned int limit; + int missing_bytes; if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ @@ -2737,10 +2767,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (tcp_pacing_check(sk)) break; - tso_segs = tcp_init_tso_segs(skb, mss_now); - BUG_ON(!tso_segs); - - cwnd_quota = tcp_cwnd_test(tp, skb); + cwnd_quota = tcp_cwnd_test(tp); if (!cwnd_quota) { if (push_one == 2) /* Force out a loss probe pkt. */ @@ -2748,6 +2775,12 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, else break; } + cwnd_quota = min(cwnd_quota, max_segs); + missing_bytes = cwnd_quota * mss_now - skb->len; + if (missing_bytes > 0) + tcp_grow_skb(sk, skb, missing_bytes); + + tso_segs = tcp_set_skb_tso_segs(skb, mss_now); if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) { is_rwnd_limited = true; @@ -2769,9 +2802,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, - min_t(unsigned int, - cwnd_quota, - max_segs), + cwnd_quota, nonagle); if (skb->len > limit && @@ -3387,11 +3418,6 @@ start: err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } - /* To avoid taking spuriously low RTT samples based on a timestamp - * for a transmit that never happened, always mark EVER_RETRANS - */ - TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; - if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, TCP_SKB_CB(skb)->seq, segs, err); @@ -3401,6 +3427,12 @@ start: } else if (err != -EBUSY) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); } + + /* To avoid taking spuriously low RTT samples based on a timestamp + * for a transmit that never happened, always mark EVER_RETRANS + */ + TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; + return err; } @@ -3855,7 +3887,7 @@ static void tcp_connect_init(struct sock *sk) tcp_ca_dst_init(sk, dst); if (!tp->window_clamp) - tp->window_clamp = dst_metric(dst, RTAX_WINDOW); + WRITE_ONCE(tp->window_clamp, dst_metric(dst, RTAX_WINDOW)); tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); tcp_initialize_rcv_mss(sk); @@ -3863,7 +3895,7 @@ static void tcp_connect_init(struct sock *sk) /* limit the window selection if the user enforce a smaller rx buffer */ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) - tp->window_clamp = tcp_full_space(sk); + WRITE_ONCE(tp->window_clamp, tcp_full_space(sk)); rcv_wnd = tcp_rwnd_init_bpf(sk); if (rcv_wnd == 0) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index d1ad20ce1c8c..976db57b95d4 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -25,7 +25,7 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { - struct inet_connection_sock *icsk = inet_csk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); u32 elapsed, user_timeout; s32 remaining; @@ -47,7 +47,7 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) { - struct inet_connection_sock *icsk = inet_csk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); u32 remaining, user_timeout; s32 elapsed; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 420905be5f30..6e2446295089 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -427,15 +427,21 @@ static struct sock *udp4_lib_lookup2(struct net *net, { struct sock *sk, *result; int score, badness; + bool need_rescore; result = NULL; badness = 0; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { - score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, sdif); + need_rescore = false; +rescore: + score = compute_score(need_rescore ? result : sk, net, saddr, + sport, daddr, hnum, dif, sdif); if (score > badness) { badness = score; + if (need_rescore) + continue; + if (sk->sk_state == TCP_ESTABLISHED) { result = sk; continue; @@ -456,9 +462,14 @@ static struct sock *udp4_lib_lookup2(struct net *net, if (IS_ERR(result)) continue; - badness = compute_score(result, net, saddr, sport, - daddr, hnum, dif, sdif); - + /* compute_score is too long of a function to be + * inlined, and calling it again here yields + * measureable overhead for some + * workloads. Work around it by jumping + * backwards to rescore 'result'. + */ + need_rescore = true; + goto rescore; } } return result; @@ -1500,13 +1511,15 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) struct sk_buff_head *list = &sk->sk_receive_queue; int rmem, err = -ENOMEM; spinlock_t *busy = NULL; - int size; + bool becomes_readable; + int size, rcvbuf; - /* try to avoid the costly atomic add/sub pair when the receive - * queue is full; always allow at least a packet + /* Immediately drop when the receive queue is full. + * Always allow at least one packet. */ rmem = atomic_read(&sk->sk_rmem_alloc); - if (rmem > sk->sk_rcvbuf) + rcvbuf = READ_ONCE(sk->sk_rcvbuf); + if (rmem > rcvbuf) goto drop; /* Under mem pressure, it might be helpful to help udp_recvmsg() @@ -1515,7 +1528,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) * - Less cache line misses at copyout() time * - Less work at consume_skb() (less alien page frag freeing) */ - if (rmem > (sk->sk_rcvbuf >> 1)) { + if (rmem > (rcvbuf >> 1)) { skb_condense(skb); busy = busylock_acquire(sk); @@ -1523,12 +1536,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) size = skb->truesize; udp_set_dev_scratch(skb); - /* we drop only if the receive buf is full and the receive - * queue contains some other skb - */ - rmem = atomic_add_return(size, &sk->sk_rmem_alloc); - if (rmem > (size + (unsigned int)sk->sk_rcvbuf)) - goto uncharge_drop; + atomic_add(size, &sk->sk_rmem_alloc); spin_lock(&list->lock); err = udp_rmem_schedule(sk, size); @@ -1544,12 +1552,19 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) */ sock_skb_set_dropcount(sk, skb); + becomes_readable = skb_queue_empty(list); __skb_queue_tail(list, skb); spin_unlock(&list->lock); - if (!sock_flag(sk, SOCK_DEAD)) - INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); - + if (!sock_flag(sk, SOCK_DEAD)) { + if (becomes_readable || + sk->sk_data_ready != sock_def_readable || + READ_ONCE(sk->sk_peek_off) >= 0) + INDIRECT_CALL_1(sk->sk_data_ready, + sock_def_readable, sk); + else + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); + } busylock_release(busy); return 0; @@ -2057,8 +2072,8 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) drop_reason = SKB_DROP_REASON_PROTO_MEM; } UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + trace_udp_fail_queue_rcv_skb(rc, sk, skb); kfree_skb_reason(skb, drop_reason); - trace_udp_fail_queue_rcv_skb(rc, sk); return -1; } diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 860aff5f8599..e4e0fa869fa4 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -183,7 +183,8 @@ void udp_tunnel_sock_release(struct socket *sock) EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, - __be16 flags, __be64 tunnel_id, int md_size) + const unsigned long *flags, + __be64 tunnel_id, int md_size) { struct metadata_dst *tun_dst; struct ip_tunnel_info *info; @@ -199,7 +200,7 @@ struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, info->key.tp_src = udp_hdr(skb)->source; info->key.tp_dst = udp_hdr(skb)->dest; if (udp_hdr(skb)->check) - info->key.tun_flags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); return tun_dst; } EXPORT_SYMBOL_GPL(udp_tun_rx_dst); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index c33bca2c3841..1dda59e0aeab 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -186,7 +186,7 @@ err_alloc: static __net_exit void xfrm4_net_sysctl_exit(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; if (!net->ipv4.xfrm4_hdr) return; |