diff options
Diffstat (limited to 'net/ipv4')
35 files changed, 786 insertions, 276 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1fbe2f815474..326c422c22f8 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -208,6 +208,7 @@ int inet_listen(struct socket *sock, int backlog) if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) goto out; + sk->sk_max_ack_backlog = backlog; /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ @@ -231,7 +232,6 @@ int inet_listen(struct socket *sock, int backlog) goto out; tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); } - sk->sk_max_ack_backlog = backlog; err = 0; out: @@ -1964,6 +1964,8 @@ static int __init inet_init(void) /* Add UDP-Lite (RFC 3828) */ udplite4_register(); + raw_init(); + ping_init(); /* diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index a34602ae27de..5b9b6d497f71 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1100,7 +1100,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) inet_del_ifa(in_dev, ifap, 1); break; } - ret = dev_change_flags(dev, ifr->ifr_flags); + ret = dev_change_flags(dev, ifr->ifr_flags, NULL); break; case SIOCSIFADDR: /* Set interface address (and family) */ diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b5c3937ca6ec..5022bc63863a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1076,7 +1076,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, if (!fi) goto failure; fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx, - cfg->fc_mx_len); + cfg->fc_mx_len, extack); if (unlikely(IS_ERR(fi->fib_metrics))) { err = PTR_ERR(fi->fib_metrics); kfree(fi); diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 500a59906b87..0d0ad19ecb87 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -3,6 +3,7 @@ #include <linux/socket.h> #include <linux/skbuff.h> #include <linux/ip.h> +#include <linux/icmp.h> #include <linux/udp.h> #include <linux/types.h> #include <linux/kernel.h> @@ -1003,15 +1004,82 @@ static int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, return 0; } +static int gue_err_proto_handler(int proto, struct sk_buff *skb, u32 info) +{ + const struct net_protocol *ipprot = rcu_dereference(inet_protos[proto]); + + if (ipprot && ipprot->err_handler) { + if (!ipprot->err_handler(skb, info)) + return 0; + } + + return -ENOENT; +} + +static int gue_err(struct sk_buff *skb, u32 info) +{ + int transport_offset = skb_transport_offset(skb); + struct guehdr *guehdr; + size_t optlen; + int ret; + + if (skb->len < sizeof(struct udphdr) + sizeof(struct guehdr)) + return -EINVAL; + + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; + + switch (guehdr->version) { + case 0: /* Full GUE header present */ + break; + case 1: { + /* Direct encasulation of IPv4 or IPv6 */ + skb_set_transport_header(skb, -(int)sizeof(struct icmphdr)); + + switch (((struct iphdr *)guehdr)->version) { + case 4: + ret = gue_err_proto_handler(IPPROTO_IPIP, skb, info); + goto out; +#if IS_ENABLED(CONFIG_IPV6) + case 6: + ret = gue_err_proto_handler(IPPROTO_IPV6, skb, info); + goto out; +#endif + default: + ret = -EOPNOTSUPP; + goto out; + } + } + default: /* Undefined version */ + return -EOPNOTSUPP; + } + + if (guehdr->control) + return -ENOENT; + + optlen = guehdr->hlen << 2; + + if (validate_gue_flags(guehdr, optlen)) + return -EINVAL; + + skb_set_transport_header(skb, -(int)sizeof(struct icmphdr)); + ret = gue_err_proto_handler(guehdr->proto_ctype, skb, info); + +out: + skb_set_transport_header(skb, transport_offset); + return ret; +} + static const struct ip_tunnel_encap_ops fou_iptun_ops = { .encap_hlen = fou_encap_hlen, .build_header = fou_build_header, + .err_handler = gue_err, }; static const struct ip_tunnel_encap_ops gue_iptun_ops = { .encap_hlen = gue_encap_hlen, .build_header = gue_build_header, + .err_handler = gue_err, }; static int ip_tunnel_encap_add_fou_ops(void) diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 7efe740c06eb..a4bf22ee3aed 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -151,20 +151,25 @@ drop: return NET_RX_DROP; } -static void gre_err(struct sk_buff *skb, u32 info) +static int gre_err(struct sk_buff *skb, u32 info) { const struct gre_protocol *proto; const struct iphdr *iph = (const struct iphdr *)skb->data; u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f; + int err = 0; if (ver >= GREPROTO_MAX) - return; + return -EINVAL; rcu_read_lock(); proto = rcu_dereference(gre_proto[ver]); if (proto && proto->err_handler) proto->err_handler(skb, info); + else + err = -EPROTONOSUPPORT; rcu_read_unlock(); + + return err; } static const struct net_protocol net_gre_protocol = { diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index d832beed6e3a..065997f414e6 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1079,7 +1079,7 @@ error: goto drop; } -void icmp_err(struct sk_buff *skb, u32 info) +int icmp_err(struct sk_buff *skb, u32 info) { struct iphdr *iph = (struct iphdr *)skb->data; int offset = iph->ihl<<2; @@ -1094,13 +1094,15 @@ void icmp_err(struct sk_buff *skb, u32 info) */ if (icmph->type != ICMP_ECHOREPLY) { ping_err(skb, offset, info); - return; + return 0; } if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP); else if (type == ICMP_REDIRECT) ipv4_redirect(skb, net, 0, IPPROTO_ICMP); + + return 0; } /* diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 15e7f7915a21..6ea523d71947 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -183,7 +183,9 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int * int i, low, high, attempt_half; struct inet_bind_bucket *tb; u32 remaining, offset; + int l3mdev; + l3mdev = inet_sk_bound_l3mdev(sk); attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; other_half_scan: inet_get_local_port_range(net, &low, &high); @@ -219,7 +221,8 @@ other_parity_scan: hinfo->bhash_size)]; spin_lock_bh(&head->lock); inet_bind_bucket_for_each(tb, &head->chain) - if (net_eq(ib_net(tb), net) && tb->port == port) { + if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && + tb->port == port) { if (!inet_csk_bind_conflict(sk, tb, false, false)) goto success; goto next_port; @@ -293,6 +296,9 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) struct net *net = sock_net(sk); struct inet_bind_bucket *tb = NULL; kuid_t uid = sock_i_uid(sk); + int l3mdev; + + l3mdev = inet_sk_bound_l3mdev(sk); if (!port) { head = inet_csk_find_open_port(sk, &tb, &port); @@ -306,11 +312,12 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) hinfo->bhash_size)]; spin_lock_bh(&head->lock); inet_bind_bucket_for_each(tb, &head->chain) - if (net_eq(ib_net(tb), net) && tb->port == port) + if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && + tb->port == port) goto tb_found; tb_not_found: tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, - net, head, port); + net, head, port, l3mdev); if (!tb) goto fail_unlock; tb_found: @@ -874,7 +881,6 @@ int inet_csk_listen_start(struct sock *sk, int backlog) reqsk_queue_alloc(&icsk->icsk_accept_queue); - sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 411dd7a90046..13890d5bfc34 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -65,12 +65,14 @@ static u32 sk_ehashfn(const struct sock *sk) struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, - const unsigned short snum) + const unsigned short snum, + int l3mdev) { struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); if (tb) { write_pnet(&tb->ib_net, net); + tb->l3mdev = l3mdev; tb->port = snum; tb->fastreuse = 0; tb->fastreuseport = 0; @@ -135,6 +137,7 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) table->bhash_size); struct inet_bind_hashbucket *head = &table->bhash[bhash]; struct inet_bind_bucket *tb; + int l3mdev; spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; @@ -143,6 +146,8 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) return -ENOENT; } if (tb->port != port) { + l3mdev = inet_sk_bound_l3mdev(sk); + /* NOTE: using tproxy and redirecting skbs to a proxy * on a different listener port breaks the assumption * that the listener socket's icsk_bind_hash is the same @@ -150,12 +155,13 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) * create a new bind bucket for the child here. */ inet_bind_bucket_for_each(tb, &head->chain) { if (net_eq(ib_net(tb), sock_net(sk)) && - tb->port == port) + tb->l3mdev == l3mdev && tb->port == port) break; } if (!tb) { tb = inet_bind_bucket_create(table->bind_bucket_cachep, - sock_net(sk), head, port); + sock_net(sk), head, port, + l3mdev); if (!tb) { spin_unlock(&head->lock); return -ENOMEM; @@ -229,6 +235,7 @@ static inline int compute_score(struct sock *sk, struct net *net, { int score = -1; struct inet_sock *inet = inet_sk(sk); + bool dev_match; if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && !ipv6_only_sock(sk)) { @@ -239,15 +246,12 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); + dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, + dif, sdif); + if (!dev_match) + return -1; + score += 4; - if (!dev_match) - return -1; - if (sk->sk_bound_dev_if) - score += 4; - } if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; } @@ -675,6 +679,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, u32 remaining, offset; int ret, i, low, high; static u32 hint; + int l3mdev; if (port) { head = &hinfo->bhash[inet_bhashfn(net, port, @@ -693,6 +698,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, return ret; } + l3mdev = inet_sk_bound_l3mdev(sk); + inet_get_local_port_range(net, &low, &high); high++; /* [32768, 60999] -> [32768, 61000[ */ remaining = high - low; @@ -719,7 +726,8 @@ other_parity_scan: * the established check is already unique enough. */ inet_bind_bucket_for_each(tb, &head->chain) { - if (net_eq(ib_net(tb), net) && tb->port == port) { + if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && + tb->port == port) { if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) goto next_port; @@ -732,7 +740,7 @@ other_parity_scan: } tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, - net, head, port); + net, head, port, l3mdev); if (!tb) { spin_unlock_bh(&head->lock); return -ENOMEM; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 32662e9e5d21..06ee4696703c 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -69,6 +69,13 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s __IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS); __IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len); +#ifdef CONFIG_NET_SWITCHDEV + if (skb->offload_l3_fwd_mark) { + consume_skb(skb); + return 0; + } +#endif + if (unlikely(opt->optlen)) ip_forward_options(skb); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 38befe829caf..76a9a5f7a40e 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -121,8 +121,8 @@ static unsigned int ipgre_net_id __read_mostly; static unsigned int gre_tap_net_id __read_mostly; static unsigned int erspan_net_id __read_mostly; -static void ipgre_err(struct sk_buff *skb, u32 info, - const struct tnl_ptk_info *tpi) +static int ipgre_err(struct sk_buff *skb, u32 info, + const struct tnl_ptk_info *tpi) { /* All the routers (except for Linux) return only @@ -146,17 +146,32 @@ static void ipgre_err(struct sk_buff *skb, u32 info, unsigned int data_len = 0; struct ip_tunnel *t; + if (tpi->proto == htons(ETH_P_TEB)) + itn = net_generic(net, gre_tap_net_id); + else if (tpi->proto == htons(ETH_P_ERSPAN) || + tpi->proto == htons(ETH_P_ERSPAN2)) + itn = net_generic(net, erspan_net_id); + else + itn = net_generic(net, ipgre_net_id); + + iph = (const struct iphdr *)(icmp_hdr(skb) + 1); + t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, + iph->daddr, iph->saddr, tpi->key); + + if (!t) + return -ENOENT; + switch (type) { default: case ICMP_PARAMETERPROB: - return; + return 0; case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: case ICMP_PORT_UNREACH: /* Impossible event. */ - return; + return 0; default: /* All others are translated to HOST_UNREACH. rfc2003 contains "deep thoughts" about NET_UNREACH, @@ -168,7 +183,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info, case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) - return; + return 0; data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ break; @@ -176,40 +191,27 @@ static void ipgre_err(struct sk_buff *skb, u32 info, break; } - if (tpi->proto == htons(ETH_P_TEB)) - itn = net_generic(net, gre_tap_net_id); - else if (tpi->proto == htons(ETH_P_ERSPAN) || - tpi->proto == htons(ETH_P_ERSPAN2)) - itn = net_generic(net, erspan_net_id); - else - itn = net_generic(net, ipgre_net_id); - - iph = (const struct iphdr *)(icmp_hdr(skb) + 1); - t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, - iph->daddr, iph->saddr, tpi->key); - - if (!t) - return; - #if IS_ENABLED(CONFIG_IPV6) if (tpi->proto == htons(ETH_P_IPV6) && !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, type, data_len)) - return; + return 0; #endif if (t->parms.iph.daddr == 0 || ipv4_is_multicast(t->parms.iph.daddr)) - return; + return 0; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) - return; + return 0; if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; t->err_time = jiffies; + + return 0; } static void gre_err(struct sk_buff *skb, u32 info) @@ -1601,7 +1603,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, memset(&tb, 0, sizeof(tb)); dev = rtnl_create_link(net, name, name_assign_type, - &ipgre_tap_ops, tb); + &ipgre_tap_ops, tb, NULL); if (IS_ERR(dev)) return dev; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index e609b08c9df4..26921f6b3b92 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -188,51 +188,50 @@ bool ip_call_ra_chain(struct sk_buff *skb) return false; } -static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol) { - __skb_pull(skb, skb_network_header_len(skb)); - - rcu_read_lock(); - { - int protocol = ip_hdr(skb)->protocol; - const struct net_protocol *ipprot; - int raw; + const struct net_protocol *ipprot; + int raw, ret; - resubmit: - raw = raw_local_deliver(skb, protocol); +resubmit: + raw = raw_local_deliver(skb, protocol); - ipprot = rcu_dereference(inet_protos[protocol]); - if (ipprot) { - int ret; - - if (!ipprot->no_policy) { - if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - kfree_skb(skb); - goto out; - } - nf_reset(skb); + ipprot = rcu_dereference(inet_protos[protocol]); + if (ipprot) { + if (!ipprot->no_policy) { + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + return; } - ret = ipprot->handler(skb); - if (ret < 0) { - protocol = -ret; - goto resubmit; + nf_reset(skb); + } + ret = ipprot->handler(skb); + if (ret < 0) { + protocol = -ret; + goto resubmit; + } + __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); + } else { + if (!raw) { + if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS); + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_PROT_UNREACH, 0); } - __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); + kfree_skb(skb); } else { - if (!raw) { - if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS); - icmp_send(skb, ICMP_DEST_UNREACH, - ICMP_PROT_UNREACH, 0); - } - kfree_skb(skb); - } else { - __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); - consume_skb(skb); - } + __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); + consume_skb(skb); } } - out: +} + +static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + __skb_pull(skb, skb_network_header_len(skb)); + + rcu_read_lock(); + ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol); rcu_read_unlock(); return 0; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 5dbec21856f4..ab6618036afe 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -867,6 +867,7 @@ static int __ip_append_data(struct sock *sk, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); + struct ubuf_info *uarg = NULL; struct sk_buff *skb; struct ip_options *opt = cork->opt; @@ -880,8 +881,8 @@ static int __ip_append_data(struct sock *sk, int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; unsigned int wmem_alloc_delta = 0; + bool paged, extra_uref; u32 tskey = 0; - bool paged; skb = skb_peek_tail(queue); @@ -916,6 +917,20 @@ static int __ip_append_data(struct sock *sk, (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM))) csummode = CHECKSUM_PARTIAL; + if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { + uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); + if (!uarg) + return -ENOBUFS; + extra_uref = true; + if (rt->dst.dev->features & NETIF_F_SG && + csummode == CHECKSUM_PARTIAL) { + paged = true; + } else { + uarg->zerocopy = 0; + skb_zcopy_set(skb, uarg, &extra_uref); + } + } + cork->length += length; /* So, what's going on in the loop below? @@ -1001,12 +1016,6 @@ alloc_new_skb: skb->csum = 0; skb_reserve(skb, hh_len); - /* only the initial fragment is time stamped */ - skb_shinfo(skb)->tx_flags = cork->tx_flags; - cork->tx_flags = 0; - skb_shinfo(skb)->tskey = tskey; - tskey = 0; - /* * Find where to start putting bytes. */ @@ -1039,6 +1048,13 @@ alloc_new_skb: exthdrlen = 0; csummode = CHECKSUM_NONE; + /* only the initial fragment is time stamped */ + skb_shinfo(skb)->tx_flags = cork->tx_flags; + cork->tx_flags = 0; + skb_shinfo(skb)->tskey = tskey; + tskey = 0; + skb_zcopy_set(skb, uarg, &extra_uref); + if ((flags & MSG_CONFIRM) && !skb_prev) skb_set_dst_pending_confirm(skb, 1); @@ -1068,7 +1084,7 @@ alloc_new_skb: err = -EFAULT; goto error; } - } else { + } else if (!uarg || !uarg->zerocopy) { int i = skb_shinfo(skb)->nr_frags; err = -ENOMEM; @@ -1098,6 +1114,10 @@ alloc_new_skb: skb->data_len += copy; skb->truesize += copy; wmem_alloc_delta += copy; + } else { + err = skb_zerocopy_iter_dgram(skb, from, copy); + if (err < 0) + goto error; } offset += copy; length -= copy; @@ -1110,6 +1130,8 @@ alloc_new_skb: error_efault: err = -EFAULT; error: + if (uarg) + sock_zerocopy_put_abort(uarg, extra_uref); cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index c248e0dccbe1..c857ec6b9784 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -120,7 +120,7 @@ int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len, } skb_clear_hash_if_not_l4(skb); - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); skb_set_queue_mapping(skb, 0); skb_scrub_packet(skb, xnet); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 88212615bf4c..55757764c381 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -220,7 +220,7 @@ static int __init ic_open_devs(void) for_each_netdev(&init_net, dev) { if (!(dev->flags & IFF_LOOPBACK) && !netdev_uses_dsa(dev)) continue; - if (dev_change_flags(dev, dev->flags | IFF_UP) < 0) + if (dev_change_flags(dev, dev->flags | IFF_UP, NULL) < 0) pr_err("IP-Config: Failed to open %s\n", dev->name); } @@ -238,7 +238,7 @@ static int __init ic_open_devs(void) if (ic_proto_enabled && !able) continue; oflags = dev->flags; - if (dev_change_flags(dev, oflags | IFF_UP) < 0) { + if (dev_change_flags(dev, oflags | IFF_UP, NULL) < 0) { pr_err("IP-Config: Failed to open %s\n", dev->name); continue; @@ -315,7 +315,7 @@ static void __init ic_close_devs(void) dev = d->dev; if (d != ic_dev && !netdev_uses_dsa(dev)) { pr_debug("IP-Config: Downing %s\n", dev->name); - dev_change_flags(dev, d->flags); + dev_change_flags(dev, d->flags, NULL); } kfree(d); } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index e65287c27e3d..57c5dd283a2c 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -140,6 +140,13 @@ static int ipip_err(struct sk_buff *skb, u32 info) struct ip_tunnel *t; int err = 0; + t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + iph->daddr, iph->saddr, 0); + if (!t) { + err = -ENOENT; + goto out; + } + switch (type) { case ICMP_DEST_UNREACH: switch (code) { @@ -167,13 +174,6 @@ static int ipip_err(struct sk_buff *skb, u32 info) goto out; } - t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, - iph->daddr, iph->saddr, 0); - if (!t) { - err = -ENOENT; - goto out; - } - if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol); goto out; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a6defbec4f1b..ea04e38f56e9 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -506,7 +506,7 @@ static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) dev->flags |= IFF_MULTICAST; if (!ipmr_init_vif_indev(dev)) goto failure; - if (dev_open(dev)) + if (dev_open(dev, NULL)) goto failure; dev_hold(dev); } @@ -589,7 +589,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) if (!ipmr_init_vif_indev(dev)) goto failure; - if (dev_open(dev)) + if (dev_open(dev, NULL)) goto failure; dev_hold(dev); @@ -1802,7 +1802,7 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, struct vif_device *out_vif = &mrt->vif_table[out_vifi]; struct vif_device *in_vif = &mrt->vif_table[in_vifi]; - if (!skb->offload_mr_fwd_mark) + if (!skb->offload_l3_fwd_mark) return false; if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len) return false; diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c index 6d218f5a2e71..ca9a5fefdefa 100644 --- a/net/ipv4/metrics.c +++ b/net/ipv4/metrics.c @@ -6,7 +6,8 @@ #include <net/tcp.h> static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, - int fc_mx_len, u32 *metrics) + int fc_mx_len, u32 *metrics, + struct netlink_ext_ack *extack) { bool ecn_ca = false; struct nlattr *nla; @@ -21,19 +22,26 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, if (!type) continue; - if (type > RTAX_MAX) + if (type > RTAX_MAX) { + NL_SET_ERR_MSG(extack, "Invalid metric type"); return -EINVAL; + } if (type == RTAX_CC_ALGO) { char tmp[TCP_CA_NAME_MAX]; nla_strlcpy(tmp, nla, sizeof(tmp)); val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); - if (val == TCP_CA_UNSPEC) + if (val == TCP_CA_UNSPEC) { + NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm"); return -EINVAL; + } } else { - if (nla_len(nla) != sizeof(u32)) + if (nla_len(nla) != sizeof(u32)) { + NL_SET_ERR_MSG_ATTR(extack, nla, + "Invalid attribute in metrics"); return -EINVAL; + } val = nla_get_u32(nla); } if (type == RTAX_ADVMSS && val > 65535 - 40) @@ -42,8 +50,10 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, val = 65535 - 15; if (type == RTAX_HOPLIMIT && val > 255) val = 255; - if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) + if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) { + NL_SET_ERR_MSG(extack, "Unknown flag set in feature mask in metrics attribute"); return -EINVAL; + } metrics[type - 1] = val; } @@ -54,7 +64,8 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, } struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, - int fc_mx_len) + int fc_mx_len, + struct netlink_ext_ack *extack) { struct dst_metrics *fib_metrics; int err; @@ -66,7 +77,8 @@ struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, if (unlikely(!fib_metrics)) return ERR_PTR(-ENOMEM); - err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics); + err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics, + extack); if (!err) { refcount_set(&fib_metrics->refcnt, 1); } else { diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 70289682a670..c3610b37bb4c 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -219,6 +219,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED), + SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE), SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT), SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 32a691b7ce2c..92d249e053be 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -29,6 +29,7 @@ #include <net/protocol.h> struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; +EXPORT_SYMBOL(inet_protos); const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; EXPORT_SYMBOL(inet_offloads); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 8ca3eb06ba04..076f51646d26 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -131,8 +131,7 @@ struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, if (net_eq(sock_net(sk), net) && inet->inet_num == num && !(inet->inet_daddr && inet->inet_daddr != raddr) && !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && - !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && - sk->sk_bound_dev_if != sdif)) + raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) goto found; /* gotcha */ } sk = NULL; @@ -805,7 +804,7 @@ out: return copied; } -static int raw_init(struct sock *sk) +static int raw_sk_init(struct sock *sk) { struct raw_sock *rp = raw_sk(sk); @@ -970,7 +969,7 @@ struct proto raw_prot = { .connect = ip4_datagram_connect, .disconnect = __udp_disconnect, .ioctl = raw_ioctl, - .init = raw_init, + .init = raw_sk_init, .setsockopt = raw_setsockopt, .getsockopt = raw_getsockopt, .sendmsg = raw_sendmsg, @@ -1134,3 +1133,27 @@ void __init raw_proc_exit(void) unregister_pernet_subsys(&raw_net_ops); } #endif /* CONFIG_PROC_FS */ + +static void raw_sysctl_init_net(struct net *net) +{ +#ifdef CONFIG_NET_L3_MASTER_DEV + net->ipv4.sysctl_raw_l3mdev_accept = 1; +#endif +} + +static int __net_init raw_sysctl_init(struct net *net) +{ + raw_sysctl_init_net(net); + return 0; +} + +static struct pernet_operations __net_initdata raw_sysctl_ops = { + .init = raw_sysctl_init, +}; + +void __init raw_init(void) +{ + raw_sysctl_init_net(&init_net); + if (register_pernet_subsys(&raw_sysctl_ops)) + panic("RAW: failed to init sysctl parameters.\n"); +} diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c0a9d26c06ce..c4ddbc5f01fc 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1677,7 +1677,7 @@ static void ip_handle_martian_source(struct net_device *dev, print_hex_dump(KERN_WARNING, "ll header: ", DUMP_PREFIX_OFFSET, 16, 1, skb_mac_header(skb), - dev->hard_header_len, true); + dev->hard_header_len, false); } } #endif diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 891ed2f91467..ba0fc4b18465 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -602,6 +602,17 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = ipv4_ping_group_range, }, +#ifdef CONFIG_NET_L3_MASTER_DEV + { + .procname = "raw_l3mdev_accept", + .data = &init_net.ipv4.sysctl_raw_l3mdev_accept, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif { .procname = "tcp_ecn", .data = &init_net.ipv4.sysctl_tcp_ecn, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9e6bc4d6daa7..27e2f6837062 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1423,7 +1423,7 @@ do_error: if (copied + copied_syn) goto out; out_err: - sock_zerocopy_put_abort(uarg); + sock_zerocopy_put_abort(uarg, true); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && @@ -2088,7 +2088,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, } continue; - found_ok_skb: +found_ok_skb: /* Ok so how much can we use? */ used = skb->len - offset; if (len < used) @@ -2147,7 +2147,7 @@ skip_copy: sk_eat_skb(sk, skb); continue; - found_fin_ok: +found_fin_ok: /* Process the FIN. */ ++*seq; if (!(flags & MSG_PEEK)) @@ -2241,10 +2241,6 @@ void tcp_set_state(struct sock *sk, int state) * socket sitting in hash tables. */ inet_sk_state_store(sk, state); - -#ifdef STATE_TRACE - SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); -#endif } EXPORT_SYMBOL_GPL(tcp_set_state); @@ -3246,6 +3242,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ 0; } @@ -3299,6 +3296,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) TCP_NLA_PAD); nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); + nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3); return stats; } @@ -3658,8 +3656,11 @@ bool tcp_alloc_md5sig_pool(void) if (unlikely(!tcp_md5sig_pool_populated)) { mutex_lock(&tcp_md5sig_mutex); - if (!tcp_md5sig_pool_populated) + if (!tcp_md5sig_pool_populated) { __tcp_alloc_md5sig_pool(); + if (tcp_md5sig_pool_populated) + static_key_slow_inc(&tcp_md5_needed); + } mutex_unlock(&tcp_md5sig_mutex); } diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 9277abdd822a..0f497fc49c3f 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -128,7 +128,12 @@ static const u32 bbr_probe_rtt_mode_ms = 200; /* Skip TSO below the following bandwidth (bits/sec): */ static const int bbr_min_tso_rate = 1200000; -/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */ +/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. + * In order to help drive the network toward lower queues and low latency while + * maintaining high utilization, the average pacing rate aims to be slightly + * lower than the estimated bandwidth. This is an important aspect of the + * design. + */ static const int bbr_pacing_margin_percent = 1; /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain @@ -247,13 +252,7 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain); } -/* Pace using current bw estimate and a gain factor. In order to help drive the - * network toward lower queues while maintaining high utilization and low - * latency, the average pacing rate aims to be slightly (~1%) lower than the - * estimated bandwidth. This is an important aspect of the design. In this - * implementation this slightly lower pacing rate is achieved implicitly by not - * including link-layer headers in the packet size used for the pacing rate. - */ +/* Pace using current bw estimate and a gain factor. */ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 3b45fe530f91..a47c1cdf90fc 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -289,12 +289,23 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, { bool cork = false, enospc = msg->sg.start == msg->sg.end; struct sock *sk_redir; - u32 tosend; + u32 tosend, delta = 0; int ret; more_data: - if (psock->eval == __SK_NONE) + if (psock->eval == __SK_NONE) { + /* Track delta in msg size to add/subtract it on SK_DROP from + * returned to user copied size. This ensures user doesn't + * get a positive return code with msg_cut_data and SK_DROP + * verdict. + */ + delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); + if (msg->sg.size < delta) + delta -= msg->sg.size; + else + delta = 0; + } if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && !enospc) { @@ -350,7 +361,7 @@ more_data: default: sk_msg_free_partial(sk, msg, tosend); sk_msg_apply_bytes(psock, tosend); - *copied -= tosend; + *copied -= (tosend + delta); return -EACCES; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a9d9555a973f..76858b14ebe9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1865,16 +1865,20 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) /* Emulate SACKs for SACKless connection: account for a new dupack. */ -static void tcp_add_reno_sack(struct sock *sk) +static void tcp_add_reno_sack(struct sock *sk, int num_dupack) { - struct tcp_sock *tp = tcp_sk(sk); - u32 prior_sacked = tp->sacked_out; + if (num_dupack) { + struct tcp_sock *tp = tcp_sk(sk); + u32 prior_sacked = tp->sacked_out; + s32 delivered; - tp->sacked_out++; - tcp_check_reno_reordering(sk, 0); - if (tp->sacked_out > prior_sacked) - tp->delivered++; /* Some out-of-order packet is delivered */ - tcp_verify_left_out(tp); + tp->sacked_out += num_dupack; + tcp_check_reno_reordering(sk, 0); + delivered = tp->sacked_out - prior_sacked; + if (delivered > 0) + tp->delivered += delivered; + tcp_verify_left_out(tp); + } } /* Account for ACK, ACKing some data in Reno Recovery phase. */ @@ -2459,8 +2463,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag) u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + tp->prior_cwnd - 1; sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; - } else if ((flag & FLAG_RETRANS_DATA_ACKED) && - !(flag & FLAG_LOST_RETRANS)) { + } else if ((flag & (FLAG_RETRANS_DATA_ACKED | FLAG_LOST_RETRANS)) == + FLAG_RETRANS_DATA_ACKED) { sndcnt = min_t(int, delta, max_t(int, tp->prr_delivered - tp->prr_out, newly_acked_sacked) + 1); @@ -2636,7 +2640,7 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack) /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are * recovered or spurious. Otherwise retransmits more on partial ACKs. */ -static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, +static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, int *rexmit) { struct tcp_sock *tp = tcp_sk(sk); @@ -2655,7 +2659,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, return; if (after(tp->snd_nxt, tp->high_seq)) { - if (flag & FLAG_DATA_SACKED || is_dupack) + if (flag & FLAG_DATA_SACKED || num_dupack) tp->frto = 0; /* Step 3.a. loss was real */ } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { tp->high_seq = tp->snd_nxt; @@ -2681,8 +2685,8 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, /* A Reno DUPACK means new data in F-RTO step 2.b above are * delivered. Lower inflight to clock out (re)tranmissions. */ - if (after(tp->snd_nxt, tp->high_seq) && is_dupack) - tcp_add_reno_sack(sk); + if (after(tp->snd_nxt, tp->high_seq) && num_dupack) + tcp_add_reno_sack(sk, num_dupack); else if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); } @@ -2759,13 +2763,13 @@ static bool tcp_force_fast_retransmit(struct sock *sk) * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, - bool is_dupack, int *ack_flag, int *rexmit) + int num_dupack, int *ack_flag, int *rexmit) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int fast_rexmit = 0, flag = *ack_flag; - bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && - tcp_force_fast_retransmit(sk)); + bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) && + tcp_force_fast_retransmit(sk)); if (!tp->packets_out && tp->sacked_out) tp->sacked_out = 0; @@ -2812,8 +2816,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: if (!(flag & FLAG_SND_UNA_ADVANCED)) { - if (tcp_is_reno(tp) && is_dupack) - tcp_add_reno_sack(sk); + if (tcp_is_reno(tp)) + tcp_add_reno_sack(sk, num_dupack); } else { if (tcp_try_undo_partial(sk, prior_snd_una)) return; @@ -2828,7 +2832,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, tcp_identify_packet_loss(sk, ack_flag); break; case TCP_CA_Loss: - tcp_process_loss(sk, flag, is_dupack, rexmit); + tcp_process_loss(sk, flag, num_dupack, rexmit); tcp_identify_packet_loss(sk, ack_flag); if (!(icsk->icsk_ca_state == TCP_CA_Open || (*ack_flag & FLAG_LOST_RETRANS))) @@ -2839,8 +2843,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); - if (is_dupack) - tcp_add_reno_sack(sk); + tcp_add_reno_sack(sk, num_dupack); } if (icsk->icsk_ca_state <= TCP_CA_Disorder) @@ -3562,7 +3565,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) bool is_sack_reneg = tp->is_sack_reneg; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; - bool is_dupack = false; + int num_dupack = 0; int prior_packets = tp->packets_out; u32 delivered = tp->delivered; u32 lost = tp->lost; @@ -3614,7 +3617,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (flag & FLAG_UPDATE_TS_RECENT) tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); - if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { + if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) == + FLAG_SND_UNA_ADVANCED) { /* Window is constant, pure forward advance. * No more checks are required. * Note, we use the fact that SND.UNA>=SND.WL2. @@ -3672,8 +3676,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_set_xmit_timer(sk); if (tcp_ack_is_dubious(sk, flag)) { - is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP))) { + num_dupack = 1; + /* Consider if pure acks were aggregated in tcp_add_backlog() */ + if (!(flag & FLAG_DATA)) + num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs); + } + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); } @@ -3691,7 +3700,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) { - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); tcp_newly_delivered(sk, delivered, flag); } @@ -3716,7 +3725,7 @@ old_ack: if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); tcp_newly_delivered(sk, delivered, flag); tcp_xmit_recovery(sk, rexmit); @@ -4606,13 +4615,12 @@ end: } } -static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, - bool *fragstolen) +static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, + bool *fragstolen) { int eaten; struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); - __skb_pull(skb, hdrlen); eaten = (tail && tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; @@ -4663,7 +4671,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; - if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) { + if (tcp_queue_rcv(sk, skb, &fragstolen)) { WARN_ON_ONCE(fragstolen); /* should not happen */ __kfree_skb(skb); } @@ -4723,7 +4731,7 @@ queue_and_out: goto drop; } - eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); + eaten = tcp_queue_rcv(sk, skb, &fragstolen); if (skb->len) tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -5599,8 +5607,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ - eaten = tcp_queue_rcv(sk, skb, tcp_header_len, - &fragstolen); + __skb_pull(skb, tcp_header_len); + eaten = tcp_queue_rcv(sk, skb, &fragstolen); tcp_event_data_recv(sk, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index de47038afdf0..efc6fef692ff 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -423,7 +423,7 @@ EXPORT_SYMBOL(tcp_req_err); * */ -void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) +int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); @@ -446,20 +446,21 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) inet_iif(icmp_skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); - return; + return -ENOENT; } if (sk->sk_state == TCP_TIME_WAIT) { inet_twsk_put(inet_twsk(sk)); - return; + return 0; } seq = ntohl(th->seq); - if (sk->sk_state == TCP_NEW_SYN_RECV) - return tcp_req_err(sk, seq, - type == ICMP_PARAMETERPROB || - type == ICMP_TIME_EXCEEDED || - (type == ICMP_DEST_UNREACH && - (code == ICMP_NET_UNREACH || - code == ICMP_HOST_UNREACH))); + if (sk->sk_state == TCP_NEW_SYN_RECV) { + tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || + type == ICMP_TIME_EXCEEDED || + (type == ICMP_DEST_UNREACH && + (code == ICMP_NET_UNREACH || + code == ICMP_HOST_UNREACH))); + return 0; + } bh_lock_sock(sk); /* If too many ICMPs get dropped on busy @@ -541,7 +542,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); skb = tcp_rtx_queue_head(sk); - BUG_ON(!skb); tcp_mstamp_refresh(tp); delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); @@ -613,6 +613,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) out: bh_unlock_sock(sk); sock_put(sk); + return 0; } void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) @@ -969,10 +970,13 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) * We need to maintain these in the sk structure. */ +struct static_key tcp_md5_needed __read_mostly; +EXPORT_SYMBOL(tcp_md5_needed); + /* Find the Key structure for an address. */ -struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, - const union tcp_md5_addr *addr, - int family) +struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, + const union tcp_md5_addr *addr, + int family) { const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; @@ -1010,7 +1014,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, } return best_match; } -EXPORT_SYMBOL(tcp_md5_do_lookup); +EXPORT_SYMBOL(__tcp_md5_do_lookup); static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, const union tcp_md5_addr *addr, @@ -1618,12 +1622,14 @@ int tcp_v4_early_demux(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) { u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; - - /* Only socket owner can try to collapse/prune rx queues - * to reduce memory overhead, so add a little headroom here. - * Few sockets backlog are possibly concurrently non empty. - */ - limit += 64*1024; + struct skb_shared_info *shinfo; + const struct tcphdr *th; + struct tcphdr *thtail; + struct sk_buff *tail; + unsigned int hdrlen; + bool fragstolen; + u32 gso_segs; + int delta; /* In case all data was pulled from skb frags (in __pskb_pull_tail()), * we can fix skb->truesize to its real value to avoid future drops. @@ -1633,6 +1639,86 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) */ skb_condense(skb); + skb_dst_drop(skb); + + if (unlikely(tcp_checksum_complete(skb))) { + bh_unlock_sock(sk); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); + return true; + } + + /* Attempt coalescing to last skb in backlog, even if we are + * above the limits. + * This is okay because skb capacity is limited to MAX_SKB_FRAGS. + */ + th = (const struct tcphdr *)skb->data; + hdrlen = th->doff * 4; + shinfo = skb_shinfo(skb); + + if (!shinfo->gso_size) + shinfo->gso_size = skb->len - hdrlen; + + if (!shinfo->gso_segs) + shinfo->gso_segs = 1; + + tail = sk->sk_backlog.tail; + if (!tail) + goto no_coalesce; + thtail = (struct tcphdr *)tail->data; + + if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || + TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || + ((TCP_SKB_CB(tail)->tcp_flags | + TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) || + ((TCP_SKB_CB(tail)->tcp_flags ^ + TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || +#ifdef CONFIG_TLS_DEVICE + tail->decrypted != skb->decrypted || +#endif + thtail->doff != th->doff || + memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) + goto no_coalesce; + + __skb_pull(skb, hdrlen); + if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { + thtail->window = th->window; + + TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; + + if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq)) + TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; + + TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; + + if (TCP_SKB_CB(skb)->has_rxtstamp) { + TCP_SKB_CB(tail)->has_rxtstamp = true; + tail->tstamp = skb->tstamp; + skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; + } + + /* Not as strict as GRO. We only need to carry mss max value */ + skb_shinfo(tail)->gso_size = max(shinfo->gso_size, + skb_shinfo(tail)->gso_size); + + gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs; + skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF); + + sk->sk_backlog.len += delta; + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPBACKLOGCOALESCE); + kfree_skb_partial(skb, fragstolen); + return false; + } + __skb_push(skb, hdrlen); + +no_coalesce: + /* Only socket owner can try to collapse/prune rx queues + * to reduce memory overhead, so add a little headroom here. + * Few sockets backlog are possibly concurrently non empty. + */ + limit += 64*1024; + if (unlikely(sk_add_backlog(sk, skb, limit))) { bh_unlock_sock(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); @@ -2573,8 +2659,8 @@ static int __net_init tcp_sk_init(struct net *net) * which are too large can cause TCP streams to be bursty. */ net->ipv4.sysctl_tcp_tso_win_divisor = 3; - /* Default TSQ limit of four TSO segments */ - net->ipv4.sysctl_tcp_limit_output_bytes = 262144; + /* Default TSQ limit of 16 TSO segments */ + net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; /* rfc5961 challenge ack rate limiting */ net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; net->ipv4.sysctl_tcp_min_tso_segs = 2; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d1676d8a6ed7..c31badfee806 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -233,16 +233,14 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, if (init_rcv_wnd) *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); - (*rcv_wscale) = 0; + *rcv_wscale = 0; if (wscale_ok) { /* Set window scaling on max possible window */ space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); space = max_t(u32, space, sysctl_rmem_max); space = min_t(u32, space, *window_clamp); - while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) { - space >>= 1; - (*rcv_wscale)++; - } + *rcv_wscale = clamp_t(int, ilog2(space) - 15, + 0, TCP_MAX_WSCALE); } /* Set the clamp no higher than max representable value */ (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp); @@ -596,7 +594,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, *md5 = NULL; #ifdef CONFIG_TCP_MD5SIG - if (unlikely(rcu_access_pointer(tp->md5sig_info))) { + if (static_key_false(&tcp_md5_needed) && + rcu_access_pointer(tp->md5sig_info)) { *md5 = tp->af_specific->md5_lookup(sk, sk); if (*md5) { opts->options |= OPTION_MD5; @@ -732,7 +731,8 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb *md5 = NULL; #ifdef CONFIG_TCP_MD5SIG - if (unlikely(rcu_access_pointer(tp->md5sig_info))) { + if (static_key_false(&tcp_md5_needed) && + rcu_access_pointer(tp->md5sig_info)) { *md5 = tp->af_specific->md5_lookup(sk, sk); if (*md5) { opts->options |= OPTION_MD5; @@ -1909,18 +1909,22 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, u32 max_segs) { const struct inet_connection_sock *icsk = inet_csk(sk); - u32 age, send_win, cong_win, limit, in_flight; + u32 send_win, cong_win, limit, in_flight; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *head; int win_divisor; + s64 delta; if (icsk->icsk_ca_state >= TCP_CA_Recovery) goto send_now; /* Avoid bursty behavior by allowing defer - * only if the last write was recent. + * only if the last write was recent (1 ms). + * Note that tp->tcp_wstamp_ns can be in the future if we have + * packets waiting in a qdisc or device for EDT delivery. */ - if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0) + delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC; + if (delta > 0) goto send_now; in_flight = tcp_packets_in_flight(tp); @@ -1943,6 +1947,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) goto send_now; + /* If this packet won't get more data, do not wait. */ + if (TCP_SKB_CB(skb)->eor) + goto send_now; + win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor); if (win_divisor) { u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); @@ -1967,9 +1975,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, head = tcp_rtx_queue_head(sk); if (!head) goto send_now; - age = tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(head)); + delta = tp->tcp_clock_cache - head->tstamp; /* If next ACK is likely to come too late (half srtt), do not defer */ - if (age < (tp->srtt_us >> 4)) + if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0) goto send_now; /* Ok, it looks like it is advisable to defer. @@ -2228,8 +2236,9 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, limit = max_t(unsigned long, 2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift); - limit = min_t(unsigned long, limit, - sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); + if (sk->sk_pacing_status == SK_PACING_NONE) + limit = min_t(unsigned long, limit, + sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); limit <<= factor; if (refcount_read(&sk->sk_wmem_alloc) > limit) { diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index c0630013c1ae..33bf8e9c8663 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -149,34 +149,40 @@ drop: } #endif -static void tunnel4_err(struct sk_buff *skb, u32 info) +static int tunnel4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } #if IS_ENABLED(CONFIG_IPV6) -static void tunnel64_err(struct sk_buff *skb, u32 info) +static int tunnel64_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } #endif #if IS_ENABLED(CONFIG_MPLS) -static void tunnelmpls4_err(struct sk_buff *skb, u32 info) +static int tunnelmpls4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnelmpls4_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } #endif diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1976fddb9e00..aff2a8e99e01 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -105,6 +105,7 @@ #include <net/net_namespace.h> #include <net/icmp.h> #include <net/inet_hashtables.h> +#include <net/ip_tunnels.h> #include <net/route.h> #include <net/checksum.h> #include <net/xfrm.h> @@ -115,6 +116,7 @@ #include "udp_impl.h" #include <net/sock_reuseport.h> #include <net/addrconf.h> +#include <net/udp_tunnel.h> struct udp_table udp_table __read_mostly; EXPORT_SYMBOL(udp_table); @@ -371,6 +373,7 @@ static int compute_score(struct sock *sk, struct net *net, { int score; struct inet_sock *inet; + bool dev_match; if (!net_eq(sock_net(sk), net) || udp_sk(sk)->udp_port_hash != hnum || @@ -398,15 +401,11 @@ static int compute_score(struct sock *sk, struct net *net, score += 4; } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); - - if (!dev_match) - return -1; - if (sk->sk_bound_dev_if) - score += 4; - } + dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, + dif, sdif); + if (!dev_match) + return -1; + score += 4; if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; @@ -585,6 +584,89 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, return true; } +DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); +void udp_encap_enable(void) +{ + static_branch_inc(&udp_encap_needed_key); +} +EXPORT_SYMBOL(udp_encap_enable); + +/* Handler for tunnels with arbitrary destination ports: no socket lookup, go + * through error handlers in encapsulations looking for a match. + */ +static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info) +{ + int i; + + for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) { + int (*handler)(struct sk_buff *skb, u32 info); + + if (!iptun_encaps[i]) + continue; + handler = rcu_dereference(iptun_encaps[i]->err_handler); + if (handler && !handler(skb, info)) + return 0; + } + + return -ENOENT; +} + +/* Try to match ICMP errors to UDP tunnels by looking up a socket without + * reversing source and destination port: this will match tunnels that force the + * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that + * lwtunnels might actually break this assumption by being configured with + * different destination ports on endpoints, in this case we won't be able to + * trace ICMP messages back to them. + * + * If this doesn't match any socket, probe tunnels with arbitrary destination + * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port + * we've sent packets to won't necessarily match the local destination port. + * + * Then ask the tunnel implementation to match the error against a valid + * association. + * + * Return an error if we can't find a match, the socket if we need further + * processing, zero otherwise. + */ +static struct sock *__udp4_lib_err_encap(struct net *net, + const struct iphdr *iph, + struct udphdr *uh, + struct udp_table *udptable, + struct sk_buff *skb, u32 info) +{ + int network_offset, transport_offset; + struct sock *sk; + + network_offset = skb_network_offset(skb); + transport_offset = skb_transport_offset(skb); + + /* Network header needs to point to the outer IPv4 header inside ICMP */ + skb_reset_network_header(skb); + + /* Transport header needs to point to the UDP header */ + skb_set_transport_header(skb, iph->ihl << 2); + + sk = __udp4_lib_lookup(net, iph->daddr, uh->source, + iph->saddr, uh->dest, skb->dev->ifindex, 0, + udptable, NULL); + if (sk) { + int (*lookup)(struct sock *sk, struct sk_buff *skb); + struct udp_sock *up = udp_sk(sk); + + lookup = READ_ONCE(up->encap_err_lookup); + if (!lookup || lookup(sk, skb)) + sk = NULL; + } + + if (!sk) + sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info)); + + skb_set_transport_header(skb, transport_offset); + skb_set_network_header(skb, network_offset); + + return sk; +} + /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should @@ -596,13 +678,14 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, * to find the appropriate port. */ -void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) +int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) { struct inet_sock *inet; const struct iphdr *iph = (const struct iphdr *)skb->data; struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2)); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; + bool tunnel = false; struct sock *sk; int harderr; int err; @@ -612,8 +695,21 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) iph->saddr, uh->source, skb->dev->ifindex, inet_sdif(skb), udptable, NULL); if (!sk) { - __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); - return; /* No socket for error */ + /* No socket for error: try tunnels before discarding */ + sk = ERR_PTR(-ENOENT); + if (static_branch_unlikely(&udp_encap_needed_key)) { + sk = __udp4_lib_err_encap(net, iph, uh, udptable, skb, + info); + if (!sk) + return 0; + } + + if (IS_ERR(sk)) { + __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); + return PTR_ERR(sk); + } + + tunnel = true; } err = 0; @@ -656,6 +752,10 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) * RFC1122: OK. Passes ICMP errors back to application, as per * 4.1.3.3. */ + if (tunnel) { + /* ...not for tunnels though: we don't have a sending socket */ + goto out; + } if (!inet->recverr) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; @@ -665,12 +765,12 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) sk->sk_err = err; sk->sk_error_report(sk); out: - return; + return 0; } -void udp_err(struct sk_buff *skb, u32 info) +int udp_err(struct sk_buff *skb, u32 info) { - __udp4_lib_err(skb, info, &udp_table); + return __udp4_lib_err(skb, info, &udp_table); } /* @@ -1713,6 +1813,10 @@ try_again: memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); } + + if (udp_sk(sk)->gro_enabled) + udp_cmsg_recv(msg, sk, skb); + if (inet->cmsg_flags) ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off); @@ -1889,13 +1993,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return 0; } -DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); -void udp_encap_enable(void) -{ - static_branch_enable(&udp_encap_needed_key); -} -EXPORT_SYMBOL(udp_encap_enable); - /* returns: * -1: error * 0: success @@ -1904,7 +2001,7 @@ EXPORT_SYMBOL(udp_encap_enable); * Note that in the success and error cases, the skb is assumed to * have either been requeued or freed. */ -static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk); @@ -2007,6 +2104,27 @@ drop: return -1; } +static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff *next, *segs; + int ret; + + if (likely(!udp_unexpected_gso(sk, skb))) + return udp_queue_rcv_one_skb(sk, skb); + + BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET); + __skb_push(skb, -skb_mac_offset(skb)); + segs = udp_rcv_segment(sk, skb, true); + for (skb = segs; skb; skb = next) { + next = skb->next; + __skb_pull(skb, skb_transport_offset(skb)); + ret = udp_queue_rcv_one_skb(sk, skb); + if (ret > 0) + ip_protocol_deliver_rcu(dev_net(skb->dev), skb, -ret); + } + return 0; +} + /* For TCP sockets, sk_rx_dst is protected by socket lock * For UDP, we use xchg() to guard against concurrent changes. */ @@ -2398,11 +2516,15 @@ void udp_destroy_sock(struct sock *sk) bool slow = lock_sock_fast(sk); udp_flush_pending_frames(sk); unlock_sock_fast(sk, slow); - if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) { - void (*encap_destroy)(struct sock *sk); - encap_destroy = READ_ONCE(up->encap_destroy); - if (encap_destroy) - encap_destroy(sk); + if (static_branch_unlikely(&udp_encap_needed_key)) { + if (up->encap_type) { + void (*encap_destroy)(struct sock *sk); + encap_destroy = READ_ONCE(up->encap_destroy); + if (encap_destroy) + encap_destroy(sk); + } + if (up->encap_enabled) + static_branch_dec(&udp_encap_needed_key); } } @@ -2447,7 +2569,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, /* FALLTHROUGH */ case UDP_ENCAP_L2TPINUDP: up->encap_type = val; - udp_encap_enable(); + lock_sock(sk); + udp_tunnel_encap_enable(sk->sk_socket); + release_sock(sk); break; default: err = -ENOPROTOOPT; @@ -2469,6 +2593,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, up->gso_size = val; break; + case UDP_GRO: + lock_sock(sk); + if (valbool) + udp_tunnel_encap_enable(sk->sk_socket); + up->gro_enabled = valbool; + release_sock(sk); + break; + /* * UDP-Lite's partial checksum coverage (RFC 3828). */ diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index e7d18b140287..322672655419 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -7,7 +7,7 @@ #include <net/inet_common.h> int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int); -void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); +int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); int udp_v4_get_port(struct sock *sk, unsigned short snum); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 802f2bc00d69..0646d61f4fa8 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -343,6 +343,54 @@ out: return segs; } +#define UDP_GRO_CNT_MAX 64 +static struct sk_buff *udp_gro_receive_segment(struct list_head *head, + struct sk_buff *skb) +{ + struct udphdr *uh = udp_hdr(skb); + struct sk_buff *pp = NULL; + struct udphdr *uh2; + struct sk_buff *p; + + /* requires non zero csum, for symmetry with GSO */ + if (!uh->check) { + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + + /* pull encapsulating udp header */ + skb_gro_pull(skb, sizeof(struct udphdr)); + skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); + + list_for_each_entry(p, head, list) { + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + uh2 = udp_hdr(p); + + /* Match ports only, as csum is always non zero */ + if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + /* Terminate the flow on len mismatch or if it grow "too much". + * Under small packet flood GRO count could elsewhere grow a lot + * leading to execessive truesize values + */ + if (!skb_gro_receive(p, skb) && + NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX) + pp = p; + else if (uh->len != uh2->len) + pp = p; + + return pp; + } + + /* mismatch, but we never need to flush */ + return NULL; +} + struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, struct udphdr *uh, udp_lookup_t lookup) { @@ -353,23 +401,27 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, int flush = 1; struct sock *sk; + rcu_read_lock(); + sk = (*lookup)(skb, uh->source, uh->dest); + if (!sk) + goto out_unlock; + + if (udp_sk(sk)->gro_enabled) { + pp = call_gro_receive(udp_gro_receive_segment, head, skb); + rcu_read_unlock(); + return pp; + } + if (NAPI_GRO_CB(skb)->encap_mark || (skb->ip_summed != CHECKSUM_PARTIAL && NAPI_GRO_CB(skb)->csum_cnt == 0 && - !NAPI_GRO_CB(skb)->csum_valid)) - goto out; + !NAPI_GRO_CB(skb)->csum_valid) || + !udp_sk(sk)->gro_receive) + goto out_unlock; /* mark that this skb passed once through the tunnel gro layer */ NAPI_GRO_CB(skb)->encap_mark = 1; - rcu_read_lock(); - sk = (*lookup)(skb, uh->source, uh->dest); - - if (sk && udp_sk(sk)->gro_receive) - goto unflush; - goto out_unlock; - -unflush: flush = 0; list_for_each_entry(p, head, list) { @@ -394,7 +446,6 @@ unflush: out_unlock: rcu_read_unlock(); -out: skb_gro_flush_final(skb, pp, flush); return pp; } @@ -427,6 +478,19 @@ flush: return NULL; } +static int udp_gro_complete_segment(struct sk_buff *skb) +{ + struct udphdr *uh = udp_hdr(skb); + + skb->csum_start = (unsigned char *)uh - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4; + return 0; +} + int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup) { @@ -437,16 +501,21 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, uh->len = newlen; - /* Set encapsulation before calling into inner gro_complete() functions - * to make them set up the inner offsets. - */ - skb->encapsulation = 1; - rcu_read_lock(); sk = (*lookup)(skb, uh->source, uh->dest); - if (sk && udp_sk(sk)->gro_complete) + if (sk && udp_sk(sk)->gro_enabled) { + err = udp_gro_complete_segment(skb); + } else if (sk && udp_sk(sk)->gro_complete) { + skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM + : SKB_GSO_UDP_TUNNEL; + + /* Set encapsulation before calling into inner gro_complete() + * functions to make them set up the inner offsets. + */ + skb->encapsulation = 1; err = udp_sk(sk)->gro_complete(sk, skb, nhoff + sizeof(struct udphdr)); + } rcu_read_unlock(); if (skb->remcsum_offload) @@ -461,13 +530,9 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff) const struct iphdr *iph = ip_hdr(skb); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); - if (uh->check) { - skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; + if (uh->check) uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr, iph->daddr, 0); - } else { - skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL; - } return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb); } diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 6539ff15e9a3..be8b5b2157d8 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -20,6 +20,23 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, if (err < 0) goto error; + if (cfg->bind_ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(net, cfg->bind_ifindex); + if (!dev) { + err = -ENODEV; + goto error; + } + + err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, + dev->name, strlen(dev->name) + 1); + dev_put(dev); + + if (err < 0) + goto error; + } + udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->local_ip; udp_addr.sin_port = cfg->local_udp_port; @@ -68,6 +85,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_sk(sk)->encap_type = cfg->encap_type; udp_sk(sk)->encap_rcv = cfg->encap_rcv; + udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup; udp_sk(sk)->encap_destroy = cfg->encap_destroy; udp_sk(sk)->gro_receive = cfg->gro_receive; udp_sk(sk)->gro_complete = cfg->gro_complete; diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 8545457752fb..39c7f17d916f 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -25,9 +25,9 @@ static int udplite_rcv(struct sk_buff *skb) return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); } -static void udplite_err(struct sk_buff *skb, u32 info) +static int udplite_err(struct sk_buff *skb, u32 info) { - __udp4_lib_err(skb, info, &udplite_table); + return __udp4_lib_err(skb, info, &udplite_table); } static const struct net_protocol udplite_protocol = { diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index 8dd0e6ab8606..35c54865dc42 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -106,13 +106,15 @@ static int xfrm4_esp_rcv(struct sk_buff *skb) return 0; } -static void xfrm4_esp_err(struct sk_buff *skb, u32 info) +static int xfrm4_esp_err(struct sk_buff *skb, u32 info) { struct xfrm4_protocol *handler; for_each_protocol_rcu(esp4_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } static int xfrm4_ah_rcv(struct sk_buff *skb) @@ -132,13 +134,15 @@ static int xfrm4_ah_rcv(struct sk_buff *skb) return 0; } -static void xfrm4_ah_err(struct sk_buff *skb, u32 info) +static int xfrm4_ah_err(struct sk_buff *skb, u32 info) { struct xfrm4_protocol *handler; for_each_protocol_rcu(ah4_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } static int xfrm4_ipcomp_rcv(struct sk_buff *skb) @@ -158,13 +162,15 @@ static int xfrm4_ipcomp_rcv(struct sk_buff *skb) return 0; } -static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info) +static int xfrm4_ipcomp_err(struct sk_buff *skb, u32 info) { struct xfrm4_protocol *handler; for_each_protocol_rcu(ipcomp4_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } static const struct net_protocol esp4_protocol = { |