summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c4
-rw-r--r--net/ipv4/devinet.c2
-rw-r--r--net/ipv4/fib_semantics.c2
-rw-r--r--net/ipv4/fou.c68
-rw-r--r--net/ipv4/gre_demux.c9
-rw-r--r--net/ipv4/icmp.c6
-rw-r--r--net/ipv4/inet_connection_sock.c14
-rw-r--r--net/ipv4/inet_hashtables.c34
-rw-r--r--net/ipv4/ip_forward.c7
-rw-r--r--net/ipv4/ip_gre.c50
-rw-r--r--net/ipv4/ip_input.c73
-rw-r--r--net/ipv4/ip_output.c38
-rw-r--r--net/ipv4/ip_tunnel_core.c2
-rw-r--r--net/ipv4/ipconfig.c6
-rw-r--r--net/ipv4/ipip.c14
-rw-r--r--net/ipv4/ipmr.c6
-rw-r--r--net/ipv4/metrics.c26
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/protocol.c1
-rw-r--r--net/ipv4/raw.c31
-rw-r--r--net/ipv4/route.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c11
-rw-r--r--net/ipv4/tcp.c17
-rw-r--r--net/ipv4/tcp_bbr.c15
-rw-r--r--net/ipv4/tcp_bpf.c17
-rw-r--r--net/ipv4/tcp_input.c78
-rw-r--r--net/ipv4/tcp_ipv4.c132
-rw-r--r--net/ipv4/tcp_output.c37
-rw-r--r--net/ipv4/tunnel4.c18
-rw-r--r--net/ipv4/udp.c190
-rw-r--r--net/ipv4/udp_impl.h2
-rw-r--r--net/ipv4/udp_offload.c109
-rw-r--r--net/ipv4/udp_tunnel.c18
-rw-r--r--net/ipv4/udplite.c4
-rw-r--r--net/ipv4/xfrm4_protocol.c18
35 files changed, 786 insertions, 276 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 1fbe2f815474..326c422c22f8 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -208,6 +208,7 @@ int inet_listen(struct socket *sock, int backlog)
if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
goto out;
+ sk->sk_max_ack_backlog = backlog;
/* Really, if the socket is already in listen state
* we can only allow the backlog to be adjusted.
*/
@@ -231,7 +232,6 @@ int inet_listen(struct socket *sock, int backlog)
goto out;
tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
}
- sk->sk_max_ack_backlog = backlog;
err = 0;
out:
@@ -1964,6 +1964,8 @@ static int __init inet_init(void)
/* Add UDP-Lite (RFC 3828) */
udplite4_register();
+ raw_init();
+
ping_init();
/*
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index a34602ae27de..5b9b6d497f71 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1100,7 +1100,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
inet_del_ifa(in_dev, ifap, 1);
break;
}
- ret = dev_change_flags(dev, ifr->ifr_flags);
+ ret = dev_change_flags(dev, ifr->ifr_flags, NULL);
break;
case SIOCSIFADDR: /* Set interface address (and family) */
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index b5c3937ca6ec..5022bc63863a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1076,7 +1076,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
if (!fi)
goto failure;
fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx,
- cfg->fc_mx_len);
+ cfg->fc_mx_len, extack);
if (unlikely(IS_ERR(fi->fib_metrics))) {
err = PTR_ERR(fi->fib_metrics);
kfree(fi);
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 500a59906b87..0d0ad19ecb87 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -3,6 +3,7 @@
#include <linux/socket.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
+#include <linux/icmp.h>
#include <linux/udp.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -1003,15 +1004,82 @@ static int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
return 0;
}
+static int gue_err_proto_handler(int proto, struct sk_buff *skb, u32 info)
+{
+ const struct net_protocol *ipprot = rcu_dereference(inet_protos[proto]);
+
+ if (ipprot && ipprot->err_handler) {
+ if (!ipprot->err_handler(skb, info))
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static int gue_err(struct sk_buff *skb, u32 info)
+{
+ int transport_offset = skb_transport_offset(skb);
+ struct guehdr *guehdr;
+ size_t optlen;
+ int ret;
+
+ if (skb->len < sizeof(struct udphdr) + sizeof(struct guehdr))
+ return -EINVAL;
+
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ switch (guehdr->version) {
+ case 0: /* Full GUE header present */
+ break;
+ case 1: {
+ /* Direct encasulation of IPv4 or IPv6 */
+ skb_set_transport_header(skb, -(int)sizeof(struct icmphdr));
+
+ switch (((struct iphdr *)guehdr)->version) {
+ case 4:
+ ret = gue_err_proto_handler(IPPROTO_IPIP, skb, info);
+ goto out;
+#if IS_ENABLED(CONFIG_IPV6)
+ case 6:
+ ret = gue_err_proto_handler(IPPROTO_IPV6, skb, info);
+ goto out;
+#endif
+ default:
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ }
+ default: /* Undefined version */
+ return -EOPNOTSUPP;
+ }
+
+ if (guehdr->control)
+ return -ENOENT;
+
+ optlen = guehdr->hlen << 2;
+
+ if (validate_gue_flags(guehdr, optlen))
+ return -EINVAL;
+
+ skb_set_transport_header(skb, -(int)sizeof(struct icmphdr));
+ ret = gue_err_proto_handler(guehdr->proto_ctype, skb, info);
+
+out:
+ skb_set_transport_header(skb, transport_offset);
+ return ret;
+}
+
static const struct ip_tunnel_encap_ops fou_iptun_ops = {
.encap_hlen = fou_encap_hlen,
.build_header = fou_build_header,
+ .err_handler = gue_err,
};
static const struct ip_tunnel_encap_ops gue_iptun_ops = {
.encap_hlen = gue_encap_hlen,
.build_header = gue_build_header,
+ .err_handler = gue_err,
};
static int ip_tunnel_encap_add_fou_ops(void)
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 7efe740c06eb..a4bf22ee3aed 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -151,20 +151,25 @@ drop:
return NET_RX_DROP;
}
-static void gre_err(struct sk_buff *skb, u32 info)
+static int gre_err(struct sk_buff *skb, u32 info)
{
const struct gre_protocol *proto;
const struct iphdr *iph = (const struct iphdr *)skb->data;
u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f;
+ int err = 0;
if (ver >= GREPROTO_MAX)
- return;
+ return -EINVAL;
rcu_read_lock();
proto = rcu_dereference(gre_proto[ver]);
if (proto && proto->err_handler)
proto->err_handler(skb, info);
+ else
+ err = -EPROTONOSUPPORT;
rcu_read_unlock();
+
+ return err;
}
static const struct net_protocol net_gre_protocol = {
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index d832beed6e3a..065997f414e6 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1079,7 +1079,7 @@ error:
goto drop;
}
-void icmp_err(struct sk_buff *skb, u32 info)
+int icmp_err(struct sk_buff *skb, u32 info)
{
struct iphdr *iph = (struct iphdr *)skb->data;
int offset = iph->ihl<<2;
@@ -1094,13 +1094,15 @@ void icmp_err(struct sk_buff *skb, u32 info)
*/
if (icmph->type != ICMP_ECHOREPLY) {
ping_err(skb, offset, info);
- return;
+ return 0;
}
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP);
else if (type == ICMP_REDIRECT)
ipv4_redirect(skb, net, 0, IPPROTO_ICMP);
+
+ return 0;
}
/*
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 15e7f7915a21..6ea523d71947 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -183,7 +183,9 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *
int i, low, high, attempt_half;
struct inet_bind_bucket *tb;
u32 remaining, offset;
+ int l3mdev;
+ l3mdev = inet_sk_bound_l3mdev(sk);
attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
other_half_scan:
inet_get_local_port_range(net, &low, &high);
@@ -219,7 +221,8 @@ other_parity_scan:
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
- if (net_eq(ib_net(tb), net) && tb->port == port) {
+ if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
+ tb->port == port) {
if (!inet_csk_bind_conflict(sk, tb, false, false))
goto success;
goto next_port;
@@ -293,6 +296,9 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
struct net *net = sock_net(sk);
struct inet_bind_bucket *tb = NULL;
kuid_t uid = sock_i_uid(sk);
+ int l3mdev;
+
+ l3mdev = inet_sk_bound_l3mdev(sk);
if (!port) {
head = inet_csk_find_open_port(sk, &tb, &port);
@@ -306,11 +312,12 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
- if (net_eq(ib_net(tb), net) && tb->port == port)
+ if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
+ tb->port == port)
goto tb_found;
tb_not_found:
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
- net, head, port);
+ net, head, port, l3mdev);
if (!tb)
goto fail_unlock;
tb_found:
@@ -874,7 +881,6 @@ int inet_csk_listen_start(struct sock *sk, int backlog)
reqsk_queue_alloc(&icsk->icsk_accept_queue);
- sk->sk_max_ack_backlog = backlog;
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 411dd7a90046..13890d5bfc34 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -65,12 +65,14 @@ static u32 sk_ehashfn(const struct sock *sk)
struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
struct net *net,
struct inet_bind_hashbucket *head,
- const unsigned short snum)
+ const unsigned short snum,
+ int l3mdev)
{
struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
if (tb) {
write_pnet(&tb->ib_net, net);
+ tb->l3mdev = l3mdev;
tb->port = snum;
tb->fastreuse = 0;
tb->fastreuseport = 0;
@@ -135,6 +137,7 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
table->bhash_size);
struct inet_bind_hashbucket *head = &table->bhash[bhash];
struct inet_bind_bucket *tb;
+ int l3mdev;
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
@@ -143,6 +146,8 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
return -ENOENT;
}
if (tb->port != port) {
+ l3mdev = inet_sk_bound_l3mdev(sk);
+
/* NOTE: using tproxy and redirecting skbs to a proxy
* on a different listener port breaks the assumption
* that the listener socket's icsk_bind_hash is the same
@@ -150,12 +155,13 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
* create a new bind bucket for the child here. */
inet_bind_bucket_for_each(tb, &head->chain) {
if (net_eq(ib_net(tb), sock_net(sk)) &&
- tb->port == port)
+ tb->l3mdev == l3mdev && tb->port == port)
break;
}
if (!tb) {
tb = inet_bind_bucket_create(table->bind_bucket_cachep,
- sock_net(sk), head, port);
+ sock_net(sk), head, port,
+ l3mdev);
if (!tb) {
spin_unlock(&head->lock);
return -ENOMEM;
@@ -229,6 +235,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
{
int score = -1;
struct inet_sock *inet = inet_sk(sk);
+ bool dev_match;
if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
!ipv6_only_sock(sk)) {
@@ -239,15 +246,12 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score += 4;
}
- if (sk->sk_bound_dev_if || exact_dif) {
- bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
+ dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+ dif, sdif);
+ if (!dev_match)
+ return -1;
+ score += 4;
- if (!dev_match)
- return -1;
- if (sk->sk_bound_dev_if)
- score += 4;
- }
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
}
@@ -675,6 +679,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
u32 remaining, offset;
int ret, i, low, high;
static u32 hint;
+ int l3mdev;
if (port) {
head = &hinfo->bhash[inet_bhashfn(net, port,
@@ -693,6 +698,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
return ret;
}
+ l3mdev = inet_sk_bound_l3mdev(sk);
+
inet_get_local_port_range(net, &low, &high);
high++; /* [32768, 60999] -> [32768, 61000[ */
remaining = high - low;
@@ -719,7 +726,8 @@ other_parity_scan:
* the established check is already unique enough.
*/
inet_bind_bucket_for_each(tb, &head->chain) {
- if (net_eq(ib_net(tb), net) && tb->port == port) {
+ if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
+ tb->port == port) {
if (tb->fastreuse >= 0 ||
tb->fastreuseport >= 0)
goto next_port;
@@ -732,7 +740,7 @@ other_parity_scan:
}
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
- net, head, port);
+ net, head, port, l3mdev);
if (!tb) {
spin_unlock_bh(&head->lock);
return -ENOMEM;
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 32662e9e5d21..06ee4696703c 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -69,6 +69,13 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s
__IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
__IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len);
+#ifdef CONFIG_NET_SWITCHDEV
+ if (skb->offload_l3_fwd_mark) {
+ consume_skb(skb);
+ return 0;
+ }
+#endif
+
if (unlikely(opt->optlen))
ip_forward_options(skb);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 38befe829caf..76a9a5f7a40e 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -121,8 +121,8 @@ static unsigned int ipgre_net_id __read_mostly;
static unsigned int gre_tap_net_id __read_mostly;
static unsigned int erspan_net_id __read_mostly;
-static void ipgre_err(struct sk_buff *skb, u32 info,
- const struct tnl_ptk_info *tpi)
+static int ipgre_err(struct sk_buff *skb, u32 info,
+ const struct tnl_ptk_info *tpi)
{
/* All the routers (except for Linux) return only
@@ -146,17 +146,32 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
unsigned int data_len = 0;
struct ip_tunnel *t;
+ if (tpi->proto == htons(ETH_P_TEB))
+ itn = net_generic(net, gre_tap_net_id);
+ else if (tpi->proto == htons(ETH_P_ERSPAN) ||
+ tpi->proto == htons(ETH_P_ERSPAN2))
+ itn = net_generic(net, erspan_net_id);
+ else
+ itn = net_generic(net, ipgre_net_id);
+
+ iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
+ iph->daddr, iph->saddr, tpi->key);
+
+ if (!t)
+ return -ENOENT;
+
switch (type) {
default:
case ICMP_PARAMETERPROB:
- return;
+ return 0;
case ICMP_DEST_UNREACH:
switch (code) {
case ICMP_SR_FAILED:
case ICMP_PORT_UNREACH:
/* Impossible event. */
- return;
+ return 0;
default:
/* All others are translated to HOST_UNREACH.
rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -168,7 +183,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
case ICMP_TIME_EXCEEDED:
if (code != ICMP_EXC_TTL)
- return;
+ return 0;
data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
break;
@@ -176,40 +191,27 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
break;
}
- if (tpi->proto == htons(ETH_P_TEB))
- itn = net_generic(net, gre_tap_net_id);
- else if (tpi->proto == htons(ETH_P_ERSPAN) ||
- tpi->proto == htons(ETH_P_ERSPAN2))
- itn = net_generic(net, erspan_net_id);
- else
- itn = net_generic(net, ipgre_net_id);
-
- iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
- t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
- iph->daddr, iph->saddr, tpi->key);
-
- if (!t)
- return;
-
#if IS_ENABLED(CONFIG_IPV6)
if (tpi->proto == htons(ETH_P_IPV6) &&
!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
type, data_len))
- return;
+ return 0;
#endif
if (t->parms.iph.daddr == 0 ||
ipv4_is_multicast(t->parms.iph.daddr))
- return;
+ return 0;
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
- return;
+ return 0;
if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
t->err_count++;
else
t->err_count = 1;
t->err_time = jiffies;
+
+ return 0;
}
static void gre_err(struct sk_buff *skb, u32 info)
@@ -1601,7 +1603,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
memset(&tb, 0, sizeof(tb));
dev = rtnl_create_link(net, name, name_assign_type,
- &ipgre_tap_ops, tb);
+ &ipgre_tap_ops, tb, NULL);
if (IS_ERR(dev))
return dev;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index e609b08c9df4..26921f6b3b92 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -188,51 +188,50 @@ bool ip_call_ra_chain(struct sk_buff *skb)
return false;
}
-static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
{
- __skb_pull(skb, skb_network_header_len(skb));
-
- rcu_read_lock();
- {
- int protocol = ip_hdr(skb)->protocol;
- const struct net_protocol *ipprot;
- int raw;
+ const struct net_protocol *ipprot;
+ int raw, ret;
- resubmit:
- raw = raw_local_deliver(skb, protocol);
+resubmit:
+ raw = raw_local_deliver(skb, protocol);
- ipprot = rcu_dereference(inet_protos[protocol]);
- if (ipprot) {
- int ret;
-
- if (!ipprot->no_policy) {
- if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- kfree_skb(skb);
- goto out;
- }
- nf_reset(skb);
+ ipprot = rcu_dereference(inet_protos[protocol]);
+ if (ipprot) {
+ if (!ipprot->no_policy) {
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ kfree_skb(skb);
+ return;
}
- ret = ipprot->handler(skb);
- if (ret < 0) {
- protocol = -ret;
- goto resubmit;
+ nf_reset(skb);
+ }
+ ret = ipprot->handler(skb);
+ if (ret < 0) {
+ protocol = -ret;
+ goto resubmit;
+ }
+ __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+ } else {
+ if (!raw) {
+ if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
+ icmp_send(skb, ICMP_DEST_UNREACH,
+ ICMP_PROT_UNREACH, 0);
}
- __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+ kfree_skb(skb);
} else {
- if (!raw) {
- if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
- icmp_send(skb, ICMP_DEST_UNREACH,
- ICMP_PROT_UNREACH, 0);
- }
- kfree_skb(skb);
- } else {
- __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
- consume_skb(skb);
- }
+ __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+ consume_skb(skb);
}
}
- out:
+}
+
+static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ __skb_pull(skb, skb_network_header_len(skb));
+
+ rcu_read_lock();
+ ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);
rcu_read_unlock();
return 0;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 5dbec21856f4..ab6618036afe 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -867,6 +867,7 @@ static int __ip_append_data(struct sock *sk,
unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
+ struct ubuf_info *uarg = NULL;
struct sk_buff *skb;
struct ip_options *opt = cork->opt;
@@ -880,8 +881,8 @@ static int __ip_append_data(struct sock *sk,
int csummode = CHECKSUM_NONE;
struct rtable *rt = (struct rtable *)cork->dst;
unsigned int wmem_alloc_delta = 0;
+ bool paged, extra_uref;
u32 tskey = 0;
- bool paged;
skb = skb_peek_tail(queue);
@@ -916,6 +917,20 @@ static int __ip_append_data(struct sock *sk,
(!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
csummode = CHECKSUM_PARTIAL;
+ if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
+ uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+ if (!uarg)
+ return -ENOBUFS;
+ extra_uref = true;
+ if (rt->dst.dev->features & NETIF_F_SG &&
+ csummode == CHECKSUM_PARTIAL) {
+ paged = true;
+ } else {
+ uarg->zerocopy = 0;
+ skb_zcopy_set(skb, uarg, &extra_uref);
+ }
+ }
+
cork->length += length;
/* So, what's going on in the loop below?
@@ -1001,12 +1016,6 @@ alloc_new_skb:
skb->csum = 0;
skb_reserve(skb, hh_len);
- /* only the initial fragment is time stamped */
- skb_shinfo(skb)->tx_flags = cork->tx_flags;
- cork->tx_flags = 0;
- skb_shinfo(skb)->tskey = tskey;
- tskey = 0;
-
/*
* Find where to start putting bytes.
*/
@@ -1039,6 +1048,13 @@ alloc_new_skb:
exthdrlen = 0;
csummode = CHECKSUM_NONE;
+ /* only the initial fragment is time stamped */
+ skb_shinfo(skb)->tx_flags = cork->tx_flags;
+ cork->tx_flags = 0;
+ skb_shinfo(skb)->tskey = tskey;
+ tskey = 0;
+ skb_zcopy_set(skb, uarg, &extra_uref);
+
if ((flags & MSG_CONFIRM) && !skb_prev)
skb_set_dst_pending_confirm(skb, 1);
@@ -1068,7 +1084,7 @@ alloc_new_skb:
err = -EFAULT;
goto error;
}
- } else {
+ } else if (!uarg || !uarg->zerocopy) {
int i = skb_shinfo(skb)->nr_frags;
err = -ENOMEM;
@@ -1098,6 +1114,10 @@ alloc_new_skb:
skb->data_len += copy;
skb->truesize += copy;
wmem_alloc_delta += copy;
+ } else {
+ err = skb_zerocopy_iter_dgram(skb, from, copy);
+ if (err < 0)
+ goto error;
}
offset += copy;
length -= copy;
@@ -1110,6 +1130,8 @@ alloc_new_skb:
error_efault:
err = -EFAULT;
error:
+ if (uarg)
+ sock_zerocopy_put_abort(uarg, extra_uref);
cork->length -= length;
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index c248e0dccbe1..c857ec6b9784 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -120,7 +120,7 @@ int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
}
skb_clear_hash_if_not_l4(skb);
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
skb_set_queue_mapping(skb, 0);
skb_scrub_packet(skb, xnet);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 88212615bf4c..55757764c381 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -220,7 +220,7 @@ static int __init ic_open_devs(void)
for_each_netdev(&init_net, dev) {
if (!(dev->flags & IFF_LOOPBACK) && !netdev_uses_dsa(dev))
continue;
- if (dev_change_flags(dev, dev->flags | IFF_UP) < 0)
+ if (dev_change_flags(dev, dev->flags | IFF_UP, NULL) < 0)
pr_err("IP-Config: Failed to open %s\n", dev->name);
}
@@ -238,7 +238,7 @@ static int __init ic_open_devs(void)
if (ic_proto_enabled && !able)
continue;
oflags = dev->flags;
- if (dev_change_flags(dev, oflags | IFF_UP) < 0) {
+ if (dev_change_flags(dev, oflags | IFF_UP, NULL) < 0) {
pr_err("IP-Config: Failed to open %s\n",
dev->name);
continue;
@@ -315,7 +315,7 @@ static void __init ic_close_devs(void)
dev = d->dev;
if (d != ic_dev && !netdev_uses_dsa(dev)) {
pr_debug("IP-Config: Downing %s\n", dev->name);
- dev_change_flags(dev, d->flags);
+ dev_change_flags(dev, d->flags, NULL);
}
kfree(d);
}
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index e65287c27e3d..57c5dd283a2c 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -140,6 +140,13 @@ static int ipip_err(struct sk_buff *skb, u32 info)
struct ip_tunnel *t;
int err = 0;
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->daddr, iph->saddr, 0);
+ if (!t) {
+ err = -ENOENT;
+ goto out;
+ }
+
switch (type) {
case ICMP_DEST_UNREACH:
switch (code) {
@@ -167,13 +174,6 @@ static int ipip_err(struct sk_buff *skb, u32 info)
goto out;
}
- t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
- iph->daddr, iph->saddr, 0);
- if (!t) {
- err = -ENOENT;
- goto out;
- }
-
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol);
goto out;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index a6defbec4f1b..ea04e38f56e9 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -506,7 +506,7 @@ static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
dev->flags |= IFF_MULTICAST;
if (!ipmr_init_vif_indev(dev))
goto failure;
- if (dev_open(dev))
+ if (dev_open(dev, NULL))
goto failure;
dev_hold(dev);
}
@@ -589,7 +589,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
if (!ipmr_init_vif_indev(dev))
goto failure;
- if (dev_open(dev))
+ if (dev_open(dev, NULL))
goto failure;
dev_hold(dev);
@@ -1802,7 +1802,7 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
struct vif_device *out_vif = &mrt->vif_table[out_vifi];
struct vif_device *in_vif = &mrt->vif_table[in_vifi];
- if (!skb->offload_mr_fwd_mark)
+ if (!skb->offload_l3_fwd_mark)
return false;
if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len)
return false;
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
index 6d218f5a2e71..ca9a5fefdefa 100644
--- a/net/ipv4/metrics.c
+++ b/net/ipv4/metrics.c
@@ -6,7 +6,8 @@
#include <net/tcp.h>
static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
- int fc_mx_len, u32 *metrics)
+ int fc_mx_len, u32 *metrics,
+ struct netlink_ext_ack *extack)
{
bool ecn_ca = false;
struct nlattr *nla;
@@ -21,19 +22,26 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
if (!type)
continue;
- if (type > RTAX_MAX)
+ if (type > RTAX_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid metric type");
return -EINVAL;
+ }
if (type == RTAX_CC_ALGO) {
char tmp[TCP_CA_NAME_MAX];
nla_strlcpy(tmp, nla, sizeof(tmp));
val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
- if (val == TCP_CA_UNSPEC)
+ if (val == TCP_CA_UNSPEC) {
+ NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm");
return -EINVAL;
+ }
} else {
- if (nla_len(nla) != sizeof(u32))
+ if (nla_len(nla) != sizeof(u32)) {
+ NL_SET_ERR_MSG_ATTR(extack, nla,
+ "Invalid attribute in metrics");
return -EINVAL;
+ }
val = nla_get_u32(nla);
}
if (type == RTAX_ADVMSS && val > 65535 - 40)
@@ -42,8 +50,10 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
val = 65535 - 15;
if (type == RTAX_HOPLIMIT && val > 255)
val = 255;
- if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
+ if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) {
+ NL_SET_ERR_MSG(extack, "Unknown flag set in feature mask in metrics attribute");
return -EINVAL;
+ }
metrics[type - 1] = val;
}
@@ -54,7 +64,8 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
}
struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
- int fc_mx_len)
+ int fc_mx_len,
+ struct netlink_ext_ack *extack)
{
struct dst_metrics *fib_metrics;
int err;
@@ -66,7 +77,8 @@ struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
if (unlikely(!fib_metrics))
return ERR_PTR(-ENOMEM);
- err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics);
+ err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics,
+ extack);
if (!err) {
refcount_set(&fib_metrics->refcnt, 1);
} else {
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 70289682a670..c3610b37bb4c 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -219,6 +219,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
+ SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE),
SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 32a691b7ce2c..92d249e053be 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -29,6 +29,7 @@
#include <net/protocol.h>
struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
+EXPORT_SYMBOL(inet_protos);
const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
EXPORT_SYMBOL(inet_offloads);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 8ca3eb06ba04..076f51646d26 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -131,8 +131,7 @@ struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
if (net_eq(sock_net(sk), net) && inet->inet_num == num &&
!(inet->inet_daddr && inet->inet_daddr != raddr) &&
!(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
- !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
- sk->sk_bound_dev_if != sdif))
+ raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
goto found; /* gotcha */
}
sk = NULL;
@@ -805,7 +804,7 @@ out:
return copied;
}
-static int raw_init(struct sock *sk)
+static int raw_sk_init(struct sock *sk)
{
struct raw_sock *rp = raw_sk(sk);
@@ -970,7 +969,7 @@ struct proto raw_prot = {
.connect = ip4_datagram_connect,
.disconnect = __udp_disconnect,
.ioctl = raw_ioctl,
- .init = raw_init,
+ .init = raw_sk_init,
.setsockopt = raw_setsockopt,
.getsockopt = raw_getsockopt,
.sendmsg = raw_sendmsg,
@@ -1134,3 +1133,27 @@ void __init raw_proc_exit(void)
unregister_pernet_subsys(&raw_net_ops);
}
#endif /* CONFIG_PROC_FS */
+
+static void raw_sysctl_init_net(struct net *net)
+{
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ net->ipv4.sysctl_raw_l3mdev_accept = 1;
+#endif
+}
+
+static int __net_init raw_sysctl_init(struct net *net)
+{
+ raw_sysctl_init_net(net);
+ return 0;
+}
+
+static struct pernet_operations __net_initdata raw_sysctl_ops = {
+ .init = raw_sysctl_init,
+};
+
+void __init raw_init(void)
+{
+ raw_sysctl_init_net(&init_net);
+ if (register_pernet_subsys(&raw_sysctl_ops))
+ panic("RAW: failed to init sysctl parameters.\n");
+}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index c0a9d26c06ce..c4ddbc5f01fc 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1677,7 +1677,7 @@ static void ip_handle_martian_source(struct net_device *dev,
print_hex_dump(KERN_WARNING, "ll header: ",
DUMP_PREFIX_OFFSET, 16, 1,
skb_mac_header(skb),
- dev->hard_header_len, true);
+ dev->hard_header_len, false);
}
}
#endif
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 891ed2f91467..ba0fc4b18465 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -602,6 +602,17 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = ipv4_ping_group_range,
},
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ {
+ .procname = "raw_l3mdev_accept",
+ .data = &init_net.ipv4.sysctl_raw_l3mdev_accept,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
{
.procname = "tcp_ecn",
.data = &init_net.ipv4.sysctl_tcp_ecn,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9e6bc4d6daa7..27e2f6837062 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1423,7 +1423,7 @@ do_error:
if (copied + copied_syn)
goto out;
out_err:
- sock_zerocopy_put_abort(uarg);
+ sock_zerocopy_put_abort(uarg, true);
err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
@@ -2088,7 +2088,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
}
continue;
- found_ok_skb:
+found_ok_skb:
/* Ok so how much can we use? */
used = skb->len - offset;
if (len < used)
@@ -2147,7 +2147,7 @@ skip_copy:
sk_eat_skb(sk, skb);
continue;
- found_fin_ok:
+found_fin_ok:
/* Process the FIN. */
++*seq;
if (!(flags & MSG_PEEK))
@@ -2241,10 +2241,6 @@ void tcp_set_state(struct sock *sk, int state)
* socket sitting in hash tables.
*/
inet_sk_state_store(sk, state);
-
-#ifdef STATE_TRACE
- SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
-#endif
}
EXPORT_SYMBOL_GPL(tcp_set_state);
@@ -3246,6 +3242,7 @@ static size_t tcp_opt_stats_get_size(void)
nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */
0;
}
@@ -3299,6 +3296,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
TCP_NLA_PAD);
nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
+ nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
return stats;
}
@@ -3658,8 +3656,11 @@ bool tcp_alloc_md5sig_pool(void)
if (unlikely(!tcp_md5sig_pool_populated)) {
mutex_lock(&tcp_md5sig_mutex);
- if (!tcp_md5sig_pool_populated)
+ if (!tcp_md5sig_pool_populated) {
__tcp_alloc_md5sig_pool();
+ if (tcp_md5sig_pool_populated)
+ static_key_slow_inc(&tcp_md5_needed);
+ }
mutex_unlock(&tcp_md5sig_mutex);
}
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 9277abdd822a..0f497fc49c3f 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -128,7 +128,12 @@ static const u32 bbr_probe_rtt_mode_ms = 200;
/* Skip TSO below the following bandwidth (bits/sec): */
static const int bbr_min_tso_rate = 1200000;
-/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */
+/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+ * In order to help drive the network toward lower queues and low latency while
+ * maintaining high utilization, the average pacing rate aims to be slightly
+ * lower than the estimated bandwidth. This is an important aspect of the
+ * design.
+ */
static const int bbr_pacing_margin_percent = 1;
/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
@@ -247,13 +252,7 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);
}
-/* Pace using current bw estimate and a gain factor. In order to help drive the
- * network toward lower queues while maintaining high utilization and low
- * latency, the average pacing rate aims to be slightly (~1%) lower than the
- * estimated bandwidth. This is an important aspect of the design. In this
- * implementation this slightly lower pacing rate is achieved implicitly by not
- * including link-layer headers in the packet size used for the pacing rate.
- */
+/* Pace using current bw estimate and a gain factor. */
static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
{
struct tcp_sock *tp = tcp_sk(sk);
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 3b45fe530f91..a47c1cdf90fc 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -289,12 +289,23 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
{
bool cork = false, enospc = msg->sg.start == msg->sg.end;
struct sock *sk_redir;
- u32 tosend;
+ u32 tosend, delta = 0;
int ret;
more_data:
- if (psock->eval == __SK_NONE)
+ if (psock->eval == __SK_NONE) {
+ /* Track delta in msg size to add/subtract it on SK_DROP from
+ * returned to user copied size. This ensures user doesn't
+ * get a positive return code with msg_cut_data and SK_DROP
+ * verdict.
+ */
+ delta = msg->sg.size;
psock->eval = sk_psock_msg_verdict(sk, psock, msg);
+ if (msg->sg.size < delta)
+ delta -= msg->sg.size;
+ else
+ delta = 0;
+ }
if (msg->cork_bytes &&
msg->cork_bytes > msg->sg.size && !enospc) {
@@ -350,7 +361,7 @@ more_data:
default:
sk_msg_free_partial(sk, msg, tosend);
sk_msg_apply_bytes(psock, tosend);
- *copied -= tosend;
+ *copied -= (tosend + delta);
return -EACCES;
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a9d9555a973f..76858b14ebe9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1865,16 +1865,20 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
/* Emulate SACKs for SACKless connection: account for a new dupack. */
-static void tcp_add_reno_sack(struct sock *sk)
+static void tcp_add_reno_sack(struct sock *sk, int num_dupack)
{
- struct tcp_sock *tp = tcp_sk(sk);
- u32 prior_sacked = tp->sacked_out;
+ if (num_dupack) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 prior_sacked = tp->sacked_out;
+ s32 delivered;
- tp->sacked_out++;
- tcp_check_reno_reordering(sk, 0);
- if (tp->sacked_out > prior_sacked)
- tp->delivered++; /* Some out-of-order packet is delivered */
- tcp_verify_left_out(tp);
+ tp->sacked_out += num_dupack;
+ tcp_check_reno_reordering(sk, 0);
+ delivered = tp->sacked_out - prior_sacked;
+ if (delivered > 0)
+ tp->delivered += delivered;
+ tcp_verify_left_out(tp);
+ }
}
/* Account for ACK, ACKing some data in Reno Recovery phase. */
@@ -2459,8 +2463,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
tp->prior_cwnd - 1;
sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
- } else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
- !(flag & FLAG_LOST_RETRANS)) {
+ } else if ((flag & (FLAG_RETRANS_DATA_ACKED | FLAG_LOST_RETRANS)) ==
+ FLAG_RETRANS_DATA_ACKED) {
sndcnt = min_t(int, delta,
max_t(int, tp->prr_delivered - tp->prr_out,
newly_acked_sacked) + 1);
@@ -2636,7 +2640,7 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack)
/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
* recovered or spurious. Otherwise retransmits more on partial ACKs.
*/
-static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
+static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
int *rexmit)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2655,7 +2659,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
return;
if (after(tp->snd_nxt, tp->high_seq)) {
- if (flag & FLAG_DATA_SACKED || is_dupack)
+ if (flag & FLAG_DATA_SACKED || num_dupack)
tp->frto = 0; /* Step 3.a. loss was real */
} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
tp->high_seq = tp->snd_nxt;
@@ -2681,8 +2685,8 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
/* A Reno DUPACK means new data in F-RTO step 2.b above are
* delivered. Lower inflight to clock out (re)tranmissions.
*/
- if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
- tcp_add_reno_sack(sk);
+ if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
+ tcp_add_reno_sack(sk, num_dupack);
else if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
}
@@ -2759,13 +2763,13 @@ static bool tcp_force_fast_retransmit(struct sock *sk)
* tcp_xmit_retransmit_queue().
*/
static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
- bool is_dupack, int *ack_flag, int *rexmit)
+ int num_dupack, int *ack_flag, int *rexmit)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int fast_rexmit = 0, flag = *ack_flag;
- bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
- tcp_force_fast_retransmit(sk));
+ bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) &&
+ tcp_force_fast_retransmit(sk));
if (!tp->packets_out && tp->sacked_out)
tp->sacked_out = 0;
@@ -2812,8 +2816,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
if (!(flag & FLAG_SND_UNA_ADVANCED)) {
- if (tcp_is_reno(tp) && is_dupack)
- tcp_add_reno_sack(sk);
+ if (tcp_is_reno(tp))
+ tcp_add_reno_sack(sk, num_dupack);
} else {
if (tcp_try_undo_partial(sk, prior_snd_una))
return;
@@ -2828,7 +2832,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
tcp_identify_packet_loss(sk, ack_flag);
break;
case TCP_CA_Loss:
- tcp_process_loss(sk, flag, is_dupack, rexmit);
+ tcp_process_loss(sk, flag, num_dupack, rexmit);
tcp_identify_packet_loss(sk, ack_flag);
if (!(icsk->icsk_ca_state == TCP_CA_Open ||
(*ack_flag & FLAG_LOST_RETRANS)))
@@ -2839,8 +2843,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
- if (is_dupack)
- tcp_add_reno_sack(sk);
+ tcp_add_reno_sack(sk, num_dupack);
}
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
@@ -3562,7 +3565,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
bool is_sack_reneg = tp->is_sack_reneg;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
- bool is_dupack = false;
+ int num_dupack = 0;
int prior_packets = tp->packets_out;
u32 delivered = tp->delivered;
u32 lost = tp->lost;
@@ -3614,7 +3617,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (flag & FLAG_UPDATE_TS_RECENT)
tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
- if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
+ if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) ==
+ FLAG_SND_UNA_ADVANCED) {
/* Window is constant, pure forward advance.
* No more checks are required.
* Note, we use the fact that SND.UNA>=SND.WL2.
@@ -3672,8 +3676,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_set_xmit_timer(sk);
if (tcp_ack_is_dubious(sk, flag)) {
- is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+ if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP))) {
+ num_dupack = 1;
+ /* Consider if pure acks were aggregated in tcp_add_backlog() */
+ if (!(flag & FLAG_DATA))
+ num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+ }
+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
}
@@ -3691,7 +3700,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK) {
- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
tcp_newly_delivered(sk, delivered, flag);
}
@@ -3716,7 +3725,7 @@ old_ack:
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state);
- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
tcp_newly_delivered(sk, delivered, flag);
tcp_xmit_recovery(sk, rexmit);
@@ -4606,13 +4615,12 @@ end:
}
}
-static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
- bool *fragstolen)
+static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
+ bool *fragstolen)
{
int eaten;
struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
- __skb_pull(skb, hdrlen);
eaten = (tail &&
tcp_try_coalesce(sk, tail,
skb, fragstolen)) ? 1 : 0;
@@ -4663,7 +4671,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
- if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
+ if (tcp_queue_rcv(sk, skb, &fragstolen)) {
WARN_ON_ONCE(fragstolen); /* should not happen */
__kfree_skb(skb);
}
@@ -4723,7 +4731,7 @@ queue_and_out:
goto drop;
}
- eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
+ eaten = tcp_queue_rcv(sk, skb, &fragstolen);
if (skb->len)
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -5599,8 +5607,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
- eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
- &fragstolen);
+ __skb_pull(skb, tcp_header_len);
+ eaten = tcp_queue_rcv(sk, skb, &fragstolen);
tcp_event_data_recv(sk, skb);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index de47038afdf0..efc6fef692ff 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -423,7 +423,7 @@ EXPORT_SYMBOL(tcp_req_err);
*
*/
-void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
+int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
{
const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
@@ -446,20 +446,21 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
inet_iif(icmp_skb), 0);
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
- return;
+ return -ENOENT;
}
if (sk->sk_state == TCP_TIME_WAIT) {
inet_twsk_put(inet_twsk(sk));
- return;
+ return 0;
}
seq = ntohl(th->seq);
- if (sk->sk_state == TCP_NEW_SYN_RECV)
- return tcp_req_err(sk, seq,
- type == ICMP_PARAMETERPROB ||
- type == ICMP_TIME_EXCEEDED ||
- (type == ICMP_DEST_UNREACH &&
- (code == ICMP_NET_UNREACH ||
- code == ICMP_HOST_UNREACH)));
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
+ type == ICMP_TIME_EXCEEDED ||
+ (type == ICMP_DEST_UNREACH &&
+ (code == ICMP_NET_UNREACH ||
+ code == ICMP_HOST_UNREACH)));
+ return 0;
+ }
bh_lock_sock(sk);
/* If too many ICMPs get dropped on busy
@@ -541,7 +542,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
skb = tcp_rtx_queue_head(sk);
- BUG_ON(!skb);
tcp_mstamp_refresh(tp);
delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
@@ -613,6 +613,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
out:
bh_unlock_sock(sk);
sock_put(sk);
+ return 0;
}
void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
@@ -969,10 +970,13 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
* We need to maintain these in the sk structure.
*/
+struct static_key tcp_md5_needed __read_mostly;
+EXPORT_SYMBOL(tcp_md5_needed);
+
/* Find the Key structure for an address. */
-struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
- const union tcp_md5_addr *addr,
- int family)
+struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
+ const union tcp_md5_addr *addr,
+ int family)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
@@ -1010,7 +1014,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
}
return best_match;
}
-EXPORT_SYMBOL(tcp_md5_do_lookup);
+EXPORT_SYMBOL(__tcp_md5_do_lookup);
static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
const union tcp_md5_addr *addr,
@@ -1618,12 +1622,14 @@ int tcp_v4_early_demux(struct sk_buff *skb)
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
{
u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
-
- /* Only socket owner can try to collapse/prune rx queues
- * to reduce memory overhead, so add a little headroom here.
- * Few sockets backlog are possibly concurrently non empty.
- */
- limit += 64*1024;
+ struct skb_shared_info *shinfo;
+ const struct tcphdr *th;
+ struct tcphdr *thtail;
+ struct sk_buff *tail;
+ unsigned int hdrlen;
+ bool fragstolen;
+ u32 gso_segs;
+ int delta;
/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
* we can fix skb->truesize to its real value to avoid future drops.
@@ -1633,6 +1639,86 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
*/
skb_condense(skb);
+ skb_dst_drop(skb);
+
+ if (unlikely(tcp_checksum_complete(skb))) {
+ bh_unlock_sock(sk);
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+ return true;
+ }
+
+ /* Attempt coalescing to last skb in backlog, even if we are
+ * above the limits.
+ * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
+ */
+ th = (const struct tcphdr *)skb->data;
+ hdrlen = th->doff * 4;
+ shinfo = skb_shinfo(skb);
+
+ if (!shinfo->gso_size)
+ shinfo->gso_size = skb->len - hdrlen;
+
+ if (!shinfo->gso_segs)
+ shinfo->gso_segs = 1;
+
+ tail = sk->sk_backlog.tail;
+ if (!tail)
+ goto no_coalesce;
+ thtail = (struct tcphdr *)tail->data;
+
+ if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
+ TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
+ ((TCP_SKB_CB(tail)->tcp_flags |
+ TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) ||
+ ((TCP_SKB_CB(tail)->tcp_flags ^
+ TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
+#ifdef CONFIG_TLS_DEVICE
+ tail->decrypted != skb->decrypted ||
+#endif
+ thtail->doff != th->doff ||
+ memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
+ goto no_coalesce;
+
+ __skb_pull(skb, hdrlen);
+ if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
+ thtail->window = th->window;
+
+ TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
+
+ if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
+ TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
+
+ TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+
+ if (TCP_SKB_CB(skb)->has_rxtstamp) {
+ TCP_SKB_CB(tail)->has_rxtstamp = true;
+ tail->tstamp = skb->tstamp;
+ skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
+ }
+
+ /* Not as strict as GRO. We only need to carry mss max value */
+ skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
+ skb_shinfo(tail)->gso_size);
+
+ gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
+ skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
+
+ sk->sk_backlog.len += delta;
+ __NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPBACKLOGCOALESCE);
+ kfree_skb_partial(skb, fragstolen);
+ return false;
+ }
+ __skb_push(skb, hdrlen);
+
+no_coalesce:
+ /* Only socket owner can try to collapse/prune rx queues
+ * to reduce memory overhead, so add a little headroom here.
+ * Few sockets backlog are possibly concurrently non empty.
+ */
+ limit += 64*1024;
+
if (unlikely(sk_add_backlog(sk, skb, limit))) {
bh_unlock_sock(sk);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
@@ -2573,8 +2659,8 @@ static int __net_init tcp_sk_init(struct net *net)
* which are too large can cause TCP streams to be bursty.
*/
net->ipv4.sysctl_tcp_tso_win_divisor = 3;
- /* Default TSQ limit of four TSO segments */
- net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
+ /* Default TSQ limit of 16 TSO segments */
+ net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
/* rfc5961 challenge ack rate limiting */
net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
net->ipv4.sysctl_tcp_min_tso_segs = 2;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d1676d8a6ed7..c31badfee806 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -233,16 +233,14 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
if (init_rcv_wnd)
*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
- (*rcv_wscale) = 0;
+ *rcv_wscale = 0;
if (wscale_ok) {
/* Set window scaling on max possible window */
space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
space = max_t(u32, space, sysctl_rmem_max);
space = min_t(u32, space, *window_clamp);
- while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
- space >>= 1;
- (*rcv_wscale)++;
- }
+ *rcv_wscale = clamp_t(int, ilog2(space) - 15,
+ 0, TCP_MAX_WSCALE);
}
/* Set the clamp no higher than max representable value */
(*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
@@ -596,7 +594,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
*md5 = NULL;
#ifdef CONFIG_TCP_MD5SIG
- if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
+ if (static_key_false(&tcp_md5_needed) &&
+ rcu_access_pointer(tp->md5sig_info)) {
*md5 = tp->af_specific->md5_lookup(sk, sk);
if (*md5) {
opts->options |= OPTION_MD5;
@@ -732,7 +731,8 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
*md5 = NULL;
#ifdef CONFIG_TCP_MD5SIG
- if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
+ if (static_key_false(&tcp_md5_needed) &&
+ rcu_access_pointer(tp->md5sig_info)) {
*md5 = tp->af_specific->md5_lookup(sk, sk);
if (*md5) {
opts->options |= OPTION_MD5;
@@ -1909,18 +1909,22 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
u32 max_segs)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- u32 age, send_win, cong_win, limit, in_flight;
+ u32 send_win, cong_win, limit, in_flight;
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *head;
int win_divisor;
+ s64 delta;
if (icsk->icsk_ca_state >= TCP_CA_Recovery)
goto send_now;
/* Avoid bursty behavior by allowing defer
- * only if the last write was recent.
+ * only if the last write was recent (1 ms).
+ * Note that tp->tcp_wstamp_ns can be in the future if we have
+ * packets waiting in a qdisc or device for EDT delivery.
*/
- if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0)
+ delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
+ if (delta > 0)
goto send_now;
in_flight = tcp_packets_in_flight(tp);
@@ -1943,6 +1947,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
goto send_now;
+ /* If this packet won't get more data, do not wait. */
+ if (TCP_SKB_CB(skb)->eor)
+ goto send_now;
+
win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
if (win_divisor) {
u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
@@ -1967,9 +1975,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
head = tcp_rtx_queue_head(sk);
if (!head)
goto send_now;
- age = tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(head));
+ delta = tp->tcp_clock_cache - head->tstamp;
/* If next ACK is likely to come too late (half srtt), do not defer */
- if (age < (tp->srtt_us >> 4))
+ if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
goto send_now;
/* Ok, it looks like it is advisable to defer.
@@ -2228,8 +2236,9 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
limit = max_t(unsigned long,
2 * skb->truesize,
sk->sk_pacing_rate >> sk->sk_pacing_shift);
- limit = min_t(unsigned long, limit,
- sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
+ if (sk->sk_pacing_status == SK_PACING_NONE)
+ limit = min_t(unsigned long, limit,
+ sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
limit <<= factor;
if (refcount_read(&sk->sk_wmem_alloc) > limit) {
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index c0630013c1ae..33bf8e9c8663 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -149,34 +149,40 @@ drop:
}
#endif
-static void tunnel4_err(struct sk_buff *skb, u32 info)
+static int tunnel4_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler;
for_each_tunnel_rcu(tunnel4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
#if IS_ENABLED(CONFIG_IPV6)
-static void tunnel64_err(struct sk_buff *skb, u32 info)
+static int tunnel64_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler;
for_each_tunnel_rcu(tunnel64_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
#endif
#if IS_ENABLED(CONFIG_MPLS)
-static void tunnelmpls4_err(struct sk_buff *skb, u32 info)
+static int tunnelmpls4_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler;
for_each_tunnel_rcu(tunnelmpls4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
#endif
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1976fddb9e00..aff2a8e99e01 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -105,6 +105,7 @@
#include <net/net_namespace.h>
#include <net/icmp.h>
#include <net/inet_hashtables.h>
+#include <net/ip_tunnels.h>
#include <net/route.h>
#include <net/checksum.h>
#include <net/xfrm.h>
@@ -115,6 +116,7 @@
#include "udp_impl.h"
#include <net/sock_reuseport.h>
#include <net/addrconf.h>
+#include <net/udp_tunnel.h>
struct udp_table udp_table __read_mostly;
EXPORT_SYMBOL(udp_table);
@@ -371,6 +373,7 @@ static int compute_score(struct sock *sk, struct net *net,
{
int score;
struct inet_sock *inet;
+ bool dev_match;
if (!net_eq(sock_net(sk), net) ||
udp_sk(sk)->udp_port_hash != hnum ||
@@ -398,15 +401,11 @@ static int compute_score(struct sock *sk, struct net *net,
score += 4;
}
- if (sk->sk_bound_dev_if || exact_dif) {
- bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
-
- if (!dev_match)
- return -1;
- if (sk->sk_bound_dev_if)
- score += 4;
- }
+ dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+ dif, sdif);
+ if (!dev_match)
+ return -1;
+ score += 4;
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
@@ -585,6 +584,89 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
return true;
}
+DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
+void udp_encap_enable(void)
+{
+ static_branch_inc(&udp_encap_needed_key);
+}
+EXPORT_SYMBOL(udp_encap_enable);
+
+/* Handler for tunnels with arbitrary destination ports: no socket lookup, go
+ * through error handlers in encapsulations looking for a match.
+ */
+static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info)
+{
+ int i;
+
+ for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
+ int (*handler)(struct sk_buff *skb, u32 info);
+
+ if (!iptun_encaps[i])
+ continue;
+ handler = rcu_dereference(iptun_encaps[i]->err_handler);
+ if (handler && !handler(skb, info))
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+/* Try to match ICMP errors to UDP tunnels by looking up a socket without
+ * reversing source and destination port: this will match tunnels that force the
+ * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that
+ * lwtunnels might actually break this assumption by being configured with
+ * different destination ports on endpoints, in this case we won't be able to
+ * trace ICMP messages back to them.
+ *
+ * If this doesn't match any socket, probe tunnels with arbitrary destination
+ * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port
+ * we've sent packets to won't necessarily match the local destination port.
+ *
+ * Then ask the tunnel implementation to match the error against a valid
+ * association.
+ *
+ * Return an error if we can't find a match, the socket if we need further
+ * processing, zero otherwise.
+ */
+static struct sock *__udp4_lib_err_encap(struct net *net,
+ const struct iphdr *iph,
+ struct udphdr *uh,
+ struct udp_table *udptable,
+ struct sk_buff *skb, u32 info)
+{
+ int network_offset, transport_offset;
+ struct sock *sk;
+
+ network_offset = skb_network_offset(skb);
+ transport_offset = skb_transport_offset(skb);
+
+ /* Network header needs to point to the outer IPv4 header inside ICMP */
+ skb_reset_network_header(skb);
+
+ /* Transport header needs to point to the UDP header */
+ skb_set_transport_header(skb, iph->ihl << 2);
+
+ sk = __udp4_lib_lookup(net, iph->daddr, uh->source,
+ iph->saddr, uh->dest, skb->dev->ifindex, 0,
+ udptable, NULL);
+ if (sk) {
+ int (*lookup)(struct sock *sk, struct sk_buff *skb);
+ struct udp_sock *up = udp_sk(sk);
+
+ lookup = READ_ONCE(up->encap_err_lookup);
+ if (!lookup || lookup(sk, skb))
+ sk = NULL;
+ }
+
+ if (!sk)
+ sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info));
+
+ skb_set_transport_header(skb, transport_offset);
+ skb_set_network_header(skb, network_offset);
+
+ return sk;
+}
+
/*
* This routine is called by the ICMP module when it gets some
* sort of error condition. If err < 0 then the socket should
@@ -596,13 +678,14 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
* to find the appropriate port.
*/
-void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
+int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
{
struct inet_sock *inet;
const struct iphdr *iph = (const struct iphdr *)skb->data;
struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
+ bool tunnel = false;
struct sock *sk;
int harderr;
int err;
@@ -612,8 +695,21 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
iph->saddr, uh->source, skb->dev->ifindex,
inet_sdif(skb), udptable, NULL);
if (!sk) {
- __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
- return; /* No socket for error */
+ /* No socket for error: try tunnels before discarding */
+ sk = ERR_PTR(-ENOENT);
+ if (static_branch_unlikely(&udp_encap_needed_key)) {
+ sk = __udp4_lib_err_encap(net, iph, uh, udptable, skb,
+ info);
+ if (!sk)
+ return 0;
+ }
+
+ if (IS_ERR(sk)) {
+ __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
+ return PTR_ERR(sk);
+ }
+
+ tunnel = true;
}
err = 0;
@@ -656,6 +752,10 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
* RFC1122: OK. Passes ICMP errors back to application, as per
* 4.1.3.3.
*/
+ if (tunnel) {
+ /* ...not for tunnels though: we don't have a sending socket */
+ goto out;
+ }
if (!inet->recverr) {
if (!harderr || sk->sk_state != TCP_ESTABLISHED)
goto out;
@@ -665,12 +765,12 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
sk->sk_err = err;
sk->sk_error_report(sk);
out:
- return;
+ return 0;
}
-void udp_err(struct sk_buff *skb, u32 info)
+int udp_err(struct sk_buff *skb, u32 info)
{
- __udp4_lib_err(skb, info, &udp_table);
+ return __udp4_lib_err(skb, info, &udp_table);
}
/*
@@ -1713,6 +1813,10 @@ try_again:
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
*addr_len = sizeof(*sin);
}
+
+ if (udp_sk(sk)->gro_enabled)
+ udp_cmsg_recv(msg, sk, skb);
+
if (inet->cmsg_flags)
ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);
@@ -1889,13 +1993,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
return 0;
}
-DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
-void udp_encap_enable(void)
-{
- static_branch_enable(&udp_encap_needed_key);
-}
-EXPORT_SYMBOL(udp_encap_enable);
-
/* returns:
* -1: error
* 0: success
@@ -1904,7 +2001,7 @@ EXPORT_SYMBOL(udp_encap_enable);
* Note that in the success and error cases, the skb is assumed to
* have either been requeued or freed.
*/
-static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
int is_udplite = IS_UDPLITE(sk);
@@ -2007,6 +2104,27 @@ drop:
return -1;
}
+static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_buff *next, *segs;
+ int ret;
+
+ if (likely(!udp_unexpected_gso(sk, skb)))
+ return udp_queue_rcv_one_skb(sk, skb);
+
+ BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET);
+ __skb_push(skb, -skb_mac_offset(skb));
+ segs = udp_rcv_segment(sk, skb, true);
+ for (skb = segs; skb; skb = next) {
+ next = skb->next;
+ __skb_pull(skb, skb_transport_offset(skb));
+ ret = udp_queue_rcv_one_skb(sk, skb);
+ if (ret > 0)
+ ip_protocol_deliver_rcu(dev_net(skb->dev), skb, -ret);
+ }
+ return 0;
+}
+
/* For TCP sockets, sk_rx_dst is protected by socket lock
* For UDP, we use xchg() to guard against concurrent changes.
*/
@@ -2398,11 +2516,15 @@ void udp_destroy_sock(struct sock *sk)
bool slow = lock_sock_fast(sk);
udp_flush_pending_frames(sk);
unlock_sock_fast(sk, slow);
- if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
- void (*encap_destroy)(struct sock *sk);
- encap_destroy = READ_ONCE(up->encap_destroy);
- if (encap_destroy)
- encap_destroy(sk);
+ if (static_branch_unlikely(&udp_encap_needed_key)) {
+ if (up->encap_type) {
+ void (*encap_destroy)(struct sock *sk);
+ encap_destroy = READ_ONCE(up->encap_destroy);
+ if (encap_destroy)
+ encap_destroy(sk);
+ }
+ if (up->encap_enabled)
+ static_branch_dec(&udp_encap_needed_key);
}
}
@@ -2447,7 +2569,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
/* FALLTHROUGH */
case UDP_ENCAP_L2TPINUDP:
up->encap_type = val;
- udp_encap_enable();
+ lock_sock(sk);
+ udp_tunnel_encap_enable(sk->sk_socket);
+ release_sock(sk);
break;
default:
err = -ENOPROTOOPT;
@@ -2469,6 +2593,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
up->gso_size = val;
break;
+ case UDP_GRO:
+ lock_sock(sk);
+ if (valbool)
+ udp_tunnel_encap_enable(sk->sk_socket);
+ up->gro_enabled = valbool;
+ release_sock(sk);
+ break;
+
/*
* UDP-Lite's partial checksum coverage (RFC 3828).
*/
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index e7d18b140287..322672655419 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -7,7 +7,7 @@
#include <net/inet_common.h>
int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int);
-void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
+int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
int udp_v4_get_port(struct sock *sk, unsigned short snum);
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 802f2bc00d69..0646d61f4fa8 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -343,6 +343,54 @@ out:
return segs;
}
+#define UDP_GRO_CNT_MAX 64
+static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
+ struct sk_buff *skb)
+{
+ struct udphdr *uh = udp_hdr(skb);
+ struct sk_buff *pp = NULL;
+ struct udphdr *uh2;
+ struct sk_buff *p;
+
+ /* requires non zero csum, for symmetry with GSO */
+ if (!uh->check) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+
+ /* pull encapsulating udp header */
+ skb_gro_pull(skb, sizeof(struct udphdr));
+ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
+
+ list_for_each_entry(p, head, list) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ uh2 = udp_hdr(p);
+
+ /* Match ports only, as csum is always non zero */
+ if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ /* Terminate the flow on len mismatch or if it grow "too much".
+ * Under small packet flood GRO count could elsewhere grow a lot
+ * leading to execessive truesize values
+ */
+ if (!skb_gro_receive(p, skb) &&
+ NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX)
+ pp = p;
+ else if (uh->len != uh2->len)
+ pp = p;
+
+ return pp;
+ }
+
+ /* mismatch, but we never need to flush */
+ return NULL;
+}
+
struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
struct udphdr *uh, udp_lookup_t lookup)
{
@@ -353,23 +401,27 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
int flush = 1;
struct sock *sk;
+ rcu_read_lock();
+ sk = (*lookup)(skb, uh->source, uh->dest);
+ if (!sk)
+ goto out_unlock;
+
+ if (udp_sk(sk)->gro_enabled) {
+ pp = call_gro_receive(udp_gro_receive_segment, head, skb);
+ rcu_read_unlock();
+ return pp;
+ }
+
if (NAPI_GRO_CB(skb)->encap_mark ||
(skb->ip_summed != CHECKSUM_PARTIAL &&
NAPI_GRO_CB(skb)->csum_cnt == 0 &&
- !NAPI_GRO_CB(skb)->csum_valid))
- goto out;
+ !NAPI_GRO_CB(skb)->csum_valid) ||
+ !udp_sk(sk)->gro_receive)
+ goto out_unlock;
/* mark that this skb passed once through the tunnel gro layer */
NAPI_GRO_CB(skb)->encap_mark = 1;
- rcu_read_lock();
- sk = (*lookup)(skb, uh->source, uh->dest);
-
- if (sk && udp_sk(sk)->gro_receive)
- goto unflush;
- goto out_unlock;
-
-unflush:
flush = 0;
list_for_each_entry(p, head, list) {
@@ -394,7 +446,6 @@ unflush:
out_unlock:
rcu_read_unlock();
-out:
skb_gro_flush_final(skb, pp, flush);
return pp;
}
@@ -427,6 +478,19 @@ flush:
return NULL;
}
+static int udp_gro_complete_segment(struct sk_buff *skb)
+{
+ struct udphdr *uh = udp_hdr(skb);
+
+ skb->csum_start = (unsigned char *)uh - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4;
+ return 0;
+}
+
int udp_gro_complete(struct sk_buff *skb, int nhoff,
udp_lookup_t lookup)
{
@@ -437,16 +501,21 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
uh->len = newlen;
- /* Set encapsulation before calling into inner gro_complete() functions
- * to make them set up the inner offsets.
- */
- skb->encapsulation = 1;
-
rcu_read_lock();
sk = (*lookup)(skb, uh->source, uh->dest);
- if (sk && udp_sk(sk)->gro_complete)
+ if (sk && udp_sk(sk)->gro_enabled) {
+ err = udp_gro_complete_segment(skb);
+ } else if (sk && udp_sk(sk)->gro_complete) {
+ skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM
+ : SKB_GSO_UDP_TUNNEL;
+
+ /* Set encapsulation before calling into inner gro_complete()
+ * functions to make them set up the inner offsets.
+ */
+ skb->encapsulation = 1;
err = udp_sk(sk)->gro_complete(sk, skb,
nhoff + sizeof(struct udphdr));
+ }
rcu_read_unlock();
if (skb->remcsum_offload)
@@ -461,13 +530,9 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
const struct iphdr *iph = ip_hdr(skb);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
- if (uh->check) {
- skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+ if (uh->check)
uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,
iph->daddr, 0);
- } else {
- skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
- }
return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
}
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 6539ff15e9a3..be8b5b2157d8 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -20,6 +20,23 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
if (err < 0)
goto error;
+ if (cfg->bind_ifindex) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, cfg->bind_ifindex);
+ if (!dev) {
+ err = -ENODEV;
+ goto error;
+ }
+
+ err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE,
+ dev->name, strlen(dev->name) + 1);
+ dev_put(dev);
+
+ if (err < 0)
+ goto error;
+ }
+
udp_addr.sin_family = AF_INET;
udp_addr.sin_addr = cfg->local_ip;
udp_addr.sin_port = cfg->local_udp_port;
@@ -68,6 +85,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
udp_sk(sk)->encap_type = cfg->encap_type;
udp_sk(sk)->encap_rcv = cfg->encap_rcv;
+ udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup;
udp_sk(sk)->encap_destroy = cfg->encap_destroy;
udp_sk(sk)->gro_receive = cfg->gro_receive;
udp_sk(sk)->gro_complete = cfg->gro_complete;
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 8545457752fb..39c7f17d916f 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -25,9 +25,9 @@ static int udplite_rcv(struct sk_buff *skb)
return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
}
-static void udplite_err(struct sk_buff *skb, u32 info)
+static int udplite_err(struct sk_buff *skb, u32 info)
{
- __udp4_lib_err(skb, info, &udplite_table);
+ return __udp4_lib_err(skb, info, &udplite_table);
}
static const struct net_protocol udplite_protocol = {
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index 8dd0e6ab8606..35c54865dc42 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -106,13 +106,15 @@ static int xfrm4_esp_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm4_esp_err(struct sk_buff *skb, u32 info)
+static int xfrm4_esp_err(struct sk_buff *skb, u32 info)
{
struct xfrm4_protocol *handler;
for_each_protocol_rcu(esp4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static int xfrm4_ah_rcv(struct sk_buff *skb)
@@ -132,13 +134,15 @@ static int xfrm4_ah_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm4_ah_err(struct sk_buff *skb, u32 info)
+static int xfrm4_ah_err(struct sk_buff *skb, u32 info)
{
struct xfrm4_protocol *handler;
for_each_protocol_rcu(ah4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
@@ -158,13 +162,15 @@ static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
+static int xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
{
struct xfrm4_protocol *handler;
for_each_protocol_rcu(ipcomp4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static const struct net_protocol esp4_protocol = {