summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c14
-rw-r--r--net/ipv4/datagram.c11
-rw-r--r--net/ipv4/esp4.c3
-rw-r--r--net/ipv4/fib_rules.c6
-rw-r--r--net/ipv4/fib_trie.c4
-rw-r--r--net/ipv4/icmp.c9
-rw-r--r--net/ipv4/igmp.c66
-rw-r--r--net/ipv4/inet_connection_sock.c11
-rw-r--r--net/ipv4/inetpeer.c49
-rw-r--r--net/ipv4/ip_fragment.c15
-rw-r--r--net/ipv4/ip_gre.c17
-rw-r--r--net/ipv4/ip_input.c11
-rw-r--r--net/ipv4/ip_output.c33
-rw-r--r--net/ipv4/ip_sockglue.c14
-rw-r--r--net/ipv4/ipmr.c28
-rw-r--r--net/ipv4/ipmr_base.c6
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/route.c27
-rw-r--r--net/ipv4/sysctl_net_ipv4.c10
-rw-r--r--net/ipv4/tcp_cubic.c8
-rw-r--r--net/ipv4/tcp_input.c84
-rw-r--r--net/ipv4/tcp_ipv4.c7
-rw-r--r--net/ipv4/tcp_minisocks.c7
-rw-r--r--net/ipv4/udp.c102
25 files changed, 346 insertions, 199 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8095e82de808..21f46ee7b6e9 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1309,8 +1309,6 @@ int inet_sk_rebuild_header(struct sock *sk)
{
struct rtable *rt = dst_rtable(__sk_dst_check(sk, 0));
struct inet_sock *inet = inet_sk(sk);
- __be32 daddr;
- struct ip_options_rcu *inet_opt;
struct flowi4 *fl4;
int err;
@@ -1319,17 +1317,9 @@ int inet_sk_rebuild_header(struct sock *sk)
return 0;
/* Reroute. */
- rcu_read_lock();
- inet_opt = rcu_dereference(inet->inet_opt);
- daddr = inet->inet_daddr;
- if (inet_opt && inet_opt->opt.srr)
- daddr = inet_opt->opt.faddr;
- rcu_read_unlock();
fl4 = &inet->cork.fl.u.ip4;
- rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr,
- inet->inet_dport, inet->inet_sport,
- sk->sk_protocol, ip_sock_rt_tos(sk),
- sk->sk_bound_dev_if);
+ inet_sk_init_flowi4(inet, fl4);
+ rt = ip_route_output_flow(sock_net(sk), fl4, sk);
if (!IS_ERR(rt)) {
err = 0;
sk_setup_caps(sk, &rt->dst);
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 4aca1f05edd3..4b5bc6eb52e7 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -102,8 +102,6 @@ EXPORT_SYMBOL(ip4_datagram_connect);
void ip4_datagram_release_cb(struct sock *sk)
{
const struct inet_sock *inet = inet_sk(sk);
- const struct ip_options_rcu *inet_opt;
- __be32 daddr = inet->inet_daddr;
struct dst_entry *dst;
struct flowi4 fl4;
struct rtable *rt;
@@ -115,14 +113,9 @@ void ip4_datagram_release_cb(struct sock *sk)
rcu_read_unlock();
return;
}
- inet_opt = rcu_dereference(inet->inet_opt);
- if (inet_opt && inet_opt->opt.srr)
- daddr = inet_opt->opt.faddr;
- rt = ip_route_output_ports(sock_net(sk), &fl4, sk, daddr,
- inet->inet_saddr, inet->inet_dport,
- inet->inet_sport, sk->sk_protocol,
- ip_sock_rt_tos(sk), sk->sk_bound_dev_if);
+ inet_sk_init_flowi4(inet, &fl4);
+ rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
dst = !IS_ERR(rt) ? &rt->dst : NULL;
sk_dst_set(sk, dst);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index f3281312eb5e..b0fbf804bbba 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -816,7 +816,8 @@ int esp_input_done2(struct sk_buff *skb, int err)
}
skb_pull_rcsum(skb, hlen);
- if (x->props.mode == XFRM_MODE_TUNNEL)
+ if (x->props.mode == XFRM_MODE_TUNNEL ||
+ x->props.mode == XFRM_MODE_IPTFS)
skb_reset_transport_header(skb);
else
skb_set_transport_header(skb, -ihl);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 8325224ef072..9517b8667e00 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -249,6 +249,12 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
int err = -EINVAL;
struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+ if (tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) {
+ NL_SET_ERR_MSG(extack,
+ "Flow label cannot be specified for IPv4 FIB rules");
+ goto errout;
+ }
+
if (!inet_validate_dscp(frh->tos)) {
NL_SET_ERR_MSG(extack,
"Invalid dsfield (tos): ECN bits must be 0");
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 161f5526b86c..d6411ac81096 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2999,7 +2999,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
seq_printf(seq,
"%s\t%08X\t%08X\t%04X\t%d\t%u\t"
- "%d\t%08X\t%d\t%u\t%u",
+ "%u\t%08X\t%d\t%u\t%u",
nhc->nhc_dev ? nhc->nhc_dev->name : "*",
prefix, gw, flags, 0, 0,
fi->fib_priority,
@@ -3011,7 +3011,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
} else {
seq_printf(seq,
"*\t%08X\t%08X\t%04X\t%d\t%u\t"
- "%d\t%08X\t%d\t%u\t%u",
+ "%u\t%08X\t%d\t%u\t%u",
prefix, 0, flags, 0, 0, 0,
mask, 0, 0, 0);
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 963a89ae9c26..094084b61bff 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -312,7 +312,6 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
struct dst_entry *dst = &rt->dst;
struct inet_peer *peer;
bool rc = true;
- int vif;
if (!apply_ratelimit)
return true;
@@ -321,12 +320,12 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
goto out;
- vif = l3mdev_master_ifindex(dst->dev);
- peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
+ rcu_read_lock();
+ peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr,
+ l3mdev_master_ifindex_rcu(dst->dev));
rc = inet_peer_xrlim_allow(peer,
READ_ONCE(net->ipv4.sysctl_icmp_ratelimit));
- if (peer)
- inet_putpeer(peer);
+ rcu_read_unlock();
out:
if (!rc)
__ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 6a238398acc9..3da126cea884 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -88,6 +88,8 @@
#include <linux/byteorder/generic.h>
#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/addrconf.h>
#include <net/arp.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -1430,6 +1432,65 @@ static void ip_mc_hash_remove(struct in_device *in_dev,
*mc_hash = im->next_hash;
}
+static int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev,
+ const struct ip_mc_list *im, int event)
+{
+ struct ifa_cacheinfo ci;
+ struct ifaddrmsg *ifm;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(skb, 0, 0, event, sizeof(struct ifaddrmsg), 0);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ifm = nlmsg_data(nlh);
+ ifm->ifa_family = AF_INET;
+ ifm->ifa_prefixlen = 32;
+ ifm->ifa_flags = IFA_F_PERMANENT;
+ ifm->ifa_scope = RT_SCOPE_UNIVERSE;
+ ifm->ifa_index = dev->ifindex;
+
+ ci.cstamp = (READ_ONCE(im->mca_cstamp) - INITIAL_JIFFIES) * 100UL / HZ;
+ ci.tstamp = ci.cstamp;
+ ci.ifa_prefered = INFINITY_LIFE_TIME;
+ ci.ifa_valid = INFINITY_LIFE_TIME;
+
+ if (nla_put_in_addr(skb, IFA_MULTICAST, im->multiaddr) < 0 ||
+ nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci) < 0) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static void inet_ifmcaddr_notify(struct net_device *dev,
+ const struct ip_mc_list *im, int event)
+{
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOMEM;
+
+ skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) +
+ nla_total_size(sizeof(__be32)) +
+ nla_total_size(sizeof(struct ifa_cacheinfo)),
+ GFP_KERNEL);
+ if (!skb)
+ goto error;
+
+ err = inet_fill_ifmcaddr(skb, dev, im, event);
+ if (err < 0) {
+ WARN_ON_ONCE(err == -EMSGSIZE);
+ nlmsg_free(skb);
+ goto error;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MCADDR, NULL, GFP_KERNEL);
+ return;
+error:
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_MCADDR, err);
+}
/*
* A socket has joined a multicast group on device dev.
@@ -1473,6 +1534,8 @@ static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
im->interface = in_dev;
in_dev_hold(in_dev);
im->multiaddr = addr;
+ im->mca_cstamp = jiffies;
+ im->mca_tstamp = im->mca_cstamp;
/* initial mode is (EX, empty) */
im->sfmode = mode;
im->sfcount[mode] = 1;
@@ -1492,6 +1555,7 @@ static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
igmpv3_del_delrec(in_dev, im);
#endif
igmp_group_added(im);
+ inet_ifmcaddr_notify(in_dev->dev, im, RTM_NEWMULTICAST);
if (!in_dev->dead)
ip_rt_multicast_event(in_dev);
out:
@@ -1705,6 +1769,8 @@ void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp)
*ip = i->next_rcu;
in_dev->mc_count--;
__igmp_group_dropped(i, gfp);
+ inet_ifmcaddr_notify(in_dev->dev, i,
+ RTM_DELMULTICAST);
ip_mc_clear_src(i);
if (!in_dev->dead)
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 6872b5aff73e..e4decfb270fa 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1561,20 +1561,13 @@ EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
{
const struct inet_sock *inet = inet_sk(sk);
- const struct ip_options_rcu *inet_opt;
- __be32 daddr = inet->inet_daddr;
struct flowi4 *fl4;
struct rtable *rt;
rcu_read_lock();
- inet_opt = rcu_dereference(inet->inet_opt);
- if (inet_opt && inet_opt->opt.srr)
- daddr = inet_opt->opt.faddr;
fl4 = &fl->u.ip4;
- rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
- inet->inet_saddr, inet->inet_dport,
- inet->inet_sport, sk->sk_protocol,
- ip_sock_rt_tos(sk), sk->sk_bound_dev_if);
+ inet_sk_init_flowi4(inet, fl4);
+ rt = ip_route_output_flow(sock_net(sk), fl4, sk);
if (IS_ERR(rt))
rt = NULL;
if (rt)
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 5ab56f4cb529..b8b23a77ceb4 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -95,6 +95,7 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr,
{
struct rb_node **pp, *parent, *next;
struct inet_peer *p;
+ u32 now;
pp = &base->rb_root.rb_node;
parent = NULL;
@@ -108,8 +109,9 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr,
p = rb_entry(parent, struct inet_peer, rb_node);
cmp = inetpeer_addr_cmp(daddr, &p->daddr);
if (cmp == 0) {
- if (!refcount_inc_not_zero(&p->refcnt))
- break;
+ now = jiffies;
+ if (READ_ONCE(p->dtime) != now)
+ WRITE_ONCE(p->dtime, now);
return p;
}
if (gc_stack) {
@@ -150,9 +152,6 @@ static void inet_peer_gc(struct inet_peer_base *base,
for (i = 0; i < gc_cnt; i++) {
p = gc_stack[i];
- /* The READ_ONCE() pairs with the WRITE_ONCE()
- * in inet_putpeer()
- */
delta = (__u32)jiffies - READ_ONCE(p->dtime);
if (delta < ttl || !refcount_dec_if_one(&p->refcnt))
@@ -168,31 +167,23 @@ static void inet_peer_gc(struct inet_peer_base *base,
}
}
+/* Must be called under RCU : No refcount change is done here. */
struct inet_peer *inet_getpeer(struct inet_peer_base *base,
- const struct inetpeer_addr *daddr,
- int create)
+ const struct inetpeer_addr *daddr)
{
struct inet_peer *p, *gc_stack[PEER_MAX_GC];
struct rb_node **pp, *parent;
unsigned int gc_cnt, seq;
- int invalidated;
/* Attempt a lockless lookup first.
* Because of a concurrent writer, we might not find an existing entry.
*/
- rcu_read_lock();
seq = read_seqbegin(&base->lock);
p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp);
- invalidated = read_seqretry(&base->lock, seq);
- rcu_read_unlock();
if (p)
return p;
- /* If no writer did a change during our lookup, we can return early. */
- if (!create && !invalidated)
- return NULL;
-
/* retry an exact lookup, taking the lock before.
* At least, nodes should be hot in our cache.
*/
@@ -201,12 +192,12 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
gc_cnt = 0;
p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp);
- if (!p && create) {
+ if (!p) {
p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
if (p) {
p->daddr = *daddr;
p->dtime = (__u32)jiffies;
- refcount_set(&p->refcnt, 2);
+ refcount_set(&p->refcnt, 1);
atomic_set(&p->rid, 0);
p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
p->rate_tokens = 0;
@@ -231,15 +222,9 @@ EXPORT_SYMBOL_GPL(inet_getpeer);
void inet_putpeer(struct inet_peer *p)
{
- /* The WRITE_ONCE() pairs with itself (we run lockless)
- * and the READ_ONCE() in inet_peer_gc()
- */
- WRITE_ONCE(p->dtime, (__u32)jiffies);
-
if (refcount_dec_and_test(&p->refcnt))
kfree_rcu(p, rcu);
}
-EXPORT_SYMBOL_GPL(inet_putpeer);
/*
* Check transmit rate limitation for given message.
@@ -261,23 +246,27 @@ EXPORT_SYMBOL_GPL(inet_putpeer);
#define XRLIM_BURST_FACTOR 6
bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
{
- unsigned long now, token;
+ unsigned long now, token, otoken, delta;
bool rc = false;
if (!peer)
return true;
- token = peer->rate_tokens;
+ token = otoken = READ_ONCE(peer->rate_tokens);
now = jiffies;
- token += now - peer->rate_last;
- peer->rate_last = now;
- if (token > XRLIM_BURST_FACTOR * timeout)
- token = XRLIM_BURST_FACTOR * timeout;
+ delta = now - READ_ONCE(peer->rate_last);
+ if (delta) {
+ WRITE_ONCE(peer->rate_last, now);
+ token += delta;
+ if (token > XRLIM_BURST_FACTOR * timeout)
+ token = XRLIM_BURST_FACTOR * timeout;
+ }
if (token >= timeout) {
token -= timeout;
rc = true;
}
- peer->rate_tokens = token;
+ if (token != otoken)
+ WRITE_ONCE(peer->rate_tokens, token);
return rc;
}
EXPORT_SYMBOL(inet_peer_xrlim_allow);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 07036a2943c1..7a435746a22d 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -82,15 +82,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
{
struct ipq *qp = container_of(q, struct ipq, q);
- struct net *net = q->fqdir->net;
-
const struct frag_v4_compare_key *key = a;
+ struct net *net = q->fqdir->net;
+ struct inet_peer *p = NULL;
q->key.v4 = *key;
qp->ecn = 0;
- qp->peer = q->fqdir->max_dist ?
- inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
- NULL;
+ if (q->fqdir->max_dist) {
+ rcu_read_lock();
+ p = inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif);
+ if (p && !refcount_inc_not_zero(&p->refcnt))
+ p = NULL;
+ rcu_read_unlock();
+ }
+ qp->peer = p;
}
static void ip4_frag_free(struct inet_frag_queue *q)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f1f31ebfc793..ed1b6b44faf8 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -924,15 +924,18 @@ static int ipgre_open(struct net_device *dev)
struct ip_tunnel *t = netdev_priv(dev);
if (ipv4_is_multicast(t->parms.iph.daddr)) {
- struct flowi4 fl4;
+ struct flowi4 fl4 = {
+ .flowi4_oif = t->parms.link,
+ .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(&t->parms.iph)),
+ .flowi4_scope = RT_SCOPE_UNIVERSE,
+ .flowi4_proto = IPPROTO_GRE,
+ .saddr = t->parms.iph.saddr,
+ .daddr = t->parms.iph.daddr,
+ .fl4_gre_key = t->parms.o_key,
+ };
struct rtable *rt;
- rt = ip_route_output_gre(t->net, &fl4,
- t->parms.iph.daddr,
- t->parms.iph.saddr,
- t->parms.o_key,
- t->parms.iph.tos & INET_DSCP_MASK,
- t->parms.link);
+ rt = ip_route_output_key(t->net, &fl4);
if (IS_ERR(rt))
return -EADDRNOTAVAIL;
dev = rt->dst.dev;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index f0a4dda246ab..30a5e9460d00 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -314,7 +314,7 @@ static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
int tcp_v4_early_demux(struct sk_buff *skb);
int udp_v4_early_demux(struct sk_buff *skb);
-static int ip_rcv_finish_core(struct net *net, struct sock *sk,
+static int ip_rcv_finish_core(struct net *net,
struct sk_buff *skb, struct net_device *dev,
const struct sk_buff *hint)
{
@@ -442,7 +442,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
if (!skb)
return NET_RX_SUCCESS;
- ret = ip_rcv_finish_core(net, sk, skb, dev, NULL);
+ ret = ip_rcv_finish_core(net, skb, dev, NULL);
if (ret != NET_RX_DROP)
ret = dst_input(skb);
return ret;
@@ -589,8 +589,7 @@ static struct sk_buff *ip_extract_route_hint(const struct net *net,
return skb;
}
-static void ip_list_rcv_finish(struct net *net, struct sock *sk,
- struct list_head *head)
+static void ip_list_rcv_finish(struct net *net, struct list_head *head)
{
struct sk_buff *skb, *next, *hint = NULL;
struct dst_entry *curr_dst = NULL;
@@ -607,7 +606,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
skb = l3mdev_ip_rcv(skb);
if (!skb)
continue;
- if (ip_rcv_finish_core(net, sk, skb, dev, hint) == NET_RX_DROP)
+ if (ip_rcv_finish_core(net, skb, dev, hint) == NET_RX_DROP)
continue;
dst = skb_dst(skb);
@@ -633,7 +632,7 @@ static void ip_sublist_rcv(struct list_head *head, struct net_device *dev,
{
NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
head, dev, NULL, ip_rcv_finish);
- ip_list_rcv_finish(net, NULL, head);
+ ip_list_rcv_finish(net, head);
}
/* Receive a list of IP packets */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 0065b1996c94..ea7a260bec8a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -478,24 +478,16 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
/* Make sure we can route this packet. */
rt = dst_rtable(__sk_dst_check(sk, 0));
if (!rt) {
- __be32 daddr;
+ inet_sk_init_flowi4(inet, fl4);
- /* Use correct destination address if we have options. */
- daddr = inet->inet_daddr;
- if (inet_opt && inet_opt->opt.srr)
- daddr = inet_opt->opt.faddr;
+ /* sctp_v4_xmit() uses its own DSCP value */
+ fl4->flowi4_tos = tos & INET_DSCP_MASK;
/* If this fails, retransmit mechanism of transport layer will
* keep trying until route appears or the connection times
* itself out.
*/
- rt = ip_route_output_ports(net, fl4, sk,
- daddr, inet->inet_saddr,
- inet->inet_dport,
- inet->inet_sport,
- sk->sk_protocol,
- tos & INET_DSCP_MASK,
- sk->sk_bound_dev_if);
+ rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
goto no_route;
sk_setup_caps(sk, &rt->dst);
@@ -1169,7 +1161,10 @@ alloc_new_skb:
/* [!] NOTE: copy will be negative if pagedlen>0
* because then the equation reduces to -fraggap.
*/
- if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
+ if (copy > 0 &&
+ INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from, data + transhdrlen, offset,
+ copy, fraggap, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
goto error;
@@ -1213,8 +1208,9 @@ alloc_new_skb:
unsigned int off;
off = skb->len;
- if (getfrag(from, skb_put(skb, copy),
- offset, copy, off, skb) < 0) {
+ if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from, skb_put(skb, copy),
+ offset, copy, off, skb) < 0) {
__skb_trim(skb, off);
err = -EFAULT;
goto error;
@@ -1252,7 +1248,8 @@ alloc_new_skb:
get_page(pfrag->page);
}
copy = min_t(int, copy, pfrag->size - pfrag->offset);
- if (getfrag(from,
+ if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from,
page_address(pfrag->page) + pfrag->offset,
offset, copy, skb->len, skb) < 0)
goto error_efault;
@@ -1328,7 +1325,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
cork->ttl = ipc->ttl;
cork->tos = ipc->tos;
cork->mark = ipc->sockc.mark;
- cork->priority = ipc->priority;
+ cork->priority = ipc->sockc.priority;
cork->transmit_time = ipc->sockc.transmit_time;
cork->tx_flags = 0;
sock_tx_timestamp(sk, &ipc->sockc, &cork->tx_flags);
@@ -1465,7 +1462,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
ip_options_build(skb, opt, cork->addr, rt);
}
- skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority);
+ skb->priority = cork->priority;
skb->mark = cork->mark;
if (sk_is_tcp(sk))
skb_set_delivery_time(skb, cork->transmit_time, SKB_CLOCK_MONOTONIC);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index cf377377b52d..6d9c5c20b1c4 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -128,20 +128,20 @@ static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
{
- char *secdata;
- u32 seclen, secid;
+ struct lsm_context ctx;
+ u32 secid;
int err;
err = security_socket_getpeersec_dgram(NULL, skb, &secid);
if (err)
return;
- err = security_secid_to_secctx(secid, &secdata, &seclen);
- if (err)
+ err = security_secid_to_secctx(secid, &ctx);
+ if (err < 0)
return;
- put_cmsg(msg, SOL_IP, SCM_SECURITY, seclen, secdata);
- security_release_secctx(secdata, seclen);
+ put_cmsg(msg, SOL_IP, SCM_SECURITY, ctx.len, ctx.context);
+ security_release_secctx(&ctx);
}
static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
@@ -315,7 +315,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
if (val < 0 || val > 255)
return -EINVAL;
ipc->tos = val;
- ipc->priority = rt_tos2priority(ipc->tos);
+ ipc->sockc.priority = rt_tos2priority(ipc->tos);
break;
case IP_PROTOCOL:
if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 99d8faa508e5..21ae7594a852 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -831,7 +831,7 @@ static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache,
cache->mfc_un.res.maxvif = vifi + 1;
}
}
- cache->mfc_un.res.lastuse = jiffies;
+ WRITE_ONCE(cache->mfc_un.res.lastuse, jiffies);
}
static int vif_add(struct net *net, struct mr_table *mrt,
@@ -1681,9 +1681,9 @@ int ipmr_ioctl(struct sock *sk, int cmd, void *arg)
rcu_read_lock();
c = ipmr_cache_find(mrt, sr->src.s_addr, sr->grp.s_addr);
if (c) {
- sr->pktcnt = c->_c.mfc_un.res.pkt;
- sr->bytecnt = c->_c.mfc_un.res.bytes;
- sr->wrong_if = c->_c.mfc_un.res.wrong_if;
+ sr->pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt);
+ sr->bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes);
+ sr->wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if);
rcu_read_unlock();
return 0;
}
@@ -1753,9 +1753,9 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
rcu_read_lock();
c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
if (c) {
- sr.pktcnt = c->_c.mfc_un.res.pkt;
- sr.bytecnt = c->_c.mfc_un.res.bytes;
- sr.wrong_if = c->_c.mfc_un.res.wrong_if;
+ sr.pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt);
+ sr.bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes);
+ sr.wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if);
rcu_read_unlock();
if (copy_to_user(arg, &sr, sizeof(sr)))
@@ -1988,9 +1988,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
int vif, ct;
vif = c->_c.mfc_parent;
- c->_c.mfc_un.res.pkt++;
- c->_c.mfc_un.res.bytes += skb->len;
- c->_c.mfc_un.res.lastuse = jiffies;
+ atomic_long_inc(&c->_c.mfc_un.res.pkt);
+ atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes);
+ WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies);
if (c->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) {
struct mfc_cache *cache_proxy;
@@ -2021,7 +2021,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
goto dont_forward;
}
- c->_c.mfc_un.res.wrong_if++;
+ atomic_long_inc(&c->_c.mfc_un.res.wrong_if);
if (true_vifi >= 0 && mrt->mroute_do_assert &&
/* pimsm uses asserts, when switching from RPT to SPT,
@@ -3029,9 +3029,9 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
if (it->cache != &mrt->mfc_unres_queue) {
seq_printf(seq, " %8lu %8lu %8lu",
- mfc->_c.mfc_un.res.pkt,
- mfc->_c.mfc_un.res.bytes,
- mfc->_c.mfc_un.res.wrong_if);
+ atomic_long_read(&mfc->_c.mfc_un.res.pkt),
+ atomic_long_read(&mfc->_c.mfc_un.res.bytes),
+ atomic_long_read(&mfc->_c.mfc_un.res.wrong_if));
for (n = mfc->_c.mfc_un.res.minvif;
n < mfc->_c.mfc_un.res.maxvif; n++) {
if (VIF_EXISTS(mrt, n) &&
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index f0af12a2f70b..03b6eee407a2 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -263,9 +263,9 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
lastuse = READ_ONCE(c->mfc_un.res.lastuse);
lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0;
- mfcs.mfcs_packets = c->mfc_un.res.pkt;
- mfcs.mfcs_bytes = c->mfc_un.res.bytes;
- mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
+ mfcs.mfcs_packets = atomic_long_read(&c->mfc_un.res.pkt);
+ mfcs.mfcs_bytes = atomic_long_read(&c->mfc_un.res.bytes);
+ mfcs.mfcs_wrong_if = atomic_long_read(&c->mfc_un.res.wrong_if);
if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) ||
nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse),
RTA_PAD))
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 40053a02bae1..affd21a0f572 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -189,6 +189,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED),
SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED),
SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED),
+ SNMP_MIB_ITEM("PAWSOldAck", LINUX_MIB_PAWS_OLD_ACK),
SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS),
SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED),
SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 0e9e01967ec9..4304a68d1db0 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -358,7 +358,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
skb_reserve(skb, hlen);
skb->protocol = htons(ETH_P_IP);
- skb->priority = READ_ONCE(sk->sk_priority);
+ skb->priority = sockc->priority;
skb->mark = sockc->mark;
skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid);
skb_dst_set(skb, &rt->dst);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 0fbec3509618..577b88a43293 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -870,11 +870,11 @@ void ip_rt_send_redirect(struct sk_buff *skb)
}
log_martians = IN_DEV_LOG_MARTIANS(in_dev);
vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
- rcu_read_unlock();
net = dev_net(rt->dst.dev);
- peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
+ peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif);
if (!peer) {
+ rcu_read_unlock();
icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
rt_nexthop(rt, ip_hdr(skb)->daddr));
return;
@@ -893,7 +893,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
*/
if (peer->n_redirects >= ip_rt_redirect_number) {
peer->rate_last = jiffies;
- goto out_put_peer;
+ goto out_unlock;
}
/* Check for load limit; set rate_last to the latest sent
@@ -914,8 +914,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)
&ip_hdr(skb)->saddr, inet_iif(skb),
&ip_hdr(skb)->daddr, &gw);
}
-out_put_peer:
- inet_putpeer(peer);
+out_unlock:
+ rcu_read_unlock();
}
static int ip_error(struct sk_buff *skb)
@@ -975,9 +975,9 @@ static int ip_error(struct sk_buff *skb)
break;
}
+ rcu_read_lock();
peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
- l3mdev_master_ifindex(skb->dev), 1);
-
+ l3mdev_master_ifindex_rcu(skb->dev));
send = true;
if (peer) {
now = jiffies;
@@ -989,8 +989,9 @@ static int ip_error(struct sk_buff *skb)
peer->rate_tokens -= ip_rt_error_cost;
else
send = false;
- inet_putpeer(peer);
}
+ rcu_read_unlock();
+
if (send)
icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
@@ -2445,6 +2446,7 @@ martian_destination:
net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
&daddr, &saddr, dev->name);
#endif
+ goto out;
e_nobufs:
reason = SKB_DROP_REASON_NOMEM;
@@ -3268,6 +3270,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct flowi4 fl4 = {};
__be32 dst = 0;
__be32 src = 0;
+ dscp_t dscp;
kuid_t uid;
u32 iif;
int err;
@@ -3282,6 +3285,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
dst = nla_get_in_addr_default(tb[RTA_DST], 0);
iif = nla_get_u32_default(tb[RTA_IIF], 0);
mark = nla_get_u32_default(tb[RTA_MARK], 0);
+ dscp = inet_dsfield_to_dscp(rtm->rtm_tos);
if (tb[RTA_UID])
uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
else
@@ -3306,7 +3310,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
fl4.daddr = dst;
fl4.saddr = src;
- fl4.flowi4_tos = rtm->rtm_tos & INET_DSCP_MASK;
+ fl4.flowi4_tos = inet_dscp_to_dsfield(dscp);
fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0);
fl4.flowi4_mark = mark;
fl4.flowi4_uid = uid;
@@ -3330,9 +3334,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
fl4.flowi4_iif = iif; /* for rt_fill_info */
skb->dev = dev;
skb->mark = mark;
- err = ip_route_input_rcu(skb, dst, src,
- inet_dsfield_to_dscp(rtm->rtm_tos),
- dev, &res) ? -EINVAL : 0;
+ err = ip_route_input_rcu(skb, dst, src, dscp, dev,
+ &res) ? -EINVAL : 0;
rt = skb_rtable(skb);
if (err == 0 && rt->dst.error)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a79b2a52ce01..42cb5dc9cb24 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -45,6 +45,7 @@ static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
static int tcp_plb_max_rounds = 31;
static int tcp_plb_max_cong_thresh = 256;
+static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC;
/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
@@ -1066,6 +1067,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = SYSCTL_TWO,
},
{
+ .procname = "tcp_tw_reuse_delay",
+ .data = &init_net.ipv4.sysctl_tcp_tw_reuse_delay,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = &tcp_tw_reuse_delay_max,
+ },
+ {
.procname = "tcp_max_syn_backlog",
.data = &init_net.ipv4.sysctl_max_syn_backlog,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 5dbed91c6178..76c23675ae50 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -392,6 +392,10 @@ static void hystart_update(struct sock *sk, u32 delay)
if (after(tp->snd_una, ca->end_seq))
bictcp_hystart_reset(sk);
+ /* hystart triggers when cwnd is larger than some threshold */
+ if (tcp_snd_cwnd(tp) < hystart_low_window)
+ return;
+
if (hystart_detect & HYSTART_ACK_TRAIN) {
u32 now = bictcp_clock_us(sk);
@@ -467,9 +471,7 @@ __bpf_kfunc static void cubictcp_acked(struct sock *sk, const struct ack_sample
if (ca->delay_min == 0 || ca->delay_min > delay)
ca->delay_min = delay;
- /* hystart triggers when cwnd is larger than some threshold */
- if (!ca->found && tcp_in_slow_start(tp) && hystart &&
- tcp_snd_cwnd(tp) >= hystart_low_window)
+ if (!ca->found && tcp_in_slow_start(tp) && hystart)
hystart_update(sk, delay);
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4811727b8a02..eb82e01da911 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4450,34 +4450,40 @@ static u32 tcp_tsval_replay(const struct sock *sk)
return inet_csk(sk)->icsk_rto * 1200 / HZ;
}
-static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
+static enum skb_drop_reason tcp_disordered_ack_check(const struct sock *sk,
+ const struct sk_buff *skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct tcphdr *th = tcp_hdr(skb);
- u32 seq = TCP_SKB_CB(skb)->seq;
+ SKB_DR_INIT(reason, TCP_RFC7323_PAWS);
u32 ack = TCP_SKB_CB(skb)->ack_seq;
+ u32 seq = TCP_SKB_CB(skb)->seq;
- return /* 1. Pure ACK with correct sequence number. */
- (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
+ /* 1. Is this not a pure ACK ? */
+ if (!th->ack || seq != TCP_SKB_CB(skb)->end_seq)
+ return reason;
- /* 2. ... and duplicate ACK. */
- ack == tp->snd_una &&
+ /* 2. Is its sequence not the expected one ? */
+ if (seq != tp->rcv_nxt)
+ return before(seq, tp->rcv_nxt) ?
+ SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK :
+ reason;
- /* 3. ... and does not update window. */
- !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
+ /* 3. Is this not a duplicate ACK ? */
+ if (ack != tp->snd_una)
+ return reason;
- /* 4. ... and sits in replay window. */
- (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <=
- tcp_tsval_replay(sk);
-}
+ /* 4. Is this updating the window ? */
+ if (tcp_may_update_window(tp, ack, seq, ntohs(th->window) <<
+ tp->rx_opt.snd_wscale))
+ return reason;
-static inline bool tcp_paws_discard(const struct sock *sk,
- const struct sk_buff *skb)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
+ /* 5. Is this not in the replay window ? */
+ if ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) >
+ tcp_tsval_replay(sk))
+ return reason;
- return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
- !tcp_disordered_ack(sk, skb);
+ return 0;
}
/* Check segment sequence number for validity.
@@ -5949,23 +5955,35 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
SKB_DR(reason);
/* RFC1323: H1. Apply PAWS check first. */
- if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) &&
- tp->rx_opt.saw_tstamp &&
- tcp_paws_discard(sk, skb)) {
- if (!th->rst) {
- if (unlikely(th->syn))
- goto syn_challenge;
- NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
- if (!tcp_oow_rate_limited(sock_net(sk), skb,
- LINUX_MIB_TCPACKSKIPPEDPAWS,
- &tp->last_oow_ack_time))
- tcp_send_dupack(sk, skb);
- SKB_DR_SET(reason, TCP_RFC7323_PAWS);
- goto discard;
- }
- /* Reset is accepted even if it did not pass PAWS. */
+ if (!tcp_fast_parse_options(sock_net(sk), skb, th, tp) ||
+ !tp->rx_opt.saw_tstamp ||
+ tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW))
+ goto step1;
+
+ reason = tcp_disordered_ack_check(sk, skb);
+ if (!reason)
+ goto step1;
+ /* Reset is accepted even if it did not pass PAWS. */
+ if (th->rst)
+ goto step1;
+ if (unlikely(th->syn))
+ goto syn_challenge;
+
+ /* Old ACK are common, increment PAWS_OLD_ACK
+ * and do not send a dupack.
+ */
+ if (reason == SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWS_OLD_ACK);
+ goto discard;
}
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+ if (!tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDPAWS,
+ &tp->last_oow_ack_time))
+ tcp_send_dupack(sk, skb);
+ goto discard;
+step1:
/* Step 1: check sequence number */
reason = tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
if (reason) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c26f6c4b7bb4..cc2b5194a18d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -120,6 +120,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
struct tcp_sock *tp = tcp_sk(sk);
int ts_recent_stamp;
+ u32 reuse_thresh;
if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
reuse = 0;
@@ -162,9 +163,10 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
and use initial timestamp retrieved from peer table.
*/
ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
+ reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
if (ts_recent_stamp &&
- (!twp || (reuse && time_after32(ktime_get_seconds(),
- ts_recent_stamp)))) {
+ (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
* and releasing the bucket lock.
*/
@@ -3457,6 +3459,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
net->ipv4.sysctl_tcp_tw_reuse = 2;
+ net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 7121d8573928..b089b08e9617 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -157,8 +157,11 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
rcv_nxt);
if (tmp_opt.saw_tstamp) {
+ u64 ts = tcp_clock_ms();
+
+ WRITE_ONCE(tw->tw_entry_stamp, ts);
WRITE_ONCE(tcptw->tw_ts_recent_stamp,
- ktime_get_seconds());
+ div_u64(ts, MSEC_PER_SEC));
WRITE_ONCE(tcptw->tw_ts_recent,
tmp_opt.rcv_tsval);
}
@@ -316,6 +319,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tw->tw_mark = sk->sk_mark;
tw->tw_priority = READ_ONCE(sk->sk_priority);
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
+ /* refreshed when we enter true TIME-WAIT state */
+ tw->tw_entry_stamp = tcp_time_stamp_ms(tp);
tcptw->tw_rcv_nxt = tp->rcv_nxt;
tcptw->tw_snd_nxt = tp->snd_nxt;
tcptw->tw_rcv_wnd = tcp_receive_window(tp);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e8953e88efef..c472c9a57cf6 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -420,6 +420,49 @@ u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport,
}
EXPORT_SYMBOL(udp_ehashfn);
+/**
+ * udp4_lib_lookup1() - Simplified lookup using primary hash (destination port)
+ * @net: Network namespace
+ * @saddr: Source address, network order
+ * @sport: Source port, network order
+ * @daddr: Destination address, network order
+ * @hnum: Destination port, host order
+ * @dif: Destination interface index
+ * @sdif: Destination bridge port index, if relevant
+ * @udptable: Set of UDP hash tables
+ *
+ * Simplified lookup to be used as fallback if no sockets are found due to a
+ * potential race between (receive) address change, and lookup happening before
+ * the rehash operation. This function ignores SO_REUSEPORT groups while scoring
+ * result sockets, because if we have one, we don't need the fallback at all.
+ *
+ * Called under rcu_read_lock().
+ *
+ * Return: socket with highest matching score if any, NULL if none
+ */
+static struct sock *udp4_lib_lookup1(const struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum,
+ int dif, int sdif,
+ const struct udp_table *udptable)
+{
+ unsigned int slot = udp_hashfn(net, hnum, udptable->mask);
+ struct udp_hslot *hslot = &udptable->hash[slot];
+ struct sock *sk, *result = NULL;
+ int score, badness = 0;
+
+ sk_for_each_rcu(sk, &hslot->head) {
+ score = compute_score(sk, net,
+ saddr, sport, daddr, hnum, dif, sdif);
+ if (score > badness) {
+ result = sk;
+ badness = score;
+ }
+ }
+
+ return result;
+}
+
/* called with rcu_read_lock() */
static struct sock *udp4_lib_lookup2(const struct net *net,
__be32 saddr, __be16 sport,
@@ -533,7 +576,7 @@ begin:
return NULL;
}
-/* In hash4, rehash can happen in connect(), where hash4_cnt keeps unchanged. */
+/* udp_rehash4() only checks hslot4, and hash4_cnt is not processed. */
static void udp_rehash4(struct udp_table *udptable, struct sock *sk,
u16 newhash4)
{
@@ -582,15 +625,13 @@ void udp_lib_hash4(struct sock *sk, u16 hash)
struct net *net = sock_net(sk);
struct udp_table *udptable;
- /* Connected udp socket can re-connect to another remote address,
- * so rehash4 is needed.
+ /* Connected udp socket can re-connect to another remote address, which
+ * will be handled by rehash. Thus no need to redo hash4 here.
*/
- udptable = net->ipv4.udp_table;
- if (udp_hashed4(sk)) {
- udp_rehash4(udptable, sk, hash);
+ if (udp_hashed4(sk))
return;
- }
+ udptable = net->ipv4.udp_table;
hslot = udp_hashslot(udptable, net, udp_sk(sk)->udp_port_hash);
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
hslot4 = udp_hashslot4(udptable, hash);
@@ -683,6 +724,19 @@ struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr,
result = udp4_lib_lookup2(net, saddr, sport,
htonl(INADDR_ANY), hnum, dif, sdif,
hslot2, skb);
+ if (!IS_ERR_OR_NULL(result))
+ goto done;
+
+ /* Primary hash (destination port) lookup as fallback for this race:
+ * 1. __ip4_datagram_connect() sets sk_rcv_saddr
+ * 2. lookup (this function): new sk_rcv_saddr, hashes not updated yet
+ * 3. rehash operation updating _secondary and four-tuple_ hashes
+ * The primary hash doesn't need an update after 1., so, thanks to this
+ * further step, 1. and 3. don't need to be atomic against the lookup.
+ */
+ result = udp4_lib_lookup1(net, saddr, sport, daddr, hnum, dif, sdif,
+ udptable);
+
done:
if (IS_ERR(result))
return NULL;
@@ -2173,14 +2227,14 @@ void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4)
struct udp_table *udptable = udp_get_table_prot(sk);
struct udp_hslot *hslot, *hslot2, *nhslot2;
+ hslot = udp_hashslot(udptable, sock_net(sk),
+ udp_sk(sk)->udp_port_hash);
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
nhslot2 = udp_hashslot2(udptable, newhash);
udp_sk(sk)->udp_portaddr_hash = newhash;
if (hslot2 != nhslot2 ||
rcu_access_pointer(sk->sk_reuseport_cb)) {
- hslot = udp_hashslot(udptable, sock_net(sk),
- udp_sk(sk)->udp_port_hash);
/* we must lock primary chain too */
spin_lock_bh(&hslot->lock);
if (rcu_access_pointer(sk->sk_reuseport_cb))
@@ -2199,19 +2253,29 @@ void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4)
spin_unlock(&nhslot2->lock);
}
- if (udp_hashed4(sk)) {
- udp_rehash4(udptable, sk, newhash4);
+ spin_unlock_bh(&hslot->lock);
+ }
+
+ /* Now process hash4 if necessary:
+ * (1) update hslot4;
+ * (2) update hslot2->hash4_cnt.
+ * Note that hslot2/hslot4 should be checked separately, as
+ * either of them may change with the other unchanged.
+ */
+ if (udp_hashed4(sk)) {
+ spin_lock_bh(&hslot->lock);
- if (hslot2 != nhslot2) {
- spin_lock(&hslot2->lock);
- udp_hash4_dec(hslot2);
- spin_unlock(&hslot2->lock);
+ udp_rehash4(udptable, sk, newhash4);
+ if (hslot2 != nhslot2) {
+ spin_lock(&hslot2->lock);
+ udp_hash4_dec(hslot2);
+ spin_unlock(&hslot2->lock);
- spin_lock(&nhslot2->lock);
- udp_hash4_inc(nhslot2);
- spin_unlock(&nhslot2->lock);
- }
+ spin_lock(&nhslot2->lock);
+ udp_hash4_inc(nhslot2);
+ spin_unlock(&nhslot2->lock);
}
+
spin_unlock_bh(&hslot->lock);
}
}