summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c5
-rw-r--r--net/ipv4/inet_connection_sock.c48
-rw-r--r--net/ipv4/inet_diag.c86
-rw-r--r--net/ipv4/inet_hashtables.c27
-rw-r--r--net/ipv4/ip_gre.c11
-rw-r--r--net/ipv4/ip_sockglue.c39
-rw-r--r--net/ipv4/ipmr.c13
-rw-r--r--net/ipv4/sysctl_net_ipv4.c18
-rw-r--r--net/ipv4/tcp.c122
-rw-r--r--net/ipv4/tcp_ao.c17
-rw-r--r--net/ipv4/tcp_input.c62
-rw-r--r--net/ipv4/tcp_ipv4.c4
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/ipv4/tcp_output.c21
14 files changed, 353 insertions, 122 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index fb81de10d332..835f4f9d98d2 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1633,6 +1633,7 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
#endif
return -EINVAL;
}
+EXPORT_SYMBOL(inet_recv_error);
int inet_gro_complete(struct sk_buff *skb, int nhoff)
{
@@ -1847,9 +1848,7 @@ static __net_init int inet_init_net(struct net *net)
/*
* Set defaults for local port range
*/
- seqlock_init(&net->ipv4.ip_local_ports.lock);
- net->ipv4.ip_local_ports.range[0] = 32768;
- net->ipv4.ip_local_ports.range[1] = 60999;
+ net->ipv4.ip_local_ports.range = 60999u << 16 | 32768u;
seqlock_init(&net->ipv4.ping_group_range.lock);
/*
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 394a498c2823..bd325b029dd1 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -117,37 +117,39 @@ bool inet_rcv_saddr_any(const struct sock *sk)
return !sk->sk_rcv_saddr;
}
-void inet_get_local_port_range(const struct net *net, int *low, int *high)
-{
- unsigned int seq;
-
- do {
- seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
-
- *low = net->ipv4.ip_local_ports.range[0];
- *high = net->ipv4.ip_local_ports.range[1];
- } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
-}
-EXPORT_SYMBOL(inet_get_local_port_range);
-
-void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
+/**
+ * inet_sk_get_local_port_range - fetch ephemeral ports range
+ * @sk: socket
+ * @low: pointer to low port
+ * @high: pointer to high port
+ *
+ * Fetch netns port range (/proc/sys/net/ipv4/ip_local_port_range)
+ * Range can be overridden if socket got IP_LOCAL_PORT_RANGE option.
+ * Returns true if IP_LOCAL_PORT_RANGE was set on this socket.
+ */
+bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
{
- const struct inet_sock *inet = inet_sk(sk);
- const struct net *net = sock_net(sk);
int lo, hi, sk_lo, sk_hi;
+ bool local_range = false;
+ u32 sk_range;
- inet_get_local_port_range(net, &lo, &hi);
+ inet_get_local_port_range(sock_net(sk), &lo, &hi);
- sk_lo = inet->local_port_range.lo;
- sk_hi = inet->local_port_range.hi;
+ sk_range = READ_ONCE(inet_sk(sk)->local_port_range);
+ if (unlikely(sk_range)) {
+ sk_lo = sk_range & 0xffff;
+ sk_hi = sk_range >> 16;
- if (unlikely(lo <= sk_lo && sk_lo <= hi))
- lo = sk_lo;
- if (unlikely(lo <= sk_hi && sk_hi <= hi))
- hi = sk_hi;
+ if (lo <= sk_lo && sk_lo <= hi)
+ lo = sk_lo;
+ if (lo <= sk_hi && sk_hi <= hi)
+ hi = sk_hi;
+ local_range = true;
+ }
*low = lo;
*high = hi;
+ return local_range;
}
EXPORT_SYMBOL(inet_sk_get_local_port_range);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 7d0e7aaa71e0..46b13962ad02 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -1077,10 +1077,94 @@ skip_listen_ht:
s_i = num = s_num = 0;
}
+/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets
+ * with bh disabled.
+ */
+#define SKARR_SZ 16
+
+ /* Dump bound but inactive (not listening, connecting, etc.) sockets */
+ if (cb->args[0] == 1) {
+ if (!(idiag_states & TCPF_BOUND_INACTIVE))
+ goto skip_bind_ht;
+
+ for (i = s_i; i < hashinfo->bhash_size; i++) {
+ struct inet_bind_hashbucket *ibb;
+ struct inet_bind2_bucket *tb2;
+ struct sock *sk_arr[SKARR_SZ];
+ int num_arr[SKARR_SZ];
+ int idx, accum, res;
+
+resume_bind_walk:
+ num = 0;
+ accum = 0;
+ ibb = &hashinfo->bhash2[i];
+
+ spin_lock_bh(&ibb->lock);
+ inet_bind_bucket_for_each(tb2, &ibb->chain) {
+ if (!net_eq(ib2_net(tb2), net))
+ continue;
+
+ sk_for_each_bound_bhash2(sk, &tb2->owners) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (num < s_num)
+ goto next_bind;
+
+ if (sk->sk_state != TCP_CLOSE ||
+ !inet->inet_num)
+ goto next_bind;
+
+ if (r->sdiag_family != AF_UNSPEC &&
+ r->sdiag_family != sk->sk_family)
+ goto next_bind;
+
+ if (!inet_diag_bc_sk(bc, sk))
+ goto next_bind;
+
+ sock_hold(sk);
+ num_arr[accum] = num;
+ sk_arr[accum] = sk;
+ if (++accum == SKARR_SZ)
+ goto pause_bind_walk;
+next_bind:
+ num++;
+ }
+ }
+pause_bind_walk:
+ spin_unlock_bh(&ibb->lock);
+
+ res = 0;
+ for (idx = 0; idx < accum; idx++) {
+ if (res >= 0) {
+ res = inet_sk_diag_fill(sk_arr[idx],
+ NULL, skb, cb,
+ r, NLM_F_MULTI,
+ net_admin);
+ if (res < 0)
+ num = num_arr[idx];
+ }
+ sock_put(sk_arr[idx]);
+ }
+ if (res < 0)
+ goto done;
+
+ cond_resched();
+
+ if (accum == SKARR_SZ) {
+ s_num = num + 1;
+ goto resume_bind_walk;
+ }
+
+ s_num = 0;
+ }
+skip_bind_ht:
+ cb->args[0] = 2;
+ s_i = num = s_num = 0;
+ }
+
if (!(idiag_states & ~TCPF_LISTEN))
goto out;
-#define SKARR_SZ 16
for (i = s_i; i <= hashinfo->ehash_mask; i++) {
struct inet_ehash_bucket *head = &hashinfo->ehash[i];
spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index a532f749e477..9ff201bc4e6d 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -1012,7 +1012,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
bool tb_created = false;
u32 remaining, offset;
int ret, i, low, high;
- int l3mdev;
+ bool local_ports;
+ int step, l3mdev;
u32 index;
if (port) {
@@ -1024,10 +1025,12 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
l3mdev = inet_sk_bound_l3mdev(sk);
- inet_sk_get_local_port_range(sk, &low, &high);
+ local_ports = inet_sk_get_local_port_range(sk, &low, &high);
+ step = local_ports ? 1 : 2;
+
high++; /* [32768, 60999] -> [32768, 61000[ */
remaining = high - low;
- if (likely(remaining > 1))
+ if (!local_ports && remaining > 1)
remaining &= ~1U;
get_random_sleepable_once(table_perturb,
@@ -1040,10 +1043,11 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
/* In first pass we try ports of @low parity.
* inet_csk_get_port() does the opposite choice.
*/
- offset &= ~1U;
+ if (!local_ports)
+ offset &= ~1U;
other_parity_scan:
port = low + offset;
- for (i = 0; i < remaining; i += 2, port += 2) {
+ for (i = 0; i < remaining; i += step, port += step) {
if (unlikely(port >= high))
port -= remaining;
if (inet_is_local_reserved_port(net, port))
@@ -1083,10 +1087,11 @@ next_port:
cond_resched();
}
- offset++;
- if ((offset & 1) && remaining > 1)
- goto other_parity_scan;
-
+ if (!local_ports) {
+ offset++;
+ if ((offset & 1) && remaining > 1)
+ goto other_parity_scan;
+ }
return -EADDRNOTAVAIL;
ok:
@@ -1109,8 +1114,8 @@ ok:
* on low contention the randomness is maximal and on high contention
* it may be inexistent.
*/
- i = max_t(int, i, get_random_u32_below(8) * 2);
- WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2);
+ i = max_t(int, i, get_random_u32_below(8) * step);
+ WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step);
/* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, tb2, port);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 22a26d1d29a0..5169c3c72cff 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -635,15 +635,18 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
}
if (dev->header_ops) {
+ int pull_len = tunnel->hlen + sizeof(struct iphdr);
+
if (skb_cow_head(skb, 0))
goto free_skb;
tnl_params = (const struct iphdr *)skb->data;
- /* Pull skb since ip_tunnel_xmit() needs skb->data pointing
- * to gre header.
- */
- skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
+ if (!pskb_network_may_pull(skb, pull_len))
+ goto free_skb;
+
+ /* ip_tunnel_xmit() needs skb->data pointing to gre header. */
+ skb_pull(skb, pull_len);
skb_reset_mac_header(skb);
if (skb->ip_summed == CHECKSUM_PARTIAL &&
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 2efc53526a38..66247e8b429e 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -775,7 +775,7 @@ static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen)
if (optlen < GROUP_FILTER_SIZE(0))
return -EINVAL;
- if (optlen > READ_ONCE(sysctl_optmem_max))
+ if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
return -ENOBUFS;
gsf = memdup_sockptr(optval, optlen);
@@ -811,7 +811,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
if (optlen < size0)
return -EINVAL;
- if (optlen > READ_ONCE(sysctl_optmem_max) - 4)
+ if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max) - 4)
return -ENOBUFS;
p = kmalloc(optlen + 4, GFP_KERNEL);
@@ -1055,6 +1055,19 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
case IP_TOS: /* This sets both TOS and Precedence */
ip_sock_set_tos(sk, val);
return 0;
+ case IP_LOCAL_PORT_RANGE:
+ {
+ u16 lo = val;
+ u16 hi = val >> 16;
+
+ if (optlen != sizeof(u32))
+ return -EINVAL;
+ if (lo != 0 && hi != 0 && lo > hi)
+ return -EINVAL;
+
+ WRITE_ONCE(inet->local_port_range, val);
+ return 0;
+ }
}
err = 0;
@@ -1241,7 +1254,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
if (optlen < IP_MSFILTER_SIZE(0))
goto e_inval;
- if (optlen > READ_ONCE(sysctl_optmem_max)) {
+ if (optlen > READ_ONCE(net->core.sysctl_optmem_max)) {
err = -ENOBUFS;
break;
}
@@ -1332,20 +1345,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
err = xfrm_user_policy(sk, optname, optval, optlen);
break;
- case IP_LOCAL_PORT_RANGE:
- {
- const __u16 lo = val;
- const __u16 hi = val >> 16;
-
- if (optlen != sizeof(__u32))
- goto e_inval;
- if (lo != 0 && hi != 0 && lo > hi)
- goto e_inval;
-
- inet->local_port_range.lo = lo;
- inet->local_port_range.hi = hi;
- break;
- }
default:
err = -ENOPROTOOPT;
break;
@@ -1692,6 +1691,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
return -EFAULT;
return 0;
}
+ case IP_LOCAL_PORT_RANGE:
+ val = READ_ONCE(inet->local_port_range);
+ goto copyval;
}
if (needs_rtnl)
@@ -1721,9 +1723,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
else
err = ip_get_mcast_msfilter(sk, optval, optlen, len);
goto out;
- case IP_LOCAL_PORT_RANGE:
- val = inet->local_port_range.hi << 16 | inet->local_port_range.lo;
- break;
case IP_PROTOCOL:
val = inet_sk(sk)->inet_num;
break;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 9e222a57bc2b..0063a237253b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1025,6 +1025,10 @@ static int ipmr_cache_report(const struct mr_table *mrt,
struct sk_buff *skb;
int ret;
+ mroute_sk = rcu_dereference(mrt->mroute_sk);
+ if (!mroute_sk)
+ return -EINVAL;
+
if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE)
skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
else
@@ -1069,7 +1073,8 @@ static int ipmr_cache_report(const struct mr_table *mrt,
msg = (struct igmpmsg *)skb_network_header(skb);
msg->im_vif = vifi;
msg->im_vif_hi = vifi >> 8;
- skb_dst_set(skb, dst_clone(skb_dst(pkt)));
+ ipv4_pktinfo_prepare(mroute_sk, pkt);
+ memcpy(skb->cb, pkt->cb, sizeof(skb->cb));
/* Add our header */
igmp = skb_put(skb, sizeof(struct igmphdr));
igmp->type = assert;
@@ -1079,12 +1084,6 @@ static int ipmr_cache_report(const struct mr_table *mrt,
skb->transport_header = skb->network_header;
}
- mroute_sk = rcu_dereference(mrt->mroute_sk);
- if (!mroute_sk) {
- kfree_skb(skb);
- return -EINVAL;
- }
-
igmpmsg_netlink_event(mrt, skb);
/* Deliver to mrouted */
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index f63a545a7374..7e4f16a7dcc1 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -50,26 +50,22 @@ static int tcp_plb_max_cong_thresh = 256;
static int sysctl_tcp_low_latency __read_mostly;
/* Update system visible IP port range */
-static void set_local_port_range(struct net *net, int range[2])
+static void set_local_port_range(struct net *net, unsigned int low, unsigned int high)
{
- bool same_parity = !((range[0] ^ range[1]) & 1);
+ bool same_parity = !((low ^ high) & 1);
- write_seqlock_bh(&net->ipv4.ip_local_ports.lock);
if (same_parity && !net->ipv4.ip_local_ports.warned) {
net->ipv4.ip_local_ports.warned = true;
pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n");
}
- net->ipv4.ip_local_ports.range[0] = range[0];
- net->ipv4.ip_local_ports.range[1] = range[1];
- write_sequnlock_bh(&net->ipv4.ip_local_ports.lock);
+ WRITE_ONCE(net->ipv4.ip_local_ports.range, high << 16 | low);
}
/* Validate changes from /proc interface. */
static int ipv4_local_port_range(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- struct net *net =
- container_of(table->data, struct net, ipv4.ip_local_ports.range);
+ struct net *net = table->data;
int ret;
int range[2];
struct ctl_table tmp = {
@@ -93,7 +89,7 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
(range[0] < READ_ONCE(net->ipv4.sysctl_ip_prot_sock)))
ret = -EINVAL;
else
- set_local_port_range(net, range);
+ set_local_port_range(net, range[0], range[1]);
}
return ret;
@@ -733,8 +729,8 @@ static struct ctl_table ipv4_net_table[] = {
},
{
.procname = "ip_local_port_range",
- .maxlen = sizeof(init_net.ipv4.ip_local_ports.range),
- .data = &init_net.ipv4.ip_local_ports.range,
+ .maxlen = 0,
+ .data = &init_net,
.mode = 0644,
.proc_handler = ipv4_local_port_range,
},
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 53bcc17c91e4..1d6b80145efb 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2605,6 +2605,7 @@ void tcp_set_state(struct sock *sk, int state)
BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
+ BUILD_BUG_ON((int)BPF_TCP_BOUND_INACTIVE != (int)TCP_BOUND_INACTIVE);
BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
/* bpf uapi header bpf.h defines an anonymous enum with values
@@ -3368,9 +3369,25 @@ int tcp_set_window_clamp(struct sock *sk, int val)
return -EINVAL;
tp->window_clamp = 0;
} else {
- tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
- SOCK_MIN_RCVBUF / 2 : val;
- tp->rcv_ssthresh = min(tp->rcv_wnd, tp->window_clamp);
+ u32 new_rcv_ssthresh, old_window_clamp = tp->window_clamp;
+ u32 new_window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
+ SOCK_MIN_RCVBUF / 2 : val;
+
+ if (new_window_clamp == old_window_clamp)
+ return 0;
+
+ tp->window_clamp = new_window_clamp;
+ if (new_window_clamp < old_window_clamp) {
+ /* need to apply the reserved mem provisioning only
+ * when shrinking the window clamp
+ */
+ __tcp_adjust_rcv_ssthresh(sk, tp->window_clamp);
+
+ } else {
+ new_rcv_ssthresh = min(tp->rcv_wnd, tp->window_clamp);
+ tp->rcv_ssthresh = max(new_rcv_ssthresh,
+ tp->rcv_ssthresh);
+ }
}
return 0;
}
@@ -3594,6 +3611,10 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
break;
case TCP_AO_REPAIR:
+ if (!tcp_can_repair_sock(sk)) {
+ err = -EPERM;
+ break;
+ }
err = tcp_ao_set_repair(sk, optval, optlen);
break;
#ifdef CONFIG_TCP_AO
@@ -4293,6 +4314,8 @@ zerocopy_rcv_out:
}
#endif
case TCP_AO_REPAIR:
+ if (!tcp_can_repair_sock(sk))
+ return -EPERM;
return tcp_ao_get_repair(sk, optval, optlen);
case TCP_AO_GET_KEYS:
case TCP_AO_INFO: {
@@ -4563,6 +4586,97 @@ static void __init tcp_init_mem(void)
sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */
}
+static void __init tcp_struct_check(void)
+{
+ /* TX read-mostly hotpath cache lines */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, max_window);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, rcv_ssthresh);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, reordering);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, lost_skb_hint);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint);
+ CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 40);
+
+ /* TXRX read-mostly hotpath cache lines */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_wnd);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, mss_cache);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_cwnd);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, prr_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, lost_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, sacked_out);
+ CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_txrx, 31);
+
+ /* RX read-mostly hotpath cache lines */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rcv_tstamp);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_wl1);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tlp_high_seq);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rttvar_us);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, retrans_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, advmss);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, urg_data);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, lost);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh);
+ CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 69);
+
+ /* TX read-write hotpath cache lines */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, data_segs_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, bytes_sent);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, snd_sml);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_start);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_stat);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, write_seq);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, pushed_seq);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_clock_cache);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_mstamp);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags);
+ CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 113);
+
+ /* TXRX read-write hotpath cache lines */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_nxt);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_nxt);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_una);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, window_clamp);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, srtt_us);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, packets_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt);
+ CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 76);
+
+ /* RX read-write hotpath cache lines */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, segs_in);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, data_segs_in);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_wup);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, max_packets_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, cwnd_usage_seq);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space);
+ CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 99);
+}
+
void __init tcp_init(void)
{
int max_rshare, max_wshare, cnt;
@@ -4573,6 +4687,8 @@ void __init tcp_init(void)
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
sizeof_field(struct sk_buff, cb));
+ tcp_struct_check();
+
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index c4cd1e09eb6b..87db432c6bb4 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -862,7 +862,7 @@ void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb,
#endif
treq->af_specific = &tcp_request_sock_ipv4_ops;
- treq->maclen = 0;
+ treq->used_tcp_ao = false;
if (tcp_parse_auth_options(th, NULL, &aoh) || !aoh)
return;
@@ -875,7 +875,7 @@ void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb,
treq->ao_rcv_next = aoh->keyid;
treq->ao_keyid = aoh->rnext_keyid;
- treq->maclen = tcp_ao_maclen(key);
+ treq->used_tcp_ao = true;
}
static enum skb_drop_reason
@@ -1112,7 +1112,7 @@ void tcp_ao_connect_init(struct sock *sk)
ao_info->current_key = key;
if (!ao_info->rnext_key)
ao_info->rnext_key = key;
- tp->tcp_header_len += tcp_ao_len(key);
+ tp->tcp_header_len += tcp_ao_len_aligned(key);
ao_info->lisn = htonl(tp->write_seq);
ao_info->snd_sne = 0;
@@ -1358,7 +1358,7 @@ static int tcp_ao_parse_crypto(struct tcp_ao_add *cmd, struct tcp_ao_key *key)
syn_tcp_option_space -= TCPOLEN_MSS_ALIGNED;
syn_tcp_option_space -= TCPOLEN_TSTAMP_ALIGNED;
syn_tcp_option_space -= TCPOLEN_WSCALE_ALIGNED;
- if (tcp_ao_len(key) > syn_tcp_option_space) {
+ if (tcp_ao_len_aligned(key) > syn_tcp_option_space) {
err = -EMSGSIZE;
goto err_kfree;
}
@@ -1620,6 +1620,15 @@ static int tcp_ao_add_cmd(struct sock *sk, unsigned short int family,
if (!dev || !l3index)
return -EINVAL;
+ if (!bound_dev_if || bound_dev_if != cmd.ifindex) {
+ /* tcp_ao_established_key() doesn't expect having
+ * non peer-matching key on an established TCP-AO
+ * connection.
+ */
+ if (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)))
+ return -EINVAL;
+ }
+
/* It's still possible to bind after adding keys or even
* re-bind to a different dev (with CAP_NET_RAW).
* So, no reason to return error here, rather try to be
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bcb55d98004c..df7b13f0e5e0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -202,23 +202,17 @@ static void bpf_skops_established(struct sock *sk, int bpf_op,
}
#endif
-static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
- unsigned int len)
+static __cold void tcp_gro_dev_warn(const struct sock *sk, const struct sk_buff *skb,
+ unsigned int len)
{
- static bool __once __read_mostly;
+ struct net_device *dev;
- if (!__once) {
- struct net_device *dev;
-
- __once = true;
-
- rcu_read_lock();
- dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
- if (!dev || len >= dev->mtu)
- pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
- dev ? dev->name : "Unknown driver");
- rcu_read_unlock();
- }
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
+ if (!dev || len >= READ_ONCE(dev->mtu))
+ pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
+ dev ? dev->name : "Unknown driver");
+ rcu_read_unlock();
}
/* Adapt the MSS value used to make delayed ack decision to the
@@ -250,9 +244,8 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
tcp_sk(sk)->advmss);
/* Account for possibly-removed options */
- if (unlikely(len > icsk->icsk_ack.rcv_mss +
- MAX_TCP_OPTION_SPACE))
- tcp_gro_dev_warn(sk, skb, len);
+ DO_ONCE_LITE_IF(len > icsk->icsk_ack.rcv_mss + MAX_TCP_OPTION_SPACE,
+ tcp_gro_dev_warn, sk, skb, len);
/* If the skb has a len of exactly 1*MSS and has the PSH bit
* set then it is likely the end of an application write. So
* more data may not be arriving soon, and yet the data sender
@@ -3871,8 +3864,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
* then we can probably ignore it.
*/
if (before(ack, prior_snd_una)) {
+ u32 max_window;
+
+ /* do not accept ACK for bytes we never sent. */
+ max_window = min_t(u64, tp->max_window, tp->bytes_acked);
/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
- if (before(ack, prior_snd_una - tp->max_window)) {
+ if (before(ack, prior_snd_una - max_window)) {
if (!(flag & FLAG_NO_CHALLENGE_ACK))
tcp_send_challenge_ack(sk);
return -SKB_DROP_REASON_TCP_TOO_OLD_ACK;
@@ -4364,6 +4361,23 @@ EXPORT_SYMBOL(tcp_do_parse_auth_options);
* up to bandwidth of 18Gigabit/sec. 8) ]
*/
+/* Estimates max number of increments of remote peer TSval in
+ * a replay window (based on our current RTO estimation).
+ */
+static u32 tcp_tsval_replay(const struct sock *sk)
+{
+ /* If we use usec TS resolution,
+ * then expect the remote peer to use the same resolution.
+ */
+ if (tcp_sk(sk)->tcp_usec_ts)
+ return inet_csk(sk)->icsk_rto * (USEC_PER_SEC / HZ);
+
+ /* RFC 7323 recommends a TSval clock between 1ms and 1sec.
+ * We know that some OS (including old linux) can use 1200 Hz.
+ */
+ return inet_csk(sk)->icsk_rto * 1200 / HZ;
+}
+
static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -4371,7 +4385,7 @@ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
u32 seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
- return (/* 1. Pure ACK with correct sequence number. */
+ return /* 1. Pure ACK with correct sequence number. */
(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
/* 2. ... and duplicate ACK. */
@@ -4381,7 +4395,8 @@ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
/* 4. ... and sits in replay window. */
- (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
+ (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <=
+ tcp_tsval_replay(sk);
}
static inline bool tcp_paws_discard(const struct sock *sk,
@@ -7182,11 +7197,12 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
goto drop_and_release; /* Invalid TCP options */
if (aoh) {
- tcp_rsk(req)->maclen = aoh->length - sizeof(struct tcp_ao_hdr);
+ tcp_rsk(req)->used_tcp_ao = true;
tcp_rsk(req)->ao_rcv_next = aoh->keyid;
tcp_rsk(req)->ao_keyid = aoh->rnext_keyid;
+
} else {
- tcp_rsk(req)->maclen = 0;
+ tcp_rsk(req)->used_tcp_ao = false;
}
#endif
tcp_rsk(req)->snt_isn = isn;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 86cc6d36f818..4bac6e319aca 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -696,7 +696,7 @@ static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
(aoh->rnext_keyid << 8) | keyid);
- arg->iov[0].iov_len += round_up(tcp_ao_len(key), 4);
+ arg->iov[0].iov_len += tcp_ao_len_aligned(key);
reply->doff = arg->iov[0].iov_len / 4;
if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
@@ -984,7 +984,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
(tcp_ao_len(key->ao_key) << 16) |
(key->ao_key->sndid << 8) |
key->rcv_next);
- arg.iov[0].iov_len += round_up(tcp_ao_len(key->ao_key), 4);
+ arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
rep.th.doff = arg.iov[0].iov_len / 4;
tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index a9807eeb311c..9e85f2a0bddd 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -615,7 +615,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
ao_key = treq->af_specific->ao_lookup(sk, req,
tcp_rsk(req)->ao_keyid, -1);
if (ao_key)
- newtp->tcp_header_len += tcp_ao_len(ao_key);
+ newtp->tcp_header_len += tcp_ao_len_aligned(ao_key);
#endif
if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index eb13a55d660c..e3167ad96567 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -825,7 +825,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
timestamps = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps);
if (tcp_key_is_ao(key)) {
opts->options |= OPTION_AO;
- remaining -= tcp_ao_len(key->ao_key);
+ remaining -= tcp_ao_len_aligned(key->ao_key);
}
}
@@ -915,7 +915,7 @@ static unsigned int tcp_synack_options(const struct sock *sk,
ireq->tstamp_ok &= !ireq->sack_ok;
} else if (tcp_key_is_ao(key)) {
opts->options |= OPTION_AO;
- remaining -= tcp_ao_len(key->ao_key);
+ remaining -= tcp_ao_len_aligned(key->ao_key);
ireq->tstamp_ok &= !ireq->sack_ok;
}
@@ -982,7 +982,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
size += TCPOLEN_MD5SIG_ALIGNED;
} else if (tcp_key_is_ao(key)) {
opts->options |= OPTION_AO;
- size += tcp_ao_len(key->ao_key);
+ size += tcp_ao_len_aligned(key->ao_key);
}
if (likely(tp->rx_opt.tstamp_ok)) {
@@ -3293,7 +3293,13 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
if (skb_still_in_host_queue(sk, skb))
return -EBUSY;
+start:
if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
+ if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN;
+ TCP_SKB_CB(skb)->seq++;
+ goto start;
+ }
if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
WARN_ON_ONCE(1);
return -EINVAL;
@@ -3720,7 +3726,6 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
if (tcp_rsk_used_ao(req)) {
#ifdef CONFIG_TCP_AO
struct tcp_ao_key *ao_key = NULL;
- u8 maclen = tcp_rsk(req)->maclen;
u8 keyid = tcp_rsk(req)->ao_keyid;
ao_key = tcp_sk(sk)->af_specific->ao_lookup(sk, req_to_sk(req),
@@ -3730,13 +3735,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
* for another peer-matching key, but the peer has requested
* ao_keyid (RFC5925 RNextKeyID), so let's keep it simple here.
*/
- if (unlikely(!ao_key || tcp_ao_maclen(ao_key) != maclen)) {
- u8 key_maclen = ao_key ? tcp_ao_maclen(ao_key) : 0;
-
+ if (unlikely(!ao_key)) {
rcu_read_unlock();
kfree_skb(skb);
- net_warn_ratelimited("TCP-AO: the keyid %u with maclen %u|%u from SYN packet is not present - not sending SYNACK\n",
- keyid, maclen, key_maclen);
+ net_warn_ratelimited("TCP-AO: the keyid %u from SYN packet is not present - not sending SYNACK\n",
+ keyid);
return NULL;
}
key.ao_key = ao_key;