summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c30
-rw-r--r--net/ipv4/datagram.c1
-rw-r--r--net/ipv4/fib_notifier.c1
-rw-r--r--net/ipv4/fib_semantics.c16
-rw-r--r--net/ipv4/icmp.c23
-rw-r--r--net/ipv4/inet_connection_sock.c4
-rw-r--r--net/ipv4/inet_diag.c2
-rw-r--r--net/ipv4/inet_hashtables.c6
-rw-r--r--net/ipv4/ip_gre.c2
-rw-r--r--net/ipv4/ip_sockglue.c11
-rw-r--r--net/ipv4/ip_tunnel.c2
-rw-r--r--net/ipv4/ip_vti.c2
-rw-r--r--net/ipv4/ipconfig.c12
-rw-r--r--net/ipv4/ipip.c2
-rw-r--r--net/ipv4/netfilter/arp_tables.c7
-rw-r--r--net/ipv4/netfilter/arptable_filter.c10
-rw-r--r--net/ipv4/netfilter/ip_tables.c7
-rw-r--r--net/ipv4/netfilter/iptable_filter.c9
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c8
-rw-r--r--net/ipv4/netfilter/iptable_nat.c15
-rw-r--r--net/ipv4/netfilter/iptable_raw.c12
-rw-r--r--net/ipv4/netfilter/iptable_security.c9
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c30
-rw-r--r--net/ipv4/nexthop.c21
-rw-r--r--net/ipv4/proc.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c12
-rw-r--r--net/ipv4/tcp.c131
-rw-r--r--net/ipv4/tcp_bpf.c27
-rw-r--r--net/ipv4/tcp_input.c37
-rw-r--r--net/ipv4/tcp_ipv4.c76
-rw-r--r--net/ipv4/tcp_nv.c1
-rw-r--r--net/ipv4/tcp_output.c37
-rw-r--r--net/ipv4/tcp_rate.c6
-rw-r--r--net/ipv4/udp.c16
-rw-r--r--net/ipv4/udp_bpf.c1
35 files changed, 279 insertions, 309 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 1d816a5fd3eb..0189e3cd4a7d 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -133,13 +133,9 @@ void inet_sock_destruct(struct sock *sk)
struct inet_sock *inet = inet_sk(sk);
__skb_queue_purge(&sk->sk_receive_queue);
- if (sk->sk_rx_skb_cache) {
- __kfree_skb(sk->sk_rx_skb_cache);
- sk->sk_rx_skb_cache = NULL;
- }
__skb_queue_purge(&sk->sk_error_queue);
- sk_mem_reclaim(sk);
+ sk_mem_reclaim_final(sk);
if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
pr_err("Attempt to release TCP socket in state %d %p\n",
@@ -154,7 +150,7 @@ void inet_sock_destruct(struct sock *sk)
WARN_ON(atomic_read(&sk->sk_rmem_alloc));
WARN_ON(refcount_read(&sk->sk_wmem_alloc));
WARN_ON(sk->sk_wmem_queued);
- WARN_ON(sk->sk_forward_alloc);
+ WARN_ON(sk_forward_alloc_get(sk));
kfree(rcu_dereference_protected(inet->inet_opt, 1));
dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1));
@@ -773,26 +769,28 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
sin->sin_family = AF_INET;
+ lock_sock(sk);
if (peer) {
if (!inet->inet_dport ||
(((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
- peer == 1))
+ peer == 1)) {
+ release_sock(sk);
return -ENOTCONN;
+ }
sin->sin_port = inet->inet_dport;
sin->sin_addr.s_addr = inet->inet_daddr;
- BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
- CGROUP_INET4_GETPEERNAME,
- NULL);
+ BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
+ CGROUP_INET4_GETPEERNAME);
} else {
__be32 addr = inet->inet_rcv_saddr;
if (!addr)
addr = inet->inet_saddr;
sin->sin_port = inet->inet_sport;
sin->sin_addr.s_addr = addr;
- BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
- CGROUP_INET4_GETSOCKNAME,
- NULL);
+ BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
+ CGROUP_INET4_GETSOCKNAME);
}
+ release_sock(sk);
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
return sizeof(*sin);
}
@@ -1666,12 +1664,6 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
}
EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
-u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offt)
-{
- return *(((unsigned long *)per_cpu_ptr(mib, cpu)) + offt);
-}
-EXPORT_SYMBOL_GPL(snmp_get_cpu_field);
-
unsigned long snmp_fold_field(void __percpu *mib, int offt)
{
unsigned long res = 0;
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 4a8550c49202..48f337ccf949 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -9,7 +9,6 @@
#include <linux/types.h>
#include <linux/module.h>
-#include <linux/ip.h>
#include <linux/in.h>
#include <net/ip.h>
#include <net/sock.h>
diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c
index 0c28bd469a68..0e23ade74493 100644
--- a/net/ipv4/fib_notifier.c
+++ b/net/ipv4/fib_notifier.c
@@ -6,7 +6,6 @@
#include <linux/export.h>
#include <net/net_namespace.h>
#include <net/fib_notifier.h>
-#include <net/netns/ipv4.h>
#include <net/ip_fib.h>
int call_fib4_notifier(struct notifier_block *nb,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index b42c429cebbe..3364cb9c67e0 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1661,7 +1661,7 @@ EXPORT_SYMBOL_GPL(fib_nexthop_info);
#if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6)
int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc,
- int nh_weight, u8 rt_family)
+ int nh_weight, u8 rt_family, u32 nh_tclassid)
{
const struct net_device *dev = nhc->nhc_dev;
struct rtnexthop *rtnh;
@@ -1679,6 +1679,9 @@ int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc,
rtnh->rtnh_flags = flags;
+ if (nh_tclassid && nla_put_u32(skb, RTA_FLOW, nh_tclassid))
+ goto nla_put_failure;
+
/* length of rtnetlink header + attributes */
rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
@@ -1706,14 +1709,13 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
}
for_nexthops(fi) {
- if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight,
- AF_INET) < 0)
- goto nla_put_failure;
+ u32 nh_tclassid = 0;
#ifdef CONFIG_IP_ROUTE_CLASSID
- if (nh->nh_tclassid &&
- nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
- goto nla_put_failure;
+ nh_tclassid = nh->nh_tclassid;
#endif
+ if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight,
+ AF_INET, nh_tclassid) < 0)
+ goto nla_put_failure;
} endfor_nexthops(fi);
mp_end:
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 8b30cadff708..b7e277d8a84d 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1054,14 +1054,19 @@ bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr)
iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr), &_iio);
if (!ext_hdr || !iio)
goto send_mal_query;
- if (ntohs(iio->extobj_hdr.length) <= sizeof(iio->extobj_hdr))
+ if (ntohs(iio->extobj_hdr.length) <= sizeof(iio->extobj_hdr) ||
+ ntohs(iio->extobj_hdr.length) > sizeof(_iio))
goto send_mal_query;
ident_len = ntohs(iio->extobj_hdr.length) - sizeof(iio->extobj_hdr);
+ iio = skb_header_pointer(skb, sizeof(_ext_hdr),
+ sizeof(iio->extobj_hdr) + ident_len, &_iio);
+ if (!iio)
+ goto send_mal_query;
+
status = 0;
dev = NULL;
switch (iio->extobj_hdr.class_type) {
case ICMP_EXT_ECHO_CTYPE_NAME:
- iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(_iio), &_iio);
if (ident_len >= IFNAMSIZ)
goto send_mal_query;
memset(buff, 0, sizeof(buff));
@@ -1069,30 +1074,24 @@ bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr)
dev = dev_get_by_name(net, buff);
break;
case ICMP_EXT_ECHO_CTYPE_INDEX:
- iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr) +
- sizeof(iio->ident.ifindex), &_iio);
if (ident_len != sizeof(iio->ident.ifindex))
goto send_mal_query;
dev = dev_get_by_index(net, ntohl(iio->ident.ifindex));
break;
case ICMP_EXT_ECHO_CTYPE_ADDR:
- if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) +
+ if (ident_len < sizeof(iio->ident.addr.ctype3_hdr) ||
+ ident_len != sizeof(iio->ident.addr.ctype3_hdr) +
iio->ident.addr.ctype3_hdr.addrlen)
goto send_mal_query;
switch (ntohs(iio->ident.addr.ctype3_hdr.afi)) {
case ICMP_AFI_IP:
- iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr) +
- sizeof(struct in_addr), &_iio);
- if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) +
- sizeof(struct in_addr))
+ if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in_addr))
goto send_mal_query;
dev = ip_dev_find(net, iio->ident.addr.ip_addr.ipv4_addr);
break;
#if IS_ENABLED(CONFIG_IPV6)
case ICMP_AFI_IP6:
- iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(_iio), &_iio);
- if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) +
- sizeof(struct in6_addr))
+ if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in6_addr))
goto send_mal_query;
dev = ipv6_stub->ipv6_dev_find(net, &iio->ident.addr.ip_addr.ipv6_addr, dev);
dev_hold(dev);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f25d02ad4a8a..f7fea3a7c5e6 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1015,7 +1015,7 @@ void inet_csk_destroy_sock(struct sock *sk)
sk_refcnt_debug_release(sk);
- percpu_counter_dec(sk->sk_prot->orphan_count);
+ this_cpu_dec(*sk->sk_prot->orphan_count);
sock_put(sk);
}
@@ -1074,7 +1074,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
sock_orphan(child);
- percpu_counter_inc(sk->sk_prot->orphan_count);
+ this_cpu_inc(*sk->sk_prot->orphan_count);
if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index ef7897226f08..c8fa6e7f7d12 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -271,7 +271,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
struct inet_diag_meminfo minfo = {
.idiag_rmem = sk_rmem_alloc_get(sk),
.idiag_wmem = READ_ONCE(sk->sk_wmem_queued),
- .idiag_fmem = sk->sk_forward_alloc,
+ .idiag_fmem = sk_forward_alloc_get(sk),
.idiag_tmem = sk_wmem_alloc_get(sk),
};
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 80aeaf9e6e16..75737267746f 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -242,8 +242,10 @@ static inline int compute_score(struct sock *sk, struct net *net,
if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
return -1;
+ score = sk->sk_bound_dev_if ? 2 : 1;
- score = sk->sk_family == PF_INET ? 2 : 1;
+ if (sk->sk_family == PF_INET)
+ score++;
if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
score++;
}
@@ -596,7 +598,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk)
if (ok) {
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
} else {
- percpu_counter_inc(sk->sk_prot->orphan_count);
+ this_cpu_inc(*sk->sk_prot->orphan_count);
inet_sk_set_state(sk, TCP_CLOSE);
sock_set_flag(sk, SOCK_DEAD);
inet_csk_destroy_sock(sk);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 0fe6c936dc54..2ac2b95c5694 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -986,7 +986,7 @@ static int ipgre_tunnel_init(struct net_device *dev)
__gre_tunnel_init(dev);
- memcpy(dev->dev_addr, &iph->saddr, 4);
+ __dev_addr_set(dev, &iph->saddr, 4);
memcpy(dev->broadcast, &iph->daddr, 4);
dev->flags = IFF_NOARP;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index b297bb28556e..38d29b175ca6 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -886,6 +886,8 @@ static int compat_ip_mcast_join_leave(struct sock *sk, int optname,
return ip_mc_leave_group(sk, &mreq);
}
+DEFINE_STATIC_KEY_FALSE(ip4_min_ttl);
+
static int do_ip_setsockopt(struct sock *sk, int level, int optname,
sockptr_t optval, unsigned int optlen)
{
@@ -1352,7 +1354,14 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname,
goto e_inval;
if (val < 0 || val > 255)
goto e_inval;
- inet->min_ttl = val;
+
+ if (val)
+ static_branch_enable(&ip4_min_ttl);
+
+ /* tcp_v4_err() and tcp_v4_rcv() might read min_ttl
+ * while we are changint it.
+ */
+ WRITE_ONCE(inet->min_ttl, val);
break;
default:
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index fe9101d3d69e..5a473319d3a5 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -834,7 +834,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,
t->parms.i_key = p->i_key;
t->parms.o_key = p->o_key;
if (dev->type != ARPHRD_ETHER) {
- memcpy(dev->dev_addr, &p->iph.saddr, 4);
+ __dev_addr_set(dev, &p->iph.saddr, 4);
memcpy(dev->broadcast, &p->iph.daddr, 4);
}
ip_tunnel_add(itn, t);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index efe25a0172e6..8c2bd1d9ddce 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -425,7 +425,7 @@ static int vti_tunnel_init(struct net_device *dev)
struct ip_tunnel *tunnel = netdev_priv(dev);
struct iphdr *iph = &tunnel->parms.iph;
- memcpy(dev->dev_addr, &iph->saddr, 4);
+ __dev_addr_set(dev, &iph->saddr, 4);
memcpy(dev->broadcast, &iph->daddr, 4);
dev->flags = IFF_NOARP;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 816d8aad5a68..9d41d5d5cd1e 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -262,6 +262,11 @@ static int __init ic_open_devs(void)
dev->name, able, d->xid);
}
}
+ /* Devices with a complex topology like SFP ethernet interfaces needs
+ * the rtnl_lock at init. The carrier wait-loop must therefore run
+ * without holding it.
+ */
+ rtnl_unlock();
/* no point in waiting if we could not bring up at least one device */
if (!ic_first_dev)
@@ -274,9 +279,13 @@ static int __init ic_open_devs(void)
msecs_to_jiffies(carrier_timeout * 1000))) {
int wait, elapsed;
+ rtnl_lock();
for_each_netdev(&init_net, dev)
- if (ic_is_init_dev(dev) && netif_carrier_ok(dev))
+ if (ic_is_init_dev(dev) && netif_carrier_ok(dev)) {
+ rtnl_unlock();
goto have_carrier;
+ }
+ rtnl_unlock();
msleep(1);
@@ -289,7 +298,6 @@ static int __init ic_open_devs(void)
next_msg = jiffies + msecs_to_jiffies(20000);
}
have_carrier:
- rtnl_unlock();
*last = NULL;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 3aa78ccbec3e..123ea63a04cb 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -380,7 +380,7 @@ static int ipip_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
+ __dev_addr_set(dev, &tunnel->parms.iph.saddr, 4);
memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
tunnel->tun_hlen = 0;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index c53f14b94356..ffc0cab7cf18 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -179,10 +179,11 @@ struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry)
return (void *)entry + entry->next_offset;
}
-unsigned int arpt_do_table(struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct xt_table *table)
+unsigned int arpt_do_table(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
{
+ const struct xt_table *table = priv;
unsigned int hook = state->hook;
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
unsigned int verdict = NF_DROP;
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 3de78416ec76..78cd5ee24448 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -26,14 +26,6 @@ static const struct xt_table packet_filter = {
.priority = NF_IP_PRI_FILTER,
};
-/* The work comes in here from netfilter.c */
-static unsigned int
-arptable_filter_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return arpt_do_table(skb, state, priv);
-}
-
static struct nf_hook_ops *arpfilter_ops __read_mostly;
static int arptable_filter_table_init(struct net *net)
@@ -72,7 +64,7 @@ static int __init arptable_filter_init(void)
if (ret < 0)
return ret;
- arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arptable_filter_hook);
+ arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arpt_do_table);
if (IS_ERR(arpfilter_ops)) {
xt_unregister_template(&packet_filter);
return PTR_ERR(arpfilter_ops);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 13acb687c19a..2ed7c58b471a 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -222,10 +222,11 @@ struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry)
/* Returns one of the generic firewall policies, like NF_ACCEPT. */
unsigned int
-ipt_do_table(struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct xt_table *table)
+ipt_do_table(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
{
+ const struct xt_table *table = priv;
unsigned int hook = state->hook;
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
const struct iphdr *ip;
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 0eb0e2ab9bfc..b9062f4552ac 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -28,13 +28,6 @@ static const struct xt_table packet_filter = {
.priority = NF_IP_PRI_FILTER,
};
-static unsigned int
-iptable_filter_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ipt_do_table(skb, state, priv);
-}
-
static struct nf_hook_ops *filter_ops __read_mostly;
/* Default to forward because I got too much mail already. */
@@ -90,7 +83,7 @@ static int __init iptable_filter_init(void)
if (ret < 0)
return ret;
- filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook);
+ filter_ops = xt_hook_ops_alloc(&packet_filter, ipt_do_table);
if (IS_ERR(filter_ops)) {
xt_unregister_template(&packet_filter);
return PTR_ERR(filter_ops);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 40417a3f930b..3abb430af9e6 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -34,7 +34,7 @@ static const struct xt_table packet_mangler = {
};
static unsigned int
-ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state, void *priv)
+ipt_mangle_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
{
unsigned int ret;
const struct iphdr *iph;
@@ -50,7 +50,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state, void *pri
daddr = iph->daddr;
tos = iph->tos;
- ret = ipt_do_table(skb, state, priv);
+ ret = ipt_do_table(priv, skb, state);
/* Reroute for ANY change. */
if (ret != NF_DROP && ret != NF_STOLEN) {
iph = ip_hdr(skb);
@@ -75,8 +75,8 @@ iptable_mangle_hook(void *priv,
const struct nf_hook_state *state)
{
if (state->hook == NF_INET_LOCAL_OUT)
- return ipt_mangle_out(skb, state, priv);
- return ipt_do_table(skb, state, priv);
+ return ipt_mangle_out(priv, skb, state);
+ return ipt_do_table(priv, skb, state);
}
static struct nf_hook_ops *mangle_ops __read_mostly;
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index 45d7e072e6a5..56f6ecc43451 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -29,34 +29,27 @@ static const struct xt_table nf_nat_ipv4_table = {
.af = NFPROTO_IPV4,
};
-static unsigned int iptable_nat_do_chain(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ipt_do_table(skb, state, priv);
-}
-
static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
{
- .hook = iptable_nat_do_chain,
+ .hook = ipt_do_table,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_NAT_DST,
},
{
- .hook = iptable_nat_do_chain,
+ .hook = ipt_do_table,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_NAT_SRC,
},
{
- .hook = iptable_nat_do_chain,
+ .hook = ipt_do_table,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST,
},
{
- .hook = iptable_nat_do_chain,
+ .hook = ipt_do_table,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC,
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index b88e0f36cd05..ca5e5b21587c 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -32,17 +32,9 @@ static const struct xt_table packet_raw_before_defrag = {
.priority = NF_IP_PRI_RAW_BEFORE_DEFRAG,
};
-/* The work comes in here from netfilter.c. */
-static unsigned int
-iptable_raw_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ipt_do_table(skb, state, priv);
-}
-
static struct nf_hook_ops *rawtable_ops __read_mostly;
-static int __net_init iptable_raw_table_init(struct net *net)
+static int iptable_raw_table_init(struct net *net)
{
struct ipt_replace *repl;
const struct xt_table *table = &packet_raw;
@@ -90,7 +82,7 @@ static int __init iptable_raw_init(void)
if (ret < 0)
return ret;
- rawtable_ops = xt_hook_ops_alloc(table, iptable_raw_hook);
+ rawtable_ops = xt_hook_ops_alloc(table, ipt_do_table);
if (IS_ERR(rawtable_ops)) {
xt_unregister_template(table);
return PTR_ERR(rawtable_ops);
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index f519162a2fa5..d885443cb267 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -33,13 +33,6 @@ static const struct xt_table security_table = {
.priority = NF_IP_PRI_SECURITY,
};
-static unsigned int
-iptable_security_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ipt_do_table(skb, state, priv);
-}
-
static struct nf_hook_ops *sectbl_ops __read_mostly;
static int iptable_security_table_init(struct net *net)
@@ -78,7 +71,7 @@ static int __init iptable_security_init(void)
if (ret < 0)
return ret;
- sectbl_ops = xt_hook_ops_alloc(&security_table, iptable_security_hook);
+ sectbl_ops = xt_hook_ops_alloc(&security_table, ipt_do_table);
if (IS_ERR(sectbl_ops)) {
xt_unregister_template(&security_table);
return PTR_ERR(sectbl_ops);
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 613432a36f0a..e61ea428ea18 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -20,13 +20,8 @@
#endif
#include <net/netfilter/nf_conntrack_zones.h>
-static unsigned int defrag4_pernet_id __read_mostly;
static DEFINE_MUTEX(defrag4_mutex);
-struct defrag4_pernet {
- unsigned int users;
-};
-
static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb,
u_int32_t user)
{
@@ -111,19 +106,15 @@ static const struct nf_hook_ops ipv4_defrag_ops[] = {
static void __net_exit defrag4_net_exit(struct net *net)
{
- struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id);
-
- if (nf_defrag->users) {
+ if (net->nf.defrag_ipv4_users) {
nf_unregister_net_hooks(net, ipv4_defrag_ops,
ARRAY_SIZE(ipv4_defrag_ops));
- nf_defrag->users = 0;
+ net->nf.defrag_ipv4_users = 0;
}
}
static struct pernet_operations defrag4_net_ops = {
.exit = defrag4_net_exit,
- .id = &defrag4_pernet_id,
- .size = sizeof(struct defrag4_pernet),
};
static int __init nf_defrag_init(void)
@@ -138,24 +129,23 @@ static void __exit nf_defrag_fini(void)
int nf_defrag_ipv4_enable(struct net *net)
{
- struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id);
int err = 0;
mutex_lock(&defrag4_mutex);
- if (nf_defrag->users == UINT_MAX) {
+ if (net->nf.defrag_ipv4_users == UINT_MAX) {
err = -EOVERFLOW;
goto out_unlock;
}
- if (nf_defrag->users) {
- nf_defrag->users++;
+ if (net->nf.defrag_ipv4_users) {
+ net->nf.defrag_ipv4_users++;
goto out_unlock;
}
err = nf_register_net_hooks(net, ipv4_defrag_ops,
ARRAY_SIZE(ipv4_defrag_ops));
if (err == 0)
- nf_defrag->users = 1;
+ net->nf.defrag_ipv4_users = 1;
out_unlock:
mutex_unlock(&defrag4_mutex);
@@ -165,12 +155,10 @@ EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable);
void nf_defrag_ipv4_disable(struct net *net)
{
- struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id);
-
mutex_lock(&defrag4_mutex);
- if (nf_defrag->users) {
- nf_defrag->users--;
- if (nf_defrag->users == 0)
+ if (net->nf.defrag_ipv4_users) {
+ net->nf.defrag_ipv4_users--;
+ if (net->nf.defrag_ipv4_users == 0)
nf_unregister_net_hooks(net, ipv4_defrag_ops,
ARRAY_SIZE(ipv4_defrag_ops));
}
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 75ca4b6e484f..9e8100728d46 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -1982,6 +1982,8 @@ static int replace_nexthop_grp(struct net *net, struct nexthop *old,
rcu_assign_pointer(old->nh_grp, newg);
if (newg->resilient) {
+ /* Make sure concurrent readers are not using 'oldg' anymore. */
+ synchronize_net();
rcu_assign_pointer(oldg->res_table, tmp_table);
rcu_assign_pointer(oldg->spare->res_table, tmp_table);
}
@@ -3565,6 +3567,7 @@ static struct notifier_block nh_netdev_notifier = {
};
static int nexthops_dump(struct net *net, struct notifier_block *nb,
+ enum nexthop_event_type event_type,
struct netlink_ext_ack *extack)
{
struct rb_root *root = &net->nexthop.rb_root;
@@ -3575,8 +3578,7 @@ static int nexthops_dump(struct net *net, struct notifier_block *nb,
struct nexthop *nh;
nh = rb_entry(node, struct nexthop, rb_node);
- err = call_nexthop_notifier(nb, net, NEXTHOP_EVENT_REPLACE, nh,
- extack);
+ err = call_nexthop_notifier(nb, net, event_type, nh, extack);
if (err)
break;
}
@@ -3590,7 +3592,7 @@ int register_nexthop_notifier(struct net *net, struct notifier_block *nb,
int err;
rtnl_lock();
- err = nexthops_dump(net, nb, extack);
+ err = nexthops_dump(net, nb, NEXTHOP_EVENT_REPLACE, extack);
if (err)
goto unlock;
err = blocking_notifier_chain_register(&net->nexthop.notifier_chain,
@@ -3603,8 +3605,17 @@ EXPORT_SYMBOL(register_nexthop_notifier);
int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb)
{
- return blocking_notifier_chain_unregister(&net->nexthop.notifier_chain,
- nb);
+ int err;
+
+ rtnl_lock();
+ err = blocking_notifier_chain_unregister(&net->nexthop.notifier_chain,
+ nb);
+ if (err)
+ goto unlock;
+ nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL);
+unlock:
+ rtnl_unlock();
+ return err;
}
EXPORT_SYMBOL(unregister_nexthop_notifier);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index b0d3a09dc84e..f30273afb539 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -53,7 +53,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
struct net *net = seq->private;
int orphans, sockets;
- orphans = percpu_counter_sum_positive(&tcp_orphan_count);
+ orphans = tcp_orphan_count_sum();
sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
socket_seq_show(seq);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4680268f2e59..97eb54774924 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -585,18 +585,6 @@ static struct ctl_table ipv4_table[] = {
.extra1 = &sysctl_fib_sync_mem_min,
.extra2 = &sysctl_fib_sync_mem_max,
},
- {
- .procname = "tcp_rx_skb_cache",
- .data = &tcp_rx_skb_cache_key.key,
- .mode = 0644,
- .proc_handler = proc_do_static_key,
- },
- {
- .procname = "tcp_tx_skb_cache",
- .data = &tcp_tx_skb_cache_key.key,
- .mode = 0644,
- .proc_handler = proc_do_static_key,
- },
{ }
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e8b48df73c85..a7b1138d619c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -287,8 +287,8 @@ enum {
TCP_CMSG_TS = 2
};
-struct percpu_counter tcp_orphan_count;
-EXPORT_SYMBOL_GPL(tcp_orphan_count);
+DEFINE_PER_CPU(unsigned int, tcp_orphan_count);
+EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
long sysctl_tcp_mem[3] __read_mostly;
EXPORT_SYMBOL(sysctl_tcp_mem);
@@ -325,11 +325,6 @@ struct tcp_splice_state {
unsigned long tcp_memory_pressure __read_mostly;
EXPORT_SYMBOL_GPL(tcp_memory_pressure);
-DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
-EXPORT_SYMBOL(tcp_rx_skb_cache_key);
-
-DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
-
void tcp_enter_memory_pressure(struct sock *sk)
{
unsigned long val;
@@ -486,10 +481,7 @@ static bool tcp_stream_is_readable(struct sock *sk, int target)
{
if (tcp_epollin_ready(sk, target))
return true;
-
- if (sk->sk_prot->stream_memory_read)
- return sk->sk_prot->stream_memory_read(sk);
- return false;
+ return sk_is_readable(sk);
}
/*
@@ -647,7 +639,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
}
EXPORT_SYMBOL(tcp_ioctl);
-static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
+void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
{
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
tp->pushed_seq = tp->write_seq;
@@ -658,15 +650,13 @@ static inline bool forced_push(const struct tcp_sock *tp)
return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
}
-static void skb_entail(struct sock *sk, struct sk_buff *skb)
+void tcp_skb_entail(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
- skb->csum = 0;
tcb->seq = tcb->end_seq = tp->write_seq;
tcb->tcp_flags = TCPHDR_ACK;
- tcb->sacked = 0;
__skb_header_release(skb);
tcp_add_write_queue_tail(sk, skb);
sk_wmem_queued_add(sk, skb->truesize);
@@ -861,30 +851,15 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
}
EXPORT_SYMBOL(tcp_splice_read);
-struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
- bool force_schedule)
+struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
+ bool force_schedule)
{
struct sk_buff *skb;
- if (likely(!size)) {
- skb = sk->sk_tx_skb_cache;
- if (skb) {
- skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
- sk->sk_tx_skb_cache = NULL;
- pskb_trim(skb, 0);
- INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
- skb_shinfo(skb)->tx_flags = 0;
- memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb));
- return skb;
- }
- }
- /* The TCP header must be at least 32-bit aligned. */
- size = ALIGN(size, 4);
-
if (unlikely(tcp_under_memory_pressure(sk)))
sk_mem_reclaim_partial(sk);
- skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
+ skb = alloc_skb_fclone(size + MAX_TCP_HEADER, gfp);
if (likely(skb)) {
bool mem_scheduled;
@@ -895,12 +870,8 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
}
if (likely(mem_scheduled)) {
- skb_reserve(skb, sk->sk_prot->max_header);
- /*
- * Make sure that we have exactly size bytes
- * available to the caller, no more, no less.
- */
- skb->reserved_tailroom = skb->end - skb->tail - size;
+ skb_reserve(skb, MAX_TCP_HEADER);
+ skb->ip_summed = CHECKSUM_PARTIAL;
INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
return skb;
}
@@ -953,9 +924,11 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
* importantly be able to generate EPOLLOUT for Edge Trigger epoll()
* users.
*/
-void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
+void tcp_remove_empty_skb(struct sock *sk)
{
- if (skb && !skb->len) {
+ struct sk_buff *skb = tcp_write_queue_tail(sk);
+
+ if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
tcp_unlink_write_queue(skb, sk);
if (tcp_write_queue_empty(sk))
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
@@ -963,8 +936,8 @@ void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
}
}
-struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
- struct page *page, int offset, size_t *size)
+static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
+ struct page *page, int offset, size_t *size)
{
struct sk_buff *skb = tcp_write_queue_tail(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -977,15 +950,15 @@ new_segment:
if (!sk_stream_memory_free(sk))
return NULL;
- skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
- tcp_rtx_and_write_queues_empty(sk));
+ skb = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation,
+ tcp_rtx_and_write_queues_empty(sk));
if (!skb)
return NULL;
#ifdef CONFIG_TLS_DEVICE
skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
#endif
- skb_entail(sk, skb);
+ tcp_skb_entail(sk, skb);
copy = size_goal;
}
@@ -1016,7 +989,6 @@ new_segment:
skb->truesize += copy;
sk_wmem_queued_add(sk, copy);
sk_mem_charge(sk, copy);
- skb->ip_summed = CHECKSUM_PARTIAL;
WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
TCP_SKB_CB(skb)->end_seq += copy;
tcp_skb_pcount_set(skb, 0);
@@ -1107,7 +1079,7 @@ out:
return copied;
do_error:
- tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk));
+ tcp_remove_empty_skb(sk);
if (copied)
goto out;
out_err:
@@ -1306,15 +1278,14 @@ new_segment:
goto restart;
}
first_skb = tcp_rtx_and_write_queues_empty(sk);
- skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
- first_skb);
+ skb = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation,
+ first_skb);
if (!skb)
goto wait_for_space;
process_backlog++;
- skb->ip_summed = CHECKSUM_PARTIAL;
- skb_entail(sk, skb);
+ tcp_skb_entail(sk, skb);
copy = size_goal;
/* All packets are restored as if they have
@@ -1329,14 +1300,7 @@ new_segment:
if (copy > msg_data_left(msg))
copy = msg_data_left(msg);
- /* Where to copy to? */
- if (skb_availroom(skb) > 0 && !zc) {
- /* We have some space in skb head. Superb! */
- copy = min_t(int, copy, skb_availroom(skb));
- err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
- if (err)
- goto do_fault;
- } else if (!zc) {
+ if (!zc) {
bool merge = true;
int i = skb_shinfo(skb)->nr_frags;
struct page_frag *pfrag = sk_page_frag(sk);
@@ -1435,9 +1399,7 @@ out_nopush:
return copied + copied_syn;
do_error:
- skb = tcp_write_queue_tail(sk);
-do_fault:
- tcp_remove_empty_skb(sk, skb);
+ tcp_remove_empty_skb(sk);
if (copied + copied_syn)
goto out;
@@ -2690,11 +2652,36 @@ void tcp_shutdown(struct sock *sk, int how)
}
EXPORT_SYMBOL(tcp_shutdown);
+int tcp_orphan_count_sum(void)
+{
+ int i, total = 0;
+
+ for_each_possible_cpu(i)
+ total += per_cpu(tcp_orphan_count, i);
+
+ return max(total, 0);
+}
+
+static int tcp_orphan_cache;
+static struct timer_list tcp_orphan_timer;
+#define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100)
+
+static void tcp_orphan_update(struct timer_list *unused)
+{
+ WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum());
+ mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
+}
+
+static bool tcp_too_many_orphans(int shift)
+{
+ return READ_ONCE(tcp_orphan_cache) << shift > sysctl_tcp_max_orphans;
+}
+
bool tcp_check_oom(struct sock *sk, int shift)
{
bool too_many_orphans, out_of_socket_memory;
- too_many_orphans = tcp_too_many_orphans(sk, shift);
+ too_many_orphans = tcp_too_many_orphans(shift);
out_of_socket_memory = tcp_out_of_memory(sk);
if (too_many_orphans)
@@ -2803,7 +2790,7 @@ adjudge_to_death:
/* remove backlog if any, without releasing ownership. */
__release_sock(sk);
- percpu_counter_inc(sk->sk_prot->orphan_count);
+ this_cpu_inc(tcp_orphan_count);
/* Have we already been destroyed by a softirq or backlog? */
if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
@@ -2920,11 +2907,6 @@ void tcp_write_queue_purge(struct sock *sk)
sk_wmem_free_skb(sk, skb);
}
tcp_rtx_queue_purge(sk);
- skb = sk->sk_tx_skb_cache;
- if (skb) {
- __kfree_skb(skb);
- sk->sk_tx_skb_cache = NULL;
- }
INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
sk_mem_reclaim(sk);
tcp_clear_all_retrans_hints(tcp_sk(sk));
@@ -2961,10 +2943,6 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_clear_xmit_timers(sk);
__skb_queue_purge(&sk->sk_receive_queue);
- if (sk->sk_rx_skb_cache) {
- __kfree_skb(sk->sk_rx_skb_cache);
- sk->sk_rx_skb_cache = NULL;
- }
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
tp->urg_data = 0;
tcp_write_queue_purge(sk);
@@ -4505,7 +4483,10 @@ void __init tcp_init(void)
sizeof_field(struct sk_buff, cb));
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
- percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
+
+ timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
+ mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
+
inet_hashinfo_init(&tcp_hashinfo);
inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
thash_entries, 21, /* one slot per 2 MB*/
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index d3e9386b493e..5f4d6f45d87f 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -150,19 +150,6 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
#ifdef CONFIG_BPF_SYSCALL
-static bool tcp_bpf_stream_read(const struct sock *sk)
-{
- struct sk_psock *psock;
- bool empty = true;
-
- rcu_read_lock();
- psock = sk_psock(sk);
- if (likely(psock))
- empty = list_empty(&psock->ingress_msg);
- rcu_read_unlock();
- return !empty;
-}
-
static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock,
long timeo)
{
@@ -232,6 +219,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
bool cork = false, enospc = sk_msg_full(msg);
struct sock *sk_redir;
u32 tosend, delta = 0;
+ u32 eval = __SK_NONE;
int ret;
more_data:
@@ -275,13 +263,24 @@ more_data:
case __SK_REDIRECT:
sk_redir = psock->sk_redir;
sk_msg_apply_bytes(psock, tosend);
+ if (!psock->apply_bytes) {
+ /* Clean up before releasing the sock lock. */
+ eval = psock->eval;
+ psock->eval = __SK_NONE;
+ psock->sk_redir = NULL;
+ }
if (psock->cork) {
cork = true;
psock->cork = NULL;
}
sk_msg_return(sk, msg, tosend);
release_sock(sk);
+
ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags);
+
+ if (eval == __SK_REDIRECT)
+ sock_put(sk_redir);
+
lock_sock(sk);
if (unlikely(ret < 0)) {
int free = sk_msg_free_nocharge(sk, msg);
@@ -479,7 +478,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
prot[TCP_BPF_BASE].unhash = sock_map_unhash;
prot[TCP_BPF_BASE].close = sock_map_close;
prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg;
- prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read;
+ prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable;
prot[TCP_BPF_TX] = prot[TCP_BPF_BASE];
prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 141e85e6422b..246ab7b5e857 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -500,8 +500,11 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb,
room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh;
+ if (room <= 0)
+ return;
+
/* Check #1 */
- if (room > 0 && !tcp_under_memory_pressure(sk)) {
+ if (!tcp_under_memory_pressure(sk)) {
unsigned int truesize = truesize_adjust(adjust, skb);
int incr;
@@ -518,6 +521,11 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb,
tp->rcv_ssthresh += min(room, incr);
inet_csk(sk)->icsk_ack.quick |= 1;
}
+ } else {
+ /* Under pressure:
+ * Adjust rcv_ssthresh according to reserved mem
+ */
+ tcp_adjust_rcv_ssthresh(sk);
}
}
@@ -3221,7 +3229,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
long seq_rtt_us = -1L;
long ca_rtt_us = -1L;
u32 pkts_acked = 0;
- u32 last_in_flight = 0;
bool rtt_update;
int flag = 0;
@@ -3257,7 +3264,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
if (!first_ackt)
first_ackt = last_ackt;
- last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
if (before(start_seq, reord))
reord = start_seq;
if (!after(scb->end_seq, tp->high_seq))
@@ -3323,8 +3329,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt);
- if (pkts_acked == 1 && last_in_flight < tp->mss_cache &&
- last_in_flight && !prior_sacked && fully_acked &&
+ if (pkts_acked == 1 && fully_acked && !prior_sacked &&
+ (tp->snd_una - prior_snd_una) < tp->mss_cache &&
sack->rate->prior_delivered + 1 == tp->delivered &&
!(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) {
/* Conservatively mark a delayed ACK. It's typically
@@ -3381,9 +3387,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
if (icsk->icsk_ca_ops->pkts_acked) {
struct ack_sample sample = { .pkts_acked = pkts_acked,
- .rtt_us = sack->rate->rtt_us,
- .in_flight = last_in_flight };
+ .rtt_us = sack->rate->rtt_us };
+ sample.in_flight = tp->mss_cache *
+ (tp->delivered - sack->rate->prior_delivered);
icsk->icsk_ca_ops->pkts_acked(sk, &sample);
}
@@ -5346,7 +5353,7 @@ static int tcp_prune_queue(struct sock *sk)
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
tcp_clamp_window(sk);
else if (tcp_under_memory_pressure(sk))
- tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
+ tcp_adjust_rcv_ssthresh(sk);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
return 0;
@@ -5381,7 +5388,7 @@ static int tcp_prune_queue(struct sock *sk)
return -1;
}
-static bool tcp_should_expand_sndbuf(const struct sock *sk)
+static bool tcp_should_expand_sndbuf(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -5392,8 +5399,18 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk)
return false;
/* If we are under global TCP memory pressure, do not expand. */
- if (tcp_under_memory_pressure(sk))
+ if (tcp_under_memory_pressure(sk)) {
+ int unused_mem = sk_unused_reserved_mem(sk);
+
+ /* Adjust sndbuf according to reserved mem. But make sure
+ * it never goes below SOCK_MIN_SNDBUF.
+ * See sk_stream_moderate_sndbuf() for more details.
+ */
+ if (unused_mem > SOCK_MIN_SNDBUF)
+ WRITE_ONCE(sk->sk_sndbuf, unused_mem);
+
return false;
+ }
/* If we are under soft global TCP memory pressure, do not expand. */
if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2e62e0d6373a..13d868c43284 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -508,9 +508,12 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
if (sk->sk_state == TCP_CLOSE)
goto out;
- if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
- __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
- goto out;
+ if (static_branch_unlikely(&ip4_min_ttl)) {
+ /* min_ttl can be changed concurrently from do_ip_setsockopt() */
+ if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
+ __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
+ goto out;
+ }
}
tp = tcp_sk(sk);
@@ -1037,6 +1040,20 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
EXPORT_SYMBOL(tcp_md5_needed);
+static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
+{
+ if (!old)
+ return true;
+
+ /* l3index always overrides non-l3index */
+ if (old->l3index && new->l3index == 0)
+ return false;
+ if (old->l3index == 0 && new->l3index)
+ return true;
+
+ return old->prefixlen < new->prefixlen;
+}
+
/* Find the Key structure for an address. */
struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
const union tcp_md5_addr *addr,
@@ -1059,7 +1076,7 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
lockdep_sock_is_held(sk)) {
if (key->family != family)
continue;
- if (key->l3index && key->l3index != l3index)
+ if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
continue;
if (family == AF_INET) {
mask = inet_make_mask(key->prefixlen);
@@ -1074,8 +1091,7 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
match = false;
}
- if (match && (!best_match ||
- key->prefixlen > best_match->prefixlen))
+ if (match && better_md5_match(best_match, key))
best_match = key;
}
return best_match;
@@ -1085,7 +1101,7 @@ EXPORT_SYMBOL(__tcp_md5_do_lookup);
static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
const union tcp_md5_addr *addr,
int family, u8 prefixlen,
- int l3index)
+ int l3index, u8 flags)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
@@ -1105,7 +1121,9 @@ static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
lockdep_sock_is_held(sk)) {
if (key->family != family)
continue;
- if (key->l3index && key->l3index != l3index)
+ if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
+ continue;
+ if (key->l3index != l3index)
continue;
if (!memcmp(&key->addr, addr, size) &&
key->prefixlen == prefixlen)
@@ -1129,7 +1147,7 @@ EXPORT_SYMBOL(tcp_v4_md5_lookup);
/* This can be called on a newly created socket, from other files */
int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
- int family, u8 prefixlen, int l3index,
+ int family, u8 prefixlen, int l3index, u8 flags,
const u8 *newkey, u8 newkeylen, gfp_t gfp)
{
/* Add Key to the list */
@@ -1137,7 +1155,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_info *md5sig;
- key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
+ key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
if (key) {
/* Pre-existing entry - just update that one.
* Note that the key might be used concurrently.
@@ -1182,6 +1200,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
key->family = family;
key->prefixlen = prefixlen;
key->l3index = l3index;
+ key->flags = flags;
memcpy(&key->addr, addr,
(family == AF_INET6) ? sizeof(struct in6_addr) :
sizeof(struct in_addr));
@@ -1191,11 +1210,11 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
EXPORT_SYMBOL(tcp_md5_do_add);
int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
- u8 prefixlen, int l3index)
+ u8 prefixlen, int l3index, u8 flags)
{
struct tcp_md5sig_key *key;
- key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
+ key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
if (!key)
return -ENOENT;
hlist_del_rcu(&key->node);
@@ -1229,6 +1248,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
const union tcp_md5_addr *addr;
u8 prefixlen = 32;
int l3index = 0;
+ u8 flags;
if (optlen < sizeof(cmd))
return -EINVAL;
@@ -1239,6 +1259,8 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
if (sin->sin_family != AF_INET)
return -EINVAL;
+ flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
+
if (optname == TCP_MD5SIG_EXT &&
cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
prefixlen = cmd.tcpm_prefixlen;
@@ -1246,7 +1268,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
return -EINVAL;
}
- if (optname == TCP_MD5SIG_EXT &&
+ if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
struct net_device *dev;
@@ -1267,12 +1289,12 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
if (!cmd.tcpm_keylen)
- return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
+ return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
return -EINVAL;
- return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
+ return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
}
@@ -1596,7 +1618,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
* memory, then we end up not copying the key
* across. Shucks.
*/
- tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
+ tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
key->key, key->keylen, GFP_ATOMIC);
sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
}
@@ -1684,7 +1706,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
if (dst) {
- if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
+ if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
!INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
dst, 0)) {
dst_release(dst);
@@ -1769,7 +1791,7 @@ int tcp_v4_early_demux(struct sk_buff *skb)
if (dst)
dst = dst_check(dst, 0);
if (dst &&
- inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
+ sk->sk_rx_dst_ifindex == skb->skb_iif)
skb_dst_set_noref(skb, dst);
}
}
@@ -1941,7 +1963,6 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
int tcp_v4_rcv(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
- struct sk_buff *skb_to_free;
int sdif = inet_sdif(skb);
int dif = inet_iif(skb);
const struct iphdr *iph;
@@ -2050,9 +2071,13 @@ process:
return 0;
}
}
- if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
- __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
- goto discard_and_relse;
+
+ if (static_branch_unlikely(&ip4_min_ttl)) {
+ /* min_ttl can be changed concurrently from do_ip_setsockopt() */
+ if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
+ __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
+ goto discard_and_relse;
+ }
}
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
@@ -2082,17 +2107,12 @@ process:
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
if (!sock_owned_by_user(sk)) {
- skb_to_free = sk->sk_rx_skb_cache;
- sk->sk_rx_skb_cache = NULL;
ret = tcp_v4_do_rcv(sk, skb);
} else {
if (tcp_add_backlog(sk, skb))
goto discard_and_relse;
- skb_to_free = NULL;
}
bh_unlock_sock(sk);
- if (skb_to_free)
- __kfree_skb(skb_to_free);
put_and_return:
if (refcounted)
@@ -2182,7 +2202,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
if (dst && dst_hold_safe(dst)) {
sk->sk_rx_dst = dst;
- inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
+ sk->sk_rx_dst_ifindex = skb->skb_iif;
}
}
EXPORT_SYMBOL(inet_sk_rx_dst_set);
diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
index 95db7a11ba2a..ab552356bdba 100644
--- a/net/ipv4/tcp_nv.c
+++ b/net/ipv4/tcp_nv.c
@@ -25,7 +25,6 @@
* 1) Add mechanism to deal with reverse congestion.
*/
-#include <linux/mm.h>
#include <linux/module.h>
#include <linux/math64.h>
#include <net/tcp.h>
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6d72f3ea48c4..6867e5db3e35 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -394,7 +394,6 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
skb->ip_summed = CHECKSUM_PARTIAL;
TCP_SKB_CB(skb)->tcp_flags = flags;
- TCP_SKB_CB(skb)->sacked = 0;
tcp_skb_pcount_set(skb, 1);
@@ -1256,8 +1255,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
if (clone_it) {
- TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
- - tp->snd_una;
oskb = skb;
tcp_skb_tsorted_save(oskb) {
@@ -1566,7 +1563,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
return -ENOMEM;
/* Get a new skb... force flag on. */
- buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
+ buff = tcp_stream_alloc_skb(sk, nsize, gfp, true);
if (!buff)
return -ENOMEM; /* We'll just try again later. */
skb_copy_decrypted(buff, skb);
@@ -1592,8 +1589,6 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
skb_split(skb, buff, len);
- buff->ip_summed = CHECKSUM_PARTIAL;
-
buff->tstamp = skb->tstamp;
tcp_fragment_tstamp(skb, buff);
@@ -1678,7 +1673,6 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
delta_truesize = __pskb_trim_head(skb, len);
TCP_SKB_CB(skb)->seq += len;
- skb->ip_summed = CHECKSUM_PARTIAL;
if (delta_truesize) {
skb->truesize -= delta_truesize;
@@ -2123,7 +2117,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
skb, len, mss_now, gfp);
- buff = sk_stream_alloc_skb(sk, 0, gfp, true);
+ buff = tcp_stream_alloc_skb(sk, 0, gfp, true);
if (unlikely(!buff))
return -ENOMEM;
skb_copy_decrypted(buff, skb);
@@ -2144,12 +2138,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
TCP_SKB_CB(buff)->tcp_flags = flags;
- /* This packet was never sent out yet, so no SACK bits. */
- TCP_SKB_CB(buff)->sacked = 0;
-
tcp_skb_fragment_eor(skb, buff);
- buff->ip_summed = CHECKSUM_PARTIAL;
skb_split(skb, buff, len);
tcp_fragment_tstamp(skb, buff);
@@ -2390,7 +2380,7 @@ static int tcp_mtu_probe(struct sock *sk)
return -1;
/* We're allowed to probe. Build it now. */
- nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
+ nskb = tcp_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
if (!nskb)
return -1;
sk_wmem_queued_add(sk, nskb->truesize);
@@ -2403,9 +2393,6 @@ static int tcp_mtu_probe(struct sock *sk)
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
- TCP_SKB_CB(nskb)->sacked = 0;
- nskb->csum = 0;
- nskb->ip_summed = CHECKSUM_PARTIAL;
tcp_insert_write_queue_before(nskb, skb, sk);
tcp_highest_sack_replace(sk, skb, nskb);
@@ -2969,8 +2956,7 @@ u32 __tcp_select_window(struct sock *sk)
icsk->icsk_ack.quick = 0;
if (tcp_under_memory_pressure(sk))
- tp->rcv_ssthresh = min(tp->rcv_ssthresh,
- 4U * tp->advmss);
+ tcp_adjust_rcv_ssthresh(sk);
/* free_space might become our new window, make sure we don't
* increase it due to wscale.
@@ -3048,13 +3034,9 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
- if (next_skb_size) {
- if (next_skb_size <= skb_availroom(skb))
- skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
- next_skb_size);
- else if (!tcp_skb_shift(skb, next_skb, 1, next_skb_size))
- return false;
- }
+ if (next_skb_size && !tcp_skb_shift(skb, next_skb, 1, next_skb_size))
+ return false;
+
tcp_highest_sack_replace(sk, next_skb, skb);
/* Update sequence range on original skb. */
@@ -3757,10 +3739,9 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
/* limit to order-0 allocations */
space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
- syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false);
+ syn_data = tcp_stream_alloc_skb(sk, space, sk->sk_allocation, false);
if (!syn_data)
goto fallback;
- syn_data->ip_summed = CHECKSUM_PARTIAL;
memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
if (space) {
int copied = copy_from_iter(skb_put(syn_data, space), space,
@@ -3838,7 +3819,7 @@ int tcp_connect(struct sock *sk)
return 0;
}
- buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
+ buff = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
if (unlikely(!buff))
return -ENOBUFS;
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index 0de693565963..fbab921670cc 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -65,6 +65,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp;
TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
+ TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce;
TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
}
@@ -86,6 +87,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
if (!rs->prior_delivered ||
after(scb->tx.delivered, rs->prior_delivered)) {
+ rs->prior_delivered_ce = scb->tx.delivered_ce;
rs->prior_delivered = scb->tx.delivered;
rs->prior_mstamp = scb->tx.delivered_mstamp;
rs->is_app_limited = scb->tx.is_app_limited;
@@ -138,6 +140,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
}
rs->delivered = tp->delivered - rs->prior_delivered;
+ rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
+ /* delivered_ce occupies less than 32 bits in the skb control block */
+ rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK;
+
/* Model sending data and receiving ACKs as separate pipeline phases
* for a window. Usually the ACK phase is longer, but with ACK
* compression the send phase can be longer. To be safe we use the
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 8851c9463b4b..2fffcf2b54f3 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -390,7 +390,8 @@ static int compute_score(struct sock *sk, struct net *net,
dif, sdif);
if (!dev_match)
return -1;
- score += 4;
+ if (sk->sk_bound_dev_if)
+ score += 4;
if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
score++;
@@ -1053,7 +1054,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
__be16 dport;
u8 tos;
int err, is_udplite = IS_UDPLITE(sk);
- int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
+ int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE;
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
struct sk_buff *skb;
struct ip_options_data opt_copy;
@@ -1361,7 +1362,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
}
up->len += size;
- if (!(up->corkflag || (flags&MSG_MORE)))
+ if (!(READ_ONCE(up->corkflag) || (flags&MSG_MORE)))
ret = udp_push_pending_frames(sk);
if (!ret)
ret = size;
@@ -2662,9 +2663,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
switch (optname) {
case UDP_CORK:
if (val != 0) {
- up->corkflag = 1;
+ WRITE_ONCE(up->corkflag, 1);
} else {
- up->corkflag = 0;
+ WRITE_ONCE(up->corkflag, 0);
lock_sock(sk);
push_pending_frames(sk);
release_sock(sk);
@@ -2787,7 +2788,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
switch (optname) {
case UDP_CORK:
- val = up->corkflag;
+ val = READ_ONCE(up->corkflag);
break;
case UDP_ENCAP:
@@ -2866,6 +2867,9 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
!(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
mask &= ~(EPOLLIN | EPOLLRDNORM);
+ /* psock ingress_msg queue should not contain any bad checksum frames */
+ if (sk_is_readable(sk))
+ mask |= EPOLLIN | EPOLLRDNORM;
return mask;
}
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
index 7a1d5f473878..bbe6569c9ad3 100644
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -114,6 +114,7 @@ static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
*prot = *base;
prot->close = sock_map_close;
prot->recvmsg = udp_bpf_recvmsg;
+ prot->sock_is_readable = sk_msg_is_readable;
}
static void udp_bpf_check_v6_needs_rebuild(struct proto *ops)