summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/af_inet.c8
-rw-r--r--net/ipv4/arp.c6
-rw-r--r--net/ipv4/devinet.c117
-rw-r--r--net/ipv4/esp4.c373
-rw-r--r--net/ipv4/esp4_offload.c231
-rw-r--r--net/ipv4/fib_frontend.c12
-rw-r--r--net/ipv4/fib_notifier.c86
-rw-r--r--net/ipv4/fib_rules.c55
-rw-r--r--net/ipv4/fib_semantics.c11
-rw-r--r--net/ipv4/fib_trie.c108
-rw-r--r--net/ipv4/icmp.c19
-rw-r--r--net/ipv4/ip_gre.c24
-rw-r--r--net/ipv4/ip_input.c5
-rw-r--r--net/ipv4/ip_sockglue.c12
-rw-r--r--net/ipv4/ip_tunnel.c27
-rw-r--r--net/ipv4/ip_tunnel_core.c5
-rw-r--r--net/ipv4/ip_vti.c20
-rw-r--r--net/ipv4/ipconfig.c1
-rw-r--r--net/ipv4/ipip.c24
-rw-r--r--net/ipv4/ipmr.c23
-rw-r--r--net/ipv4/netfilter/arp_tables.c23
-rw-r--r--net/ipv4/netfilter/ip_tables.c20
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c19
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c94
-rw-r--r--net/ipv4/netfilter/nf_dup_ipv4.c3
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c8
-rw-r--r--net/ipv4/netfilter/nf_nat_masquerade_ipv4.c5
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c45
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c27
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c3
-rw-r--r--net/ipv4/netfilter/nf_socket_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nft_fib_ipv4.c6
-rw-r--r--net/ipv4/proc.c2
-rw-r--r--net/ipv4/protocol.c2
-rw-r--r--net/ipv4/route.c123
-rw-r--r--net/ipv4/sysctl_net_ipv4.c107
-rw-r--r--net/ipv4/tcp.c9
-rw-r--r--net/ipv4/tcp_cubic.c2
-rw-r--r--net/ipv4/tcp_fastopen.c102
-rw-r--r--net/ipv4/tcp_input.c157
-rw-r--r--net/ipv4/tcp_ipv4.c46
-rw-r--r--net/ipv4/tcp_lp.c6
-rw-r--r--net/ipv4/tcp_metrics.c147
-rw-r--r--net/ipv4/tcp_minisocks.c26
-rw-r--r--net/ipv4/tcp_output.c20
-rw-r--r--net/ipv4/tcp_rate.c7
-rw-r--r--net/ipv4/tcp_recovery.c19
-rw-r--r--net/ipv4/tcp_timer.c7
-rw-r--r--net/ipv4/tcp_westwood.c4
-rw-r--r--net/ipv4/xfrm4_mode_transport.c34
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c28
-rw-r--r--net/ipv4/xfrm4_output.c3
53 files changed, 1365 insertions, 910 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index c6d4238ff94a..f83de23a30e7 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -11,7 +11,7 @@ obj-y := route.o inetpeer.o protocol.o \
tcp_rate.o tcp_recovery.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
- fib_frontend.o fib_semantics.o fib_trie.o \
+ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o
obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 13a9a3297eae..f3dad1661343 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1602,8 +1602,9 @@ static const struct net_protocol igmp_protocol = {
};
#endif
-static const struct net_protocol tcp_protocol = {
+static struct net_protocol tcp_protocol = {
.early_demux = tcp_v4_early_demux,
+ .early_demux_handler = tcp_v4_early_demux,
.handler = tcp_v4_rcv,
.err_handler = tcp_v4_err,
.no_policy = 1,
@@ -1611,8 +1612,9 @@ static const struct net_protocol tcp_protocol = {
.icmp_strict_tag_validation = 1,
};
-static const struct net_protocol udp_protocol = {
+static struct net_protocol udp_protocol = {
.early_demux = udp_v4_early_demux,
+ .early_demux_handler = udp_v4_early_demux,
.handler = udp_rcv,
.err_handler = udp_err,
.no_policy = 1,
@@ -1723,6 +1725,8 @@ static __net_init int inet_init_net(struct net *net)
net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
net->ipv4.sysctl_ip_dynaddr = 0;
net->ipv4.sysctl_ip_early_demux = 1;
+ net->ipv4.sysctl_udp_early_demux = 1;
+ net->ipv4.sysctl_tcp_early_demux = 1;
#ifdef CONFIG_SYSCTL
net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
#endif
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 51b27ae09fbd..0937b34c27ca 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -872,7 +872,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
skb->pkt_type != PACKET_HOST)
state = NUD_STALE;
neigh_update(n, sha, state,
- override ? NEIGH_UPDATE_F_OVERRIDE : 0);
+ override ? NEIGH_UPDATE_F_OVERRIDE : 0, 0);
neigh_release(n);
}
@@ -1033,7 +1033,7 @@ static int arp_req_set(struct net *net, struct arpreq *r,
err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
r->arp_ha.sa_data : NULL, state,
NEIGH_UPDATE_F_OVERRIDE |
- NEIGH_UPDATE_F_ADMIN);
+ NEIGH_UPDATE_F_ADMIN, 0);
neigh_release(neigh);
}
return err;
@@ -1084,7 +1084,7 @@ static int arp_invalidate(struct net_device *dev, __be32 ip)
if (neigh->nud_state & ~NUD_NOARP)
err = neigh_update(neigh, NULL, NUD_FAILED,
NEIGH_UPDATE_F_OVERRIDE|
- NEIGH_UPDATE_F_ADMIN);
+ NEIGH_UPDATE_F_ADMIN, 0);
neigh_release(neigh);
}
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index cebedd545e5e..df14815a3b8c 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -571,7 +571,8 @@ static int ip_mc_config(struct sock *sk, bool join, const struct in_ifaddr *ifa)
return ret;
}
-static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tb[IFA_MAX+1];
@@ -582,7 +583,8 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
ASSERT_RTNL();
- err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy);
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy,
+ extack);
if (err < 0)
goto errout;
@@ -752,7 +754,8 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
struct in_device *in_dev;
int err;
- err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy);
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy,
+ NULL);
if (err < 0)
goto errout;
@@ -843,7 +846,8 @@ static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa)
return NULL;
}
-static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
struct in_ifaddr *ifa;
@@ -1192,6 +1196,18 @@ out:
return done;
}
+static __be32 in_dev_select_addr(const struct in_device *in_dev,
+ int scope)
+{
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_scope != RT_SCOPE_LINK &&
+ ifa->ifa_scope <= scope)
+ return ifa->ifa_local;
+ } endfor_ifa(in_dev);
+
+ return 0;
+}
+
__be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
{
__be32 addr = 0;
@@ -1228,13 +1244,9 @@ no_in_dev:
if (master_idx &&
(dev = dev_get_by_index_rcu(net, master_idx)) &&
(in_dev = __in_dev_get_rcu(dev))) {
- for_primary_ifa(in_dev) {
- if (ifa->ifa_scope != RT_SCOPE_LINK &&
- ifa->ifa_scope <= scope) {
- addr = ifa->ifa_local;
- goto out_unlock;
- }
- } endfor_ifa(in_dev);
+ addr = in_dev_select_addr(in_dev, scope);
+ if (addr)
+ goto out_unlock;
}
/* Not loopback addresses on loopback should be preferred
@@ -1249,13 +1261,9 @@ no_in_dev:
if (!in_dev)
continue;
- for_primary_ifa(in_dev) {
- if (ifa->ifa_scope != RT_SCOPE_LINK &&
- ifa->ifa_scope <= scope) {
- addr = ifa->ifa_local;
- goto out_unlock;
- }
- } endfor_ifa(in_dev);
+ addr = in_dev_select_addr(in_dev, scope);
+ if (addr)
+ goto out_unlock;
}
out_unlock:
rcu_read_unlock();
@@ -1713,7 +1721,7 @@ static int inet_validate_link_af(const struct net_device *dev,
if (dev && !__in_dev_get_rtnl(dev))
return -EAFNOSUPPORT;
- err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy);
+ err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy, NULL);
if (err < 0)
return err;
@@ -1741,7 +1749,7 @@ static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla)
if (!in_dev)
return -EAFNOSUPPORT;
- if (nla_parse_nested(tb, IFLA_INET_MAX, nla, NULL) < 0)
+ if (nla_parse_nested(tb, IFLA_INET_MAX, nla, NULL, NULL) < 0)
BUG();
if (tb[IFLA_INET_CONF]) {
@@ -1798,6 +1806,9 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
goto nla_put_failure;
+ if (!devconf)
+ goto out;
+
if ((all || type == NETCONFA_FORWARDING) &&
nla_put_s32(skb, NETCONFA_FORWARDING,
IPV4_DEVCONF(*devconf, FORWARDING)) < 0)
@@ -1819,6 +1830,7 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0)
goto nla_put_failure;
+out:
nlmsg_end(skb, nlh);
return 0;
@@ -1827,8 +1839,8 @@ nla_put_failure:
return -EMSGSIZE;
}
-void inet_netconf_notify_devconf(struct net *net, int type, int ifindex,
- struct ipv4_devconf *devconf)
+void inet_netconf_notify_devconf(struct net *net, int event, int type,
+ int ifindex, struct ipv4_devconf *devconf)
{
struct sk_buff *skb;
int err = -ENOBUFS;
@@ -1838,7 +1850,7 @@ void inet_netconf_notify_devconf(struct net *net, int type, int ifindex,
goto errout;
err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0,
- RTM_NEWNETCONF, 0, type);
+ event, 0, type);
if (err < 0) {
/* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
WARN_ON(err == -EMSGSIZE);
@@ -1861,7 +1873,8 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
};
static int inet_netconf_get_devconf(struct sk_buff *in_skb,
- struct nlmsghdr *nlh)
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
struct net *net = sock_net(in_skb->sk);
struct nlattr *tb[NETCONFA_MAX+1];
@@ -1874,7 +1887,7 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
int err;
err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX,
- devconf_ipv4_policy);
+ devconf_ipv4_policy, extack);
if (err < 0)
goto errout;
@@ -2017,10 +2030,12 @@ static void inet_forward_change(struct net *net)
IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;
IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
- inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_FORWARDING,
NETCONFA_IFINDEX_ALL,
net->ipv4.devconf_all);
- inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_FORWARDING,
NETCONFA_IFINDEX_DEFAULT,
net->ipv4.devconf_dflt);
@@ -2033,7 +2048,8 @@ static void inet_forward_change(struct net *net)
in_dev = __in_dev_get_rtnl(dev);
if (in_dev) {
IN_DEV_CONF_SET(in_dev, FORWARDING, on);
- inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_FORWARDING,
dev->ifindex, &in_dev->cnf);
}
}
@@ -2078,19 +2094,22 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write,
if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
new_value != old_value) {
ifindex = devinet_conf_ifindex(net, cnf);
- inet_netconf_notify_devconf(net, NETCONFA_RP_FILTER,
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_RP_FILTER,
ifindex, cnf);
}
if (i == IPV4_DEVCONF_PROXY_ARP - 1 &&
new_value != old_value) {
ifindex = devinet_conf_ifindex(net, cnf);
- inet_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH,
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_PROXY_NEIGH,
ifindex, cnf);
}
if (i == IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1 &&
new_value != old_value) {
ifindex = devinet_conf_ifindex(net, cnf);
- inet_netconf_notify_devconf(net, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
ifindex, cnf);
}
}
@@ -2125,7 +2144,7 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write,
container_of(cnf, struct in_device, cnf);
if (*valp)
dev_disable_lro(idev->dev);
- inet_netconf_notify_devconf(net,
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
NETCONFA_FORWARDING,
idev->dev->ifindex,
cnf);
@@ -2133,7 +2152,8 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write,
rtnl_unlock();
rt_cache_flush(net);
} else
- inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_FORWARDING,
NETCONFA_IFINDEX_DEFAULT,
net->ipv4.devconf_dflt);
}
@@ -2255,7 +2275,8 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
p->sysctl = t;
- inet_netconf_notify_devconf(net, NETCONFA_ALL, ifindex, p);
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_ALL,
+ ifindex, p);
return 0;
free:
@@ -2264,16 +2285,18 @@ out:
return -ENOBUFS;
}
-static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
+static void __devinet_sysctl_unregister(struct net *net,
+ struct ipv4_devconf *cnf, int ifindex)
{
struct devinet_sysctl_table *t = cnf->sysctl;
- if (!t)
- return;
+ if (t) {
+ cnf->sysctl = NULL;
+ unregister_net_sysctl_table(t->sysctl_header);
+ kfree(t);
+ }
- cnf->sysctl = NULL;
- unregister_net_sysctl_table(t->sysctl_header);
- kfree(t);
+ inet_netconf_notify_devconf(net, RTM_DELNETCONF, 0, ifindex, NULL);
}
static int devinet_sysctl_register(struct in_device *idev)
@@ -2295,7 +2318,9 @@ static int devinet_sysctl_register(struct in_device *idev)
static void devinet_sysctl_unregister(struct in_device *idev)
{
- __devinet_sysctl_unregister(&idev->cnf);
+ struct net *net = dev_net(idev->dev);
+
+ __devinet_sysctl_unregister(net, &idev->cnf, idev->dev->ifindex);
neigh_sysctl_unregister(idev->arp_parms);
}
@@ -2370,9 +2395,9 @@ static __net_init int devinet_init_net(struct net *net)
#ifdef CONFIG_SYSCTL
err_reg_ctl:
- __devinet_sysctl_unregister(dflt);
+ __devinet_sysctl_unregister(net, dflt, NETCONFA_IFINDEX_DEFAULT);
err_reg_dflt:
- __devinet_sysctl_unregister(all);
+ __devinet_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL);
err_reg_all:
if (tbl != ctl_forward_entry)
kfree(tbl);
@@ -2394,8 +2419,10 @@ static __net_exit void devinet_exit_net(struct net *net)
tbl = net->ipv4.forw_hdr->ctl_table_arg;
unregister_net_sysctl_table(net->ipv4.forw_hdr);
- __devinet_sysctl_unregister(net->ipv4.devconf_dflt);
- __devinet_sysctl_unregister(net->ipv4.devconf_all);
+ __devinet_sysctl_unregister(net, net->ipv4.devconf_dflt,
+ NETCONFA_IFINDEX_DEFAULT);
+ __devinet_sysctl_unregister(net, net->ipv4.devconf_all,
+ NETCONFA_IFINDEX_ALL);
kfree(tbl);
#endif
kfree(net->ipv4.devconf_dflt);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index b1e24446e297..65cc02bd82bc 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -152,21 +152,28 @@ static void esp_output_restore_header(struct sk_buff *skb)
}
static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb,
+ struct xfrm_state *x,
struct ip_esp_hdr *esph,
struct esp_output_extra *extra)
{
- struct xfrm_state *x = skb_dst(skb)->xfrm;
-
/* For ESN we move the header forward by 4 bytes to
* accomodate the high bits. We will move it back after
* encryption.
*/
if ((x->props.flags & XFRM_STATE_ESN)) {
+ __u32 seqhi;
+ struct xfrm_offload *xo = xfrm_offload(skb);
+
+ if (xo)
+ seqhi = xo->seq.hi;
+ else
+ seqhi = XFRM_SKB_CB(skb)->seq.output.hi;
+
extra->esphoff = (unsigned char *)esph -
skb_transport_header(skb);
esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
extra->seqhi = esph->spi;
- esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ esph->seq_no = htonl(seqhi);
}
esph->spi = x->id.spi;
@@ -198,98 +205,56 @@ static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
tail[plen - 1] = proto;
}
-static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
+static void esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
{
- struct esp_output_extra *extra;
- int err = -ENOMEM;
- struct ip_esp_hdr *esph;
- struct crypto_aead *aead;
- struct aead_request *req;
- struct scatterlist *sg, *dsg;
- struct sk_buff *trailer;
- struct page *page;
- void *tmp;
- u8 *iv;
- u8 *tail;
- u8 *vaddr;
- int blksize;
- int clen;
- int alen;
- int plen;
- int ivlen;
- int tfclen;
- int nfrags;
- int assoclen;
- int extralen;
- int tailen;
- __be64 seqno;
- __u8 proto = *skb_mac_header(skb);
-
- /* skb is pure payload to encrypt */
-
- aead = x->data;
- alen = crypto_aead_authsize(aead);
- ivlen = crypto_aead_ivsize(aead);
-
- tfclen = 0;
- if (x->tfcpad) {
- struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
- u32 padto;
-
- padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached));
- if (skb->len < padto)
- tfclen = padto - skb->len;
+ int encap_type;
+ struct udphdr *uh;
+ __be32 *udpdata32;
+ __be16 sport, dport;
+ struct xfrm_encap_tmpl *encap = x->encap;
+ struct ip_esp_hdr *esph = esp->esph;
+
+ spin_lock_bh(&x->lock);
+ sport = encap->encap_sport;
+ dport = encap->encap_dport;
+ encap_type = encap->encap_type;
+ spin_unlock_bh(&x->lock);
+
+ uh = (struct udphdr *)esph;
+ uh->source = sport;
+ uh->dest = dport;
+ uh->len = htons(skb->len + esp->tailen
+ - skb_transport_offset(skb));
+ uh->check = 0;
+
+ switch (encap_type) {
+ default:
+ case UDP_ENCAP_ESPINUDP:
+ esph = (struct ip_esp_hdr *)(uh + 1);
+ break;
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ udpdata32 = (__be32 *)(uh + 1);
+ udpdata32[0] = udpdata32[1] = 0;
+ esph = (struct ip_esp_hdr *)(udpdata32 + 2);
+ break;
}
- blksize = ALIGN(crypto_aead_blocksize(aead), 4);
- clen = ALIGN(skb->len + 2 + tfclen, blksize);
- plen = clen - skb->len - tfclen;
- tailen = tfclen + plen + alen;
- assoclen = sizeof(*esph);
- extralen = 0;
- if (x->props.flags & XFRM_STATE_ESN) {
- extralen += sizeof(*extra);
- assoclen += sizeof(__be32);
- }
+ *skb_mac_header(skb) = IPPROTO_UDP;
+ esp->esph = esph;
+}
- *skb_mac_header(skb) = IPPROTO_ESP;
- esph = ip_esp_hdr(skb);
+int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
+{
+ u8 *tail;
+ u8 *vaddr;
+ int nfrags;
+ struct page *page;
+ struct sk_buff *trailer;
+ int tailen = esp->tailen;
/* this is non-NULL only with UDP Encapsulation */
- if (x->encap) {
- struct xfrm_encap_tmpl *encap = x->encap;
- struct udphdr *uh;
- __be32 *udpdata32;
- __be16 sport, dport;
- int encap_type;
-
- spin_lock_bh(&x->lock);
- sport = encap->encap_sport;
- dport = encap->encap_dport;
- encap_type = encap->encap_type;
- spin_unlock_bh(&x->lock);
-
- uh = (struct udphdr *)esph;
- uh->source = sport;
- uh->dest = dport;
- uh->len = htons(skb->len + tailen
- - skb_transport_offset(skb));
- uh->check = 0;
-
- switch (encap_type) {
- default:
- case UDP_ENCAP_ESPINUDP:
- esph = (struct ip_esp_hdr *)(uh + 1);
- break;
- case UDP_ENCAP_ESPINUDP_NON_IKE:
- udpdata32 = (__be32 *)(uh + 1);
- udpdata32[0] = udpdata32[1] = 0;
- esph = (struct ip_esp_hdr *)(udpdata32 + 2);
- break;
- }
-
- *skb_mac_header(skb) = IPPROTO_UDP;
- }
+ if (x->encap)
+ esp_output_udp_encap(x, skb, esp);
if (!skb_cloned(skb)) {
if (tailen <= skb_availroom(skb)) {
@@ -304,6 +269,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
struct sock *sk = skb->sk;
struct page_frag *pfrag = &x->xfrag;
+ esp->inplace = false;
+
allocsize = ALIGN(tailen, L1_CACHE_BYTES);
spin_lock_bh(&x->lock);
@@ -320,10 +287,12 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
tail = vaddr + pfrag->offset;
- esp_output_fill_trailer(tail, tfclen, plen, proto);
+ esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
kunmap_atomic(vaddr);
+ spin_unlock_bh(&x->lock);
+
nfrags = skb_shinfo(skb)->nr_frags;
__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
@@ -339,107 +308,113 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
if (sk)
atomic_add(tailen, &sk->sk_wmem_alloc);
- skb_push(skb, -skb_network_offset(skb));
-
- esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
- esph->spi = x->id.spi;
-
- tmp = esp_alloc_tmp(aead, nfrags + 2, extralen);
- if (!tmp) {
- spin_unlock_bh(&x->lock);
- err = -ENOMEM;
- goto error;
- }
-
- extra = esp_tmp_extra(tmp);
- iv = esp_tmp_iv(aead, tmp, extralen);
- req = esp_tmp_req(aead, iv);
- sg = esp_req_sg(aead, req);
- dsg = &sg[nfrags];
-
- esph = esp_output_set_extra(skb, esph, extra);
-
- sg_init_table(sg, nfrags);
- skb_to_sgvec(skb, sg,
- (unsigned char *)esph - skb->data,
- assoclen + ivlen + clen + alen);
-
- allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
-
- if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
- spin_unlock_bh(&x->lock);
- err = -ENOMEM;
- goto error;
- }
-
- skb_shinfo(skb)->nr_frags = 1;
-
- page = pfrag->page;
- get_page(page);
- /* replace page frags in skb with new page */
- __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
- pfrag->offset = pfrag->offset + allocsize;
-
- sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
- skb_to_sgvec(skb, dsg,
- (unsigned char *)esph - skb->data,
- assoclen + ivlen + clen + alen);
-
- spin_unlock_bh(&x->lock);
-
- goto skip_cow2;
+ goto out;
}
}
cow:
- err = skb_cow_data(skb, tailen, &trailer);
- if (err < 0)
- goto error;
- nfrags = err;
+ nfrags = skb_cow_data(skb, tailen, &trailer);
+ if (nfrags < 0)
+ goto out;
tail = skb_tail_pointer(trailer);
- esph = ip_esp_hdr(skb);
+ esp->esph = ip_esp_hdr(skb);
skip_cow:
- esp_output_fill_trailer(tail, tfclen, plen, proto);
+ esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
+ pskb_put(skb, trailer, tailen);
- pskb_put(skb, trailer, clen - skb->len + alen);
- skb_push(skb, -skb_network_offset(skb));
- esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
- esph->spi = x->id.spi;
+out:
+ return nfrags;
+}
+EXPORT_SYMBOL_GPL(esp_output_head);
- tmp = esp_alloc_tmp(aead, nfrags, extralen);
- if (!tmp) {
- err = -ENOMEM;
- goto error;
+int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
+{
+ u8 *iv;
+ int alen;
+ void *tmp;
+ int ivlen;
+ int assoclen;
+ int extralen;
+ struct page *page;
+ struct ip_esp_hdr *esph;
+ struct crypto_aead *aead;
+ struct aead_request *req;
+ struct scatterlist *sg, *dsg;
+ struct esp_output_extra *extra;
+ int err = -ENOMEM;
+
+ assoclen = sizeof(struct ip_esp_hdr);
+ extralen = 0;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ extralen += sizeof(*extra);
+ assoclen += sizeof(__be32);
}
+ aead = x->data;
+ alen = crypto_aead_authsize(aead);
+ ivlen = crypto_aead_ivsize(aead);
+
+ tmp = esp_alloc_tmp(aead, esp->nfrags + 2, extralen);
+ if (!tmp)
+ goto error;
+
extra = esp_tmp_extra(tmp);
iv = esp_tmp_iv(aead, tmp, extralen);
req = esp_tmp_req(aead, iv);
sg = esp_req_sg(aead, req);
- dsg = sg;
- esph = esp_output_set_extra(skb, esph, extra);
+ if (esp->inplace)
+ dsg = sg;
+ else
+ dsg = &sg[esp->nfrags];
- sg_init_table(sg, nfrags);
+ esph = esp_output_set_extra(skb, x, esp->esph, extra);
+ esp->esph = esph;
+
+ sg_init_table(sg, esp->nfrags);
skb_to_sgvec(skb, sg,
(unsigned char *)esph - skb->data,
- assoclen + ivlen + clen + alen);
+ assoclen + ivlen + esp->clen + alen);
+
+ if (!esp->inplace) {
+ int allocsize;
+ struct page_frag *pfrag = &x->xfrag;
+
+ allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
+
+ spin_lock_bh(&x->lock);
+ if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+ spin_unlock_bh(&x->lock);
+ goto error;
+ }
+
+ skb_shinfo(skb)->nr_frags = 1;
+
+ page = pfrag->page;
+ get_page(page);
+ /* replace page frags in skb with new page */
+ __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
+ pfrag->offset = pfrag->offset + allocsize;
+ spin_unlock_bh(&x->lock);
+
+ sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
+ skb_to_sgvec(skb, dsg,
+ (unsigned char *)esph - skb->data,
+ assoclen + ivlen + esp->clen + alen);
+ }
-skip_cow2:
if ((x->props.flags & XFRM_STATE_ESN))
aead_request_set_callback(req, 0, esp_output_done_esn, skb);
else
aead_request_set_callback(req, 0, esp_output_done, skb);
- aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv);
+ aead_request_set_crypt(req, sg, dsg, ivlen + esp->clen, iv);
aead_request_set_ad(req, assoclen);
- seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
- ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));
-
memset(iv, 0, ivlen);
- memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&seqno + 8 - min(ivlen, 8),
+ memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&esp->seqno + 8 - min(ivlen, 8),
min(ivlen, 8));
ESP_SKB_CB(skb)->tmp = tmp;
@@ -465,11 +440,63 @@ skip_cow2:
error:
return err;
}
+EXPORT_SYMBOL_GPL(esp_output_tail);
-static int esp_input_done2(struct sk_buff *skb, int err)
+static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int alen;
+ int blksize;
+ struct ip_esp_hdr *esph;
+ struct crypto_aead *aead;
+ struct esp_info esp;
+
+ esp.inplace = true;
+
+ esp.proto = *skb_mac_header(skb);
+ *skb_mac_header(skb) = IPPROTO_ESP;
+
+ /* skb is pure payload to encrypt */
+
+ aead = x->data;
+ alen = crypto_aead_authsize(aead);
+
+ esp.tfclen = 0;
+ if (x->tfcpad) {
+ struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
+ u32 padto;
+
+ padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached));
+ if (skb->len < padto)
+ esp.tfclen = padto - skb->len;
+ }
+ blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+ esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize);
+ esp.plen = esp.clen - skb->len - esp.tfclen;
+ esp.tailen = esp.tfclen + esp.plen + alen;
+
+ esp.esph = ip_esp_hdr(skb);
+
+ esp.nfrags = esp_output_head(x, skb, &esp);
+ if (esp.nfrags < 0)
+ return esp.nfrags;
+
+ esph = esp.esph;
+ esph->spi = x->id.spi;
+
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+ esp.seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
+ ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));
+
+ skb_push(skb, -skb_network_offset(skb));
+
+ return esp_output_tail(x, skb, &esp);
+}
+
+int esp_input_done2(struct sk_buff *skb, int err)
{
const struct iphdr *iph;
struct xfrm_state *x = xfrm_input_state(skb);
+ struct xfrm_offload *xo = xfrm_offload(skb);
struct crypto_aead *aead = x->data;
int alen = crypto_aead_authsize(aead);
int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
@@ -478,7 +505,8 @@ static int esp_input_done2(struct sk_buff *skb, int err)
u8 nexthdr[2];
int padlen;
- kfree(ESP_SKB_CB(skb)->tmp);
+ if (!xo || (xo && !(xo->flags & CRYPTO_DONE)))
+ kfree(ESP_SKB_CB(skb)->tmp);
if (unlikely(err))
goto out;
@@ -549,6 +577,7 @@ static int esp_input_done2(struct sk_buff *skb, int err)
out:
return err;
}
+EXPORT_SYMBOL_GPL(esp_input_done2);
static void esp_input_done(struct crypto_async_request *base, int err)
{
@@ -751,13 +780,17 @@ static int esp_init_aead(struct xfrm_state *x)
char aead_name[CRYPTO_MAX_ALG_NAME];
struct crypto_aead *aead;
int err;
+ u32 mask = 0;
err = -ENAMETOOLONG;
if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME)
goto error;
- aead = crypto_alloc_aead(aead_name, 0, 0);
+ if (x->xso.offload_handle)
+ mask |= CRYPTO_ALG_ASYNC;
+
+ aead = crypto_alloc_aead(aead_name, 0, mask);
err = PTR_ERR(aead);
if (IS_ERR(aead))
goto error;
@@ -787,6 +820,7 @@ static int esp_init_authenc(struct xfrm_state *x)
char authenc_name[CRYPTO_MAX_ALG_NAME];
unsigned int keylen;
int err;
+ u32 mask = 0;
err = -EINVAL;
if (!x->ealg)
@@ -812,7 +846,10 @@ static int esp_init_authenc(struct xfrm_state *x)
goto error;
}
- aead = crypto_alloc_aead(authenc_name, 0, 0);
+ if (x->xso.offload_handle)
+ mask |= CRYPTO_ALG_ASYNC;
+
+ aead = crypto_alloc_aead(authenc_name, 0, mask);
err = PTR_ERR(aead);
if (IS_ERR(aead))
goto error;
@@ -931,7 +968,7 @@ static const struct xfrm_type esp_type =
.destructor = esp_destroy,
.get_mtu = esp4_get_mtu,
.input = esp_input,
- .output = esp_output
+ .output = esp_output,
};
static struct xfrm4_protocol esp4_protocol = {
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 1de442632406..e0666016a764 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -43,27 +43,31 @@ static struct sk_buff **esp4_gro_receive(struct sk_buff **head,
if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0)
goto out;
- err = secpath_set(skb);
- if (err)
- goto out;
+ xo = xfrm_offload(skb);
+ if (!xo || !(xo->flags & CRYPTO_DONE)) {
+ err = secpath_set(skb);
+ if (err)
+ goto out;
- if (skb->sp->len == XFRM_MAX_DEPTH)
- goto out;
+ if (skb->sp->len == XFRM_MAX_DEPTH)
+ goto out;
- x = xfrm_state_lookup(dev_net(skb->dev), skb->mark,
- (xfrm_address_t *)&ip_hdr(skb)->daddr,
- spi, IPPROTO_ESP, AF_INET);
- if (!x)
- goto out;
+ x = xfrm_state_lookup(dev_net(skb->dev), skb->mark,
+ (xfrm_address_t *)&ip_hdr(skb)->daddr,
+ spi, IPPROTO_ESP, AF_INET);
+ if (!x)
+ goto out;
- skb->sp->xvec[skb->sp->len++] = x;
- skb->sp->olen++;
+ skb->sp->xvec[skb->sp->len++] = x;
+ skb->sp->olen++;
- xo = xfrm_offload(skb);
- if (!xo) {
- xfrm_state_put(x);
- goto out;
+ xo = xfrm_offload(skb);
+ if (!xo) {
+ xfrm_state_put(x);
+ goto out;
+ }
}
+
xo->flags |= XFRM_GRO;
XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
@@ -84,19 +88,214 @@ out:
return NULL;
}
+static void esp4_gso_encap(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct ip_esp_hdr *esph;
+ struct iphdr *iph = ip_hdr(skb);
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ int proto = iph->protocol;
+
+ skb_push(skb, -skb_network_offset(skb));
+ esph = ip_esp_hdr(skb);
+ *skb_mac_header(skb) = IPPROTO_ESP;
+
+ esph->spi = x->id.spi;
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+
+ xo->proto = proto;
+}
+
+static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ __u32 seq;
+ int err = 0;
+ struct sk_buff *skb2;
+ struct xfrm_state *x;
+ struct ip_esp_hdr *esph;
+ struct crypto_aead *aead;
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ netdev_features_t esp_features = features;
+ struct xfrm_offload *xo = xfrm_offload(skb);
+
+ if (!xo)
+ goto out;
+
+ seq = xo->seq.low;
+
+ x = skb->sp->xvec[skb->sp->len - 1];
+ aead = x->data;
+ esph = ip_esp_hdr(skb);
+
+ if (esph->spi != x->id.spi)
+ goto out;
+
+ if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)))
+ goto out;
+
+ __skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead));
+
+ skb->encap_hdr_csum = 1;
+
+ if (!(features & NETIF_F_HW_ESP))
+ esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
+
+ segs = x->outer_mode->gso_segment(x, skb, esp_features);
+ if (IS_ERR_OR_NULL(segs))
+ goto out;
+
+ __skb_pull(skb, skb->data - skb_mac_header(skb));
+
+ skb2 = segs;
+ do {
+ struct sk_buff *nskb = skb2->next;
+
+ xo = xfrm_offload(skb2);
+ xo->flags |= XFRM_GSO_SEGMENT;
+ xo->seq.low = seq;
+ xo->seq.hi = xfrm_replay_seqhi(x, seq);
+
+ if(!(features & NETIF_F_HW_ESP))
+ xo->flags |= CRYPTO_FALLBACK;
+
+ x->outer_mode->xmit(x, skb2);
+
+ err = x->type_offload->xmit(x, skb2, esp_features);
+ if (err) {
+ kfree_skb_list(segs);
+ return ERR_PTR(err);
+ }
+
+ if (!skb_is_gso(skb2))
+ seq++;
+ else
+ seq += skb_shinfo(skb2)->gso_segs;
+
+ skb_push(skb2, skb2->mac_len);
+ skb2 = nskb;
+ } while (skb2);
+
+out:
+ return segs;
+}
+
+static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct crypto_aead *aead = x->data;
+
+ if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead)))
+ return -EINVAL;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return esp_input_done2(skb, 0);
+}
+
+static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features)
+{
+ int err;
+ int alen;
+ int blksize;
+ struct xfrm_offload *xo;
+ struct ip_esp_hdr *esph;
+ struct crypto_aead *aead;
+ struct esp_info esp;
+ bool hw_offload = true;
+
+ esp.inplace = true;
+
+ xo = xfrm_offload(skb);
+
+ if (!xo)
+ return -EINVAL;
+
+ if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
+ (x->xso.dev != skb->dev)) {
+ xo->flags |= CRYPTO_FALLBACK;
+ hw_offload = false;
+ }
+
+ esp.proto = xo->proto;
+
+ /* skb is pure payload to encrypt */
+
+ aead = x->data;
+ alen = crypto_aead_authsize(aead);
+
+ esp.tfclen = 0;
+ /* XXX: Add support for tfc padding here. */
+
+ blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+ esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize);
+ esp.plen = esp.clen - skb->len - esp.tfclen;
+ esp.tailen = esp.tfclen + esp.plen + alen;
+
+ esp.esph = ip_esp_hdr(skb);
+
+
+ if (!hw_offload || (hw_offload && !skb_is_gso(skb))) {
+ esp.nfrags = esp_output_head(x, skb, &esp);
+ if (esp.nfrags < 0)
+ return esp.nfrags;
+ }
+
+ esph = esp.esph;
+ esph->spi = x->id.spi;
+
+ skb_push(skb, -skb_network_offset(skb));
+
+ if (xo->flags & XFRM_GSO_SEGMENT) {
+ esph->seq_no = htonl(xo->seq.low);
+ } else {
+ ip_hdr(skb)->tot_len = htons(skb->len);
+ ip_send_check(ip_hdr(skb));
+ }
+
+ if (hw_offload)
+ return 0;
+
+ esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32));
+
+ err = esp_output_tail(x, skb, &esp);
+ if (err < 0)
+ return err;
+
+ secpath_reset(skb);
+
+ return 0;
+}
+
static const struct net_offload esp4_offload = {
.callbacks = {
.gro_receive = esp4_gro_receive,
+ .gso_segment = esp4_gso_segment,
},
};
+static const struct xfrm_type_offload esp_type_offload = {
+ .description = "ESP4 OFFLOAD",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_ESP,
+ .input_tail = esp_input_tail,
+ .xmit = esp_xmit,
+ .encap = esp4_gso_encap,
+};
+
static int __init esp4_offload_init(void)
{
+ if (xfrm_register_type_offload(&esp_type_offload, AF_INET) < 0) {
+ pr_info("%s: can't add xfrm type offload\n", __func__);
+ return -EAGAIN;
+ }
+
return inet_add_offload(&esp4_offload, IPPROTO_ESP);
}
static void __exit esp4_offload_exit(void)
{
+ if (xfrm_unregister_type_offload(&esp_type_offload, AF_INET) < 0)
+ pr_info("%s: can't remove xfrm type offload\n", __func__);
+
inet_del_offload(&esp4_offload, IPPROTO_ESP);
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 8f2133ffc2ff..39bd1edee676 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -632,7 +632,8 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
int err, remaining;
struct rtmsg *rtm;
- err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy);
+ err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy,
+ NULL);
if (err < 0)
goto errout;
@@ -709,7 +710,8 @@ errout:
return err;
}
-static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
struct fib_config cfg;
@@ -731,7 +733,8 @@ errout:
return err;
}
-static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
struct fib_config cfg;
@@ -1127,7 +1130,8 @@ static void fib_disable_ip(struct net_device *dev, unsigned long event,
{
if (fib_sync_down_dev(dev, event, force))
fib_flush(dev_net(dev));
- rt_cache_flush(dev_net(dev));
+ else
+ rt_cache_flush(dev_net(dev));
arp_ifdown(dev);
}
diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c
new file mode 100644
index 000000000000..e0714d975947
--- /dev/null
+++ b/net/ipv4/fib_notifier.c
@@ -0,0 +1,86 @@
+#include <linux/rtnetlink.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel.h>
+#include <net/net_namespace.h>
+#include <net/netns/ipv4.h>
+#include <net/ip_fib.h>
+
+static ATOMIC_NOTIFIER_HEAD(fib_chain);
+
+int call_fib_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type,
+ struct fib_notifier_info *info)
+{
+ info->net = net;
+ return nb->notifier_call(nb, event_type, info);
+}
+
+int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
+ struct fib_notifier_info *info)
+{
+ net->ipv4.fib_seq++;
+ info->net = net;
+ return atomic_notifier_call_chain(&fib_chain, event_type, info);
+}
+
+static unsigned int fib_seq_sum(void)
+{
+ unsigned int fib_seq = 0;
+ struct net *net;
+
+ rtnl_lock();
+ for_each_net(net)
+ fib_seq += net->ipv4.fib_seq;
+ rtnl_unlock();
+
+ return fib_seq;
+}
+
+static bool fib_dump_is_consistent(struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb),
+ unsigned int fib_seq)
+{
+ atomic_notifier_chain_register(&fib_chain, nb);
+ if (fib_seq == fib_seq_sum())
+ return true;
+ atomic_notifier_chain_unregister(&fib_chain, nb);
+ if (cb)
+ cb(nb);
+ return false;
+}
+
+#define FIB_DUMP_MAX_RETRIES 5
+int register_fib_notifier(struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb))
+{
+ int retries = 0;
+
+ do {
+ unsigned int fib_seq = fib_seq_sum();
+ struct net *net;
+
+ /* Mutex semantics guarantee that every change done to
+ * FIB tries before we read the change sequence counter
+ * is now visible to us.
+ */
+ rcu_read_lock();
+ for_each_net_rcu(net) {
+ fib_rules_notify(net, nb);
+ fib_notify(net, nb);
+ }
+ rcu_read_unlock();
+
+ if (fib_dump_is_consistent(nb, cb, fib_seq))
+ return 0;
+ } while (++retries < FIB_DUMP_MAX_RETRIES);
+
+ return -EBUSY;
+}
+EXPORT_SYMBOL(register_fib_notifier);
+
+int unregister_fib_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&fib_chain, nb);
+}
+EXPORT_SYMBOL(unregister_fib_notifier);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 2e50062f642d..778ecf977eb2 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -47,6 +47,27 @@ struct fib4_rule {
#endif
};
+static bool fib4_rule_matchall(const struct fib_rule *rule)
+{
+ struct fib4_rule *r = container_of(rule, struct fib4_rule, common);
+
+ if (r->dst_len || r->src_len || r->tos)
+ return false;
+ return fib_rule_matchall(rule);
+}
+
+bool fib4_rule_default(const struct fib_rule *rule)
+{
+ if (!fib4_rule_matchall(rule) || rule->action != FR_ACT_TO_TBL ||
+ rule->l3mdev)
+ return false;
+ if (rule->table != RT_TABLE_LOCAL && rule->table != RT_TABLE_MAIN &&
+ rule->table != RT_TABLE_DEFAULT)
+ return false;
+ return true;
+}
+EXPORT_SYMBOL_GPL(fib4_rule_default);
+
int __fib_lookup(struct net *net, struct flowi4 *flp,
struct fib_result *res, unsigned int flags)
{
@@ -164,12 +185,36 @@ static struct fib_table *fib_empty_table(struct net *net)
return NULL;
}
+static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type,
+ struct fib_rule *rule)
+{
+ struct fib_rule_notifier_info info = {
+ .rule = rule,
+ };
+
+ return call_fib_notifier(nb, net, event_type, &info.info);
+}
+
static int call_fib_rule_notifiers(struct net *net,
- enum fib_event_type event_type)
+ enum fib_event_type event_type,
+ struct fib_rule *rule)
+{
+ struct fib_rule_notifier_info info = {
+ .rule = rule,
+ };
+
+ return call_fib_notifiers(net, event_type, &info.info);
+}
+
+/* Called with rcu_read_lock() */
+void fib_rules_notify(struct net *net, struct notifier_block *nb)
{
- struct fib_notifier_info info;
+ struct fib_rules_ops *ops = net->ipv4.rules_ops;
+ struct fib_rule *rule;
- return call_fib_notifiers(net, event_type, &info);
+ list_for_each_entry_rcu(rule, &ops->rules_list, list)
+ call_fib_rule_notifier(nb, net, FIB_EVENT_RULE_ADD, rule);
}
static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
@@ -228,7 +273,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
rule4->tos = frh->tos;
net->ipv4.fib_has_custom_rules = true;
- call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD);
+ call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule);
err = 0;
errout:
@@ -250,7 +295,7 @@ static int fib4_rule_delete(struct fib_rule *rule)
net->ipv4.fib_num_tclassid_users--;
#endif
net->ipv4.fib_has_custom_rules = true;
- call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL);
+ call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule);
errout:
return err;
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 317026a39cfa..da449ddb8cc1 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -57,7 +57,6 @@ static unsigned int fib_info_cnt;
static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
#ifdef CONFIG_IP_ROUTE_MULTIPATH
-u32 fib_multipath_secret __read_mostly;
#define for_nexthops(fi) { \
int nhsel; const struct fib_nh *nh; \
@@ -576,9 +575,6 @@ static void fib_rebalance(struct fib_info *fi)
atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
} endfor_nexthops(fi);
-
- net_get_random_once(&fib_multipath_secret,
- sizeof(fib_multipath_secret));
}
static inline void fib_add_weight(struct fib_info *fi,
@@ -1641,7 +1637,7 @@ void fib_select_multipath(struct fib_result *res, int hash)
#endif
void fib_select_path(struct net *net, struct fib_result *res,
- struct flowi4 *fl4, int mp_hash)
+ struct flowi4 *fl4, const struct sk_buff *skb)
{
bool oif_check;
@@ -1650,10 +1646,9 @@ void fib_select_path(struct net *net, struct fib_result *res,
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (res->fi->fib_nhs > 1 && oif_check) {
- if (mp_hash < 0)
- mp_hash = get_hash_from_flowi4(fl4) >> 1;
+ int h = fib_multipath_hash(res->fi, fl4, skb);
- fib_select_multipath(res, mp_hash);
+ fib_select_multipath(res, h);
}
else
#endif
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 2f0d8233950f..1201409ba1dc 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -84,43 +84,6 @@
#include <trace/events/fib.h>
#include "fib_lookup.h"
-static unsigned int fib_seq_sum(void)
-{
- unsigned int fib_seq = 0;
- struct net *net;
-
- rtnl_lock();
- for_each_net(net)
- fib_seq += net->ipv4.fib_seq;
- rtnl_unlock();
-
- return fib_seq;
-}
-
-static ATOMIC_NOTIFIER_HEAD(fib_chain);
-
-static int call_fib_notifier(struct notifier_block *nb, struct net *net,
- enum fib_event_type event_type,
- struct fib_notifier_info *info)
-{
- info->net = net;
- return nb->notifier_call(nb, event_type, info);
-}
-
-static void fib_rules_notify(struct net *net, struct notifier_block *nb,
- enum fib_event_type event_type)
-{
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- struct fib_notifier_info info;
-
- if (net->ipv4.fib_has_custom_rules)
- call_fib_notifier(nb, net, event_type, &info);
-#endif
-}
-
-static void fib_notify(struct net *net, struct notifier_block *nb,
- enum fib_event_type event_type);
-
static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
enum fib_event_type event_type, u32 dst,
int dst_len, struct fib_info *fi,
@@ -137,62 +100,6 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
return call_fib_notifier(nb, net, event_type, &info.info);
}
-static bool fib_dump_is_consistent(struct notifier_block *nb,
- void (*cb)(struct notifier_block *nb),
- unsigned int fib_seq)
-{
- atomic_notifier_chain_register(&fib_chain, nb);
- if (fib_seq == fib_seq_sum())
- return true;
- atomic_notifier_chain_unregister(&fib_chain, nb);
- if (cb)
- cb(nb);
- return false;
-}
-
-#define FIB_DUMP_MAX_RETRIES 5
-int register_fib_notifier(struct notifier_block *nb,
- void (*cb)(struct notifier_block *nb))
-{
- int retries = 0;
-
- do {
- unsigned int fib_seq = fib_seq_sum();
- struct net *net;
-
- /* Mutex semantics guarantee that every change done to
- * FIB tries before we read the change sequence counter
- * is now visible to us.
- */
- rcu_read_lock();
- for_each_net_rcu(net) {
- fib_rules_notify(net, nb, FIB_EVENT_RULE_ADD);
- fib_notify(net, nb, FIB_EVENT_ENTRY_ADD);
- }
- rcu_read_unlock();
-
- if (fib_dump_is_consistent(nb, cb, fib_seq))
- return 0;
- } while (++retries < FIB_DUMP_MAX_RETRIES);
-
- return -EBUSY;
-}
-EXPORT_SYMBOL(register_fib_notifier);
-
-int unregister_fib_notifier(struct notifier_block *nb)
-{
- return atomic_notifier_chain_unregister(&fib_chain, nb);
-}
-EXPORT_SYMBOL(unregister_fib_notifier);
-
-int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
- struct fib_notifier_info *info)
-{
- net->ipv4.fib_seq++;
- info->net = net;
- return atomic_notifier_call_chain(&fib_chain, event_type, info);
-}
-
static int call_fib_entry_notifiers(struct net *net,
enum fib_event_type event_type, u32 dst,
int dst_len, struct fib_info *fi,
@@ -1995,8 +1902,7 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
}
static void fib_leaf_notify(struct net *net, struct key_vector *l,
- struct fib_table *tb, struct notifier_block *nb,
- enum fib_event_type event_type)
+ struct fib_table *tb, struct notifier_block *nb)
{
struct fib_alias *fa;
@@ -2012,22 +1918,21 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l,
if (tb->tb_id != fa->tb_id)
continue;
- call_fib_entry_notifier(nb, net, event_type, l->key,
+ call_fib_entry_notifier(nb, net, FIB_EVENT_ENTRY_ADD, l->key,
KEYLENGTH - fa->fa_slen, fi, fa->fa_tos,
fa->fa_type, fa->tb_id);
}
}
static void fib_table_notify(struct net *net, struct fib_table *tb,
- struct notifier_block *nb,
- enum fib_event_type event_type)
+ struct notifier_block *nb)
{
struct trie *t = (struct trie *)tb->tb_data;
struct key_vector *l, *tp = t->kv;
t_key key = 0;
while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
- fib_leaf_notify(net, l, tb, nb, event_type);
+ fib_leaf_notify(net, l, tb, nb);
key = l->key + 1;
/* stop in case of wrap around */
@@ -2036,8 +1941,7 @@ static void fib_table_notify(struct net *net, struct fib_table *tb,
}
}
-static void fib_notify(struct net *net, struct notifier_block *nb,
- enum fib_event_type event_type)
+void fib_notify(struct net *net, struct notifier_block *nb)
{
unsigned int h;
@@ -2046,7 +1950,7 @@ static void fib_notify(struct net *net, struct notifier_block *nb,
struct fib_table *tb;
hlist_for_each_entry_rcu(tb, head, tb_hlist)
- fib_table_notify(net, tb, nb, event_type);
+ fib_table_notify(net, tb, nb);
}
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index fc310db2708b..43318b5f5647 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -464,22 +464,6 @@ out_bh_enable:
local_bh_enable();
}
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-
-/* Source and destination is swapped. See ip_multipath_icmp_hash */
-static int icmp_multipath_hash_skb(const struct sk_buff *skb)
-{
- const struct iphdr *iph = ip_hdr(skb);
-
- return fib_multipath_hash(iph->daddr, iph->saddr);
-}
-
-#else
-
-#define icmp_multipath_hash_skb(skb) (-1)
-
-#endif
-
static struct rtable *icmp_route_lookup(struct net *net,
struct flowi4 *fl4,
struct sk_buff *skb_in,
@@ -505,8 +489,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev);
security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
- rt = __ip_route_output_key_hash(net, fl4,
- icmp_multipath_hash_skb(skb_in));
+ rt = __ip_route_output_key_hash(net, fl4, skb_in);
if (IS_ERR(rt))
return rt;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index c9c1cb635d9a..e90c80a548ad 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -829,7 +829,8 @@ out:
static int ipgre_netlink_parms(struct net_device *dev,
struct nlattr *data[],
struct nlattr *tb[],
- struct ip_tunnel_parm *parms)
+ struct ip_tunnel_parm *parms,
+ __u32 *fwmark)
{
struct ip_tunnel *t = netdev_priv(dev);
@@ -886,6 +887,9 @@ static int ipgre_netlink_parms(struct net_device *dev,
t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
}
+ if (data[IFLA_GRE_FWMARK])
+ *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
+
return 0;
}
@@ -957,6 +961,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev,
{
struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
+ __u32 fwmark = 0;
int err;
if (ipgre_netlink_encap_parms(data, &ipencap)) {
@@ -967,31 +972,32 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev,
return err;
}
- err = ipgre_netlink_parms(dev, data, tb, &p);
+ err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
if (err < 0)
return err;
- return ip_tunnel_newlink(dev, tb, &p);
+ return ip_tunnel_newlink(dev, tb, &p, fwmark);
}
static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
struct nlattr *data[])
{
+ struct ip_tunnel *t = netdev_priv(dev);
struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
+ __u32 fwmark = t->fwmark;
int err;
if (ipgre_netlink_encap_parms(data, &ipencap)) {
- struct ip_tunnel *t = netdev_priv(dev);
err = ip_tunnel_encap_setup(t, &ipencap);
if (err < 0)
return err;
}
- err = ipgre_netlink_parms(dev, data, tb, &p);
+ err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
if (err < 0)
return err;
- return ip_tunnel_changelink(dev, tb, &p);
+ return ip_tunnel_changelink(dev, tb, &p, fwmark);
}
static size_t ipgre_get_size(const struct net_device *dev)
@@ -1029,6 +1035,8 @@ static size_t ipgre_get_size(const struct net_device *dev)
nla_total_size(0) +
/* IFLA_GRE_IGNORE_DF */
nla_total_size(1) +
+ /* IFLA_GRE_FWMARK */
+ nla_total_size(4) +
0;
}
@@ -1049,7 +1057,8 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
nla_put_u8(skb, IFLA_GRE_PMTUDISC,
- !!(p->iph.frag_off & htons(IP_DF))))
+ !!(p->iph.frag_off & htons(IP_DF))) ||
+ nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark))
goto nla_put_failure;
if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
@@ -1093,6 +1102,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
[IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 },
[IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG },
[IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 },
+ [IFLA_GRE_FWMARK] = { .type = NLA_U32 },
};
static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d6feabb03516..fa2dc8f692c6 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -313,6 +313,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
struct net_device *dev = skb->dev;
+ void (*edemux)(struct sk_buff *skb);
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
@@ -329,8 +330,8 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
int protocol = iph->protocol;
ipprot = rcu_dereference(inet_protos[protocol]);
- if (ipprot && ipprot->early_demux) {
- ipprot->early_demux(skb);
+ if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
+ edemux(skb);
/* must reload iph, skb->head might have changed */
iph = ip_hdr(skb);
}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 1d46d05efb0f..ec4fe3d4b5c9 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -330,7 +330,6 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
sent to multicast group to reach destination designated router.
*/
struct ip_ra_chain __rcu *ip_ra_chain;
-static DEFINE_SPINLOCK(ip_ra_lock);
static void ip_ra_destroy_rcu(struct rcu_head *head)
@@ -352,21 +351,17 @@ int ip_ra_control(struct sock *sk, unsigned char on,
new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
- spin_lock_bh(&ip_ra_lock);
for (rap = &ip_ra_chain;
- (ra = rcu_dereference_protected(*rap,
- lockdep_is_held(&ip_ra_lock))) != NULL;
+ (ra = rtnl_dereference(*rap)) != NULL;
rap = &ra->next) {
if (ra->sk == sk) {
if (on) {
- spin_unlock_bh(&ip_ra_lock);
kfree(new_ra);
return -EADDRINUSE;
}
/* dont let ip_call_ra_chain() use sk again */
ra->sk = NULL;
RCU_INIT_POINTER(*rap, ra->next);
- spin_unlock_bh(&ip_ra_lock);
if (ra->destructor)
ra->destructor(sk);
@@ -380,17 +375,14 @@ int ip_ra_control(struct sock *sk, unsigned char on,
return 0;
}
}
- if (!new_ra) {
- spin_unlock_bh(&ip_ra_lock);
+ if (!new_ra)
return -ENOBUFS;
- }
new_ra->sk = sk;
new_ra->destructor = destructor;
RCU_INIT_POINTER(new_ra->next, ra);
rcu_assign_pointer(*rap, new_ra);
sock_hold(sk);
- spin_unlock_bh(&ip_ra_lock);
return 0;
}
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 823abaef006b..b878ecbc0608 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -293,7 +293,8 @@ failed:
static inline void init_tunnel_flow(struct flowi4 *fl4,
int proto,
__be32 daddr, __be32 saddr,
- __be32 key, __u8 tos, int oif)
+ __be32 key, __u8 tos, int oif,
+ __u32 mark)
{
memset(fl4, 0, sizeof(*fl4));
fl4->flowi4_oif = oif;
@@ -302,6 +303,7 @@ static inline void init_tunnel_flow(struct flowi4 *fl4,
fl4->flowi4_tos = tos;
fl4->flowi4_proto = proto;
fl4->fl4_gre_key = key;
+ fl4->flowi4_mark = mark;
}
static int ip_tunnel_bind_dev(struct net_device *dev)
@@ -322,7 +324,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
iph->saddr, tunnel->parms.o_key,
- RT_TOS(iph->tos), tunnel->parms.link);
+ RT_TOS(iph->tos), tunnel->parms.link,
+ tunnel->fwmark);
rt = ip_route_output_key(tunnel->net, &fl4);
if (!IS_ERR(rt)) {
@@ -578,7 +581,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
}
init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
- RT_TOS(tos), tunnel->parms.link);
+ RT_TOS(tos), tunnel->parms.link, tunnel->fwmark);
if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
goto tx_error;
rt = ip_route_output_key(tunnel->net, &fl4);
@@ -707,7 +710,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
}
init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
- tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
+ tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
+ tunnel->fwmark);
if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
goto tx_error;
@@ -795,7 +799,8 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,
struct ip_tunnel *t,
struct net_device *dev,
struct ip_tunnel_parm *p,
- bool set_mtu)
+ bool set_mtu,
+ __u32 fwmark)
{
ip_tunnel_del(itn, t);
t->parms.iph.saddr = p->iph.saddr;
@@ -812,10 +817,11 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,
t->parms.iph.tos = p->iph.tos;
t->parms.iph.frag_off = p->iph.frag_off;
- if (t->parms.link != p->link) {
+ if (t->parms.link != p->link || t->fwmark != fwmark) {
int mtu;
t->parms.link = p->link;
+ t->fwmark = fwmark;
mtu = ip_tunnel_bind_dev(dev);
if (set_mtu)
dev->mtu = mtu;
@@ -893,7 +899,7 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
if (t) {
err = 0;
- ip_tunnel_update(itn, t, dev, p, true);
+ ip_tunnel_update(itn, t, dev, p, true, 0);
} else {
err = -ENOENT;
}
@@ -1066,7 +1072,7 @@ void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
- struct ip_tunnel_parm *p)
+ struct ip_tunnel_parm *p, __u32 fwmark)
{
struct ip_tunnel *nt;
struct net *net = dev_net(dev);
@@ -1087,6 +1093,7 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
nt->net = net;
nt->parms = *p;
+ nt->fwmark = fwmark;
err = register_netdevice(dev);
if (err)
goto out;
@@ -1105,7 +1112,7 @@ out:
EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
- struct ip_tunnel_parm *p)
+ struct ip_tunnel_parm *p, __u32 fwmark)
{
struct ip_tunnel *t;
struct ip_tunnel *tunnel = netdev_priv(dev);
@@ -1137,7 +1144,7 @@ int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
}
}
- ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
+ ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
return 0;
}
EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index a31f47ccaad9..baf196eaf1d8 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -235,7 +235,7 @@ static int ip_tun_build_state(struct nlattr *attr,
struct nlattr *tb[LWTUNNEL_IP_MAX + 1];
int err;
- err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy);
+ err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy, NULL);
if (err < 0)
return err;
@@ -332,7 +332,8 @@ static int ip6_tun_build_state(struct nlattr *attr,
struct nlattr *tb[LWTUNNEL_IP6_MAX + 1];
int err;
- err = nla_parse_nested(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy);
+ err = nla_parse_nested(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy,
+ NULL);
if (err < 0)
return err;
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 8b14f1404c8f..40977413fd48 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -471,7 +471,8 @@ static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
}
static void vti_netlink_parms(struct nlattr *data[],
- struct ip_tunnel_parm *parms)
+ struct ip_tunnel_parm *parms,
+ __u32 *fwmark)
{
memset(parms, 0, sizeof(*parms));
@@ -497,24 +498,29 @@ static void vti_netlink_parms(struct nlattr *data[],
if (data[IFLA_VTI_REMOTE])
parms->iph.daddr = nla_get_in_addr(data[IFLA_VTI_REMOTE]);
+ if (data[IFLA_VTI_FWMARK])
+ *fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]);
}
static int vti_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
struct ip_tunnel_parm parms;
+ __u32 fwmark = 0;
- vti_netlink_parms(data, &parms);
- return ip_tunnel_newlink(dev, tb, &parms);
+ vti_netlink_parms(data, &parms, &fwmark);
+ return ip_tunnel_newlink(dev, tb, &parms, fwmark);
}
static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
struct nlattr *data[])
{
+ struct ip_tunnel *t = netdev_priv(dev);
+ __u32 fwmark = t->fwmark;
struct ip_tunnel_parm p;
- vti_netlink_parms(data, &p);
- return ip_tunnel_changelink(dev, tb, &p);
+ vti_netlink_parms(data, &p, &fwmark);
+ return ip_tunnel_changelink(dev, tb, &p, fwmark);
}
static size_t vti_get_size(const struct net_device *dev)
@@ -530,6 +536,8 @@ static size_t vti_get_size(const struct net_device *dev)
nla_total_size(4) +
/* IFLA_VTI_REMOTE */
nla_total_size(4) +
+ /* IFLA_VTI_FWMARK */
+ nla_total_size(4) +
0;
}
@@ -543,6 +551,7 @@ static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key);
nla_put_in_addr(skb, IFLA_VTI_LOCAL, p->iph.saddr);
nla_put_in_addr(skb, IFLA_VTI_REMOTE, p->iph.daddr);
+ nla_put_u32(skb, IFLA_VTI_FWMARK, t->fwmark);
return 0;
}
@@ -553,6 +562,7 @@ static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = {
[IFLA_VTI_OKEY] = { .type = NLA_U32 },
[IFLA_VTI_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
[IFLA_VTI_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+ [IFLA_VTI_FWMARK] = { .type = NLA_U32 },
};
static struct rtnl_link_ops vti_link_ops __read_mostly = {
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index dfb2ab2dd3c8..c3b12b1c7162 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -57,6 +57,7 @@
#include <linux/export.h>
#include <net/net_namespace.h>
#include <net/arp.h>
+#include <net/dsa.h>
#include <net/ip.h>
#include <net/ipconfig.h>
#include <net/route.h>
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 00d4229b6954..1e441c6f2160 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -390,7 +390,8 @@ static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
}
static void ipip_netlink_parms(struct nlattr *data[],
- struct ip_tunnel_parm *parms, bool *collect_md)
+ struct ip_tunnel_parm *parms, bool *collect_md,
+ __u32 *fwmark)
{
memset(parms, 0, sizeof(*parms));
@@ -428,6 +429,9 @@ static void ipip_netlink_parms(struct nlattr *data[],
if (data[IFLA_IPTUN_COLLECT_METADATA])
*collect_md = true;
+
+ if (data[IFLA_IPTUN_FWMARK])
+ *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]);
}
/* This function returns true when ENCAP attributes are present in the nl msg */
@@ -470,6 +474,7 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev,
struct ip_tunnel *t = netdev_priv(dev);
struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
+ __u32 fwmark = 0;
if (ipip_netlink_encap_parms(data, &ipencap)) {
int err = ip_tunnel_encap_setup(t, &ipencap);
@@ -478,26 +483,27 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev,
return err;
}
- ipip_netlink_parms(data, &p, &t->collect_md);
- return ip_tunnel_newlink(dev, tb, &p);
+ ipip_netlink_parms(data, &p, &t->collect_md, &fwmark);
+ return ip_tunnel_newlink(dev, tb, &p, fwmark);
}
static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
struct nlattr *data[])
{
+ struct ip_tunnel *t = netdev_priv(dev);
struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
bool collect_md;
+ __u32 fwmark = t->fwmark;
if (ipip_netlink_encap_parms(data, &ipencap)) {
- struct ip_tunnel *t = netdev_priv(dev);
int err = ip_tunnel_encap_setup(t, &ipencap);
if (err < 0)
return err;
}
- ipip_netlink_parms(data, &p, &collect_md);
+ ipip_netlink_parms(data, &p, &collect_md, &fwmark);
if (collect_md)
return -EINVAL;
@@ -505,7 +511,7 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
(!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
return -EINVAL;
- return ip_tunnel_changelink(dev, tb, &p);
+ return ip_tunnel_changelink(dev, tb, &p, fwmark);
}
static size_t ipip_get_size(const struct net_device *dev)
@@ -535,6 +541,8 @@ static size_t ipip_get_size(const struct net_device *dev)
nla_total_size(2) +
/* IFLA_IPTUN_COLLECT_METADATA */
nla_total_size(0) +
+ /* IFLA_IPTUN_FWMARK */
+ nla_total_size(4) +
0;
}
@@ -550,7 +558,8 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) ||
nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) ||
nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
- !!(parm->iph.frag_off & htons(IP_DF))))
+ !!(parm->iph.frag_off & htons(IP_DF))) ||
+ nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark))
goto nla_put_failure;
if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
@@ -585,6 +594,7 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
[IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 },
[IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 },
[IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG },
+ [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 },
};
static struct rtnl_link_ops ipip_link_ops __read_mostly = {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index b036e85e093b..3a02d52ed50e 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -631,7 +631,7 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
in_dev = __in_dev_get_rtnl(dev);
if (in_dev) {
IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
- inet_netconf_notify_devconf(dev_net(dev),
+ inet_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
NETCONFA_MC_FORWARDING,
dev->ifindex, &in_dev->cnf);
ip_rt_multicast_event(in_dev);
@@ -820,8 +820,8 @@ static int vif_add(struct net *net, struct mr_table *mrt,
return -EADDRNOTAVAIL;
}
IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
- inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, dev->ifindex,
- &in_dev->cnf);
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING,
+ dev->ifindex, &in_dev->cnf);
ip_rt_multicast_event(in_dev);
/* Fill in the VIF structures */
@@ -1282,7 +1282,8 @@ static void mrtsock_destruct(struct sock *sk)
ipmr_for_each_table(mrt, net) {
if (sk == rtnl_dereference(mrt->mroute_sk)) {
IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
- inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING,
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_MC_FORWARDING,
NETCONFA_IFINDEX_ALL,
net->ipv4.devconf_all);
RCU_INIT_POINTER(mrt->mroute_sk, NULL);
@@ -1343,7 +1344,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
if (ret == 0) {
rcu_assign_pointer(mrt->mroute_sk, sk);
IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
- inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING,
+ inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_MC_FORWARDING,
NETCONFA_IFINDEX_ALL,
net->ipv4.devconf_all);
}
@@ -2421,7 +2423,8 @@ static int ipmr_nla_get_ttls(const struct nlattr *nla, struct mfcctl *mfcc)
/* returns < 0 on error, 0 for ADD_MFC and 1 for ADD_MFC_PROXY */
static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh,
struct mfcctl *mfcc, int *mrtsock,
- struct mr_table **mrtret)
+ struct mr_table **mrtret,
+ struct netlink_ext_ack *extack)
{
struct net_device *dev = NULL;
u32 tblid = RT_TABLE_DEFAULT;
@@ -2430,7 +2433,8 @@ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh,
struct rtmsg *rtm;
int ret, rem;
- ret = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy);
+ ret = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy,
+ extack);
if (ret < 0)
goto out;
rtm = nlmsg_data(nlh);
@@ -2489,7 +2493,8 @@ out:
}
/* takes care of both newroute and delroute */
-static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
int ret, mrtsock, parent;
@@ -2498,7 +2503,7 @@ static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh)
mrtsock = 0;
tbl = NULL;
- ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl);
+ ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl, extack);
if (ret < 0)
return ret;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 6241a81fd7f5..0bc3c3d73e61 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -309,8 +309,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
*/
for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) {
unsigned int pos = newinfo->hook_entry[hook];
- struct arpt_entry *e
- = (struct arpt_entry *)(entry0 + pos);
+ struct arpt_entry *e = entry0 + pos;
if (!(valid_hooks & (1 << hook)))
continue;
@@ -354,14 +353,12 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
if (pos == oldpos)
goto next;
- e = (struct arpt_entry *)
- (entry0 + pos);
+ e = entry0 + pos;
} while (oldpos == pos + e->next_offset);
/* Move along one */
size = e->next_offset;
- e = (struct arpt_entry *)
- (entry0 + pos + size);
+ e = entry0 + pos + size;
if (pos + size >= newinfo->size)
return 0;
e->counters.pcnt = pos;
@@ -376,16 +373,14 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
if (!xt_find_jump_offset(offsets, newpos,
newinfo->number))
return 0;
- e = (struct arpt_entry *)
- (entry0 + newpos);
+ e = entry0 + newpos;
} else {
/* ... this is a fallthru */
newpos = pos + e->next_offset;
if (newpos >= newinfo->size)
return 0;
}
- e = (struct arpt_entry *)
- (entry0 + newpos);
+ e = entry0 + newpos;
e->counters.pcnt = pos;
pos = newpos;
}
@@ -562,8 +557,6 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
XT_ERROR_TARGET) == 0)
++newinfo->stacksize;
}
- if (ret != 0)
- goto out_free;
ret = -EINVAL;
if (i != repl->num_entries)
@@ -683,7 +676,7 @@ static int copy_entries_to_user(unsigned int total_size,
for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
const struct xt_entry_target *t;
- e = (struct arpt_entry *)(loc_cpu_entry + off);
+ e = loc_cpu_entry + off;
if (copy_to_user(userptr + off, e, sizeof(*e))) {
ret = -EFAULT;
goto free_counters;
@@ -1130,7 +1123,7 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
int h;
origsize = *size;
- de = (struct arpt_entry *)*dstptr;
+ de = *dstptr;
memcpy(de, e, sizeof(struct arpt_entry));
memcpy(&de->counters, &e->counters, sizeof(e->counters));
@@ -1324,7 +1317,7 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
int ret;
origsize = *size;
- ce = (struct compat_arpt_entry __user *)*dstptr;
+ ce = *dstptr;
if (copy_to_user(ce, e, sizeof(struct arpt_entry)) != 0 ||
copy_to_user(&ce->counters, &counters[i],
sizeof(counters[i])) != 0)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 384b85713e06..2a55a40211cb 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -382,7 +382,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
to 0 as we leave), and comefrom to save source hook bitmask */
for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) {
unsigned int pos = newinfo->hook_entry[hook];
- struct ipt_entry *e = (struct ipt_entry *)(entry0 + pos);
+ struct ipt_entry *e = entry0 + pos;
if (!(valid_hooks & (1 << hook)))
continue;
@@ -424,14 +424,12 @@ mark_source_chains(const struct xt_table_info *newinfo,
if (pos == oldpos)
goto next;
- e = (struct ipt_entry *)
- (entry0 + pos);
+ e = entry0 + pos;
} while (oldpos == pos + e->next_offset);
/* Move along one */
size = e->next_offset;
- e = (struct ipt_entry *)
- (entry0 + pos + size);
+ e = entry0 + pos + size;
if (pos + size >= newinfo->size)
return 0;
e->counters.pcnt = pos;
@@ -446,16 +444,14 @@ mark_source_chains(const struct xt_table_info *newinfo,
if (!xt_find_jump_offset(offsets, newpos,
newinfo->number))
return 0;
- e = (struct ipt_entry *)
- (entry0 + newpos);
+ e = entry0 + newpos;
} else {
/* ... this is a fallthru */
newpos = pos + e->next_offset;
if (newpos >= newinfo->size)
return 0;
}
- e = (struct ipt_entry *)
- (entry0 + newpos);
+ e = entry0 + newpos;
e->counters.pcnt = pos;
pos = newpos;
}
@@ -834,7 +830,7 @@ copy_entries_to_user(unsigned int total_size,
const struct xt_entry_match *m;
const struct xt_entry_target *t;
- e = (struct ipt_entry *)(loc_cpu_entry + off);
+ e = loc_cpu_entry + off;
if (copy_to_user(userptr + off, e, sizeof(*e))) {
ret = -EFAULT;
goto free_counters;
@@ -1229,7 +1225,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
int ret = 0;
origsize = *size;
- ce = (struct compat_ipt_entry __user *)*dstptr;
+ ce = *dstptr;
if (copy_to_user(ce, e, sizeof(struct ipt_entry)) != 0 ||
copy_to_user(&ce->counters, &counters[i],
sizeof(counters[i])) != 0)
@@ -1366,7 +1362,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
struct xt_entry_match *ematch;
origsize = *size;
- de = (struct ipt_entry *)*dstptr;
+ de = *dstptr;
memcpy(de, e, sizeof(struct ipt_entry));
memcpy(&de->counters, &e->counters, sizeof(e->counters));
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 9b8841316e7b..038f293c2376 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -22,6 +22,7 @@
#include <linux/icmp.h>
#include <linux/if_arp.h>
#include <linux/seq_file.h>
+#include <linux/refcount.h>
#include <linux/netfilter_arp.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ip_tables.h>
@@ -40,8 +41,8 @@ MODULE_DESCRIPTION("Xtables: CLUSTERIP target");
struct clusterip_config {
struct list_head list; /* list of all configs */
- atomic_t refcount; /* reference count */
- atomic_t entries; /* number of entries/rules
+ refcount_t refcount; /* reference count */
+ refcount_t entries; /* number of entries/rules
* referencing us */
__be32 clusterip; /* the IP address */
@@ -77,7 +78,7 @@ struct clusterip_net {
static inline void
clusterip_config_get(struct clusterip_config *c)
{
- atomic_inc(&c->refcount);
+ refcount_inc(&c->refcount);
}
@@ -89,7 +90,7 @@ static void clusterip_config_rcu_free(struct rcu_head *head)
static inline void
clusterip_config_put(struct clusterip_config *c)
{
- if (atomic_dec_and_test(&c->refcount))
+ if (refcount_dec_and_test(&c->refcount))
call_rcu_bh(&c->rcu, clusterip_config_rcu_free);
}
@@ -103,7 +104,7 @@ clusterip_config_entry_put(struct clusterip_config *c)
struct clusterip_net *cn = net_generic(net, clusterip_net_id);
local_bh_disable();
- if (atomic_dec_and_lock(&c->entries, &cn->lock)) {
+ if (refcount_dec_and_lock(&c->entries, &cn->lock)) {
list_del_rcu(&c->list);
spin_unlock(&cn->lock);
local_bh_enable();
@@ -149,10 +150,10 @@ clusterip_config_find_get(struct net *net, __be32 clusterip, int entry)
c = NULL;
else
#endif
- if (unlikely(!atomic_inc_not_zero(&c->refcount)))
+ if (unlikely(!refcount_inc_not_zero(&c->refcount)))
c = NULL;
else if (entry)
- atomic_inc(&c->entries);
+ refcount_inc(&c->entries);
}
rcu_read_unlock_bh();
@@ -188,8 +189,8 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
clusterip_config_init_nodelist(c, i);
c->hash_mode = i->hash_mode;
c->hash_initval = i->hash_initval;
- atomic_set(&c->refcount, 1);
- atomic_set(&c->entries, 1);
+ refcount_set(&c->refcount, 1);
+ refcount_set(&c->entries, 1);
spin_lock_bh(&cn->lock);
if (__clusterip_config_find(net, ip)) {
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 3240a2614e82..af2b69b6895f 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -293,12 +293,16 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
XT_SYNPROXY_OPT_ECN);
synproxy_send_client_synack(net, skb, th, &opts);
- return NF_DROP;
-
+ consume_skb(skb);
+ return NF_STOLEN;
} else if (th->ack && !(th->fin || th->rst || th->syn)) {
/* ACK from client */
- synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq));
- return NF_DROP;
+ if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq))) {
+ consume_skb(skb);
+ return NF_STOLEN;
+ } else {
+ return NF_DROP;
+ }
}
return XT_CONTINUE;
@@ -367,10 +371,13 @@ static unsigned int ipv4_synproxy_hook(void *priv,
* number match the one of first SYN.
*/
if (synproxy_recv_client_ack(net, skb, th, &opts,
- ntohl(th->seq) + 1))
+ ntohl(th->seq) + 1)) {
this_cpu_inc(snet->stats->cookie_retrans);
-
- return NF_DROP;
+ consume_skb(skb);
+ return NF_STOLEN;
+ } else {
+ return NF_DROP;
+ }
}
synproxy->isn = ntohl(th->ack_seq);
@@ -409,19 +416,56 @@ static unsigned int ipv4_synproxy_hook(void *priv,
return NF_ACCEPT;
}
+static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = {
+ {
+ .hook = ipv4_synproxy_hook,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+ },
+ {
+ .hook = ipv4_synproxy_hook,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+ },
+};
+
static int synproxy_tg4_check(const struct xt_tgchk_param *par)
{
+ struct synproxy_net *snet = synproxy_pernet(par->net);
const struct ipt_entry *e = par->entryinfo;
+ int err;
if (e->ip.proto != IPPROTO_TCP ||
e->ip.invflags & XT_INV_PROTO)
return -EINVAL;
- return nf_ct_netns_get(par->net, par->family);
+ err = nf_ct_netns_get(par->net, par->family);
+ if (err)
+ return err;
+
+ if (snet->hook_ref4 == 0) {
+ err = nf_register_net_hooks(par->net, ipv4_synproxy_ops,
+ ARRAY_SIZE(ipv4_synproxy_ops));
+ if (err) {
+ nf_ct_netns_put(par->net, par->family);
+ return err;
+ }
+ }
+
+ snet->hook_ref4++;
+ return err;
}
static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
{
+ struct synproxy_net *snet = synproxy_pernet(par->net);
+
+ snet->hook_ref4--;
+ if (snet->hook_ref4 == 0)
+ nf_unregister_net_hooks(par->net, ipv4_synproxy_ops,
+ ARRAY_SIZE(ipv4_synproxy_ops));
nf_ct_netns_put(par->net, par->family);
}
@@ -436,46 +480,14 @@ static struct xt_target synproxy_tg4_reg __read_mostly = {
.me = THIS_MODULE,
};
-static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = {
- {
- .hook = ipv4_synproxy_hook,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
- },
- {
- .hook = ipv4_synproxy_hook,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
- },
-};
-
static int __init synproxy_tg4_init(void)
{
- int err;
-
- err = nf_register_hooks(ipv4_synproxy_ops,
- ARRAY_SIZE(ipv4_synproxy_ops));
- if (err < 0)
- goto err1;
-
- err = xt_register_target(&synproxy_tg4_reg);
- if (err < 0)
- goto err2;
-
- return 0;
-
-err2:
- nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops));
-err1:
- return err;
+ return xt_register_target(&synproxy_tg4_reg);
}
static void __exit synproxy_tg4_exit(void)
{
xt_unregister_target(&synproxy_tg4_reg);
- nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops));
}
module_init(synproxy_tg4_init);
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index f0dbff05fc28..39895b9ddeb9 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -69,8 +69,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
/* Avoid counting cloned packets towards the original connection. */
nf_reset(skb);
- nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
- nf_conntrack_get(skb_nfct(skb));
+ nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
#endif
/*
* If we are in PREROUTING/INPUT, decrease the TTL to mitigate potential
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 6f5e8d01b876..feedd759ca80 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -264,13 +264,7 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
if (!ct)
return NF_ACCEPT;
- /* Don't try to NAT if this packet is not conntracked */
- if (nf_ct_is_untracked(ct))
- return NF_ACCEPT;
-
- nat = nf_ct_nat_ext_add(ct);
- if (nat == NULL)
- return NF_ACCEPT;
+ nat = nfct_nat(ct);
switch (ctinfo) {
case IP_CT_RELATED:
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
index ea91058b5f6f..dc1dea15c1b4 100644
--- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -37,7 +37,6 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING);
ct = nf_ct_get(skb, &ctinfo);
- nat = nfct_nat(ct);
NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
ctinfo == IP_CT_RELATED_REPLY));
@@ -56,7 +55,9 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
return NF_DROP;
}
- nat->masq_index = out->ifindex;
+ nat = nf_ct_nat_ext_add(ct);
+ if (nat)
+ nat->masq_index = out->ifindex;
/* Transfer from original range. */
memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index b3ca21b2ba9b..8a69363b4884 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -49,9 +49,14 @@ static void pptp_nat_expected(struct nf_conn *ct,
const struct nf_ct_pptp_master *ct_pptp_info;
const struct nf_nat_pptp *nat_pptp_info;
struct nf_nat_range range;
+ struct nf_conn_nat *nat;
+ nat = nf_ct_nat_ext_add(ct);
+ if (WARN_ON_ONCE(!nat))
+ return;
+
+ nat_pptp_info = &nat->help.nat_pptp_info;
ct_pptp_info = nfct_help_data(master);
- nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info;
/* And here goes the grand finale of corrosion... */
if (exp->dir == IP_CT_DIR_ORIGINAL) {
@@ -120,13 +125,17 @@ pptp_outbound_pkt(struct sk_buff *skb,
{
struct nf_ct_pptp_master *ct_pptp_info;
+ struct nf_conn_nat *nat = nfct_nat(ct);
struct nf_nat_pptp *nat_pptp_info;
u_int16_t msg;
__be16 new_callid;
unsigned int cid_off;
+ if (WARN_ON_ONCE(!nat))
+ return NF_DROP;
+
+ nat_pptp_info = &nat->help.nat_pptp_info;
ct_pptp_info = nfct_help_data(ct);
- nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
new_callid = ct_pptp_info->pns_call_id;
@@ -177,11 +186,11 @@ pptp_outbound_pkt(struct sk_buff *skb,
ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid));
/* mangle packet */
- if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
- cid_off + sizeof(struct pptp_pkt_hdr) +
- sizeof(struct PptpControlHeader),
- sizeof(new_callid), (char *)&new_callid,
- sizeof(new_callid)) == 0)
+ if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
+ cid_off + sizeof(struct pptp_pkt_hdr) +
+ sizeof(struct PptpControlHeader),
+ sizeof(new_callid), (char *)&new_callid,
+ sizeof(new_callid)))
return NF_DROP;
return NF_ACCEPT;
}
@@ -191,11 +200,15 @@ pptp_exp_gre(struct nf_conntrack_expect *expect_orig,
struct nf_conntrack_expect *expect_reply)
{
const struct nf_conn *ct = expect_orig->master;
+ struct nf_conn_nat *nat = nfct_nat(ct);
struct nf_ct_pptp_master *ct_pptp_info;
struct nf_nat_pptp *nat_pptp_info;
+ if (WARN_ON_ONCE(!nat))
+ return;
+
+ nat_pptp_info = &nat->help.nat_pptp_info;
ct_pptp_info = nfct_help_data(ct);
- nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
/* save original PAC call ID in nat_info */
nat_pptp_info->pac_call_id = ct_pptp_info->pac_call_id;
@@ -223,11 +236,15 @@ pptp_inbound_pkt(struct sk_buff *skb,
union pptp_ctrl_union *pptpReq)
{
const struct nf_nat_pptp *nat_pptp_info;
+ struct nf_conn_nat *nat = nfct_nat(ct);
u_int16_t msg;
__be16 new_pcid;
unsigned int pcid_off;
- nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
+ if (WARN_ON_ONCE(!nat))
+ return NF_DROP;
+
+ nat_pptp_info = &nat->help.nat_pptp_info;
new_pcid = nat_pptp_info->pns_call_id;
switch (msg = ntohs(ctlh->messageType)) {
@@ -271,11 +288,11 @@ pptp_inbound_pkt(struct sk_buff *skb,
pr_debug("altering peer call id from 0x%04x to 0x%04x\n",
ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid));
- if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
- pcid_off + sizeof(struct pptp_pkt_hdr) +
- sizeof(struct PptpControlHeader),
- sizeof(new_pcid), (char *)&new_pcid,
- sizeof(new_pcid)) == 0)
+ if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
+ pcid_off + sizeof(struct pptp_pkt_hdr) +
+ sizeof(struct PptpControlHeader),
+ sizeof(new_pcid), (char *)&new_pcid,
+ sizeof(new_pcid)))
return NF_DROP;
return NF_ACCEPT;
}
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 53e49f5011d3..d5b1e0b3f687 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -827,8 +827,8 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
return 1;
}
-static unsigned char snmp_request_decode(struct asn1_ctx *ctx,
- struct snmp_request *request)
+static unsigned char noinline_for_stack
+snmp_request_decode(struct asn1_ctx *ctx, struct snmp_request *request)
{
unsigned int cls, con, tag;
unsigned char *end;
@@ -920,10 +920,10 @@ static inline void mangle_address(unsigned char *begin,
}
}
-static unsigned char snmp_trap_decode(struct asn1_ctx *ctx,
- struct snmp_v1_trap *trap,
- const struct oct1_map *map,
- __sum16 *check)
+static unsigned char noinline_for_stack
+snmp_trap_decode(struct asn1_ctx *ctx, struct snmp_v1_trap *trap,
+ const struct oct1_map *map,
+ __sum16 *check)
{
unsigned int cls, con, tag, len;
unsigned char *end;
@@ -998,18 +998,6 @@ err_id_free:
*
*****************************************************************************/
-static void hex_dump(const unsigned char *buf, size_t len)
-{
- size_t i;
-
- for (i = 0; i < len; i++) {
- if (i && !(i % 16))
- printk("\n");
- printk("%02x ", *(buf + i));
- }
- printk("\n");
-}
-
/*
* Parse and mangle SNMP message according to mapping.
* (And this is the fucking 'basic' method).
@@ -1026,7 +1014,8 @@ static int snmp_parse_mangle(unsigned char *msg,
struct snmp_object *obj;
if (debug > 1)
- hex_dump(msg, len);
+ print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_NONE, 16, 1,
+ msg, len, 0);
asn1_open(&ctx, msg, len);
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 146d86105183..7cd8d0d918f8 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -104,7 +104,6 @@ EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put);
void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
{
struct sk_buff *nskb;
- const struct iphdr *oiph;
struct iphdr *niph;
const struct tcphdr *oth;
struct tcphdr _oth;
@@ -116,8 +115,6 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
return;
- oiph = ip_hdr(oldskb);
-
nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
LL_MAX_HEADER, GFP_ATOMIC);
if (!nskb)
diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c
index a83d558e1aae..e9293bdebba0 100644
--- a/net/ipv4/netfilter/nf_socket_ipv4.c
+++ b/net/ipv4/netfilter/nf_socket_ipv4.c
@@ -139,7 +139,7 @@ struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb,
* SNAT-ted connection.
*/
ct = nf_ct_get(skb, &ctinfo);
- if (ct && !nf_ct_is_untracked(ct) &&
+ if (ct &&
((iph->protocol != IPPROTO_ICMP &&
ctinfo == IP_CT_ESTABLISHED_REPLY) ||
(iph->protocol == IPPROTO_ICMP &&
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
index 2981291910dd..de3681df2ce7 100644
--- a/net/ipv4/netfilter/nft_fib_ipv4.c
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -90,7 +90,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
if (nft_hook(pkt) == NF_INET_PRE_ROUTING &&
nft_fib_is_loopback(pkt->skb, nft_in(pkt))) {
- nft_fib_store_result(dest, priv->result, pkt,
+ nft_fib_store_result(dest, priv, pkt,
nft_in(pkt)->ifindex);
return;
}
@@ -99,7 +99,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
if (ipv4_is_zeronet(iph->saddr)) {
if (ipv4_is_lbcast(iph->daddr) ||
ipv4_is_local_multicast(iph->daddr)) {
- nft_fib_store_result(dest, priv->result, pkt,
+ nft_fib_store_result(dest, priv, pkt,
get_ifindex(pkt->skb->dev));
return;
}
@@ -212,7 +212,7 @@ nft_fib4_select_ops(const struct nft_ctx *ctx,
static struct nft_expr_type nft_fib4_type __read_mostly = {
.name = "fib",
- .select_ops = &nft_fib4_select_ops,
+ .select_ops = nft_fib4_select_ops,
.policy = nft_fib_policy,
.maxattr = NFTA_FIB_MAX,
.family = NFPROTO_IPV4,
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 69cf49e8356d..fa44e752a9a3 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -199,7 +199,6 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TW", LINUX_MIB_TIMEWAITED),
SNMP_MIB_ITEM("TWRecycled", LINUX_MIB_TIMEWAITRECYCLED),
SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED),
- SNMP_MIB_ITEM("PAWSPassive", LINUX_MIB_PAWSPASSIVEREJECTED),
SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED),
SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED),
SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS),
@@ -282,6 +281,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL),
SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
+ SNMP_MIB_ITEM("TCPFastOpenBlackhole", LINUX_MIB_TCPFASTOPENBLACKHOLE),
SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS),
SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING),
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 4b7c0ec65251..32a691b7ce2c 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -28,7 +28,7 @@
#include <linux/spinlock.h>
#include <net/protocol.h>
-const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
+struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
EXPORT_SYMBOL(inet_offloads);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d9724889ff09..655d9eebe43e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1250,15 +1250,11 @@ static void set_class_tag(struct rtable *rt, u32 tag)
static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
{
- unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
+ unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
+ unsigned int advmss = max_t(unsigned int, dst->dev->mtu - header_size,
+ ip_rt_min_advmss);
- if (advmss == 0) {
- advmss = max_t(unsigned int, dst->dev->mtu - 40,
- ip_rt_min_advmss);
- if (advmss > 65535 - 40)
- advmss = 65535 - 40;
- }
- return advmss;
+ return min(advmss, IPV4_MAX_PMTU - header_size);
}
static unsigned int ipv4_mtu(const struct dst_entry *dst)
@@ -1734,45 +1730,97 @@ out:
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
-
/* To make ICMP packets follow the right flow, the multipath hash is
- * calculated from the inner IP addresses in reverse order.
+ * calculated from the inner IP addresses.
*/
-static int ip_multipath_icmp_hash(struct sk_buff *skb)
+static void ip_multipath_l3_keys(const struct sk_buff *skb,
+ struct flow_keys *hash_keys)
{
const struct iphdr *outer_iph = ip_hdr(skb);
- struct icmphdr _icmph;
+ const struct iphdr *inner_iph;
const struct icmphdr *icmph;
struct iphdr _inner_iph;
- const struct iphdr *inner_iph;
+ struct icmphdr _icmph;
+
+ hash_keys->addrs.v4addrs.src = outer_iph->saddr;
+ hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
+ if (likely(outer_iph->protocol != IPPROTO_ICMP))
+ return;
if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
- goto standard_hash;
+ return;
icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
&_icmph);
if (!icmph)
- goto standard_hash;
+ return;
if (icmph->type != ICMP_DEST_UNREACH &&
icmph->type != ICMP_REDIRECT &&
icmph->type != ICMP_TIME_EXCEEDED &&
- icmph->type != ICMP_PARAMETERPROB) {
- goto standard_hash;
- }
+ icmph->type != ICMP_PARAMETERPROB)
+ return;
inner_iph = skb_header_pointer(skb,
outer_iph->ihl * 4 + sizeof(_icmph),
sizeof(_inner_iph), &_inner_iph);
if (!inner_iph)
- goto standard_hash;
+ return;
+ hash_keys->addrs.v4addrs.src = inner_iph->saddr;
+ hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
+}
- return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
+/* if skb is set it will be used and fl4 can be NULL */
+int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
+ const struct sk_buff *skb)
+{
+ struct net *net = fi->fib_net;
+ struct flow_keys hash_keys;
+ u32 mhash;
-standard_hash:
- return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
-}
+ switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
+ case 0:
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ if (skb) {
+ ip_multipath_l3_keys(skb, &hash_keys);
+ } else {
+ hash_keys.addrs.v4addrs.src = fl4->saddr;
+ hash_keys.addrs.v4addrs.dst = fl4->daddr;
+ }
+ break;
+ case 1:
+ /* skb is currently provided only when forwarding */
+ if (skb) {
+ unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
+ struct flow_keys keys;
+
+ /* short-circuit if we already have L4 hash present */
+ if (skb->l4_hash)
+ return skb_get_hash_raw(skb) >> 1;
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ skb_flow_dissect_flow_keys(skb, &keys, flag);
+ hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+ hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+ hash_keys.ports.src = keys.ports.src;
+ hash_keys.ports.dst = keys.ports.dst;
+ hash_keys.basic.ip_proto = keys.basic.ip_proto;
+ } else {
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ hash_keys.addrs.v4addrs.src = fl4->saddr;
+ hash_keys.addrs.v4addrs.dst = fl4->daddr;
+ hash_keys.ports.src = fl4->fl4_sport;
+ hash_keys.ports.dst = fl4->fl4_dport;
+ hash_keys.basic.ip_proto = fl4->flowi4_proto;
+ }
+ break;
+ }
+ mhash = flow_hash_from_keys(&hash_keys);
+ return mhash >> 1;
+}
+EXPORT_SYMBOL_GPL(fib_multipath_hash);
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
static int ip_mkroute_input(struct sk_buff *skb,
@@ -1782,12 +1830,8 @@ static int ip_mkroute_input(struct sk_buff *skb,
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (res->fi && res->fi->fib_nhs > 1) {
- int h;
+ int h = fib_multipath_hash(res->fi, NULL, skb);
- if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
- h = ip_multipath_icmp_hash(skb);
- else
- h = fib_multipath_hash(saddr, daddr);
fib_select_multipath(res, h);
}
#endif
@@ -2203,7 +2247,7 @@ add:
*/
struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
- int mp_hash)
+ const struct sk_buff *skb)
{
struct net_device *dev_out = NULL;
__u8 tos = RT_FL_TOS(fl4);
@@ -2366,7 +2410,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
goto make_route;
}
- fib_select_path(net, &res, fl4, mp_hash);
+ fib_select_path(net, &res, fl4, skb);
dev_out = FIB_RES_DEV(res);
fl4->flowi4_oif = dev_out->ifindex;
@@ -2586,7 +2630,8 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
+static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
struct net *net = sock_net(in_skb->sk);
struct rtmsg *rtm;
@@ -2602,7 +2647,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
u32 table_id = RT_TABLE_MAIN;
kuid_t uid;
- err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
+ err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
+ extack);
if (err < 0)
goto errout;
@@ -2620,10 +2666,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
skb_reset_mac_header(skb);
skb_reset_network_header(skb);
- /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
- ip_hdr(skb)->protocol = IPPROTO_UDP;
- skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
-
src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
@@ -2633,6 +2675,15 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
else
uid = (iif ? INVALID_UID : current_uid());
+ /* Bugfix: need to give ip_route_input enough of an IP header to
+ * not gag.
+ */
+ ip_hdr(skb)->protocol = IPPROTO_UDP;
+ ip_hdr(skb)->saddr = src;
+ ip_hdr(skb)->daddr = dst;
+
+ skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
+
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = dst;
fl4.saddr = src;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d6880a6149ee..86957e9cd6c6 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -24,6 +24,7 @@
#include <net/cipso_ipv4.h>
#include <net/inet_frag.h>
#include <net/ping.h>
+#include <net/protocol.h>
static int zero;
static int one = 1;
@@ -294,6 +295,74 @@ bad_key:
return ret;
}
+static void proc_configure_early_demux(int enabled, int protocol)
+{
+ struct net_protocol *ipprot;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct inet6_protocol *ip6prot;
+#endif
+
+ rcu_read_lock();
+
+ ipprot = rcu_dereference(inet_protos[protocol]);
+ if (ipprot)
+ ipprot->early_demux = enabled ? ipprot->early_demux_handler :
+ NULL;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ ip6prot = rcu_dereference(inet6_protos[protocol]);
+ if (ip6prot)
+ ip6prot->early_demux = enabled ? ip6prot->early_demux_handler :
+ NULL;
+#endif
+ rcu_read_unlock();
+}
+
+static int proc_tcp_early_demux(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret = 0;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ if (write && !ret) {
+ int enabled = init_net.ipv4.sysctl_tcp_early_demux;
+
+ proc_configure_early_demux(enabled, IPPROTO_TCP);
+ }
+
+ return ret;
+}
+
+static int proc_udp_early_demux(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret = 0;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ if (write && !ret) {
+ int enabled = init_net.ipv4.sysctl_udp_early_demux;
+
+ proc_configure_early_demux(enabled, IPPROTO_UDP);
+ }
+
+ return ret;
+}
+
+static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
+ int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ tcp_fastopen_active_timeout_reset();
+ return ret;
+}
+
static struct ctl_table ipv4_table[] = {
{
.procname = "tcp_timestamps",
@@ -344,6 +413,14 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_tcp_fastopen_key,
},
{
+ .procname = "tcp_fastopen_blackhole_timeout_sec",
+ .data = &sysctl_tcp_fastopen_blackhole_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_tfo_blackhole_detect_timeout,
+ .extra1 = &zero,
+ },
+ {
.procname = "tcp_abort_on_overflow",
.data = &sysctl_tcp_abort_on_overflow,
.maxlen = sizeof(int),
@@ -750,6 +827,20 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "udp_early_demux",
+ .data = &init_net.ipv4.sysctl_udp_early_demux,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_udp_early_demux
+ },
+ {
+ .procname = "tcp_early_demux",
+ .data = &init_net.ipv4.sysctl_tcp_early_demux,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_tcp_early_demux
+ },
+ {
.procname = "ip_default_ttl",
.data = &init_net.ipv4.sysctl_ip_default_ttl,
.maxlen = sizeof(int),
@@ -981,13 +1072,6 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "tcp_tw_recycle",
- .data = &init_net.ipv4.tcp_death_row.sysctl_tw_recycle,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_max_syn_backlog",
.data = &init_net.ipv4.sysctl_max_syn_backlog,
.maxlen = sizeof(int),
@@ -1004,6 +1088,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
+ {
+ .procname = "fib_multipath_hash_policy",
+ .data = &init_net.ipv4.sysctl_fib_multipath_hash_policy,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
#endif
{
.procname = "ip_unprivileged_port_start",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 40ba4249a586..1e4c76d2b827 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -533,7 +533,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (tp->urg_data & TCP_URG_VALID)
mask |= POLLPRI;
- } else if (sk->sk_state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
+ } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
/* Active TCP fastopen socket with defer_connect
* Return POLLOUT so application can call write()
* in order for kernel to generate SYN+data
@@ -2296,6 +2296,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_clear_xmit_timers(sk);
__skb_queue_purge(&sk->sk_receive_queue);
tcp_write_queue_purge(sk);
+ tcp_fastopen_active_disable_ofo_check(sk);
skb_rbtree_purge(&tp->out_of_order_queue);
inet->inet_dport = 0;
@@ -2394,7 +2395,7 @@ static int tcp_repair_options_est(struct tcp_sock *tp,
u16 snd_wscale = opt.opt_val & 0xFFFF;
u16 rcv_wscale = opt.opt_val >> 16;
- if (snd_wscale > 14 || rcv_wscale > 14)
+ if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE)
return -EFBIG;
tp->rx_opt.snd_wscale = snd_wscale;
@@ -2471,7 +2472,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
/* Values greater than interface MTU won't take effect. However
* at the point when this call is done we typically don't yet
* know which interface is going to be used */
- if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
+ if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) {
err = -EINVAL;
break;
}
@@ -2852,7 +2853,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_snd_ssthresh = tp->snd_ssthresh;
info->tcpi_advmss = tp->advmss;
- info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
+ info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
info->tcpi_rcv_space = tp->rcvq_space.space;
info->tcpi_total_retrans = tp->total_retrans;
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index c99230efcd52..0683ba447d77 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -72,7 +72,7 @@ MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
module_param(hystart, int, 0644);
MODULE_PARM_DESC(hystart, "turn on/off hybrid slow start algorithm");
module_param(hystart_detect, int, 0644);
-MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms"
+MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms"
" 1: packet-train 2: delay 3: both packet-train and delay");
module_param(hystart_low_window, int, 0644);
MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 8ea4e9787f82..4af82b914dd4 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -341,6 +341,13 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
cookie->len = -1;
return false;
}
+
+ /* Firewall blackhole issue check */
+ if (tcp_fastopen_active_should_disable(sk)) {
+ cookie->len = -1;
+ return false;
+ }
+
if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
cookie->len = -1;
return true;
@@ -380,3 +387,98 @@ bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
return false;
}
EXPORT_SYMBOL(tcp_fastopen_defer_connect);
+
+/*
+ * The following code block is to deal with middle box issues with TFO:
+ * Middlebox firewall issues can potentially cause server's data being
+ * blackholed after a successful 3WHS using TFO.
+ * The proposed solution is to disable active TFO globally under the
+ * following circumstances:
+ * 1. client side TFO socket receives out of order FIN
+ * 2. client side TFO socket receives out of order RST
+ * We disable active side TFO globally for 1hr at first. Then if it
+ * happens again, we disable it for 2h, then 4h, 8h, ...
+ * And we reset the timeout back to 1hr when we see a successful active
+ * TFO connection with data exchanges.
+ */
+
+/* Default to 1hr */
+unsigned int sysctl_tcp_fastopen_blackhole_timeout __read_mostly = 60 * 60;
+static atomic_t tfo_active_disable_times __read_mostly = ATOMIC_INIT(0);
+static unsigned long tfo_active_disable_stamp __read_mostly;
+
+/* Disable active TFO and record current jiffies and
+ * tfo_active_disable_times
+ */
+void tcp_fastopen_active_disable(struct sock *sk)
+{
+ atomic_inc(&tfo_active_disable_times);
+ tfo_active_disable_stamp = jiffies;
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENBLACKHOLE);
+}
+
+/* Reset tfo_active_disable_times to 0 */
+void tcp_fastopen_active_timeout_reset(void)
+{
+ atomic_set(&tfo_active_disable_times, 0);
+}
+
+/* Calculate timeout for tfo active disable
+ * Return true if we are still in the active TFO disable period
+ * Return false if timeout already expired and we should use active TFO
+ */
+bool tcp_fastopen_active_should_disable(struct sock *sk)
+{
+ int tfo_da_times = atomic_read(&tfo_active_disable_times);
+ int multiplier;
+ unsigned long timeout;
+
+ if (!tfo_da_times)
+ return false;
+
+ /* Limit timout to max: 2^6 * initial timeout */
+ multiplier = 1 << min(tfo_da_times - 1, 6);
+ timeout = multiplier * sysctl_tcp_fastopen_blackhole_timeout * HZ;
+ if (time_before(jiffies, tfo_active_disable_stamp + timeout))
+ return true;
+
+ /* Mark check bit so we can check for successful active TFO
+ * condition and reset tfo_active_disable_times
+ */
+ tcp_sk(sk)->syn_fastopen_ch = 1;
+ return false;
+}
+
+/* Disable active TFO if FIN is the only packet in the ofo queue
+ * and no data is received.
+ * Also check if we can reset tfo_active_disable_times if data is
+ * received successfully on a marked active TFO sockets opened on
+ * a non-loopback interface
+ */
+void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct rb_node *p;
+ struct sk_buff *skb;
+ struct dst_entry *dst;
+
+ if (!tp->syn_fastopen)
+ return;
+
+ if (!tp->data_segs_in) {
+ p = rb_first(&tp->out_of_order_queue);
+ if (p && !rb_next(p)) {
+ skb = rb_entry(p, struct sk_buff, rbnode);
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
+ tcp_fastopen_active_disable(sk);
+ return;
+ }
+ }
+ } else if (tp->syn_fastopen_ch &&
+ atomic_read(&tfo_active_disable_times)) {
+ dst = sk_dst_get(sk);
+ if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)))
+ tcp_fastopen_active_timeout_reset();
+ dst_release(dst);
+ }
+}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 659d1baefb2b..9739962bfb3f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -442,7 +442,8 @@ void tcp_init_buffer_space(struct sock *sk)
tcp_sndbuf_expand(sk);
tp->rcvq_space.space = tp->rcv_wnd;
- tp->rcvq_space.time = tcp_time_stamp;
+ skb_mstamp_get(&tp->tcp_mstamp);
+ tp->rcvq_space.time = tp->tcp_mstamp;
tp->rcvq_space.seq = tp->copied_seq;
maxwin = tcp_full_space(sk);
@@ -518,7 +519,7 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss);
*/
static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
{
- u32 new_sample = tp->rcv_rtt_est.rtt;
+ u32 new_sample = tp->rcv_rtt_est.rtt_us;
long m = sample;
if (m == 0)
@@ -548,21 +549,23 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
new_sample = m << 3;
}
- if (tp->rcv_rtt_est.rtt != new_sample)
- tp->rcv_rtt_est.rtt = new_sample;
+ tp->rcv_rtt_est.rtt_us = new_sample;
}
static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
{
- if (tp->rcv_rtt_est.time == 0)
+ u32 delta_us;
+
+ if (tp->rcv_rtt_est.time.v64 == 0)
goto new_measure;
if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
return;
- tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1);
+ delta_us = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcv_rtt_est.time);
+ tcp_rcv_rtt_update(tp, delta_us, 1);
new_measure:
tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
- tp->rcv_rtt_est.time = tcp_time_stamp;
+ tp->rcv_rtt_est.time = tp->tcp_mstamp;
}
static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
@@ -572,7 +575,10 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
if (tp->rx_opt.rcv_tsecr &&
(TCP_SKB_CB(skb)->end_seq -
TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
- tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
+ tcp_rcv_rtt_update(tp,
+ jiffies_to_usecs(tcp_time_stamp -
+ tp->rx_opt.rcv_tsecr),
+ 0);
}
/*
@@ -585,8 +591,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
int time;
int copied;
- time = tcp_time_stamp - tp->rcvq_space.time;
- if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
+ time = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcvq_space.time);
+ if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
return;
/* Number of bytes copied to user in last RTT */
@@ -642,7 +648,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
new_measure:
tp->rcvq_space.seq = tp->copied_seq;
- tp->rcvq_space.time = tcp_time_stamp;
+ tp->rcvq_space.time = tp->tcp_mstamp;
}
/* There is something which you must keep in mind when you analyze the
@@ -1131,7 +1137,6 @@ struct tcp_sacktag_state {
*/
struct skb_mstamp first_sackt;
struct skb_mstamp last_sackt;
- struct skb_mstamp ack_time; /* Timestamp when the S/ACK was received */
struct rate_sample *rate;
int flag;
};
@@ -1214,8 +1219,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
return sacked;
if (!(sacked & TCPCB_SACKED_ACKED)) {
- tcp_rack_advance(tp, sacked, end_seq,
- xmit_time, &state->ack_time);
+ tcp_rack_advance(tp, sacked, end_seq, xmit_time);
if (sacked & TCPCB_SACKED_RETRANS) {
/* If the segment is not tagged as lost,
@@ -2760,8 +2764,7 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
return false;
}
-static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag,
- const struct skb_mstamp *ack_time)
+static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2769,7 +2772,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag,
if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
u32 prior_retrans = tp->retrans_out;
- tcp_rack_mark_lost(sk, ack_time);
+ tcp_rack_mark_lost(sk);
if (prior_retrans > tp->retrans_out)
*ack_flag |= FLAG_LOST_RETRANS;
}
@@ -2788,8 +2791,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag,
* tcp_xmit_retransmit_queue().
*/
static void tcp_fastretrans_alert(struct sock *sk, const int acked,
- bool is_dupack, int *ack_flag, int *rexmit,
- const struct skb_mstamp *ack_time)
+ bool is_dupack, int *ack_flag, int *rexmit)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -2857,11 +2859,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
tcp_try_keep_open(sk);
return;
}
- tcp_rack_identify_loss(sk, ack_flag, ack_time);
+ tcp_rack_identify_loss(sk, ack_flag);
break;
case TCP_CA_Loss:
tcp_process_loss(sk, flag, is_dupack, rexmit);
- tcp_rack_identify_loss(sk, ack_flag, ack_time);
+ tcp_rack_identify_loss(sk, ack_flag);
if (!(icsk->icsk_ca_state == TCP_CA_Open ||
(*ack_flag & FLAG_LOST_RETRANS)))
return;
@@ -2877,7 +2879,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
- tcp_rack_identify_loss(sk, ack_flag, ack_time);
+ tcp_rack_identify_loss(sk, ack_flag);
if (!tcp_time_to_recover(sk, flag)) {
tcp_try_to_open(sk, flag);
return;
@@ -3059,8 +3061,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct skb_mstamp first_ackt, last_ackt;
- struct skb_mstamp *now = &sack->ack_time;
struct tcp_sock *tp = tcp_sk(sk);
+ struct skb_mstamp *now = &tp->tcp_mstamp;
u32 prior_sacked = tp->sacked_out;
u32 reord = tp->packets_out;
bool fully_acked = true;
@@ -3120,8 +3122,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->delivered += acked_pcount;
if (!tcp_skb_spurious_retrans(tp, skb))
tcp_rack_advance(tp, sacked, scb->end_seq,
- &skb->skb_mstamp,
- &sack->ack_time);
+ &skb->skb_mstamp);
}
if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount;
@@ -3576,8 +3577,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, tp->snd_nxt))
goto invalid_ack;
- skb_mstamp_get(&sack_state.ack_time);
-
if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
@@ -3647,8 +3646,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
- tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
- &sack_state.ack_time);
+ tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
}
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
@@ -3660,8 +3658,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_schedule_loss_probe(sk);
delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
lost = tp->lost - lost; /* freshly marked lost */
- tcp_rate_gen(sk, delivered, lost, &sack_state.ack_time,
- sack_state.rate);
+ tcp_rate_gen(sk, delivered, lost, sack_state.rate);
tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
tcp_xmit_recovery(sk, rexmit);
return 1;
@@ -3669,8 +3666,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK)
- tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
- &sack_state.ack_time);
+ tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
@@ -3691,11 +3687,9 @@ old_ack:
* If data was DSACKed, see if we can undo a cwnd reduction.
*/
if (TCP_SKB_CB(skb)->sacked) {
- skb_mstamp_get(&sack_state.ack_time);
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state);
- tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
- &sack_state.ack_time);
+ tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
tcp_xmit_recovery(sk, rexmit);
}
@@ -3768,11 +3762,12 @@ void tcp_parse_options(const struct sk_buff *skb,
!estab && sysctl_tcp_window_scaling) {
__u8 snd_wscale = *(__u8 *)ptr;
opt_rx->wscale_ok = 1;
- if (snd_wscale > 14) {
- net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n",
+ if (snd_wscale > TCP_MAX_WSCALE) {
+ net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n",
__func__,
- snd_wscale);
- snd_wscale = 14;
+ snd_wscale,
+ TCP_MAX_WSCALE);
+ snd_wscale = TCP_MAX_WSCALE;
}
opt_rx->snd_wscale = snd_wscale;
}
@@ -4007,10 +4002,10 @@ void tcp_reset(struct sock *sk)
/* This barrier is coupled with smp_rmb() in tcp_poll() */
smp_wmb();
+ tcp_done(sk);
+
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_error_report(sk);
-
- tcp_done(sk);
}
/*
@@ -5299,8 +5294,16 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
if (rst_seq_match)
tcp_reset(sk);
- else
+ else {
+ /* Disable TFO if RST is out-of-order
+ * and no data has been received
+ * for current active TFO socket
+ */
+ if (tp->syn_fastopen && !tp->data_segs_in &&
+ sk->sk_state == TCP_ESTABLISHED)
+ tcp_fastopen_active_disable(sk);
tcp_send_challenge_ack(sk, skb);
+ }
goto discard;
}
@@ -5353,6 +5356,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
{
struct tcp_sock *tp = tcp_sk(sk);
+ skb_mstamp_get(&tp->tcp_mstamp);
if (unlikely(!sk->sk_rx_dst))
inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
/*
@@ -5579,10 +5583,6 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
else
tp->pred_flags = 0;
- if (!sock_flag(sk, SOCK_DEAD)) {
- sk->sk_state_change(sk);
- sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
- }
}
static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
@@ -5651,6 +5651,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_cookie foc = { .len = -1 };
int saved_clamp = tp->rx_opt.mss_clamp;
+ bool fastopen_fail;
tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
@@ -5754,10 +5755,15 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_finish_connect(sk, skb);
- if ((tp->syn_fastopen || tp->syn_data) &&
- tcp_rcv_fastopen_synack(sk, skb, &foc))
- return -1;
+ fastopen_fail = (tp->syn_fastopen || tp->syn_data) &&
+ tcp_rcv_fastopen_synack(sk, skb, &foc);
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
+ }
+ if (fastopen_fail)
+ return -1;
if (sk->sk_write_pending ||
icsk->icsk_accept_queue.rskq_defer_accept ||
icsk->icsk_ack.pingpong) {
@@ -5911,6 +5917,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
case TCP_SYN_SENT:
tp->rx_opt.saw_tstamp = 0;
+ skb_mstamp_get(&tp->tcp_mstamp);
queued = tcp_rcv_synsent_state_process(sk, skb, th);
if (queued >= 0)
return queued;
@@ -5922,6 +5929,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
return 0;
}
+ skb_mstamp_get(&tp->tcp_mstamp);
tp->rx_opt.saw_tstamp = 0;
req = tp->fastopen_rsk;
if (req) {
@@ -6041,9 +6049,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
break;
}
- if (tp->linger2 < 0 ||
- (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
- after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
+ if (tp->linger2 < 0) {
+ tcp_done(sk);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ return 1;
+ }
+ if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
+ /* Receive out of order FIN after close() */
+ if (tp->syn_fastopen && th->fin)
+ tcp_fastopen_active_disable(sk);
tcp_done(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
@@ -6333,36 +6348,14 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop_and_free;
if (isn && tmp_opt.tstamp_ok)
- af_ops->init_seq(skb, &tcp_rsk(req)->ts_off);
+ af_ops->init_seq_tsoff(skb, &tcp_rsk(req)->ts_off);
if (!want_cookie && !isn) {
- /* VJ's idea. We save last timestamp seen
- * from the destination in peer table, when entering
- * state TIME-WAIT, and check against it before
- * accepting new connection request.
- *
- * If "isn" is not zero, this request hit alive
- * timewait bucket, so that all the necessary checks
- * are made in the function processing timewait state.
- */
- if (net->ipv4.tcp_death_row.sysctl_tw_recycle) {
- bool strict;
-
- dst = af_ops->route_req(sk, &fl, req, &strict);
-
- if (dst && strict &&
- !tcp_peer_is_proven(req, dst, true,
- tmp_opt.saw_tstamp)) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
- goto drop_and_release;
- }
- }
/* Kill the following clause, if you dislike this way. */
- else if (!net->ipv4.sysctl_tcp_syncookies &&
- (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
- (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
- !tcp_peer_is_proven(req, dst, false,
- tmp_opt.saw_tstamp)) {
+ if (!net->ipv4.sysctl_tcp_syncookies &&
+ (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
+ (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
+ !tcp_peer_is_proven(req, dst)) {
/* Without syncookies last quarter of
* backlog is filled with destinations,
* proven to be alive.
@@ -6375,10 +6368,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop_and_release;
}
- isn = af_ops->init_seq(skb, &tcp_rsk(req)->ts_off);
+ isn = af_ops->init_seq_tsoff(skb, &tcp_rsk(req)->ts_off);
}
if (!dst) {
- dst = af_ops->route_req(sk, &fl, req, NULL);
+ dst = af_ops->route_req(sk, &fl, req);
if (!dst)
goto drop_and_free;
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 575e19dcc017..cbbafe546c0f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -94,12 +94,12 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
struct inet_hashinfo tcp_hashinfo;
EXPORT_SYMBOL(tcp_hashinfo);
-static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff)
+static u32 tcp_v4_init_seq_and_tsoff(const struct sk_buff *skb, u32 *tsoff)
{
- return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
- ip_hdr(skb)->saddr,
- tcp_hdr(skb)->dest,
- tcp_hdr(skb)->source, tsoff);
+ return secure_tcp_seq_and_tsoff(ip_hdr(skb)->daddr,
+ ip_hdr(skb)->saddr,
+ tcp_hdr(skb)->dest,
+ tcp_hdr(skb)->source, tsoff);
}
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
@@ -198,10 +198,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
tp->write_seq = 0;
}
- if (tcp_death_row->sysctl_tw_recycle &&
- !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
- tcp_fetch_timewait_stamp(sk, &rt->dst);
-
inet->inet_dport = usin->sin_port;
sk_daddr_set(sk, daddr);
@@ -236,11 +232,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
rt = NULL;
if (likely(!tp->repair)) {
- seq = secure_tcp_sequence_number(inet->inet_saddr,
- inet->inet_daddr,
- inet->inet_sport,
- usin->sin_port,
- &tp->tsoffset);
+ seq = secure_tcp_seq_and_tsoff(inet->inet_saddr,
+ inet->inet_daddr,
+ inet->inet_sport,
+ usin->sin_port,
+ &tp->tsoffset);
if (!tp->write_seq)
tp->write_seq = seq;
}
@@ -1217,19 +1213,9 @@ static void tcp_v4_init_req(struct request_sock *req,
static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
struct flowi *fl,
- const struct request_sock *req,
- bool *strict)
+ const struct request_sock *req)
{
- struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
-
- if (strict) {
- if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
- *strict = true;
- else
- *strict = false;
- }
-
- return dst;
+ return inet_csk_route_req(sk, &fl->u.ip4, req);
}
struct request_sock_ops tcp_request_sock_ops __read_mostly = {
@@ -1253,7 +1239,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
.cookie_init_seq = cookie_v4_init_sequence,
#endif
.route_req = tcp_v4_route_req,
- .init_seq = tcp_v4_init_sequence,
+ .init_seq_tsoff = tcp_v4_init_seq_and_tsoff,
.send_synack = tcp_v4_send_synack,
};
@@ -1423,8 +1409,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
if (!nsk)
goto discard;
if (nsk != sk) {
- sock_rps_save_rxhash(nsk, skb);
- sk_mark_napi_id(nsk, skb);
if (tcp_child_process(sk, nsk, skb)) {
rsk = nsk;
goto reset;
@@ -1871,6 +1855,9 @@ void tcp_v4_destroy_sock(struct sock *sk)
/* Cleanup up the write buffer. */
tcp_write_queue_purge(sk);
+ /* Check if we want to disable active TFO */
+ tcp_fastopen_active_disable_ofo_check(sk);
+
/* Cleans up our, hopefully empty, out_of_order_queue. */
skb_rbtree_purge(&tp->out_of_order_queue);
@@ -2466,7 +2453,6 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_tw_reuse = 0;
cnt = tcp_hashinfo.ehash_mask + 1;
- net->ipv4.tcp_death_row.sysctl_tw_recycle = 0;
net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index 046fd3910873..d6fb6c067af4 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -264,13 +264,15 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample)
{
struct tcp_sock *tp = tcp_sk(sk);
struct lp *lp = inet_csk_ca(sk);
+ u32 delta;
if (sample->rtt_us > 0)
tcp_lp_rtt_sample(sk, sample->rtt_us);
/* calc inference */
- if (tcp_time_stamp > tp->rx_opt.rcv_tsecr)
- lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr);
+ delta = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
+ if ((s32)delta > 0)
+ lp->inference = 3 * delta;
/* test if within inference */
if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference))
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 0f46e5fe31ad..9d0d4f39e42b 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -45,8 +45,6 @@ struct tcp_metrics_block {
struct inetpeer_addr tcpm_saddr;
struct inetpeer_addr tcpm_daddr;
unsigned long tcpm_stamp;
- u32 tcpm_ts;
- u32 tcpm_ts_stamp;
u32 tcpm_lock;
u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1];
struct tcp_fastopen_metrics tcpm_fastopen;
@@ -123,8 +121,6 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm,
tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
- tm->tcpm_ts = 0;
- tm->tcpm_ts_stamp = 0;
if (fastopen_clear) {
tm->tcpm_fastopen.mss = 0;
tm->tcpm_fastopen.syn_loss = 0;
@@ -273,48 +269,6 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
return tm;
}
-static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
-{
- struct tcp_metrics_block *tm;
- struct inetpeer_addr saddr, daddr;
- unsigned int hash;
- struct net *net;
-
- if (tw->tw_family == AF_INET) {
- inetpeer_set_addr_v4(&saddr, tw->tw_rcv_saddr);
- inetpeer_set_addr_v4(&daddr, tw->tw_daddr);
- hash = ipv4_addr_hash(tw->tw_daddr);
- }
-#if IS_ENABLED(CONFIG_IPV6)
- else if (tw->tw_family == AF_INET6) {
- if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) {
- inetpeer_set_addr_v4(&saddr, tw->tw_rcv_saddr);
- inetpeer_set_addr_v4(&daddr, tw->tw_daddr);
- hash = ipv4_addr_hash(tw->tw_daddr);
- } else {
- inetpeer_set_addr_v6(&saddr, &tw->tw_v6_rcv_saddr);
- inetpeer_set_addr_v6(&daddr, &tw->tw_v6_daddr);
- hash = ipv6_addr_hash(&tw->tw_v6_daddr);
- }
- }
-#endif
- else
- return NULL;
-
- net = twsk_net(tw);
- hash ^= net_hash_mix(net);
- hash = hash_32(hash, tcp_metrics_hash_log);
-
- for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
- tm = rcu_dereference(tm->tcpm_next)) {
- if (addr_same(&tm->tcpm_saddr, &saddr) &&
- addr_same(&tm->tcpm_daddr, &daddr) &&
- net_eq(tm_net(tm), net))
- break;
- }
- return tm;
-}
-
static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
struct dst_entry *dst,
bool create)
@@ -573,8 +527,7 @@ reset:
tp->snd_cwnd_stamp = tcp_time_stamp;
}
-bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
- bool paws_check, bool timestamps)
+bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
{
struct tcp_metrics_block *tm;
bool ret;
@@ -584,94 +537,10 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
rcu_read_lock();
tm = __tcp_get_metrics_req(req, dst);
- if (paws_check) {
- if (tm &&
- (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
- ((s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW ||
- !timestamps))
- ret = false;
- else
- ret = true;
- } else {
- if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp)
- ret = true;
- else
- ret = false;
- }
- rcu_read_unlock();
-
- return ret;
-}
-
-void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
-{
- struct tcp_metrics_block *tm;
-
- rcu_read_lock();
- tm = tcp_get_metrics(sk, dst, true);
- if (tm) {
- struct tcp_sock *tp = tcp_sk(sk);
-
- if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) {
- tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp;
- tp->rx_opt.ts_recent = tm->tcpm_ts;
- }
- }
- rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp);
-
-/* VJ's idea. Save last timestamp seen from this destination and hold
- * it at least for normal timewait interval to use for duplicate
- * segment detection in subsequent connections, before they enter
- * synchronized state.
- */
-bool tcp_remember_stamp(struct sock *sk)
-{
- struct dst_entry *dst = __sk_dst_get(sk);
- bool ret = false;
-
- if (dst) {
- struct tcp_metrics_block *tm;
-
- rcu_read_lock();
- tm = tcp_get_metrics(sk, dst, true);
- if (tm) {
- struct tcp_sock *tp = tcp_sk(sk);
-
- if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 ||
- ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
- tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
- tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
- tm->tcpm_ts = tp->rx_opt.ts_recent;
- }
- ret = true;
- }
- rcu_read_unlock();
- }
- return ret;
-}
-
-bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
-{
- struct tcp_metrics_block *tm;
- bool ret = false;
-
- rcu_read_lock();
- tm = __tcp_get_metrics_tw(tw);
- if (tm) {
- const struct tcp_timewait_sock *tcptw;
- struct sock *sk = (struct sock *) tw;
-
- tcptw = tcp_twsk(sk);
- if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 ||
- ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
- tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
- tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
- tm->tcpm_ts = tcptw->tw_ts_recent;
- }
+ if (tm && tcp_metric_get(tm, TCP_METRIC_RTT))
ret = true;
- }
+ else
+ ret = false;
rcu_read_unlock();
return ret;
@@ -791,14 +660,6 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
jiffies - tm->tcpm_stamp,
TCP_METRICS_ATTR_PAD) < 0)
goto nla_put_failure;
- if (tm->tcpm_ts_stamp) {
- if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP,
- (s32) (get_seconds() - tm->tcpm_ts_stamp)) < 0)
- goto nla_put_failure;
- if (nla_put_u32(msg, TCP_METRICS_ATTR_TW_TSVAL,
- tm->tcpm_ts) < 0)
- goto nla_put_failure;
- }
{
int n = 0;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 65c0f3d13eca..8f6373b0cd77 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -26,6 +26,7 @@
#include <net/tcp.h>
#include <net/inet_common.h>
#include <net/xfrm.h>
+#include <net/busy_poll.h>
int sysctl_tcp_abort_on_overflow __read_mostly;
@@ -94,7 +95,6 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
struct tcp_options_received tmp_opt;
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
bool paws_reject = false;
- struct inet_timewait_death_row *tcp_death_row = &sock_net((struct sock*)tw)->ipv4.tcp_death_row;
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
@@ -149,12 +149,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
}
- if (tcp_death_row->sysctl_tw_recycle &&
- tcptw->tw_ts_recent_stamp &&
- tcp_tw_remember_stamp(tw))
- inet_twsk_reschedule(tw, tw->tw_timeout);
- else
- inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
+ inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
return TCP_TW_ACK;
}
@@ -259,12 +254,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
struct inet_timewait_sock *tw;
- bool recycle_ok = false;
struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
- if (tcp_death_row->sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
- recycle_ok = tcp_remember_stamp(sk);
-
tw = inet_twsk_alloc(sk, tcp_death_row, state);
if (tw) {
@@ -317,13 +308,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
if (timeo < rto)
timeo = rto;
- if (recycle_ok) {
- tw->tw_timeout = rto;
- } else {
- tw->tw_timeout = TCP_TIMEWAIT_LEN;
- if (state == TCP_TIME_WAIT)
- timeo = TCP_TIMEWAIT_LEN;
- }
+ tw->tw_timeout = TCP_TIMEWAIT_LEN;
+ if (state == TCP_TIME_WAIT)
+ timeo = TCP_TIMEWAIT_LEN;
inet_twsk_schedule(tw, timeo);
/* Linkage updates. */
@@ -813,6 +800,9 @@ int tcp_child_process(struct sock *parent, struct sock *child,
int ret = 0;
int state = child->sk_state;
+ /* record NAPI ID of child */
+ sk_mark_napi_id(child, skb);
+
tcp_segs_in(tcp_sk(child), skb);
if (!sock_owned_by_user(child)) {
ret = tcp_rcv_state_process(child, skb);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a85d863c4419..60111a0fc201 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -212,12 +212,12 @@ void tcp_select_initial_window(int __space, __u32 mss,
/* If no clamp set the clamp to the max possible scaled window */
if (*window_clamp == 0)
- (*window_clamp) = (65535 << 14);
+ (*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
space = min(*window_clamp, space);
/* Quantize space offering to a multiple of mss if possible. */
if (space > mss)
- space = (space / mss) * mss;
+ space = rounddown(space, mss);
/* NOTE: offering an initial window larger than 32767
* will break some buggy TCP stacks. If the admin tells us
@@ -234,13 +234,11 @@ void tcp_select_initial_window(int __space, __u32 mss,
(*rcv_wscale) = 0;
if (wscale_ok) {
- /* Set window scaling on max possible window
- * See RFC1323 for an explanation of the limit to 14
- */
+ /* Set window scaling on max possible window */
space = max_t(u32, space, sysctl_tcp_rmem[2]);
space = max_t(u32, space, sysctl_rmem_max);
space = min_t(u32, space, *window_clamp);
- while (space > 65535 && (*rcv_wscale) < 14) {
+ while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
space >>= 1;
(*rcv_wscale)++;
}
@@ -253,7 +251,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
}
/* Set the clamp no higher than max representable value */
- (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
+ (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
}
EXPORT_SYMBOL(tcp_select_initial_window);
@@ -2566,7 +2564,6 @@ u32 __tcp_select_window(struct sock *sk)
/* Don't do rounding if we are using window scaling, since the
* scaled window will not line up with the MSS boundary anyway.
*/
- window = tp->rcv_wnd;
if (tp->rx_opt.rcv_wscale) {
window = free_space;
@@ -2574,10 +2571,9 @@ u32 __tcp_select_window(struct sock *sk)
* Import case: prevent zero window announcement if
* 1<<rcv_wscale > mss.
*/
- if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window)
- window = (((window >> tp->rx_opt.rcv_wscale) + 1)
- << tp->rx_opt.rcv_wscale);
+ window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
} else {
+ window = tp->rcv_wnd;
/* Get the largest window that is a nice multiple of mss.
* Window clamp already applied above.
* If our current window offering is within 1 mss of the
@@ -2587,7 +2583,7 @@ u32 __tcp_select_window(struct sock *sk)
* is too small.
*/
if (window <= free_space - mss || window > free_space)
- window = (free_space / mss) * mss;
+ window = rounddown(free_space, mss);
else if (mss == full_space &&
free_space > window + (full_space >> 1))
window = free_space;
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index 9be1581a5a08..c6a9fa894646 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -106,7 +106,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
/* Update the connection delivery information and generate a rate sample. */
void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
- struct skb_mstamp *now, struct rate_sample *rs)
+ struct rate_sample *rs)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 snd_us, ack_us;
@@ -120,7 +120,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
* to carry current time, flags, stats like "tcp_sacktag_state".
*/
if (delivered)
- tp->delivered_mstamp = *now;
+ tp->delivered_mstamp = tp->tcp_mstamp;
rs->acked_sacked = delivered; /* freshly ACKed or SACKed */
rs->losses = lost; /* freshly marked lost */
@@ -138,7 +138,8 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
* longer phase.
*/
snd_us = rs->interval_us; /* send phase */
- ack_us = skb_mstamp_us_delta(now, &rs->prior_mstamp); /* ack phase */
+ ack_us = skb_mstamp_us_delta(&tp->tcp_mstamp,
+ &rs->prior_mstamp); /* ack phase */
rs->interval_us = max(snd_us, ack_us);
/* Normally we expect interval_us >= min-rtt.
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index d8acbd9f477a..362b8c75bfab 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -45,8 +45,7 @@ static bool tcp_rack_sent_after(const struct skb_mstamp *t1,
* or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will
* make us enter the CA_Recovery state.
*/
-static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
- u32 *reo_timeout)
+static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -79,7 +78,7 @@ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
* A packet is lost if its elapsed time is beyond
* the recent RTT plus the reordering window.
*/
- u32 elapsed = skb_mstamp_us_delta(now,
+ u32 elapsed = skb_mstamp_us_delta(&tp->tcp_mstamp,
&skb->skb_mstamp);
s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;
@@ -105,7 +104,7 @@ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
}
}
-void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
+void tcp_rack_mark_lost(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 timeout;
@@ -115,7 +114,7 @@ void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
/* Reset the advanced flag to avoid unnecessary queue scanning */
tp->rack.advanced = 0;
- tcp_rack_detect_loss(sk, now, &timeout);
+ tcp_rack_detect_loss(sk, &timeout);
if (timeout) {
timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
@@ -128,8 +127,7 @@ void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
* draft-cheng-tcpm-rack-00.txt
*/
void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
- const struct skb_mstamp *xmit_time,
- const struct skb_mstamp *ack_time)
+ const struct skb_mstamp *xmit_time)
{
u32 rtt_us;
@@ -138,7 +136,7 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
end_seq, tp->rack.end_seq))
return;
- rtt_us = skb_mstamp_us_delta(ack_time, xmit_time);
+ rtt_us = skb_mstamp_us_delta(&tp->tcp_mstamp, xmit_time);
if (sacked & TCPCB_RETRANS) {
/* If the sacked packet was retransmitted, it's ambiguous
* whether the retransmission or the original (or the prior
@@ -165,12 +163,11 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
void tcp_rack_reo_timeout(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct skb_mstamp now;
u32 timeout, prior_inflight;
- skb_mstamp_get(&now);
prior_inflight = tcp_packets_in_flight(tp);
- tcp_rack_detect_loss(sk, &now, &timeout);
+ skb_mstamp_get(&tp->tcp_mstamp);
+ tcp_rack_detect_loss(sk, &timeout);
if (prior_inflight != tcp_packets_in_flight(tp)) {
if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
tcp_enter_recovery(sk, false);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b2ab411c6d37..14672543cf0b 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -201,11 +201,10 @@ static int tcp_write_timeout(struct sock *sk)
if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0, 0)) {
/* Some middle-boxes may black-hole Fast Open _after_
* the handshake. Therefore we conservatively disable
- * Fast Open on this path on recurring timeouts with
- * few or zero bytes acked after Fast Open.
+ * Fast Open on this path on recurring timeouts after
+ * successful Fast Open.
*/
- if (tp->syn_data_acked &&
- tp->bytes_acked <= tp->rx_opt.mss_clamp) {
+ if (tp->syn_data_acked) {
tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1)
NET_INC_STATS(sock_net(sk),
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index fed66dc0e0f5..9775453b8d17 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -265,8 +265,8 @@ static size_t tcp_westwood_info(struct sock *sk, u32 ext, int *attr,
if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
info->vegas.tcpv_enabled = 1;
info->vegas.tcpv_rttcnt = 0;
- info->vegas.tcpv_rtt = jiffies_to_usecs(ca->rtt),
- info->vegas.tcpv_minrtt = jiffies_to_usecs(ca->rtt_min),
+ info->vegas.tcpv_rtt = jiffies_to_usecs(ca->rtt);
+ info->vegas.tcpv_minrtt = jiffies_to_usecs(ca->rtt_min);
*attr = INET_DIAG_VEGASINFO;
return sizeof(struct tcpvegas_info);
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
index 4acc0508c5eb..3d36644890bb 100644
--- a/net/ipv4/xfrm4_mode_transport.c
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -12,6 +12,7 @@
#include <net/dst.h>
#include <net/ip.h>
#include <net/xfrm.h>
+#include <net/protocol.h>
/* Add encapsulation header.
*
@@ -23,6 +24,8 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
struct iphdr *iph = ip_hdr(skb);
int ihl = iph->ihl * 4;
+ skb_set_inner_transport_header(skb, skb_transport_offset(skb));
+
skb_set_network_header(skb, -x->props.header_len);
skb->mac_header = skb->network_header +
offsetof(struct iphdr, protocol);
@@ -56,9 +59,40 @@ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
return 0;
}
+static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ const struct net_offload *ops;
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ struct xfrm_offload *xo = xfrm_offload(skb);
+
+ skb->transport_header += x->props.header_len;
+ ops = rcu_dereference(inet_offloads[xo->proto]);
+ if (likely(ops && ops->callbacks.gso_segment))
+ segs = ops->callbacks.gso_segment(skb, features);
+
+ return segs;
+}
+
+static void xfrm4_transport_xmit(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct xfrm_offload *xo = xfrm_offload(skb);
+
+ skb_reset_mac_len(skb);
+ pskb_pull(skb, skb->mac_len + sizeof(struct iphdr) + x->props.header_len);
+
+ if (xo->flags & XFRM_GSO_SEGMENT) {
+ skb_reset_transport_header(skb);
+ skb->transport_header -= x->props.header_len;
+ }
+}
+
static struct xfrm_mode xfrm4_transport_mode = {
.input = xfrm4_transport_input,
.output = xfrm4_transport_output,
+ .gso_segment = xfrm4_transport_gso_segment,
+ .xmit = xfrm4_transport_xmit,
.owner = THIS_MODULE,
.encap = XFRM_MODE_TRANSPORT,
};
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 35feda676464..e6265e2c274e 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -33,6 +33,9 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
struct iphdr *top_iph;
int flags;
+ skb_set_inner_network_header(skb, skb_network_offset(skb));
+ skb_set_inner_transport_header(skb, skb_transport_offset(skb));
+
skb_set_network_header(skb, -x->props.header_len);
skb->mac_header = skb->network_header +
offsetof(struct iphdr, protocol);
@@ -96,11 +99,36 @@ out:
return err;
}
+static struct sk_buff *xfrm4_mode_tunnel_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ __skb_push(skb, skb->mac_len);
+ return skb_mac_gso_segment(skb, features);
+
+}
+
+static void xfrm4_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct xfrm_offload *xo = xfrm_offload(skb);
+
+ if (xo->flags & XFRM_GSO_SEGMENT) {
+ skb->network_header = skb->network_header - x->props.header_len;
+ skb->transport_header = skb->network_header +
+ sizeof(struct iphdr);
+ }
+
+ skb_reset_mac_len(skb);
+ pskb_pull(skb, skb->mac_len + x->props.header_len);
+}
+
static struct xfrm_mode xfrm4_tunnel_mode = {
.input2 = xfrm4_mode_tunnel_input,
.input = xfrm_prepare_input,
.output2 = xfrm4_mode_tunnel_output,
.output = xfrm4_prepare_output,
+ .gso_segment = xfrm4_mode_tunnel_gso_segment,
+ .xmit = xfrm4_mode_tunnel_xmit,
.owner = THIS_MODULE,
.encap = XFRM_MODE_TUNNEL,
.flags = XFRM_MODE_FLAG_TUNNEL,
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 7ee6518afa86..94b8702603bc 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -29,7 +29,8 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
goto out;
mtu = dst_mtu(skb_dst(skb));
- if (skb->len > mtu) {
+ if ((!skb_is_gso(skb) && skb->len > mtu) ||
+ (skb_is_gso(skb) && skb_gso_network_seglen(skb) > ip_skb_dst_mtu(skb->sk, skb))) {
skb->protocol = htons(ETH_P_IP);
if (skb->sk)